aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390')
-rw-r--r--arch/s390/Kbuild4
-rw-r--r--arch/s390/Kconfig391
-rw-r--r--arch/s390/Kconfig.debug29
-rw-r--r--arch/s390/Makefile100
-rw-r--r--arch/s390/appldata/appldata_base.c11
-rw-r--r--arch/s390/appldata/appldata_mem.c4
-rw-r--r--arch/s390/appldata/appldata_net_sum.c4
-rw-r--r--arch/s390/appldata/appldata_os.c12
-rw-r--r--arch/s390/boot/.gitignore4
-rw-r--r--arch/s390/boot/Makefile86
-rw-r--r--arch/s390/boot/als.c6
-rw-r--r--arch/s390/boot/boot.h23
-rw-r--r--arch/s390/boot/clz_ctz.c2
-rw-r--r--arch/s390/boot/compressed/.gitignore2
-rw-r--r--arch/s390/boot/compressed/Makefile68
-rw-r--r--arch/s390/boot/decompressor.c (renamed from arch/s390/boot/compressed/decompressor.c)14
-rw-r--r--arch/s390/boot/decompressor.h (renamed from arch/s390/boot/compressed/decompressor.h)10
-rw-r--r--arch/s390/boot/head.S449
-rw-r--r--arch/s390/boot/head_kdump.S8
-rwxr-xr-x[-rw-r--r--]arch/s390/boot/install.sh23
-rw-r--r--arch/s390/boot/ipl_data.c84
-rw-r--r--arch/s390/boot/ipl_parm.c151
-rw-r--r--arch/s390/boot/ipl_report.c6
-rw-r--r--arch/s390/boot/kaslr.c175
-rw-r--r--arch/s390/boot/mem_detect.c96
-rw-r--r--arch/s390/boot/pgm_check_info.c218
-rw-r--r--arch/s390/boot/sclp_early_core.c9
-rw-r--r--arch/s390/boot/startup.c234
-rw-r--r--arch/s390/boot/string.c1
-rw-r--r--arch/s390/boot/uv.c66
-rw-r--r--arch/s390/boot/uv.h22
-rw-r--r--arch/s390/boot/version.c1
-rw-r--r--arch/s390/boot/vmlinux.lds.S (renamed from arch/s390/boot/compressed/vmlinux.lds.S)86
-rw-r--r--arch/s390/configs/btf.config1
-rw-r--r--arch/s390/configs/debug_defconfig239
-rw-r--r--arch/s390/configs/defconfig211
-rw-r--r--arch/s390/configs/kasan.config3
-rw-r--r--arch/s390/configs/zfcpdump_defconfig32
-rw-r--r--arch/s390/crypto/Kconfig135
-rw-r--r--arch/s390/crypto/Makefile4
-rw-r--r--arch/s390/crypto/aes_s390.c11
-rw-r--r--arch/s390/crypto/arch_random.c107
-rw-r--r--arch/s390/crypto/chacha-glue.c130
-rw-r--r--arch/s390/crypto/chacha-s390.S907
-rw-r--r--arch/s390/crypto/chacha-s390.h14
-rw-r--r--arch/s390/crypto/crc32-vx.c2
-rw-r--r--arch/s390/crypto/crc32be-vx.S4
-rw-r--r--arch/s390/crypto/des_s390.c4
-rw-r--r--arch/s390/crypto/ghash_s390.c2
-rw-r--r--arch/s390/crypto/paes_s390.c28
-rw-r--r--arch/s390/crypto/prng.c77
-rw-r--r--arch/s390/crypto/sha.h3
-rw-r--r--arch/s390/crypto/sha1_s390.c16
-rw-r--r--arch/s390/crypto/sha256_s390.c4
-rw-r--r--arch/s390/crypto/sha3_256_s390.c3
-rw-r--r--arch/s390/crypto/sha3_512_s390.c3
-rw-r--r--arch/s390/crypto/sha512_s390.c36
-rw-r--r--arch/s390/hypfs/hypfs_diag.c2
-rw-r--r--arch/s390/hypfs/hypfs_diag0c.c14
-rw-r--r--arch/s390/hypfs/hypfs_sprp.c13
-rw-r--r--arch/s390/hypfs/hypfs_vm.c13
-rw-r--r--arch/s390/hypfs/inode.c2
-rw-r--r--arch/s390/include/asm/Kbuild17
-rw-r--r--arch/s390/include/asm/abs_lowcore.h17
-rw-r--r--arch/s390/include/asm/airq.h7
-rw-r--r--arch/s390/include/asm/alternative-asm.h70
-rw-r--r--arch/s390/include/asm/alternative.h102
-rw-r--r--arch/s390/include/asm/ap.h229
-rw-r--r--arch/s390/include/asm/archrandom.h37
-rw-r--r--arch/s390/include/asm/asm-const.h12
-rw-r--r--arch/s390/include/asm/asm-extable.h88
-rw-r--r--arch/s390/include/asm/atomic.h110
-rw-r--r--arch/s390/include/asm/atomic_ops.h76
-rw-r--r--arch/s390/include/asm/barrier.h24
-rw-r--r--arch/s390/include/asm/bitops.h181
-rw-r--r--arch/s390/include/asm/bug.h7
-rw-r--r--arch/s390/include/asm/cache.h2
-rw-r--r--arch/s390/include/asm/ccwdev.h26
-rw-r--r--arch/s390/include/asm/ccwgroup.h19
-rw-r--r--arch/s390/include/asm/checksum.h141
-rw-r--r--arch/s390/include/asm/chsc.h69
-rw-r--r--arch/s390/include/asm/cio.h9
-rw-r--r--arch/s390/include/asm/clocksource.h7
-rw-r--r--arch/s390/include/asm/clp.h3
-rw-r--r--arch/s390/include/asm/cmpxchg.h212
-rw-r--r--arch/s390/include/asm/compat.h150
-rw-r--r--arch/s390/include/asm/cpacf.h208
-rw-r--r--arch/s390/include/asm/cpu.h3
-rw-r--r--arch/s390/include/asm/cpu_mcf.h46
-rw-r--r--arch/s390/include/asm/cpu_mf.h17
-rw-r--r--arch/s390/include/asm/cpufeature.h23
-rw-r--r--arch/s390/include/asm/cputime.h2
-rw-r--r--arch/s390/include/asm/crw.h1
-rw-r--r--arch/s390/include/asm/css_chars.h4
-rw-r--r--arch/s390/include/asm/ctl_reg.h40
-rw-r--r--arch/s390/include/asm/debug.h137
-rw-r--r--arch/s390/include/asm/delay.h11
-rw-r--r--arch/s390/include/asm/diag.h26
-rw-r--r--arch/s390/include/asm/dma.h6
-rw-r--r--arch/s390/include/asm/eadm.h4
-rw-r--r--arch/s390/include/asm/elf.h98
-rw-r--r--arch/s390/include/asm/entry-common.h68
-rw-r--r--arch/s390/include/asm/extable.h52
-rw-r--r--arch/s390/include/asm/facility.h31
-rw-r--r--arch/s390/include/asm/fcx.h4
-rw-r--r--arch/s390/include/asm/fpu/api.h3
-rw-r--r--arch/s390/include/asm/ftrace.h110
-rw-r--r--arch/s390/include/asm/ftrace.lds.h21
-rw-r--r--arch/s390/include/asm/futex.h12
-rw-r--r--arch/s390/include/asm/gmap.h45
-rw-r--r--arch/s390/include/asm/hardirq.h1
-rw-r--r--arch/s390/include/asm/hugetlb.h46
-rw-r--r--arch/s390/include/asm/hw_irq.h1
-rw-r--r--arch/s390/include/asm/idals.h2
-rw-r--r--arch/s390/include/asm/idle.h16
-rw-r--r--arch/s390/include/asm/io.h15
-rw-r--r--arch/s390/include/asm/ipl.h26
-rw-r--r--arch/s390/include/asm/irq.h9
-rw-r--r--arch/s390/include/asm/irq_work.h12
-rw-r--r--arch/s390/include/asm/irqflags.h16
-rw-r--r--arch/s390/include/asm/jump_label.h7
-rw-r--r--arch/s390/include/asm/kasan.h35
-rw-r--r--arch/s390/include/asm/kdebug.h2
-rw-r--r--arch/s390/include/asm/kexec.h32
-rw-r--r--arch/s390/include/asm/kfence.h42
-rw-r--r--arch/s390/include/asm/kprobes.h3
-rw-r--r--arch/s390/include/asm/kvm_host.h216
-rw-r--r--arch/s390/include/asm/kvm_para.h229
-rw-r--r--arch/s390/include/asm/linkage.h31
-rw-r--r--arch/s390/include/asm/livepatch.h21
-rw-r--r--arch/s390/include/asm/lowcore.h94
-rw-r--r--arch/s390/include/asm/maccess.h17
-rw-r--r--arch/s390/include/asm/mem_encrypt.h2
-rw-r--r--arch/s390/include/asm/mmu.h19
-rw-r--r--arch/s390/include/asm/mmu_context.h102
-rw-r--r--arch/s390/include/asm/module.h14
-rw-r--r--arch/s390/include/asm/nmi.h18
-rw-r--r--arch/s390/include/asm/nospec-branch.h5
-rw-r--r--arch/s390/include/asm/nospec-insn.h155
-rw-r--r--arch/s390/include/asm/numa.h13
-rw-r--r--arch/s390/include/asm/os_info.h3
-rw-r--r--arch/s390/include/asm/page.h82
-rw-r--r--arch/s390/include/asm/pai.h78
-rw-r--r--arch/s390/include/asm/pci.h94
-rw-r--r--arch/s390/include/asm/pci_clp.h41
-rw-r--r--arch/s390/include/asm/pci_debug.h7
-rw-r--r--arch/s390/include/asm/pci_dma.h42
-rw-r--r--arch/s390/include/asm/pci_insn.h29
-rw-r--r--arch/s390/include/asm/pci_io.h17
-rw-r--r--arch/s390/include/asm/percpu.h51
-rw-r--r--arch/s390/include/asm/pgalloc.h52
-rw-r--r--arch/s390/include/asm/pgtable.h564
-rw-r--r--arch/s390/include/asm/preempt.h35
-rw-r--r--arch/s390/include/asm/processor.h137
-rw-r--r--arch/s390/include/asm/ptdump.h14
-rw-r--r--arch/s390/include/asm/ptrace.h99
-rw-r--r--arch/s390/include/asm/qdio.h145
-rw-r--r--arch/s390/include/asm/sclp.h35
-rw-r--r--arch/s390/include/asm/scsw.h89
-rw-r--r--arch/s390/include/asm/seccomp.h9
-rw-r--r--arch/s390/include/asm/sections.h20
-rw-r--r--arch/s390/include/asm/set_memory.h10
-rw-r--r--arch/s390/include/asm/setup.h70
-rw-r--r--arch/s390/include/asm/sigp.h14
-rw-r--r--arch/s390/include/asm/smp.h15
-rw-r--r--arch/s390/include/asm/softirq_stack.h14
-rw-r--r--arch/s390/include/asm/spinlock.h13
-rw-r--r--arch/s390/include/asm/spinlock_types.h4
-rw-r--r--arch/s390/include/asm/stacktrace.h176
-rw-r--r--arch/s390/include/asm/stp.h100
-rw-r--r--arch/s390/include/asm/string.h63
-rw-r--r--arch/s390/include/asm/syscall.h94
-rw-r--r--arch/s390/include/asm/syscall_wrapper.h132
-rw-r--r--arch/s390/include/asm/sysinfo.h6
-rw-r--r--arch/s390/include/asm/termios.h26
-rw-r--r--arch/s390/include/asm/text-patching.h16
-rw-r--r--arch/s390/include/asm/thread_info.h11
-rw-r--r--arch/s390/include/asm/timex.h115
-rw-r--r--arch/s390/include/asm/tlb.h9
-rw-r--r--arch/s390/include/asm/tlbflush.h8
-rw-r--r--arch/s390/include/asm/topology.h13
-rw-r--r--arch/s390/include/asm/tpi.h37
-rw-r--r--arch/s390/include/asm/types.h19
-rw-r--r--arch/s390/include/asm/uaccess.h480
-rw-r--r--arch/s390/include/asm/unistd.h1
-rw-r--r--arch/s390/include/asm/unwind.h21
-rw-r--r--arch/s390/include/asm/user.h4
-rw-r--r--arch/s390/include/asm/uv.h346
-rw-r--r--arch/s390/include/asm/vdso.h71
-rw-r--r--arch/s390/include/asm/vdso/clocksource.h8
-rw-r--r--arch/s390/include/asm/vdso/data.h13
-rw-r--r--arch/s390/include/asm/vdso/gettimeofday.h63
-rw-r--r--arch/s390/include/asm/vdso/processor.h7
-rw-r--r--arch/s390/include/asm/vdso/vsyscall.h26
-rw-r--r--arch/s390/include/asm/vtime.h15
-rw-r--r--arch/s390/include/asm/vtimer.h2
-rw-r--r--arch/s390/include/asm/vx-insn.h121
-rw-r--r--arch/s390/include/uapi/asm/dasd.h14
-rw-r--r--arch/s390/include/uapi/asm/debug.h35
-rw-r--r--arch/s390/include/uapi/asm/hwctrset.h51
-rw-r--r--arch/s390/include/uapi/asm/ipl.h25
-rw-r--r--arch/s390/include/uapi/asm/kvm.h8
-rw-r--r--arch/s390/include/uapi/asm/pkey.h79
-rw-r--r--arch/s390/include/uapi/asm/ptrace.h5
-rw-r--r--arch/s390/include/uapi/asm/schid.h3
-rw-r--r--arch/s390/include/uapi/asm/setup.h13
-rw-r--r--arch/s390/include/uapi/asm/sie.h2
-rw-r--r--arch/s390/include/uapi/asm/signal.h26
-rw-r--r--arch/s390/include/uapi/asm/termios.h50
-rw-r--r--arch/s390/include/uapi/asm/uvdevice.h51
-rw-r--r--arch/s390/include/uapi/asm/zcrypt.h158
-rw-r--r--arch/s390/kernel/.gitignore1
-rw-r--r--arch/s390/kernel/Makefile31
-rw-r--r--arch/s390/kernel/abs_lowcore.c95
-rw-r--r--arch/s390/kernel/alternative.c84
-rw-r--r--arch/s390/kernel/asm-offsets.c119
-rw-r--r--arch/s390/kernel/audit.c12
-rw-r--r--arch/s390/kernel/base.S63
-rw-r--r--arch/s390/kernel/cache.c7
-rw-r--r--arch/s390/kernel/compat_audit.c13
-rw-r--r--arch/s390/kernel/compat_linux.h89
-rw-r--r--arch/s390/kernel/compat_signal.c14
-rw-r--r--arch/s390/kernel/cpcmd.c40
-rw-r--r--arch/s390/kernel/cpufeature.c46
-rw-r--r--arch/s390/kernel/crash_dump.c171
-rw-r--r--arch/s390/kernel/debug.c328
-rw-r--r--arch/s390/kernel/diag.c56
-rw-r--r--arch/s390/kernel/dis.c27
-rw-r--r--arch/s390/kernel/dumpstack.c31
-rw-r--r--arch/s390/kernel/early.c73
-rw-r--r--arch/s390/kernel/early_printk.c2
-rw-r--r--arch/s390/kernel/earlypgm.S23
-rw-r--r--arch/s390/kernel/entry.S1379
-rw-r--r--arch/s390/kernel/entry.h62
-rw-r--r--arch/s390/kernel/fpu.c88
-rw-r--r--arch/s390/kernel/ftrace.c299
-rw-r--r--arch/s390/kernel/ftrace.h24
-rw-r--r--arch/s390/kernel/head64.S9
-rw-r--r--arch/s390/kernel/idle.c64
-rw-r--r--arch/s390/kernel/ipl.c427
-rw-r--r--arch/s390/kernel/ipl_vmparm.c2
-rw-r--r--arch/s390/kernel/irq.c157
-rw-r--r--arch/s390/kernel/jump_label.c42
-rw-r--r--arch/s390/kernel/kprobes.c258
-rw-r--r--arch/s390/kernel/kprobes_insn_page.S22
-rw-r--r--arch/s390/kernel/lgr.c5
-rw-r--r--arch/s390/kernel/machine_kexec.c64
-rw-r--r--arch/s390/kernel/machine_kexec_file.c105
-rw-r--r--arch/s390/kernel/machine_kexec_reloc.c1
-rw-r--r--arch/s390/kernel/mcount.S131
-rw-r--r--arch/s390/kernel/module.c225
-rw-r--r--arch/s390/kernel/nmi.c254
-rw-r--r--arch/s390/kernel/nospec-branch.c48
-rw-r--r--arch/s390/kernel/nospec-sysfs.c4
-rw-r--r--arch/s390/kernel/numa.c35
-rw-r--r--arch/s390/kernel/os_info.c24
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c1198
-rw-r--r--arch/s390/kernel/perf_cpum_cf_common.c74
-rw-r--r--arch/s390/kernel/perf_cpum_cf_diag.c705
-rw-r--r--arch/s390/kernel/perf_cpum_cf_events.c277
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c63
-rw-r--r--arch/s390/kernel/perf_event.c23
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c699
-rw-r--r--arch/s390/kernel/perf_pai_ext.c672
-rw-r--r--arch/s390/kernel/perf_regs.c3
-rw-r--r--arch/s390/kernel/pgm_check.S147
-rw-r--r--arch/s390/kernel/process.c112
-rw-r--r--arch/s390/kernel/processor.c213
-rw-r--r--arch/s390/kernel/ptrace.c449
-rw-r--r--arch/s390/kernel/relocate_kernel.S6
-rw-r--r--arch/s390/kernel/runtime_instr.c2
-rw-r--r--arch/s390/kernel/setup.c598
-rw-r--r--arch/s390/kernel/signal.c55
-rw-r--r--arch/s390/kernel/smp.c444
-rw-r--r--arch/s390/kernel/stacktrace.c13
-rw-r--r--arch/s390/kernel/sthyi.c13
-rw-r--r--arch/s390/kernel/suspend.c240
-rw-r--r--arch/s390/kernel/swsusp.S276
-rw-r--r--arch/s390/kernel/syscall.c (renamed from arch/s390/kernel/sys_s390.c)68
-rw-r--r--arch/s390/kernel/syscalls/Makefile3
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl43
-rw-r--r--arch/s390/kernel/sysinfo.c20
-rw-r--r--arch/s390/kernel/text_amode31.S (renamed from arch/s390/boot/text_dma.S)97
-rw-r--r--arch/s390/kernel/time.c492
-rw-r--r--arch/s390/kernel/topology.c89
-rw-r--r--arch/s390/kernel/trace.c2
-rw-r--r--arch/s390/kernel/traps.c158
-rw-r--r--arch/s390/kernel/unwind_bc.c12
-rw-r--r--arch/s390/kernel/uprobes.c23
-rw-r--r--arch/s390/kernel/uv.c675
-rw-r--r--arch/s390/kernel/vdso.c422
-rw-r--r--arch/s390/kernel/vdso32/.gitignore2
-rw-r--r--arch/s390/kernel/vdso32/Makefile77
-rwxr-xr-xarch/s390/kernel/vdso32/gen_vdso_offsets.sh15
-rw-r--r--arch/s390/kernel/vdso32/note.S13
-rw-r--r--arch/s390/kernel/vdso32/vdso32.lds.S142
-rw-r--r--arch/s390/kernel/vdso32/vdso32_wrapper.S15
-rw-r--r--arch/s390/kernel/vdso32/vdso_user_wrapper.S21
-rw-r--r--arch/s390/kernel/vdso64/.gitignore1
-rw-r--r--arch/s390/kernel/vdso64/Makefile42
-rw-r--r--arch/s390/kernel/vdso64/clock_getres.S50
-rw-r--r--arch/s390/kernel/vdso64/clock_gettime.S163
-rwxr-xr-xarch/s390/kernel/vdso64/gen_vdso_offsets.sh15
-rw-r--r--arch/s390/kernel/vdso64/getcpu.S31
-rw-r--r--arch/s390/kernel/vdso64/getcpu.c21
-rw-r--r--arch/s390/kernel/vdso64/gettimeofday.S71
-rw-r--r--arch/s390/kernel/vdso64/vdso.h14
-rw-r--r--arch/s390/kernel/vdso64/vdso64.lds.S14
-rw-r--r--arch/s390/kernel/vdso64/vdso64_generic.c19
-rw-r--r--arch/s390/kernel/vdso64/vdso_user_wrapper.S56
-rw-r--r--arch/s390/kernel/vmlinux.lds.S42
-rw-r--r--arch/s390/kernel/vtime.c62
-rw-r--r--arch/s390/kvm/Kconfig12
-rw-r--r--arch/s390/kvm/Makefile8
-rw-r--r--arch/s390/kvm/diag.c54
-rw-r--r--arch/s390/kvm/gaccess.c589
-rw-r--r--arch/s390/kvm/gaccess.h150
-rw-r--r--arch/s390/kvm/guestdbg.c8
-rw-r--r--arch/s390/kvm/intercept.c161
-rw-r--r--arch/s390/kvm/interrupt.c588
-rw-r--r--arch/s390/kvm/kvm-s390.c1851
-rw-r--r--arch/s390/kvm/kvm-s390.h121
-rw-r--r--arch/s390/kvm/pci.c702
-rw-r--r--arch/s390/kvm/pci.h87
-rw-r--r--arch/s390/kvm/priv.c191
-rw-r--r--arch/s390/kvm/pv.c545
-rw-r--r--arch/s390/kvm/sigp.c46
-rw-r--r--arch/s390/kvm/vsie.c138
-rw-r--r--arch/s390/lib/Makefile9
-rw-r--r--arch/s390/lib/delay.c114
-rw-r--r--arch/s390/lib/error-inject.c14
-rw-r--r--arch/s390/lib/expoline/Makefile3
-rw-r--r--arch/s390/lib/expoline/expoline.S12
-rw-r--r--arch/s390/lib/spinlock.c6
-rw-r--r--arch/s390/lib/string.c182
-rw-r--r--arch/s390/lib/test_kprobes.c75
-rw-r--r--arch/s390/lib/test_kprobes.h10
-rw-r--r--arch/s390/lib/test_kprobes_asm.S45
-rw-r--r--arch/s390/lib/test_modules.c32
-rw-r--r--arch/s390/lib/test_modules.h53
-rw-r--r--arch/s390/lib/test_modules_helpers.c13
-rw-r--r--arch/s390/lib/test_unwind.c419
-rw-r--r--arch/s390/lib/uaccess.c426
-rw-r--r--arch/s390/lib/xor.c26
-rw-r--r--arch/s390/mm/Makefile4
-rw-r--r--arch/s390/mm/cmm.c74
-rw-r--r--arch/s390/mm/dump_pagetables.c405
-rw-r--r--arch/s390/mm/extable.c81
-rw-r--r--arch/s390/mm/extmem.c34
-rw-r--r--arch/s390/mm/fault.c329
-rw-r--r--arch/s390/mm/gmap.c424
-rw-r--r--arch/s390/mm/hugetlbpage.c93
-rw-r--r--arch/s390/mm/init.c53
-rw-r--r--arch/s390/mm/kasan_init.c157
-rw-r--r--arch/s390/mm/maccess.c234
-rw-r--r--arch/s390/mm/mmap.c67
-rw-r--r--arch/s390/mm/page-states.c70
-rw-r--r--arch/s390/mm/pageattr.c85
-rw-r--r--arch/s390/mm/pgalloc.c383
-rw-r--r--arch/s390/mm/pgtable.c183
-rw-r--r--arch/s390/mm/vmem.c835
-rw-r--r--arch/s390/net/bpf_jit_comp.c469
-rw-r--r--arch/s390/numa/Makefile4
-rw-r--r--arch/s390/numa/mode_emu.c577
-rw-r--r--arch/s390/numa/numa.c171
-rw-r--r--arch/s390/numa/numa_mode.h25
-rw-r--r--arch/s390/numa/toptree.c351
-rw-r--r--arch/s390/numa/toptree.h61
-rw-r--r--arch/s390/oprofile/Makefile10
-rw-r--r--arch/s390/oprofile/init.c37
-rw-r--r--arch/s390/pci/Makefile4
-rw-r--r--arch/s390/pci/pci.c675
-rw-r--r--arch/s390/pci/pci_bus.c386
-rw-r--r--arch/s390/pci/pci_bus.h43
-rw-r--r--arch/s390/pci/pci_clp.c301
-rw-r--r--arch/s390/pci/pci_debug.c2
-rw-r--r--arch/s390/pci/pci_dma.c97
-rw-r--r--arch/s390/pci/pci_event.c345
-rw-r--r--arch/s390/pci/pci_insn.c177
-rw-r--r--arch/s390/pci/pci_iov.c99
-rw-r--r--arch/s390/pci/pci_iov.h30
-rw-r--r--arch/s390/pci/pci_irq.c142
-rw-r--r--arch/s390/pci/pci_kvm_hook.c11
-rw-r--r--arch/s390/pci/pci_mmio.c292
-rw-r--r--arch/s390/pci/pci_sysfs.c75
-rw-r--r--arch/s390/purgatory/.gitignore1
-rw-r--r--arch/s390/purgatory/Makefile7
-rw-r--r--arch/s390/purgatory/head.S39
-rw-r--r--arch/s390/purgatory/purgatory.c2
-rw-r--r--arch/s390/scripts/Makefile.chkbss20
-rw-r--r--arch/s390/tools/.gitignore1
-rwxr-xr-xarch/s390/tools/gcc-thunk-extern.sh24
-rw-r--r--arch/s390/tools/gen_facilities.c13
-rw-r--r--arch/s390/tools/opcodes.txt21
394 files changed, 25997 insertions, 15555 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index e63940bb57cd..76e362277179 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -7,5 +7,7 @@ obj-$(CONFIG_S390_HYPFS_FS) += hypfs/
obj-$(CONFIG_APPLDATA_BASE) += appldata/
obj-y += net/
obj-$(CONFIG_PCI) += pci/
-obj-$(CONFIG_NUMA) += numa/
obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
+
+# for cleaning
+subdir- += boot tools
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8abe77536d9d..de575af02ffe 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -2,9 +2,6 @@
config MMU
def_bool y
-config ZONE_DMA
- def_bool y
-
config CPU_BIG_ENDIAN
def_bool y
@@ -30,14 +27,11 @@ config GENERIC_BUG_RELATIVE_POINTERS
def_bool y
config GENERIC_LOCKBREAK
- def_bool y if PREEMPTTION
+ def_bool y if PREEMPTION
config PGSTE
def_bool y if KVM
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
- def_bool y
-
config AUDIT_ARCH
def_bool y
@@ -53,25 +47,40 @@ config ARCH_SUPPORTS_UPROBES
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
- default 0x18000000000000 if KASAN_S390_4_LEVEL_PAGING
- default 0x30000000000
+ default 0x1C000000000000
config S390
def_bool y
+ #
+ # Note: keep this list sorted alphabetically
+ #
+ imply IMA_SECURE_AND_OR_TRUSTED_BOOT
+ select ALTERNATE_USER_ADDRESS_SPACE
+ select ARCH_32BIT_USTAT_F_TINODE
select ARCH_BINFMT_ELF_STATE
+ select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+ select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
+ select ARCH_ENABLE_MEMORY_HOTREMOVE
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
+ select ARCH_HAS_CURRENT_STACK_POINTER
+ select ARCH_HAS_DEBUG_VM_PGTABLE
+ select ARCH_HAS_DEBUG_WX
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_SCALED_CPUTIME
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_HAS_VDSO_DATA
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK
select ARCH_INLINE_READ_LOCK_BH
@@ -101,53 +110,66 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
- select ARCH_KEEP_MEMBLOCK
- select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_HUGETLBFS
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANTS_NO_INSTR
+ select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS2
+ select DMA_OPS if PCI
select DYNAMIC_FTRACE if FUNCTION_TRACER
- select GENERIC_CLOCKEVENTS
+ select GCC12_NO_ARRAY_BOUNDS
+ select GENERIC_ALLOCATOR
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
- select GENERIC_FIND_FIRST_BIT
+ select GENERIC_ENTRY
+ select GENERIC_GETTIMEOFDAY
+ select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
+ select GENERIC_VDSO_TIME_NS
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN
select HAVE_ARCH_KASAN_VMALLOC
- select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
+ select HAVE_ARCH_KCSAN
+ select HAVE_ARCH_KFENCE
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_VMAP_STACK
select HAVE_ASM_MODVERSIONS
- select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
- select HAVE_COPY_THREAD_TLS
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
+ select HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_DYNAMIC_FTRACE_WITH_REGS
- select HAVE_FAST_GUP
+ select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select HAVE_FAST_GUP
select HAVE_FENTRY
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
- select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_GCC_PLUGINS
+ select HAVE_GENERIC_VDSO
+ select HAVE_IOREMAP_PROT if PCI
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZ4
@@ -155,29 +177,34 @@ config S390
select HAVE_KERNEL_LZO
select HAVE_KERNEL_UNCOMPRESSED
select HAVE_KERNEL_XZ
+ select HAVE_KERNEL_ZSTD
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH
- select HAVE_PERF_REGS
- select HAVE_PERF_USER_STACK_DUMP
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MEMBLOCK_PHYS_MAP
- select MMU_GATHER_NO_GATHER
select HAVE_MOD_ARCH_SPECIFIC
+ select HAVE_NMI
select HAVE_NOP_MCOUNT
- select HAVE_OPROFILE
select HAVE_PCI
select HAVE_PERF_EVENTS
- select MMU_GATHER_RCU_TABLE_FREE
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
+ select HAVE_SAMPLE_FTRACE_DIRECT
+ select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
+ select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
+ select HAVE_VIRT_CPU_ACCOUNTING_IDLE
select IOMMU_HELPER if PCI
select IOMMU_SUPPORT if PCI
+ select MMU_GATHER_NO_GATHER
+ select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PCI
select NEED_SG_DMA_LENGTH if PCI
@@ -185,17 +212,16 @@ config S390
select OLD_SIGSUSPEND3
select PCI_DOMAINS if PCI
select PCI_MSI if PCI
+ select PCI_MSI_ARCH_FALLBACKS if PCI_MSI
select SPARSE_IRQ
+ select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select TTY
select VIRT_CPU_ACCOUNTING
- select ARCH_HAS_SCALED_CPUTIME
- select HAVE_NMI
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
- select SWIOTLB
- select GENERIC_ALLOCATOR
-
+ select ZONE_DMA
+ # Note: keep the above list sorted alphabetically
config SCHED_OMIT_FRAME_POINTER
def_bool y
@@ -208,20 +234,8 @@ source "kernel/livepatch/Kconfig"
menu "Processor type and features"
-config HAVE_MARCH_Z900_FEATURES
- def_bool n
-
-config HAVE_MARCH_Z990_FEATURES
- def_bool n
- select HAVE_MARCH_Z900_FEATURES
-
-config HAVE_MARCH_Z9_109_FEATURES
- def_bool n
- select HAVE_MARCH_Z990_FEATURES
-
config HAVE_MARCH_Z10_FEATURES
def_bool n
- select HAVE_MARCH_Z9_109_FEATURES
config HAVE_MARCH_Z196_FEATURES
def_bool n
@@ -243,45 +257,21 @@ config HAVE_MARCH_Z15_FEATURES
def_bool n
select HAVE_MARCH_Z14_FEATURES
+config HAVE_MARCH_Z16_FEATURES
+ def_bool n
+ select HAVE_MARCH_Z15_FEATURES
+
choice
prompt "Processor type"
default MARCH_Z196
-config MARCH_Z900
- bool "IBM zSeries model z800 and z900"
- select HAVE_MARCH_Z900_FEATURES
- depends on $(cc-option,-march=z900)
- help
- Select this to enable optimizations for model z800/z900 (2064 and
- 2066 series). This will enable some optimizations that are not
- available on older ESA/390 (31 Bit) only CPUs.
-
-config MARCH_Z990
- bool "IBM zSeries model z890 and z990"
- select HAVE_MARCH_Z990_FEATURES
- depends on $(cc-option,-march=z990)
- help
- Select this to enable optimizations for model z890/z990 (2084 and
- 2086 series). The kernel will be slightly faster but will not work
- on older machines.
-
-config MARCH_Z9_109
- bool "IBM System z9"
- select HAVE_MARCH_Z9_109_FEATURES
- depends on $(cc-option,-march=z9-109)
- help
- Select this to enable optimizations for IBM System z9 (2094 and
- 2096 series). The kernel will be slightly faster but will not work
- on older machines.
-
config MARCH_Z10
bool "IBM System z10"
select HAVE_MARCH_Z10_FEATURES
depends on $(cc-option,-march=z10)
help
- Select this to enable optimizations for IBM System z10 (2097 and
- 2098 series). The kernel will be slightly faster but will not work
- on older machines.
+ Select this to enable optimizations for IBM System z10 (2097 and 2098
+ series). This is the oldest machine generation currently supported.
config MARCH_Z196
bool "IBM zEnterprise 114 and 196"
@@ -328,16 +318,15 @@ config MARCH_Z15
and 8561 series). The kernel will be slightly faster but will not
work on older machines.
-endchoice
-
-config MARCH_Z900_TUNE
- def_bool TUNE_Z900 || MARCH_Z900 && TUNE_DEFAULT
-
-config MARCH_Z990_TUNE
- def_bool TUNE_Z990 || MARCH_Z990 && TUNE_DEFAULT
+config MARCH_Z16
+ bool "IBM z16"
+ select HAVE_MARCH_Z16_FEATURES
+ depends on $(cc-option,-march=z16)
+ help
+ Select this to enable optimizations for IBM z16 (3931 and
+ 3932 series).
-config MARCH_Z9_109_TUNE
- def_bool TUNE_Z9_109 || MARCH_Z9_109 && TUNE_DEFAULT
+endchoice
config MARCH_Z10_TUNE
def_bool TUNE_Z10 || MARCH_Z10 && TUNE_DEFAULT
@@ -357,6 +346,9 @@ config MARCH_Z14_TUNE
config MARCH_Z15_TUNE
def_bool TUNE_Z15 || MARCH_Z15 && TUNE_DEFAULT
+config MARCH_Z16_TUNE
+ def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT
+
choice
prompt "Tune code generation"
default TUNE_DEFAULT
@@ -374,21 +366,8 @@ config TUNE_DEFAULT
Tune the generated code for the target processor for which the kernel
will be compiled.
-config TUNE_Z900
- bool "IBM zSeries model z800 and z900"
- depends on $(cc-option,-mtune=z900)
-
-config TUNE_Z990
- bool "IBM zSeries model z890 and z990"
- depends on $(cc-option,-mtune=z990)
-
-config TUNE_Z9_109
- bool "IBM System z9"
- depends on $(cc-option,-mtune=z9-109)
-
config TUNE_Z10
bool "IBM System z10"
- depends on $(cc-option,-mtune=z10)
config TUNE_Z196
bool "IBM zEnterprise 114 and 196"
@@ -410,28 +389,37 @@ config TUNE_Z15
bool "IBM z15"
depends on $(cc-option,-mtune=z15)
+config TUNE_Z16
+ bool "IBM z16"
+ depends on $(cc-option,-mtune=z16)
+
endchoice
config 64BIT
def_bool y
+config COMMAND_LINE_SIZE
+ int "Maximum size of kernel command line"
+ default 4096
+ range 896 1048576
+ help
+ This allows you to specify the maximum length of the kernel command
+ line.
+
config COMPAT
def_bool y
prompt "Kernel support for 31 bit emulation"
- select COMPAT_BINFMT_ELF if BINFMT_ELF
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT_OLD_SIGACTION
select HAVE_UID16
depends on MULTIUSER
+ depends on !CC_IS_CLANG
help
Select this option if you want to enable your system kernel to
handle system-calls from ELF binaries for 31 bit ESA. This option
(and some other stuff like libraries and such) is needed for
executing 31 bit applications. It is safe to say "Y".
-config SYSVIPC_COMPAT
- def_bool y if COMPAT && SYSVIPC
-
config SMP
def_bool y
@@ -450,14 +438,6 @@ config NR_CPUS
config HOTPLUG_CPU
def_bool y
-# Some NUMA nodes have memory ranges that span
-# other nodes. Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node. See memmap_init_zone()
-# for details. <- They meant memory holes!
-config NODES_SPAN_OTHER_NODES
- def_bool NUMA
-
config NUMA
bool "NUMA support"
depends on SCHED_TOPOLOGY
@@ -467,58 +447,10 @@ config NUMA
This option adds NUMA support to the kernel.
- An operation mode can be selected by appending
- numa=<method> to the kernel command line.
-
- The default behaviour is identical to appending numa=plain to
- the command line. This will create just one node with all
- available memory and all CPUs in it.
-
config NODES_SHIFT
- int "Maximum NUMA nodes (as a power of 2)"
- range 1 10
- depends on NUMA
- default "4"
- help
- Specify the maximum number of NUMA nodes available on the target
- system. Increases memory reserved to accommodate various tables.
-
-menu "Select NUMA modes"
+ int
depends on NUMA
-
-config NUMA_EMU
- bool "NUMA emulation"
- default y
- help
- Numa emulation mode will split the available system memory into
- equal chunks which then are distributed over the configured number
- of nodes in a round-robin manner.
-
- The number of fake nodes is limited by the number of available memory
- chunks (i.e. memory size / fake size) and the number of supported
- nodes in the kernel.
-
- The CPUs are assigned to the nodes in a way that partially respects
- the original machine topology (if supported by the machine).
- Fair distribution of the CPUs is not guaranteed.
-
-config EMU_SIZE
- hex "NUMA emulation memory chunk size"
- default 0x10000000
- range 0x400000 0x100000000
- depends on NUMA_EMU
- help
- Select the default size by which the memory is chopped and then
- assigned to emulated NUMA nodes.
-
- This can be overridden by specifying
-
- emu_size=<n>
-
- on the kernel command line where also suffixes K, M, G, and T are
- supported.
-
-endmenu
+ default "1"
config SCHED_SMT
def_bool n
@@ -553,7 +485,6 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
select KEXEC_CORE
- select BUILD_BIN2C
depends on CRYPTO
depends on CRYPTO_SHA256
depends on CRYPTO_SHA256_S390
@@ -577,21 +508,6 @@ config KEXEC_SIG
verification for the corresponding kernel image type being
loaded in order for this to work.
-config ARCH_RANDOM
- def_bool y
- prompt "s390 architectural random number generation API"
- help
- Enable the s390 architectural random number generation API
- to provide random data for all consumers within the Linux
- kernel.
-
- When enabled the arch_random_* functions declared in linux/random.h
- are implemented. The implementation is based on the s390 CPACF
- instruction subfunction TRNG which provides a real true random
- number generator.
-
- If unsure, say Y.
-
config KERNEL_NOBP
def_bool n
prompt "Enable modified branch prediction for the kernel by default"
@@ -611,6 +527,7 @@ config KERNEL_NOBP
config EXPOLINE
def_bool n
+ depends on $(cc-option,-mindirect-branch=thunk)
prompt "Avoid speculative indirect branches in the kernel"
help
Compile the kernel with the expoline compiler options to guard
@@ -621,6 +538,19 @@ config EXPOLINE
If unsure, say N.
+config EXPOLINE_EXTERN
+ def_bool n
+ depends on EXPOLINE
+ depends on CC_IS_GCC && GCC_VERSION >= 110200
+ depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC))
+ prompt "Generate expolines as extern functions."
+ help
+ This option is required for some tooling like kpatch. The kernel is
+ compiled with -mindirect-branch=thunk-extern and requires a newer
+ compiler.
+
+ If unsure, say N.
+
choice
prompt "Expoline default"
depends on EXPOLINE
@@ -638,9 +568,7 @@ config EXPOLINE_FULL
endchoice
config RELOCATABLE
- bool "Build a relocatable kernel"
- select MODULE_REL_CRCS if MODVERSIONS
- default y
+ def_bool y
help
This builds a kernel image that retains relocation information
so it can be loaded at an arbitrary address.
@@ -649,10 +577,11 @@ config RELOCATABLE
bootup process.
The relocations make the kernel image about 15% larger (compressed
10%), but are discarded at runtime.
+ Note: this option exists only for documentation purposes, please do
+ not remove it.
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image (KASLR)"
- depends on RELOCATABLE
default y
help
In support of Kernel Address Space Layout Randomization (KASLR),
@@ -672,19 +601,6 @@ config ARCH_SPARSEMEM_ENABLE
config ARCH_SPARSEMEM_DEFAULT
def_bool y
-config ARCH_ENABLE_MEMORY_HOTPLUG
- def_bool y if SPARSEMEM
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
- def_bool y
-
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
- def_bool y
-
-config FORCE_MAX_ZONEORDER
- int
- default "9"
-
config MAX_PHYSMEM_BITS
int "Maximum size of supported physical memory in bits (42-53)"
range 42 53
@@ -695,20 +611,6 @@ config MAX_PHYSMEM_BITS
Increasing the number of bits also increases the kernel image size.
By default 46 bits (64TB) are supported.
-config PACK_STACK
- def_bool y
- prompt "Pack kernel stack"
- help
- This option enables the compiler option -mkernel-backchain if it
- is available. If the option is available the compiler supports
- the new stack layout which dramatically reduces the minimum stack
- frame size. With an old compiler a non-leaf function needs a
- minimum of 96 bytes on 31 bit and 160 bytes on 64 bit. With
- -mkernel-backchain the minimum size drops to 16 byte on 31 bit
- and 24 byte on 64 bit.
-
- Say Y if you are unsure.
-
config CHECK_STACK
def_bool y
depends on !VMAP_STACK
@@ -735,16 +637,6 @@ config STACK_GUARD
The minimum size for the stack guard should be 256 for 31 bit and
512 for 64 bit.
-config WARN_DYNAMIC_STACK
- def_bool n
- prompt "Emit compiler warnings for function with dynamic stack usage"
- help
- This option enables the compiler option -mwarn-dynamicstack. If the
- compiler supports this options generates warnings for functions
- that dynamically allocate stack space using alloca.
-
- Say N if you are unsure.
-
endmenu
menu "I/O subsystem"
@@ -752,7 +644,7 @@ menu "I/O subsystem"
config QDIO
def_tristate y
prompt "QDIO support"
- ---help---
+ help
This driver provides the Queued Direct I/O base support for
IBM System z.
@@ -766,7 +658,7 @@ if PCI
config PCI_NR_FUNCTIONS
int "Maximum number of PCI functions (1-4096)"
range 1 4096
- default "128"
+ default "512"
help
This allows you to specify the maximum number of PCI functions which
this kernel will support.
@@ -823,13 +715,14 @@ config VFIO_CCW
config VFIO_AP
def_tristate n
prompt "VFIO support for AP devices"
- depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM
+ depends on S390_AP_IOMMU && VFIO_MDEV && KVM
+ depends on ZCRYPT
help
- This driver grants access to Adjunct Processor (AP) devices
- via the VFIO mediated device interface.
+ This driver grants access to Adjunct Processor (AP) devices
+ via the VFIO mediated device interface.
- To compile this driver as a module, choose M here: the module
- will be called vfio_ap.
+ To compile this driver as a module, choose M here: the module
+ will be called vfio_ap.
endmenu
@@ -849,32 +742,6 @@ config CRASH_DUMP
endmenu
-config SECCOMP
- def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y.
-
-menu "Power Management"
-
-config ARCH_HIBERNATION_POSSIBLE
- def_bool y
-
-source "kernel/power/Kconfig"
-
-endmenu
-
config CCW
def_bool y
@@ -932,7 +799,7 @@ config CMM_IUCV
config APPLDATA_BASE
def_bool n
prompt "Linux - VM Monitor Stream, base infrastructure"
- depends on PROC_FS
+ depends on PROC_SYSCTL
help
This provides a kernel interface for creating and updating z/VM APPLDATA
monitor records. The monitor records are updated at certain time
@@ -1009,7 +876,6 @@ config S390_GUEST
select TTY
select VIRTUALIZATION
select VIRTIO
- select VIRTIO_CONSOLE
help
Enabling this option adds support for virtio based paravirtual device
drivers on s390.
@@ -1019,10 +885,15 @@ config S390_GUEST
endmenu
+config S390_MODULES_SANITY_TEST_HELPERS
+ def_bool n
+
menu "Selftests"
config S390_UNWIND_SELFTEST
def_tristate n
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
prompt "Test unwind functions"
help
This option enables s390 specific stack unwinder testing kernel
@@ -1031,4 +902,28 @@ config S390_UNWIND_SELFTEST
Say N if you are unsure.
+config S390_KPROBES_SANITY_TEST
+ def_tristate n
+ prompt "Enable s390 specific kprobes tests"
+ depends on KPROBES
+ depends on KUNIT
+ help
+ This option enables an s390 specific kprobes test module. This option
+ is not useful for distributions or general kernels, but only for kernel
+ developers working on architecture code.
+
+ Say N if you are unsure.
+
+config S390_MODULES_SANITY_TEST
+ def_tristate n
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ prompt "Enable s390 specific modules tests"
+ select S390_MODULES_SANITY_TEST_HELPERS
+ help
+ This option enables an s390 specific modules test. This option is
+ not useful for distributions or general kernels, but only for
+ kernel developers working on architecture code.
+
+ Say N if you are unsure.
endmenu
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 190527560b2c..c4300ea4abf8 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -1,19 +1,22 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
+config EARLY_PRINTK
def_bool y
-config S390_PTDUMP
- bool "Export kernel pagetable layout to userspace via debugfs"
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
depends on DEBUG_KERNEL
- select DEBUG_FS
- ---help---
- Say Y here if you want to show the kernel pagetable layout in a
- debugfs file. This information is only useful for kernel developers
- who are working in architecture specific areas of the kernel.
- It is probably not a good idea to enable this feature in a production
- kernel.
- If in doubt, say "N"
+ help
+ This option enables sanity checks in s390 low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
-config EARLY_PRINTK
- def_bool y
+ If unsure, say N.
+
+config CIO_INJECT
+ bool "CIO Inject interfaces"
+ depends on DEBUG_KERNEL && DEBUG_FS
+ help
+ This option provides a debugging facility to inject certain artificial events
+ and instruction responses to the CIO layer of Linux kernel. The newly created
+ debugfs user-interfaces will be at /sys/kernel/debug/s390/cio/*
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 8dfa2cf1f05c..b3235ab0ace8 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -3,9 +3,7 @@
# s390/Makefile
#
# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
#
# Copyright (C) 1994 by Linus Torvalds
#
@@ -16,51 +14,50 @@ KBUILD_AFLAGS_MODULE += -fPIC
KBUILD_CFLAGS_MODULE += -fPIC
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
-ifeq ($(CONFIG_RELOCATABLE),y)
KBUILD_CFLAGS += -fPIE
LDFLAGS_vmlinux := -pie
-endif
aflags_dwarf := -Wa,-gdwarf-2
KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
+ifndef CONFIG_AS_IS_LLVM
KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf))
-KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2
+endif
+KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack
KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY
-KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float
+KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain
KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables
-KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-option,-ffreestanding)
+KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding
+KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector
KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,))
+KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
+
UTS_MACHINE := s390x
STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384)
CHECKFLAGS += -D__s390__ -D__s390x__
export LD_BFD
-mflags-$(CONFIG_MARCH_Z900) := -march=z900
-mflags-$(CONFIG_MARCH_Z990) := -march=z990
-mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109
mflags-$(CONFIG_MARCH_Z10) := -march=z10
mflags-$(CONFIG_MARCH_Z196) := -march=z196
mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12
mflags-$(CONFIG_MARCH_Z13) := -march=z13
mflags-$(CONFIG_MARCH_Z14) := -march=z14
mflags-$(CONFIG_MARCH_Z15) := -march=z15
+mflags-$(CONFIG_MARCH_Z16) := -march=z16
export CC_FLAGS_MARCH := $(mflags-y)
aflags-y += $(mflags-y)
cflags-y += $(mflags-y)
-cflags-$(CONFIG_MARCH_Z900_TUNE) += -mtune=z900
-cflags-$(CONFIG_MARCH_Z990_TUNE) += -mtune=z990
-cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109
cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10
cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196
cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12
cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13
cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14
cflags-$(CONFIG_MARCH_Z15_TUNE) += -mtune=z15
+cflags-$(CONFIG_MARCH_Z16_TUNE) += -mtune=z16
cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include
@@ -69,44 +66,38 @@ cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include
#
cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls
-ifeq ($(call cc-option-yn,-mpacked-stack -mbackchain -msoft-float),y)
-cflags-$(CONFIG_PACK_STACK) += -mpacked-stack -D__PACK_STACK
-aflags-$(CONFIG_PACK_STACK) += -D__PACK_STACK
-endif
-
KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y)
KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y)
-ifeq ($(call cc-option-yn,-mstack-size=8192 -mstack-guard=128),y)
-cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE)
-ifneq ($(call cc-option-yn,-mstack-size=8192),y)
-cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD)
-endif
-endif
-
-ifdef CONFIG_WARN_DYNAMIC_STACK
- ifeq ($(call cc-option-yn,-mwarn-dynamicstack),y)
- KBUILD_CFLAGS += -mwarn-dynamicstack
- KBUILD_CFLAGS_DECOMPRESSOR += -mwarn-dynamicstack
+ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),)
+ CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE)
+ ifeq ($(call cc-option,-mstack-size=8192),)
+ CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD)
endif
+ export CC_FLAGS_CHECK_STACK
+ cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK)
endif
ifdef CONFIG_EXPOLINE
- ifeq ($(call cc-option-yn,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),y)
+ ifdef CONFIG_EXPOLINE_EXTERN
+ KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o
+ CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern
+ CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern
+ else
CC_FLAGS_EXPOLINE := -mindirect-branch=thunk
CC_FLAGS_EXPOLINE += -mfunction-return=thunk
- CC_FLAGS_EXPOLINE += -mindirect-branch-table
- export CC_FLAGS_EXPOLINE
- cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE
- aflags-y += -DCC_USING_EXPOLINE
endif
+ CC_FLAGS_EXPOLINE += -mindirect-branch-table
+ export CC_FLAGS_EXPOLINE
+ cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE
+ aflags-y += -DCC_USING_EXPOLINE
endif
ifdef CONFIG_FUNCTION_TRACER
- ifeq ($(call cc-option-yn,-mfentry -mnop-mcount),n)
+ ifeq ($(call cc-option,-mfentry -mnop-mcount),)
# make use of hotpatch feature if the compiler supports it
cc_hotpatch := -mhotpatch=0,3
- ifeq ($(call cc-option-yn,$(cc_hotpatch)),y)
+ ifneq ($(call cc-option,$(cc_hotpatch)),)
CC_FLAGS_FTRACE := $(cc_hotpatch)
KBUILD_AFLAGS += -DCC_USING_HOTPATCH
KBUILD_CFLAGS += -DCC_USING_HOTPATCH
@@ -117,7 +108,7 @@ endif
# Test CFI features of binutils
cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1)
-KBUILD_CFLAGS += -mbackchain -msoft-float $(cflags-y)
+KBUILD_CFLAGS += -mpacked-stack -mbackchain -msoft-float $(cflags-y)
KBUILD_CFLAGS += -pipe -Wno-sign-compare
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi)
KBUILD_AFLAGS += $(aflags-y) $(cfi)
@@ -126,17 +117,9 @@ export KBUILD_CFLAGS_DECOMPRESSOR
OBJCOPYFLAGS := -O binary
-head-y := arch/s390/kernel/head64.o
-
-# See arch/s390/Kbuild for content of core part of the kernel
-core-y += arch/s390/
-
libs-y += arch/s390/lib/
drivers-y += drivers/s390/
-# must be linked after kernel
-drivers-$(CONFIG_OPROFILE) += arch/s390/oprofile/
-
boot := arch/s390/boot
syscalls := arch/s390/kernel/syscalls
tools := arch/s390/tools
@@ -147,7 +130,7 @@ all: bzImage
KBUILD_IMAGE := $(boot)/bzImage
install:
- $(Q)$(MAKE) $(build)=$(boot) $@
+ $(call cmd,install)
bzImage: vmlinux
$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
@@ -158,16 +141,31 @@ zfcpdump:
vdso_install:
$(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@
-archclean:
- $(Q)$(MAKE) $(clean)=$(boot)
- $(Q)$(MAKE) $(clean)=$(tools)
-
archheaders:
$(Q)$(MAKE) $(build)=$(syscalls) uapi
archprepare:
$(Q)$(MAKE) $(build)=$(syscalls) kapi
$(Q)$(MAKE) $(build)=$(tools) kapi
+ifeq ($(KBUILD_EXTMOD),)
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+ $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
+ $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+ $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
+
+ifdef CONFIG_EXPOLINE_EXTERN
+modules_prepare: expoline_prepare
+expoline_prepare:
+ $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o
+endif
+endif
# Don't use tabs in echo arguments
define archhelp
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index aa738cad1338..d74a4c7d5df6 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev;
*/
static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
static int appldata_timer_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static int appldata_interval_handler(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static struct ctl_table_header *appldata_sysctl_header;
static struct ctl_table appldata_table[] = {
@@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd)
*/
static int
appldata_timer_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int timer_active = appldata_timer_active;
int rc;
@@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
*/
static int
appldata_interval_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int interval = appldata_interval;
int rc;
@@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
*/
static int
appldata_generic_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct appldata_ops *ops = NULL, *tmp_ops;
struct list_head *lh;
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index e68136c3c23a..21c3147bd92a 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -29,10 +29,6 @@
* the structure version (product ID, see appldata_base.c) needs to be changed
* as well and all documentation and z/VM applications using it must be
* updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
*/
struct appldata_mem_data {
u64 timestamp;
diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c
index 8bc14b0d1def..59c282ca002f 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -25,10 +25,6 @@
* This is accessed as binary data by z/VM. If changes to it can't be avoided,
* the structure version (product ID, see appldata_base.c) needs to be changed
* as well and all documentation and z/VM applications using it must be updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
*/
struct appldata_net_sum_data {
u64 timestamp;
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 54f375627532..a363d30ce739 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -32,10 +32,6 @@
* the structure version (product ID, see appldata_base.c) needs to be changed
* as well and all documentation and z/VM applications using it must be
* updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
*/
struct appldata_os_per_cpu {
u32 per_cpu_user; /* timer ticks spent in user mode */
@@ -75,7 +71,7 @@ struct appldata_os_data {
(waiting for I/O) */
/* per cpu data */
- struct appldata_os_per_cpu os_cpu[0];
+ struct appldata_os_per_cpu os_cpu[];
} __attribute__((packed));
static struct appldata_os_data *appldata_os_data;
@@ -133,8 +129,7 @@ static void appldata_get_os_data(void *data)
os_data->nr_cpus = j;
- new_size = sizeof(struct appldata_os_data) +
- (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu));
+ new_size = struct_size(os_data, os_cpu, os_data->nr_cpus);
if (ops.size != new_size) {
if (ops.active) {
rc = appldata_diag(APPLDATA_RECORD_OS_ID,
@@ -169,8 +164,7 @@ static int __init appldata_os_init(void)
{
int rc, max_size;
- max_size = sizeof(struct appldata_os_data) +
- (num_possible_cpus() * sizeof(struct appldata_os_per_cpu));
+ max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus());
if (max_size > APPLDATA_MAX_REC_SIZE) {
pr_err("Maximum OS record size %i exceeds the maximum "
"record size %i\n", max_size, APPLDATA_MAX_REC_SIZE);
diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore
index 16ff906e4610..f56591bc0897 100644
--- a/arch/s390/boot/.gitignore
+++ b/arch/s390/boot/.gitignore
@@ -1,3 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
image
bzImage
section_cmp.*
+vmlinux
+vmlinux.lds
+vmlinux.syms
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index 0ff9261c915e..d52c3e2e16bc 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -7,6 +7,7 @@ KCOV_INSTRUMENT := n
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
@@ -36,14 +37,20 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y += version.o pgm_check_info.o ctype.o text_dma.o
-obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o
-obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o
+obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
-targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
-subdir- := compressed
+obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
+obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
+obj-all := $(obj-y) piggy.o syms.o
+
+targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
+targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
+targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
+targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
OBJECTS := $(addprefix $(obj)/,$(obj-y))
+OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
quiet_cmd_section_cmp = SECTCMP $*
define cmd_section_cmp
@@ -58,22 +65,67 @@ define cmd_section_cmp
touch $@
endef
-$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE
+$(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE
$(call if_changed,objcopy)
-$(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE
+$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE
$(call if_changed,section_cmp)
-$(obj)/compressed/vmlinux: $(obj)/startup.a FORCE
- $(Q)$(MAKE) $(build)=$(obj)/compressed $@
+LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup --build-id=sha1 -T
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE
+ $(call if_changed,ld)
+
+LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T
+$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE
+ $(call if_changed,ld)
+
+quiet_cmd_dumpsyms = DUMPSYMS $<
+define cmd_dumpsyms
+ $(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@"
+endef
+
+$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE
+ $(call if_changed,dumpsyms)
+
+OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms
+$(obj)/syms.o: $(obj)/syms.bin FORCE
+ $(call if_changed,objcopy)
+
+OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
+$(obj)/info.bin: vmlinux FORCE
+ $(call if_changed,objcopy)
-$(obj)/startup.a: $(OBJECTS) FORCE
- $(call if_changed,ar)
+OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info
+$(obj)/info.o: $(obj)/info.bin FORCE
+ $(call if_changed,objcopy)
-install:
- sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
- System.map "$(INSTALL_PATH)"
+OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S
+$(obj)/vmlinux.bin: vmlinux FORCE
+ $(call if_changed,objcopy)
-chkbss := $(obj-y)
-chkbss-target := startup.a
-include $(srctree)/arch/s390/scripts/Makefile.chkbss
+suffix-$(CONFIG_KERNEL_GZIP) := .gz
+suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
+suffix-$(CONFIG_KERNEL_LZ4) := .lz4
+suffix-$(CONFIG_KERNEL_LZMA) := .lzma
+suffix-$(CONFIG_KERNEL_LZO) := .lzo
+suffix-$(CONFIG_KERNEL_XZ) := .xz
+suffix-$(CONFIG_KERNEL_ZSTD) := .zst
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,bzip2_with_size)
+$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,lz4_with_size)
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,lzma_with_size)
+$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,lzo_with_size)
+$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,xzkern_with_size)
+$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,zstd22_with_size)
+
+OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed
+$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE
+ $(call if_changed,objcopy)
diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c
index ff6801d401c4..47c48fbfb563 100644
--- a/arch/s390/boot/als.c
+++ b/arch/s390/boot/als.c
@@ -68,7 +68,7 @@ void print_missing_facilities(void)
first = 1;
for (i = 0; i < ARRAY_SIZE(als); i++) {
- val = ~S390_lowcore.stfle_fac_list[i] & als[i];
+ val = ~stfle_fac_list[i] & als[i];
for (j = 0; j < BITS_PER_LONG; j++) {
if (!(val & (1UL << (BITS_PER_LONG - 1 - j))))
continue;
@@ -106,9 +106,9 @@ void verify_facilities(void)
{
int i;
- __stfle(S390_lowcore.stfle_fac_list, ARRAY_SIZE(S390_lowcore.stfle_fac_list));
+ __stfle(stfle_fac_list, ARRAY_SIZE(stfle_fac_list));
for (i = 0; i < ARRAY_SIZE(als); i++) {
- if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i])
+ if ((stfle_fac_list[i] & als[i]) != als[i])
facility_mismatch();
}
}
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 2ea603f70c3b..70418389414d 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -2,20 +2,37 @@
#ifndef BOOT_BOOT_H
#define BOOT_BOOT_H
+#include <linux/types.h>
+
+#define IPL_START 0x200
+
+#ifndef __ASSEMBLY__
+
void startup_kernel(void);
-void detect_memory(void);
+unsigned long detect_memory(void);
+bool is_ipl_block_dump(void);
void store_ipl_parmblock(void);
void setup_boot_command_line(void);
void parse_boot_command_line(void);
-void setup_memory_end(void);
void verify_facilities(void);
void print_missing_facilities(void);
+void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
unsigned long get_random_base(unsigned long safe_addr);
+void __printf(1, 2) decompressor_printk(const char *fmt, ...);
-extern int kaslr_enabled;
+/* Symbols defined by linker scripts */
extern const char kernel_version[];
+extern unsigned long memory_limit;
+extern unsigned long vmalloc_size;
+extern int vmalloc_size_set;
+extern int kaslr_enabled;
+extern char __boot_data_start[], __boot_data_end[];
+extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
+extern char _decompressor_syms_start[], _decompressor_syms_end[];
+extern char _stack_start[], _stack_end[];
unsigned long read_ipl_report(unsigned long safe_offset);
+#endif /* __ASSEMBLY__ */
#endif /* BOOT_BOOT_H */
diff --git a/arch/s390/boot/clz_ctz.c b/arch/s390/boot/clz_ctz.c
new file mode 100644
index 000000000000..c3ebf248596b
--- /dev/null
+++ b/arch/s390/boot/clz_ctz.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/clz_ctz.c"
diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore
deleted file mode 100644
index e72fcd7ecebb..000000000000
--- a/arch/s390/boot/compressed/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-vmlinux
-vmlinux.lds
diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile
deleted file mode 100644
index fa529c5b4486..000000000000
--- a/arch/s390/boot/compressed/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# linux/arch/s390/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-KCOV_INSTRUMENT := n
-GCOV_PROFILE := n
-UBSAN_SANITIZE := n
-KASAN_SANITIZE := n
-
-obj-y := $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) piggy.o info.o
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
-targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
-targets += info.bin $(obj-y)
-
-KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
-KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
-OBJCOPYFLAGS :=
-
-OBJECTS := $(addprefix $(obj)/,$(obj-y))
-
-LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(objtree)/arch/s390/boot/startup.a $(OBJECTS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
-$(obj)/info.bin: vmlinux FORCE
- $(call if_changed,objcopy)
-
-OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info
-$(obj)/info.o: $(obj)/info.bin FORCE
- $(call if_changed,objcopy)
-
-OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S
-$(obj)/vmlinux.bin: vmlinux FORCE
- $(call if_changed,objcopy)
-
-vmlinux.bin.all-y := $(obj)/vmlinux.bin
-
-suffix-$(CONFIG_KERNEL_GZIP) := .gz
-suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
-suffix-$(CONFIG_KERNEL_LZ4) := .lz4
-suffix-$(CONFIG_KERNEL_LZMA) := .lzma
-suffix-$(CONFIG_KERNEL_LZO) := .lzo
-suffix-$(CONFIG_KERNEL_XZ) := .xz
-
-$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lz4)
-$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzma)
-$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzo)
-$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,xzkern)
-
-OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed
-$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE
- $(call if_changed,objcopy)
-
-chkbss := $(filter-out piggy.o info.o, $(obj-y))
-chkbss-target := vmlinux.bin
-include $(srctree)/arch/s390/scripts/Makefile.chkbss
diff --git a/arch/s390/boot/compressed/decompressor.c b/arch/s390/boot/decompressor.c
index 368fd372c875..e27c2140d620 100644
--- a/arch/s390/boot/compressed/decompressor.c
+++ b/arch/s390/boot/decompressor.c
@@ -16,7 +16,6 @@
* gzip declarations
*/
#define STATIC static
-#define STATIC_RW_DATA static __section(.data)
#undef memset
#undef memcpy
@@ -24,13 +23,10 @@
#define memmove memmove
#define memzero(s, n) memset((s), 0, (n))
-/* Symbols defined by linker scripts */
-extern char _end[];
-extern unsigned char _compressed_start[];
-extern unsigned char _compressed_end[];
-
-#ifdef CONFIG_HAVE_KERNEL_BZIP2
+#ifdef CONFIG_KERNEL_BZIP2
#define BOOT_HEAP_SIZE 0x400000
+#elif CONFIG_KERNEL_ZSTD
+#define BOOT_HEAP_SIZE 0x30000
#else
#define BOOT_HEAP_SIZE 0x10000
#endif
@@ -62,6 +58,10 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE;
#include "../../../../lib/decompress_unxz.c"
#endif
+#ifdef CONFIG_KERNEL_ZSTD
+#include "../../../../lib/decompress_unzstd.c"
+#endif
+
#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE)
unsigned long mem_safe_offset(void)
diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/decompressor.h
index c15eb7114d83..f75cc31a77dd 100644
--- a/arch/s390/boot/compressed/decompressor.h
+++ b/arch/s390/boot/decompressor.h
@@ -2,8 +2,10 @@
#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
#define BOOT_COMPRESSED_DECOMPRESSOR_H
+#include <linux/stddef.h>
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
-static inline void *decompress_kernel(void) {}
+static inline void *decompress_kernel(void) { return NULL; }
#else
void *decompress_kernel(void);
#endif
@@ -22,9 +24,15 @@ struct vmlinux_info {
unsigned long dynsym_start;
unsigned long rela_dyn_start;
unsigned long rela_dyn_end;
+ unsigned long amode31_size;
};
+/* Symbols defined by linker scripts */
+extern char _end[];
+extern unsigned char _compressed_start[];
+extern unsigned char _compressed_end[];
extern char _vmlinux_info[];
+
#define vmlinux (*(struct vmlinux_info *)_vmlinux_info)
#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S
index dae10961d072..3f79b9efb803 100644
--- a/arch/s390/boot/head.S
+++ b/arch/s390/boot/head.S
@@ -5,7 +5,6 @@
* Author(s): Hartmut Penner <hp@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Rob van der Heij <rvdhei@iae.nl>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*
* There are 5 different IPL methods
* 1) load the image directly into ram at address 0 and do an PSW restart
@@ -25,242 +24,184 @@
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/sclp.h>
+#include "boot.h"
-#define ARCH_OFFSET 4
+#define EP_OFFSET 0x10008
+#define EP_STRING "S390EP"
+#define IPL_BS 0x730
__HEAD
-
-#define IPL_BS 0x730
- .org 0
- .long 0x00080000,0x80000000+iplstart # The first 24 bytes are loaded
- .long 0x02000018,0x60000050 # by ipl to addresses 0-23.
- .long 0x02000068,0x60000050 # (a PSW and two CCWs).
- .fill 80-24,1,0x40 # bytes 24-79 are discarded !!
- .long 0x020000f0,0x60000050 # The next 160 byte are loaded
- .long 0x02000140,0x60000050 # to addresses 0x18-0xb7
- .long 0x02000190,0x60000050 # They form the continuation
- .long 0x020001e0,0x60000050 # of the CCW program started
- .long 0x02000230,0x60000050 # by ipl and load the range
- .long 0x02000280,0x60000050 # 0x0f0-0x730 from the image
- .long 0x020002d0,0x60000050 # to the range 0x0f0-0x730
- .long 0x02000320,0x60000050 # in memory. At the end of
- .long 0x02000370,0x60000050 # the channel program the PSW
- .long 0x020003c0,0x60000050 # at location 0 is loaded.
- .long 0x02000410,0x60000050 # Initial processing starts
- .long 0x02000460,0x60000050 # at 0x200 = iplstart.
- .long 0x020004b0,0x60000050
- .long 0x02000500,0x60000050
- .long 0x02000550,0x60000050
- .long 0x020005a0,0x60000050
- .long 0x020005f0,0x60000050
- .long 0x02000640,0x60000050
- .long 0x02000690,0x60000050
- .long 0x020006e0,0x20000050
-
- .org __LC_RST_NEW_PSW # 0x1a0
- .quad 0,iplstart
- .org __LC_PGM_NEW_PSW # 0x1d0
- .quad 0x0000000180000000,startup_pgm_check_handler
-
- .org 0x200
-
+ipl_start:
+ mvi __LC_AR_MODE_ID,1 # set esame flag
+ slr %r0,%r0 # set cpuid to zero
+ lhi %r1,2 # mode 2 = esame (dump)
+ sigp %r1,%r0,0x12 # switch to esame mode
+ sam64 # switch to 64 bit addressing mode
+ lgh %r1,__LC_SUBCHANNEL_ID # test if subchannel number
+ brctg %r1,.Lnoload # is valid
+ llgf %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number
+ lghi %r2,IPL_BS # load start address
+ bras %r14,.Lloader # load rest of ipl image
+ larl %r12,parmarea # pointer to parameter area
+ stg %r1,IPL_DEVICE-PARMAREA(%r12) # save ipl device number
+#
+# load parameter file from ipl device
+#
+.Lagain1:
+ larl %r2,_end # ramdisk loc. is temp
+ bras %r14,.Lloader # load parameter file
+ ltgr %r2,%r2 # got anything ?
+ jz .Lnopf
+ lg %r3,MAX_COMMAND_LINE_SIZE-PARMAREA(%r12)
+ aghi %r3,-1
+ clgr %r2,%r3
+ jl .Lnotrunc
+ lgr %r2,%r3
+.Lnotrunc:
+ larl %r4,_end
+ larl %r13,.L_hdr
+ clc 0(3,%r4),0(%r13) # if it is HDRx
+ jz .Lagain1 # skip dataset header
+ larl %r13,.L_eof
+ clc 0(3,%r4),0(%r13) # if it is EOFx
+ jz .Lagain1 # skip dateset trailer
+ lgr %r5,%r2
+ la %r6,COMMAND_LINE-PARMAREA(%r12)
+ lgr %r7,%r2
+ aghi %r7,1
+ mvcl %r6,%r4
+.Lnopf:
+#
+# load ramdisk from ipl device
+#
+.Lagain2:
+ larl %r2,_end # addr of ramdisk
+ stg %r2,INITRD_START-PARMAREA(%r12)
+ bras %r14,.Lloader # load ramdisk
+ stg %r2,INITRD_SIZE-PARMAREA(%r12) # store size of rd
+ ltgr %r2,%r2
+ jnz .Lrdcont
+ stg %r2,INITRD_START-PARMAREA(%r12) # no ramdisk found
+.Lrdcont:
+ larl %r2,_end
+ larl %r13,.L_hdr # skip HDRx and EOFx
+ clc 0(3,%r2),0(%r13)
+ jz .Lagain2
+ larl %r13,.L_eof
+ clc 0(3,%r2),0(%r13)
+ jz .Lagain2
+#
+# reset files in VM reader
+#
+ larl %r13,.Lcpuid
+ stidp 0(%r13) # store cpuid
+ tm 0(%r13),0xff # running VM ?
+ jno .Lnoreset
+ larl %r2,.Lreset
+ lghi %r3,26
+ diag %r2,%r3,8
+ larl %r5,.Lirb
+ stsch 0(%r5) # check if irq is pending
+ tm 30(%r5),0x0f # by verifying if any of the
+ jnz .Lwaitforirq # activity or status control
+ tm 31(%r5),0xff # bits is set in the schib
+ jz .Lnoreset
+.Lwaitforirq:
+ bras %r14,.Lirqwait # wait for IO interrupt
+ c %r1,__LC_SUBCHANNEL_ID # compare subchannel number
+ jne .Lwaitforirq
+ larl %r5,.Lirb
+ tsch 0(%r5)
+.Lnoreset:
+ j .Lnoload
+#
+# everything loaded, go for it
+#
+.Lnoload:
+ jg startup
#
# subroutine to wait for end I/O
#
.Lirqwait:
- mvc __LC_IO_NEW_PSW(16),.Lnewpsw # set up IO interrupt psw
- lpsw .Lwaitpsw
+ larl %r13,.Lnewpswmask # set up IO interrupt psw
+ mvc __LC_IO_NEW_PSW(8),0(%r13)
+ stg %r14,__LC_IO_NEW_PSW+8
+ larl %r13,.Lwaitpsw
+ lpswe 0(%r13)
.Lioint:
- br %r14
- .align 8
-.Lnewpsw:
- .quad 0x0000000080000000,.Lioint
-.Lwaitpsw:
- .long 0x020a0000,0x80000000+.Lioint
-
#
# subroutine for loading cards from the reader
#
.Lloader:
- la %r4,0(%r14)
- la %r3,.Lorb # r2 = address of orb into r2
- la %r5,.Lirb # r4 = address of irb
- la %r6,.Lccws
- la %r7,20
+ lgr %r4,%r14
+ larl %r3,.Lorb # r2 = address of orb into r2
+ larl %r5,.Lirb # r4 = address of irb
+ larl %r6,.Lccws
+ lghi %r7,20
.Linit:
st %r2,4(%r6) # initialize CCW data addresses
la %r2,0x50(%r2)
la %r6,8(%r6)
- bct 7,.Linit
-
- lctl %c6,%c6,.Lcr6 # set IO subclass mask
- slr %r2,%r2
+ brctg %r7,.Linit
+ larl %r13,.Lcr6
+ lctlg %c6,%c6,0(%r13)
+ xgr %r2,%r2
.Lldlp:
ssch 0(%r3) # load chunk of 1600 bytes
- bnz .Llderr
+ jnz .Llderr
.Lwait4irq:
- bas %r14,.Lirqwait
+ bras %r14,.Lirqwait
c %r1,__LC_SUBCHANNEL_ID # compare subchannel number
- bne .Lwait4irq
+ jne .Lwait4irq
tsch 0(%r5)
-
- slr %r0,%r0
+ xgr %r0,%r0
ic %r0,8(%r5) # get device status
- chi %r0,8 # channel end ?
- be .Lcont
- chi %r0,12 # channel end + device end ?
- be .Lcont
-
- l %r0,4(%r5)
- s %r0,8(%r3) # r0/8 = number of ccws executed
- mhi %r0,10 # *10 = number of bytes in ccws
- lh %r3,10(%r5) # get residual count
- sr %r0,%r3 # #ccws*80-residual=#bytes read
- ar %r2,%r0
-
+ cghi %r0,8 # channel end ?
+ je .Lcont
+ cghi %r0,12 # channel end + device end ?
+ je .Lcont
+ llgf %r0,4(%r5)
+ sgf %r0,8(%r3) # r0/8 = number of ccws executed
+ mghi %r0,10 # *10 = number of bytes in ccws
+ llgh %r3,10(%r5) # get residual count
+ sgr %r0,%r3 # #ccws*80-residual=#bytes read
+ agr %r2,%r0
br %r4 # r2 contains the total size
-
.Lcont:
- ahi %r2,0x640 # add 0x640 to total size
- la %r6,.Lccws
- la %r7,20
+ aghi %r2,0x640 # add 0x640 to total size
+ larl %r6,.Lccws
+ lghi %r7,20
.Lincr:
l %r0,4(%r6) # update CCW data addresses
- ahi %r0,0x640
+ aghi %r0,0x640
st %r0,4(%r6)
- ahi %r6,8
- bct 7,.Lincr
-
- b .Lldlp
+ aghi %r6,8
+ brctg %r7,.Lincr
+ j .Lldlp
.Llderr:
- lpsw .Lcrash
+ larl %r13,.Lcrash
+ lpsw 0(%r13)
.align 8
+.Lwaitpsw:
+ .quad 0x0202000180000000,.Lioint
+.Lnewpswmask:
+ .quad 0x0000000180000000
+ .align 8
.Lorb: .long 0x00000000,0x0080ff00,.Lccws
.Lirb: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.Lcr6: .long 0xff000000
-.Lloadp:.long 0,0
+ .align 8
+.Lcr6: .quad 0x00000000ff000000
.align 8
.Lcrash:.long 0x000a0000,0x00000000
-
.align 8
.Lccws: .rept 19
.long 0x02600050,0x00000000
.endr
.long 0x02200050,0x00000000
-
-iplstart:
- mvi __LC_AR_MODE_ID,1 # set esame flag
- slr %r0,%r0 # set cpuid to zero
- lhi %r1,2 # mode 2 = esame (dump)
- sigp %r1,%r0,0x12 # switch to esame mode
- bras %r13,0f
- .fill 16,4,0x0
-0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs
- sam31 # switch to 31 bit addressing mode
- lh %r1,__LC_SUBCHANNEL_ID # test if subchannel number
- bct %r1,.Lnoload # is valid
- l %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number
- la %r2,IPL_BS # load start address
- bas %r14,.Lloader # load rest of ipl image
- l %r12,.Lparm # pointer to parameter area
- st %r1,IPL_DEVICE+ARCH_OFFSET-PARMAREA(%r12) # save ipl device number
-
-#
-# load parameter file from ipl device
-#
-.Lagain1:
- l %r2,.Linitrd # ramdisk loc. is temp
- bas %r14,.Lloader # load parameter file
- ltr %r2,%r2 # got anything ?
- bz .Lnopf
- chi %r2,895
- bnh .Lnotrunc
- la %r2,895
-.Lnotrunc:
- l %r4,.Linitrd
- clc 0(3,%r4),.L_hdr # if it is HDRx
- bz .Lagain1 # skip dataset header
- clc 0(3,%r4),.L_eof # if it is EOFx
- bz .Lagain1 # skip dateset trailer
- la %r5,0(%r4,%r2)
- lr %r3,%r2
- la %r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line
- mvc 0(256,%r3),0(%r4)
- mvc 256(256,%r3),256(%r4)
- mvc 512(256,%r3),512(%r4)
- mvc 768(122,%r3),768(%r4)
- slr %r0,%r0
- b .Lcntlp
-.Ldelspc:
- ic %r0,0(%r2,%r3)
- chi %r0,0x20 # is it a space ?
- be .Lcntlp
- ahi %r2,1
- b .Leolp
-.Lcntlp:
- brct %r2,.Ldelspc
-.Leolp:
- slr %r0,%r0
- stc %r0,0(%r2,%r3) # terminate buffer
-.Lnopf:
-
-#
-# load ramdisk from ipl device
-#
-.Lagain2:
- l %r2,.Linitrd # addr of ramdisk
- st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12)
- bas %r14,.Lloader # load ramdisk
- st %r2,INITRD_SIZE+ARCH_OFFSET-PARMAREA(%r12) # store size of rd
- ltr %r2,%r2
- bnz .Lrdcont
- st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) # no ramdisk found
-.Lrdcont:
- l %r2,.Linitrd
-
- clc 0(3,%r2),.L_hdr # skip HDRx and EOFx
- bz .Lagain2
- clc 0(3,%r2),.L_eof
- bz .Lagain2
-
-#
-# reset files in VM reader
-#
- stidp .Lcpuid # store cpuid
- tm .Lcpuid,0xff # running VM ?
- bno .Lnoreset
- la %r2,.Lreset
- lhi %r3,26
- diag %r2,%r3,8
- la %r5,.Lirb
- stsch 0(%r5) # check if irq is pending
- tm 30(%r5),0x0f # by verifying if any of the
- bnz .Lwaitforirq # activity or status control
- tm 31(%r5),0xff # bits is set in the schib
- bz .Lnoreset
-.Lwaitforirq:
- bas %r14,.Lirqwait # wait for IO interrupt
- c %r1,__LC_SUBCHANNEL_ID # compare subchannel number
- bne .Lwaitforirq
- la %r5,.Lirb
- tsch 0(%r5)
-.Lnoreset:
- b .Lnoload
-
-#
-# everything loaded, go for it
-#
-.Lnoload:
- l %r1,.Lstartup
- br %r1
-
-.Linitrd:.long _end # default address of initrd
-.Lparm: .long PARMAREA
-.Lstartup: .long startup
.Lreset:.byte 0xc3,0xc8,0xc1,0xd5,0xc7,0xc5,0x40,0xd9,0xc4,0xd9,0x40
.byte 0xc1,0xd3,0xd3,0x40,0xd2,0xc5,0xc5,0xd7,0x40,0xd5,0xd6
.byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold"
@@ -270,14 +211,14 @@ iplstart:
.Lcpuid:.fill 8,1,0
#
-# startup-code at 0x10000, running in absolute addressing mode
+# normal startup-code, running in absolute addressing mode
# this is called either by the ipl loader or directly by PSW restart
# or linload or SALIPL
#
- .org 0x10000
-ENTRY(startup)
- j .Lep_startup_normal
- .org EP_OFFSET
+ .org STARTUP_NORMAL_OFFSET - IPL_START
+SYM_CODE_START(startup)
+ j startup_normal
+ .org EP_OFFSET - IPL_START
#
# This is a list of s390 kernel entry points. At address 0x1000f the number of
# valid entry points is stored.
@@ -287,12 +228,12 @@ ENTRY(startup)
.ascii EP_STRING
.byte 0x00,0x01
#
-# kdump startup-code at 0x10010, running in 64 bit absolute addressing mode
+# kdump startup-code, running in 64 bit absolute addressing mode
#
- .org 0x10010
-ENTRY(startup_kdump)
- j .Lep_startup_kdump
-.Lep_startup_normal:
+ .org STARTUP_KDUMP_OFFSET - IPL_START
+ j startup_kdump
+SYM_CODE_END(startup)
+SYM_CODE_START_LOCAL(startup_normal)
mvi __LC_AR_MODE_ID,1 # set esame flag
slr %r0,%r0 # set cpuid to zero
lhi %r1,2 # mode 2 = esame (dump)
@@ -301,32 +242,43 @@ ENTRY(startup_kdump)
.fill 16,4,0x0
0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs
sam64 # switch to 64 bit addressing mode
- basr %r13,0 # get base
-.LPG0:
+ larl %r13,.Lext_new_psw
+ mvc __LC_EXT_NEW_PSW(16),0(%r13)
+ larl %r13,.Lpgm_new_psw
+ mvc __LC_PGM_NEW_PSW(16),0(%r13)
+ larl %r13,.Lio_new_psw
+ mvc __LC_IO_NEW_PSW(16),0(%r13)
xc 0x200(256),0x200 # partially clear lowcore
xc 0x300(256),0x300
xc 0xe00(256),0xe00
xc 0xf00(256),0xf00
- lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers
+ larl %r13,.Lctl
+ lctlg %c0,%c15,0(%r13) # load control registers
stcke __LC_BOOT_CLOCK
mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1
- spt 6f-.LPG0(%r13)
- mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13)
- l %r15,.Lstack-.LPG0(%r13)
+ larl %r13,6f
+ spt 0(%r13)
+ mvc __LC_LAST_UPDATE_TIMER(8),0(%r13)
+ larl %r15,_stack_end-STACK_FRAME_OVERHEAD
+ brasl %r14,sclp_early_setup_buffer
brasl %r14,verify_facilities
brasl %r14,startup_kernel
+SYM_CODE_END(startup_normal)
-.Lstack:
- .long 0x8000 + (1<<(PAGE_SHIFT+BOOT_STACK_ORDER)) - STACK_FRAME_OVERHEAD
.align 8
6: .long 0x7fffffff,0xffffffff
-
+.Lext_new_psw:
+ .quad 0x0002000180000000,0x1b0 # disabled wait
+.Lpgm_new_psw:
+ .quad 0x0000000180000000,startup_pgm_check_handler
+.Lio_new_psw:
+ .quad 0x0002000180000000,0x1f0 # disabled wait
.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space
.quad 0 # cr1: primary space segment table
- .quad .Lduct # cr2: dispatchable unit control table
+ .quad 0 # cr2: dispatchable unit control table
.quad 0 # cr3: instruction authorization
.quad 0xffff # cr4: instruction authorization
- .quad .Lduct # cr5: primary-aste origin
+ .quad 0 # cr5: primary-aste origin
.quad 0 # cr6: I/O interrupts
.quad 0 # cr7: secondary space segment table
.quad 0x0000000000008000 # cr8: access registers translation
@@ -336,20 +288,7 @@ ENTRY(startup_kdump)
.quad 0 # cr12: tracing off
.quad 0 # cr13: home space segment table
.quad 0xc0000000 # cr14: machine check handling off
- .quad .Llinkage_stack # cr15: linkage stack operations
-
- .section .dma.data,"aw",@progbits
-.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0
- .long 0,0,0,0,0,0,0,0
-.Llinkage_stack:
- .long 0,0,0x89000000,0,0,0,0x8a000000,0
- .align 64
-.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0
- .align 128
-.Lduald:.rept 8
- .long 0x80000000,0,0,0 # invalid access-list entries
- .endr
- .previous
+ .quad 0 # cr15: linkage stack operations
#include "head_kdump.S"
@@ -359,45 +298,23 @@ ENTRY(startup_kdump)
# It simply saves general/control registers and psw in
# the save area and does disabled wait with a faulty address.
#
-ENTRY(startup_pgm_check_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_SYNC
- la %r1,4095
- stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r1)
- mvc __LC_GPREGS_SAVE_AREA-4095(128,%r1),__LC_SAVE_AREA_SYNC
- mvc __LC_PSW_SAVE_AREA-4095(16,%r1),__LC_PGM_OLD_PSW
+SYM_CODE_START_LOCAL(startup_pgm_check_handler)
+ stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ la %r8,4095
+ stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8)
+ stmg %r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8)
+ mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA_SYNC
+ mvc __LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW
mvc __LC_RETURN_PSW(16),__LC_PGM_OLD_PSW
ni __LC_RETURN_PSW,0xfc # remove IO and EX bits
ni __LC_RETURN_PSW+1,0xfb # remove MCHK bit
oi __LC_RETURN_PSW+1,0x2 # set wait state bit
- larl %r2,.Lold_psw_disabled_wait
- stg %r2,__LC_PGM_NEW_PSW+8
- l %r15,.Ldump_info_stack-.Lold_psw_disabled_wait(%r2)
+ larl %r9,.Lold_psw_disabled_wait
+ stg %r9,__LC_PGM_NEW_PSW+8
+ larl %r15,_dump_info_stack_end-STACK_FRAME_OVERHEAD
brasl %r14,print_pgm_check_info
.Lold_psw_disabled_wait:
- la %r1,4095
- lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)
+ la %r8,4095
+ lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8)
lpswe __LC_RETURN_PSW # disabled wait
-.Ldump_info_stack:
- .long 0x5000 + PAGE_SIZE - STACK_FRAME_OVERHEAD
-ENDPROC(startup_pgm_check_handler)
-
-#
-# params at 10400 (setup.h)
-# Must be keept in sync with struct parmarea in setup.h
-#
- .org PARMAREA
- .quad 0 # IPL_DEVICE
- .quad 0 # INITRD_START
- .quad 0 # INITRD_SIZE
- .quad 0 # OLDMEM_BASE
- .quad 0 # OLDMEM_SIZE
- .quad kernel_version # points to kernel version string
-
- .org COMMAND_LINE
- .byte "root=/dev/ram0 ro"
- .byte 0
-
- .org EARLY_SCCB_OFFSET
- .fill 4096
-
- .org HEAD_END
+SYM_CODE_END(startup_pgm_check_handler)
diff --git a/arch/s390/boot/head_kdump.S b/arch/s390/boot/head_kdump.S
index 174d6959bf5b..f015469e7db9 100644
--- a/arch/s390/boot/head_kdump.S
+++ b/arch/s390/boot/head_kdump.S
@@ -19,8 +19,7 @@
# Note: This code has to be position independent
#
-.align 2
-.Lep_startup_kdump:
+SYM_CODE_START_LOCAL(startup_kdump)
lhi %r1,2 # mode 2 = esame (dump)
sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to esame mode
sam64 # Switch to 64 bit addressing
@@ -87,14 +86,15 @@
startup_kdump_relocated:
basr %r13,0
0: lpswe .Lrestart_psw-0b(%r13) # Start new kernel...
+SYM_CODE_END(startup_kdump)
.align 8
.Lrestart_psw:
.quad 0x0000000080000000,0x0000000000000000 + startup
#else
-.align 2
-.Lep_startup_kdump:
+SYM_CODE_START_LOCAL(startup_kdump)
larl %r13,startup_kdump_crash
lpswe 0(%r13)
+SYM_CODE_END(startup_kdump)
.align 8
startup_kdump_crash:
.quad 0x0002000080000000,0x0000000000000000 + startup_kdump_crash
diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh
index bed227f267ae..616ba1660f08 100644..100755
--- a/arch/s390/boot/install.sh
+++ b/arch/s390/boot/install.sh
@@ -14,22 +14,11 @@
# $2 - kernel image file
# $3 - kernel map file
# $4 - default install path (blank if root directory)
-#
-
-# User may have a custom install script
-
-if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
-if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
-
-# Default install - same as make zlilo
-
-if [ -f $4/vmlinuz ]; then
- mv $4/vmlinuz $4/vmlinuz.old
-fi
-if [ -f $4/System.map ]; then
- mv $4/System.map $4/System.old
-fi
+echo "Warning: '${INSTALLKERNEL}' command not available - additional " \
+ "bootloader config required" >&2
+if [ -f $4/vmlinuz-$1 ]; then mv $4/vmlinuz-$1 $4/vmlinuz-$1.old; fi
+if [ -f $4/System.map-$1 ]; then mv $4/System.map-$1 $4/System.map-$1.old; fi
-cat $2 > $4/vmlinuz
-cp $3 $4/System.map
+cat $2 > $4/vmlinuz-$1
+cp $3 $4/System.map-$1
diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c
new file mode 100644
index 000000000000..0846e2b249c6
--- /dev/null
+++ b/arch/s390/boot/ipl_data.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <asm/cio.h>
+#include <asm/asm-offsets.h>
+#include "boot.h"
+
+#define CCW0(cmd, addr, cnt, flg) \
+ { .cmd_code = cmd, .cda = addr, .count = cnt, .flags = flg, }
+
+#define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA)
+
+struct ipl_lowcore {
+ psw_t32 ipl_psw; /* 0x0000 */
+ struct ccw0 ccwpgm[2]; /* 0x0008 */
+ u8 fill[56]; /* 0x0018 */
+ struct ccw0 ccwpgmcc[20]; /* 0x0050 */
+ u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */
+ psw_t restart_psw; /* 0x01a0 */
+ psw_t external_new_psw; /* 0x01b0 */
+ psw_t svc_new_psw; /* 0x01c0 */
+ psw_t program_new_psw; /* 0x01d0 */
+ psw_t mcck_new_psw; /* 0x01e0 */
+ psw_t io_new_psw; /* 0x01f0 */
+};
+
+/*
+ * Initial lowcore for IPL: the first 24 bytes are loaded by IPL to
+ * addresses 0-23 (a PSW and two CCWs). Bytes 24-79 are discarded.
+ * The next 160 bytes are loaded to addresses 0x18-0xb7. They form
+ * the continuation of the CCW program started by IPL and load the
+ * range 0x0f0-0x730 from the image to the range 0x0f0-0x730 in
+ * memory. At the end of the channel program the PSW at location 0 is
+ * loaded.
+ * Initial processing starts at 0x200 = iplstart.
+ *
+ * The restart psw points to iplstart which allows to load a kernel
+ * image into memory and starting it by a psw restart on any cpu. All
+ * other default psw new locations contain a disabled wait psw where
+ * the address indicates which psw was loaded.
+ *
+ * Note that the 'file' utility can detect s390 kernel images. For
+ * that to succeed the two initial CCWs, and the 0x40 fill bytes must
+ * be present.
+ */
+static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = {
+ .ipl_psw = { .mask = PSW32_MASK_BASE, .addr = PSW32_ADDR_AMODE | IPL_START },
+ .ccwpgm = {
+ [ 0] = CCW0(CCW_CMD_READ_IPL, 0x018, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 1] = CCW0(CCW_CMD_READ_IPL, 0x068, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ },
+ .fill = {
+ [ 0 ... 55] = 0x40,
+ },
+ .ccwpgmcc = {
+ [ 0] = CCW0(CCW_CMD_READ_IPL, 0x0f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 1] = CCW0(CCW_CMD_READ_IPL, 0x140, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 2] = CCW0(CCW_CMD_READ_IPL, 0x190, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 3] = CCW0(CCW_CMD_READ_IPL, 0x1e0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 4] = CCW0(CCW_CMD_READ_IPL, 0x230, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 5] = CCW0(CCW_CMD_READ_IPL, 0x280, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 6] = CCW0(CCW_CMD_READ_IPL, 0x2d0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 7] = CCW0(CCW_CMD_READ_IPL, 0x320, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 8] = CCW0(CCW_CMD_READ_IPL, 0x370, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 9] = CCW0(CCW_CMD_READ_IPL, 0x3c0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [10] = CCW0(CCW_CMD_READ_IPL, 0x410, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [11] = CCW0(CCW_CMD_READ_IPL, 0x460, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [12] = CCW0(CCW_CMD_READ_IPL, 0x4b0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [13] = CCW0(CCW_CMD_READ_IPL, 0x500, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [14] = CCW0(CCW_CMD_READ_IPL, 0x550, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [15] = CCW0(CCW_CMD_READ_IPL, 0x5a0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [16] = CCW0(CCW_CMD_READ_IPL, 0x5f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [17] = CCW0(CCW_CMD_READ_IPL, 0x640, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI),
+ },
+ .restart_psw = { .mask = 0, .addr = IPL_START, },
+ .external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, },
+ .svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, },
+ .program_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_PGM_NEW_PSW, },
+ .mcck_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_MCK_NEW_PSW, },
+ .io_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_IO_NEW_PSW, },
+};
diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index 357adad991d2..ca78d6162245 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -2,49 +2,61 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/ctype.h>
+#include <linux/pgtable.h>
#include <asm/ebcdic.h>
#include <asm/sclp.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
#include <asm/facility.h>
-#include <asm/pgtable.h>
+#include <asm/setup.h>
#include <asm/uv.h>
#include "boot.h"
+struct parmarea parmarea __section(".parmarea") = {
+ .kernel_version = (unsigned long)kernel_version,
+ .max_command_line_size = COMMAND_LINE_SIZE,
+ .command_line = "root=/dev/ram0 ro",
+};
+
char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
+int __bootdata(noexec_disabled);
+
+unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL;
struct ipl_parameter_block __bootdata_preserved(ipl_block);
int __bootdata_preserved(ipl_block_valid);
-unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL;
-
-unsigned long __bootdata(vmalloc_size) = VMALLOC_DEFAULT_SIZE;
-unsigned long __bootdata(memory_end);
-int __bootdata(memory_end_set);
-int __bootdata(noexec_disabled);
-int kaslr_enabled __section(.data);
+unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE;
+unsigned long memory_limit;
+int vmalloc_size_set;
+int kaslr_enabled;
static inline int __diag308(unsigned long subcode, void *addr)
{
- register unsigned long _addr asm("0") = (unsigned long)addr;
- register unsigned long _rc asm("1") = 0;
unsigned long reg1, reg2;
- psw_t old = S390_lowcore.program_new_psw;
+ union register_pair r1;
+ psw_t old;
+ r1.even = (unsigned long) addr;
+ r1.odd = 0;
asm volatile(
- " epsw %0,%1\n"
- " st %0,%[psw_pgm]\n"
- " st %1,%[psw_pgm]+4\n"
- " larl %0,1f\n"
- " stg %0,%[psw_pgm]+8\n"
- " diag %[addr],%[subcode],0x308\n"
- "1: nopr %%r7\n"
- : "=&d" (reg1), "=&a" (reg2),
- [psw_pgm] "=Q" (S390_lowcore.program_new_psw),
- [addr] "+d" (_addr), "+d" (_rc)
- : [subcode] "d" (subcode)
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
+ " diag %[r1],%[subcode],0x308\n"
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [r1] "+&d" (r1.pair),
+ [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ "+Q" (S390_lowcore.program_new_psw),
+ "=Q" (old)
+ : [subcode] "d" (subcode),
+ [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw)
: "cc", "memory");
- S390_lowcore.program_new_psw = old;
- return _rc;
+ return r1.odd;
}
void store_ipl_parmblock(void)
@@ -57,6 +69,17 @@ void store_ipl_parmblock(void)
ipl_block_valid = 1;
}
+bool is_ipl_block_dump(void)
+{
+ if (ipl_block.pb0_hdr.pbt == IPL_PBT_FCP &&
+ ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP)
+ return true;
+ if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME &&
+ ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+ return true;
+ return false;
+}
+
static size_t scpdata_length(const u8 *buf, size_t count)
{
while (count) {
@@ -70,30 +93,44 @@ static size_t scpdata_length(const u8 *buf, size_t count)
static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size,
const struct ipl_parameter_block *ipb)
{
- size_t count;
- size_t i;
+ const __u8 *scp_data;
+ __u32 scp_data_len;
int has_lowercase;
+ size_t count = 0;
+ size_t i;
- count = min(size - 1, scpdata_length(ipb->fcp.scp_data,
- ipb->fcp.scp_data_len));
+ switch (ipb->pb0_hdr.pbt) {
+ case IPL_PBT_FCP:
+ scp_data_len = ipb->fcp.scp_data_len;
+ scp_data = ipb->fcp.scp_data;
+ break;
+ case IPL_PBT_NVME:
+ scp_data_len = ipb->nvme.scp_data_len;
+ scp_data = ipb->nvme.scp_data;
+ break;
+ default:
+ goto out;
+ }
+
+ count = min(size - 1, scpdata_length(scp_data, scp_data_len));
if (!count)
goto out;
has_lowercase = 0;
for (i = 0; i < count; i++) {
- if (!isascii(ipb->fcp.scp_data[i])) {
+ if (!isascii(scp_data[i])) {
count = 0;
goto out;
}
- if (!has_lowercase && islower(ipb->fcp.scp_data[i]))
+ if (!has_lowercase && islower(scp_data[i]))
has_lowercase = 1;
}
if (has_lowercase)
- memcpy(dest, ipb->fcp.scp_data, count);
+ memcpy(dest, scp_data, count);
else
for (i = 0; i < count; i++)
- dest[i] = tolower(ipb->fcp.scp_data[i]);
+ dest[i] = tolower(scp_data[i]);
out:
dest[count] = '\0';
return count;
@@ -115,6 +152,7 @@ static void append_ipl_block_parm(void)
parm, COMMAND_LINE_SIZE - len - 1, &ipl_block);
break;
case IPL_PBT_FCP:
+ case IPL_PBT_NVME:
rc = ipl_block_get_ascii_scpdata(
parm, COMMAND_LINE_SIZE - len - 1, &ipl_block);
break;
@@ -139,12 +177,12 @@ static inline int has_ebcdic_char(const char *str)
void setup_boot_command_line(void)
{
- COMMAND_LINE[ARCH_COMMAND_LINE_SIZE - 1] = 0;
+ parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0;
/* convert arch command line to ascii if necessary */
- if (has_ebcdic_char(COMMAND_LINE))
- EBCASC(COMMAND_LINE, ARCH_COMMAND_LINE_SIZE);
+ if (has_ebcdic_char(parmarea.command_line))
+ EBCASC(parmarea.command_line, COMMAND_LINE_SIZE);
/* copy arch command line */
- strcpy(early_command_line, strim(COMMAND_LINE));
+ strcpy(early_command_line, strim(parmarea.command_line));
/* append IPL PARM data to the boot command line */
if (!is_prot_virt_guest() && ipl_block_valid)
@@ -154,9 +192,9 @@ void setup_boot_command_line(void)
static void modify_facility(unsigned long nr, bool clear)
{
if (clear)
- __clear_facility(nr, S390_lowcore.stfle_fac_list);
+ __clear_facility(nr, stfle_fac_list);
else
- __set_facility(nr, S390_lowcore.stfle_fac_list);
+ __set_facility(nr, stfle_fac_list);
}
static void check_cleared_facilities(void)
@@ -165,7 +203,7 @@ static void check_cleared_facilities(void)
int i;
for (i = 0; i < ARRAY_SIZE(als); i++) {
- if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i]) {
+ if ((stfle_fac_list[i] & als[i]) != als[i]) {
sclp_early_printk("Warning: The Linux kernel requires facilities cleared via command line option\n");
print_missing_facilities();
break;
@@ -209,7 +247,7 @@ static void modify_fac_list(char *str)
check_cleared_facilities();
}
-static char command_line_buf[COMMAND_LINE_SIZE] __section(.data);
+static char command_line_buf[COMMAND_LINE_SIZE];
void parse_boot_command_line(void)
{
char *param, *val;
@@ -222,15 +260,15 @@ void parse_boot_command_line(void)
while (*args) {
args = next_arg(args, &param, &val);
- if (!strcmp(param, "mem") && val) {
- memory_end = round_down(memparse(val, NULL), PAGE_SIZE);
- memory_end_set = 1;
- }
+ if (!strcmp(param, "mem") && val)
+ memory_limit = round_down(memparse(val, NULL), PAGE_SIZE);
- if (!strcmp(param, "vmalloc") && val)
+ if (!strcmp(param, "vmalloc") && val) {
vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE);
+ vmalloc_size_set = 1;
+ }
- if (!strcmp(param, "dfltcc")) {
+ if (!strcmp(param, "dfltcc") && val) {
if (!strcmp(val, "off"))
zlib_dfltcc_support = ZLIB_DFLTCC_DISABLED;
else if (!strcmp(val, "on"))
@@ -254,20 +292,13 @@ void parse_boot_command_line(void)
if (!strcmp(param, "nokaslr"))
kaslr_enabled = 0;
- }
-}
-void setup_memory_end(void)
-{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE) {
- kaslr_enabled = 0;
- } else if (ipl_block_valid &&
- ipl_block.pb0_hdr.pbt == IPL_PBT_FCP &&
- ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) {
- kaslr_enabled = 0;
- if (!sclp_early_get_hsa_size(&memory_end) && memory_end)
- memory_end_set = 1;
- }
+#if IS_ENABLED(CONFIG_KVM)
+ if (!strcmp(param, "prot_virt")) {
+ rc = kstrtobool(val, &enabled);
+ if (!rc && enabled)
+ prot_virt_host = 1;
+ }
#endif
+ }
}
diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c
index 0b4965573656..9b14045065b6 100644
--- a/arch/s390/boot/ipl_report.c
+++ b/arch/s390/boot/ipl_report.c
@@ -54,9 +54,9 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
* not overlap with any component or any certificate.
*/
repeat:
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
- intersects(INITRD_START, INITRD_SIZE, safe_addr, size))
- safe_addr = INITRD_START + INITRD_SIZE;
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size &&
+ intersects(initrd_data.start, initrd_data.size, safe_addr, size))
+ safe_addr = initrd_data.start + initrd_data.size;
for_each_rb_entry(comp, comps)
if (intersects(safe_addr, size, comp->addr, comp->len)) {
safe_addr = comp->addr + comp->len;
diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c
index 5591243d673e..e8d74d4f62aa 100644
--- a/arch/s390/boot/kaslr.c
+++ b/arch/s390/boot/kaslr.c
@@ -2,12 +2,13 @@
/*
* Copyright IBM Corp. 2019
*/
+#include <linux/pgtable.h>
#include <asm/mem_detect.h>
-#include <asm/pgtable.h>
#include <asm/cpacf.h>
#include <asm/timex.h>
#include <asm/sclp.h>
-#include "compressed/decompressor.h"
+#include <asm/kasan.h>
+#include "decompressor.h"
#include "boot.h"
#define PRNG_MODE_TDES 1
@@ -42,7 +43,7 @@ static int check_prng(void)
return PRNG_MODE_TDES;
}
-static unsigned long get_random(unsigned long limit)
+static int get_random(unsigned long limit, unsigned long *value)
{
struct prng_parm prng = {
/* initial parameter block for tdes mode, copied from libica */
@@ -84,87 +85,125 @@ static unsigned long get_random(unsigned long limit)
(u8 *) &random, sizeof(random));
break;
default:
- random = 0;
+ return -1;
}
- return random % limit;
+ *value = random % limit;
+ return 0;
}
-unsigned long get_random_base(unsigned long safe_addr)
+/*
+ * To randomize kernel base address we have to consider several facts:
+ * 1. physical online memory might not be continuous and have holes. mem_detect
+ * info contains list of online memory ranges we should consider.
+ * 2. we have several memory regions which are occupied and we should not
+ * overlap and destroy them. Currently safe_addr tells us the border below
+ * which all those occupied regions are. We are safe to use anything above
+ * safe_addr.
+ * 3. the upper limit might apply as well, even if memory above that limit is
+ * online. Currently those limitations are:
+ * 3.1. Limit set by "mem=" kernel command line option
+ * 3.2. memory reserved at the end for kasan initialization.
+ * 4. kernel base address must be aligned to THREAD_SIZE (kernel stack size).
+ * Which is required for CONFIG_CHECK_STACK. Currently THREAD_SIZE is 4 pages
+ * (16 pages when the kernel is built with kasan enabled)
+ * Assumptions:
+ * 1. kernel size (including .bss size) and upper memory limit are page aligned.
+ * 2. mem_detect memory region start is THREAD_SIZE aligned / end is PAGE_SIZE
+ * aligned (in practice memory configurations granularity on z/VM and LPAR
+ * is 1mb).
+ *
+ * To guarantee uniform distribution of kernel base address among all suitable
+ * addresses we generate random value just once. For that we need to build a
+ * continuous range in which every value would be suitable. We can build this
+ * range by simply counting all suitable addresses (let's call them positions)
+ * which would be valid as kernel base address. To count positions we iterate
+ * over online memory ranges. For each range which is big enough for the
+ * kernel image we count all suitable addresses we can put the kernel image at
+ * that is
+ * (end - start - kernel_size) / THREAD_SIZE + 1
+ * Two functions count_valid_kernel_positions and position_to_address help
+ * to count positions in memory range given and then convert position back
+ * to address.
+ */
+static unsigned long count_valid_kernel_positions(unsigned long kernel_size,
+ unsigned long _min,
+ unsigned long _max)
{
- unsigned long memory_limit = memory_end_set ? memory_end : 0;
- unsigned long base, start, end, kernel_size;
- unsigned long block_sum, offset;
- unsigned long kasan_needs;
+ unsigned long start, end, pos = 0;
int i;
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) {
- if (safe_addr < INITRD_START + INITRD_SIZE)
- safe_addr = INITRD_START + INITRD_SIZE;
+ for_each_mem_detect_block(i, &start, &end) {
+ if (_min >= end)
+ continue;
+ if (start >= _max)
+ break;
+ start = max(_min, start);
+ end = min(_max, end);
+ if (end - start < kernel_size)
+ continue;
+ pos += (end - start - kernel_size) / THREAD_SIZE + 1;
}
- safe_addr = ALIGN(safe_addr, THREAD_SIZE);
- if ((IS_ENABLED(CONFIG_KASAN))) {
- /*
- * Estimate kasan memory requirements, which it will reserve
- * at the very end of available physical memory. To estimate
- * that, we take into account that kasan would require
- * 1/8 of available physical memory (for shadow memory) +
- * creating page tables for the whole memory + shadow memory
- * region (1 + 1/8). To keep page tables estimates simple take
- * the double of combined ptes size.
- */
- memory_limit = get_mem_detect_end();
- if (memory_end_set && memory_limit > memory_end)
- memory_limit = memory_end;
-
- /* for shadow memory */
- kasan_needs = memory_limit / 8;
- /* for paging structures */
- kasan_needs += (memory_limit + kasan_needs) / PAGE_SIZE /
- _PAGE_ENTRIES * _PAGE_TABLE_SIZE * 2;
- memory_limit -= kasan_needs;
- }
+ return pos;
+}
+
+static unsigned long position_to_address(unsigned long pos, unsigned long kernel_size,
+ unsigned long _min, unsigned long _max)
+{
+ unsigned long start, end;
+ int i;
- kernel_size = vmlinux.image_size + vmlinux.bss_size;
- block_sum = 0;
for_each_mem_detect_block(i, &start, &end) {
- if (memory_limit) {
- if (start >= memory_limit)
- break;
- if (end > memory_limit)
- end = memory_limit;
- }
+ if (_min >= end)
+ continue;
+ if (start >= _max)
+ break;
+ start = max(_min, start);
+ end = min(_max, end);
if (end - start < kernel_size)
continue;
- block_sum += end - start - kernel_size;
+ if ((end - start - kernel_size) / THREAD_SIZE + 1 >= pos)
+ return start + (pos - 1) * THREAD_SIZE;
+ pos -= (end - start - kernel_size) / THREAD_SIZE + 1;
+ }
+
+ return 0;
+}
+
+unsigned long get_random_base(unsigned long safe_addr)
+{
+ unsigned long memory_limit = get_mem_detect_end();
+ unsigned long base_pos, max_pos, kernel_size;
+ unsigned long kasan_needs;
+ int i;
+
+ memory_limit = min(memory_limit, ident_map_size);
+
+ /*
+ * Avoid putting kernel in the end of physical memory
+ * which kasan will use for shadow memory and early pgtable
+ * mapping allocations.
+ */
+ memory_limit -= kasan_estimate_memory_needs(memory_limit);
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size) {
+ if (safe_addr < initrd_data.start + initrd_data.size)
+ safe_addr = initrd_data.start + initrd_data.size;
}
- if (!block_sum) {
+ safe_addr = ALIGN(safe_addr, THREAD_SIZE);
+
+ kernel_size = vmlinux.image_size + vmlinux.bss_size;
+ if (safe_addr + kernel_size > memory_limit)
+ return 0;
+
+ max_pos = count_valid_kernel_positions(kernel_size, safe_addr, memory_limit);
+ if (!max_pos) {
sclp_early_printk("KASLR disabled: not enough memory\n");
return 0;
}
- base = get_random(block_sum);
- if (base == 0)
+ /* we need a value in the range [1, base_pos] inclusive */
+ if (get_random(max_pos, &base_pos))
return 0;
- if (base < safe_addr)
- base = safe_addr;
- block_sum = offset = 0;
- for_each_mem_detect_block(i, &start, &end) {
- if (memory_limit) {
- if (start >= memory_limit)
- break;
- if (end > memory_limit)
- end = memory_limit;
- }
- if (end - start < kernel_size)
- continue;
- block_sum += end - start - kernel_size;
- if (base <= block_sum) {
- base = start + base - offset;
- base = ALIGN_DOWN(base, THREAD_SIZE);
- break;
- }
- offset = block_sum;
- }
- return base;
+ return position_to_address(base_pos + 1, kernel_size, safe_addr, memory_limit);
}
diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c
index 62e7c13ce85c..7fa1a32ea0f3 100644
--- a/arch/s390/boot/mem_detect.c
+++ b/arch/s390/boot/mem_detect.c
@@ -1,14 +1,15 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/errno.h>
#include <linux/init.h>
+#include <asm/setup.h>
+#include <asm/processor.h>
#include <asm/sclp.h>
#include <asm/sections.h>
#include <asm/mem_detect.h>
#include <asm/sparsemem.h>
-#include "compressed/decompressor.h"
+#include "decompressor.h"
#include "boot.h"
-unsigned long __bootdata(max_physmem_end);
struct mem_detect_info __bootdata(mem_detect);
/* up to 256 storage elements, 1020 subincrements each */
@@ -25,9 +26,9 @@ static void *mem_detect_alloc_extended(void)
{
unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64));
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
- INITRD_START < offset + ENTRIES_EXTENDED_MAX)
- offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64));
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size &&
+ initrd_data.start < offset + ENTRIES_EXTENDED_MAX)
+ offset = ALIGN(initrd_data.start + initrd_data.size, sizeof(u64));
return (void *)offset;
}
@@ -65,30 +66,37 @@ void add_mem_detect_block(u64 start, u64 end)
static int __diag260(unsigned long rx1, unsigned long rx2)
{
- register unsigned long _rx1 asm("2") = rx1;
- register unsigned long _rx2 asm("3") = rx2;
- register unsigned long _ry asm("4") = 0x10; /* storage configuration */
- int rc = -1; /* fail */
- unsigned long reg1, reg2;
- psw_t old = S390_lowcore.program_new_psw;
-
+ unsigned long reg1, reg2, ry;
+ union register_pair rx;
+ psw_t old;
+ int rc;
+
+ rx.even = rx1;
+ rx.odd = rx2;
+ ry = 0x10; /* storage configuration */
+ rc = -1; /* fail */
asm volatile(
- " epsw %0,%1\n"
- " st %0,%[psw_pgm]\n"
- " st %1,%[psw_pgm]+4\n"
- " larl %0,1f\n"
- " stg %0,%[psw_pgm]+8\n"
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
" diag %[rx],%[ry],0x260\n"
" ipm %[rc]\n"
" srl %[rc],28\n"
- "1:\n"
- : "=&d" (reg1), "=&a" (reg2),
- [psw_pgm] "=Q" (S390_lowcore.program_new_psw),
- [rc] "+&d" (rc), [ry] "+d" (_ry)
- : [rx] "d" (_rx1), "d" (_rx2)
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ [rc] "+&d" (rc),
+ [ry] "+&d" (ry),
+ "+Q" (S390_lowcore.program_new_psw),
+ "=Q" (old)
+ : [rx] "d" (rx.pair),
+ [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw)
: "cc", "memory");
- S390_lowcore.program_new_psw = old;
- return rc == 0 ? _ry : -1;
+ return rc == 0 ? ry : -1;
}
static int diag260(void)
@@ -112,24 +120,30 @@ static int diag260(void)
static int tprot(unsigned long addr)
{
- unsigned long pgm_addr;
+ unsigned long reg1, reg2;
int rc = -EFAULT;
- psw_t old = S390_lowcore.program_new_psw;
+ psw_t old;
- S390_lowcore.program_new_psw.mask = __extract_psw();
asm volatile(
- " larl %[pgm_addr],1f\n"
- " stg %[pgm_addr],%[psw_pgm_addr]\n"
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
" tprot 0(%[addr]),0\n"
" ipm %[rc]\n"
" srl %[rc],28\n"
- "1:\n"
- : [pgm_addr] "=&d"(pgm_addr),
- [psw_pgm_addr] "=Q"(S390_lowcore.program_new_psw.addr),
- [rc] "+&d"(rc)
- : [addr] "a"(addr)
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ [rc] "+&d" (rc),
+ "=Q" (S390_lowcore.program_new_psw.addr),
+ "=Q" (old)
+ : [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw),
+ [addr] "a" (addr)
: "cc", "memory");
- S390_lowcore.program_new_psw = old;
return rc;
}
@@ -149,27 +163,29 @@ static void search_mem_end(void)
add_mem_detect_block(0, (offset + 1) << 20);
}
-void detect_memory(void)
+unsigned long detect_memory(void)
{
+ unsigned long max_physmem_end;
+
sclp_early_get_memsize(&max_physmem_end);
if (!sclp_early_read_storage_info()) {
mem_detect.info_source = MEM_DETECT_SCLP_STOR_INFO;
- return;
+ return max_physmem_end;
}
if (!diag260()) {
mem_detect.info_source = MEM_DETECT_DIAG260;
- return;
+ return max_physmem_end;
}
if (max_physmem_end) {
add_mem_detect_block(0, max_physmem_end);
mem_detect.info_source = MEM_DETECT_SCLP_READ_INFO;
- return;
+ return max_physmem_end;
}
search_mem_end();
mem_detect.info_source = MEM_DETECT_BIN_SEARCH;
- max_physmem_end = get_mem_detect_end();
+ return get_mem_detect_end();
}
diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c
index 83b5b7915c32..c2a1defc79da 100644
--- a/arch/s390/boot/pgm_check_info.c
+++ b/arch/s390/boot/pgm_check_info.c
@@ -1,90 +1,180 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/stdarg.h>
#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/stacktrace.h>
+#include <asm/boot_data.h>
#include <asm/lowcore.h>
+#include <asm/setup.h>
#include <asm/sclp.h>
+#include <asm/uv.h>
#include "boot.h"
const char hex_asc[] = "0123456789abcdef";
-#define add_val_as_hex(dst, val) \
- __add_val_as_hex(dst, (const unsigned char *)&val, sizeof(val))
+static char *as_hex(char *dst, unsigned long val, int pad)
+{
+ char *p, *end = p = dst + max(pad, (int)__fls(val | 1) / 4 + 1);
+
+ for (*p-- = 0; p >= dst; val >>= 4)
+ *p-- = hex_asc[val & 0x0f];
+ return end;
+}
-static char *__add_val_as_hex(char *dst, const unsigned char *src, size_t count)
+static char *symstart(char *p)
{
- while (count--)
- dst = hex_byte_pack(dst, *src++);
- return dst;
+ while (*p)
+ p--;
+ return p + 1;
}
-static char *add_str(char *dst, char *src)
+static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len)
{
- strcpy(dst, src);
- return dst + strlen(dst);
+ /* symbol entries are in a form "10000 c4 startup\0" */
+ char *a = _decompressor_syms_start;
+ char *b = _decompressor_syms_end;
+ unsigned long start;
+ unsigned long size;
+ char *pivot;
+ char *endp;
+
+ while (a < b) {
+ pivot = symstart(a + (b - a) / 2);
+ start = simple_strtoull(pivot, &endp, 16);
+ size = simple_strtoull(endp + 1, &endp, 16);
+ if (ip < start) {
+ b = pivot;
+ continue;
+ }
+ if (ip > start + size) {
+ a = pivot + strlen(pivot) + 1;
+ continue;
+ }
+ *off = ip - start;
+ *len = size;
+ return endp + 1;
+ }
+ return NULL;
}
-void print_pgm_check_info(void)
+static noinline char *strsym(void *ip)
{
- struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area);
- unsigned short ilc = S390_lowcore.pgm_ilc >> 1;
- char buf[256];
- int row, col;
+ static char buf[64];
+ unsigned short off;
+ unsigned short len;
char *p;
- add_str(buf, "Linux version ");
- strlcat(buf, kernel_version, sizeof(buf));
- sclp_early_printk(buf);
+ p = findsym((unsigned long)ip, &off, &len);
+ if (p) {
+ strncpy(buf, p, sizeof(buf));
+ /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */
+ p = buf + strnlen(buf, sizeof(buf) - 15);
+ strcpy(p, "+0x");
+ p = as_hex(p + 3, off, 0);
+ strcpy(p, "/0x");
+ as_hex(p + 3, len, 0);
+ } else {
+ as_hex(buf, (unsigned long)ip, 16);
+ }
+ return buf;
+}
- p = add_str(buf, "Kernel fault: interruption code ");
- p = add_val_as_hex(buf + strlen(buf), S390_lowcore.pgm_code);
- p = add_str(p, " ilc:");
- *p++ = hex_asc_lo(ilc);
- add_str(p, "\n");
- sclp_early_printk(buf);
+void decompressor_printk(const char *fmt, ...)
+{
+ char buf[1024] = { 0 };
+ char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */
+ unsigned long pad;
+ char *p = buf;
+ va_list args;
- p = add_str(buf, "PSW : ");
- p = add_val_as_hex(p, S390_lowcore.psw_save_area.mask);
- p = add_str(p, " ");
- p = add_val_as_hex(p, S390_lowcore.psw_save_area.addr);
- add_str(p, "\n");
+ va_start(args, fmt);
+ for (; p < end && *fmt; fmt++) {
+ if (*fmt != '%') {
+ *p++ = *fmt;
+ continue;
+ }
+ pad = isdigit(*++fmt) ? simple_strtol(fmt, (char **)&fmt, 10) : 0;
+ switch (*fmt) {
+ case 's':
+ p = buf + strlcat(buf, va_arg(args, char *), sizeof(buf));
+ break;
+ case 'p':
+ if (*++fmt != 'S')
+ goto out;
+ p = buf + strlcat(buf, strsym(va_arg(args, void *)), sizeof(buf));
+ break;
+ case 'l':
+ if (*++fmt != 'x' || end - p <= max(sizeof(long) * 2, pad))
+ goto out;
+ p = as_hex(p, va_arg(args, unsigned long), pad);
+ break;
+ case 'x':
+ if (end - p <= max(sizeof(int) * 2, pad))
+ goto out;
+ p = as_hex(p, va_arg(args, unsigned int), pad);
+ break;
+ default:
+ goto out;
+ }
+ }
+out:
+ va_end(args);
sclp_early_printk(buf);
+}
- p = add_str(buf, " R:");
- *p++ = hex_asc_lo(psw->per);
- p = add_str(p, " T:");
- *p++ = hex_asc_lo(psw->dat);
- p = add_str(p, " IO:");
- *p++ = hex_asc_lo(psw->io);
- p = add_str(p, " EX:");
- *p++ = hex_asc_lo(psw->ext);
- p = add_str(p, " Key:");
- *p++ = hex_asc_lo(psw->key);
- p = add_str(p, " M:");
- *p++ = hex_asc_lo(psw->mcheck);
- p = add_str(p, " W:");
- *p++ = hex_asc_lo(psw->wait);
- p = add_str(p, " P:");
- *p++ = hex_asc_lo(psw->pstate);
- p = add_str(p, " AS:");
- *p++ = hex_asc_lo(psw->as);
- p = add_str(p, " CC:");
- *p++ = hex_asc_lo(psw->cc);
- p = add_str(p, " PM:");
- *p++ = hex_asc_lo(psw->pm);
- p = add_str(p, " RI:");
- *p++ = hex_asc_lo(psw->ri);
- p = add_str(p, " EA:");
- *p++ = hex_asc_lo(psw->eaba);
- add_str(p, "\n");
- sclp_early_printk(buf);
+static noinline void print_stacktrace(void)
+{
+ struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start,
+ (unsigned long)_stack_end };
+ unsigned long sp = S390_lowcore.gpregs_save_area[15];
+ bool first = true;
- for (row = 0; row < 4; row++) {
- p = add_str(buf, row == 0 ? "GPRS:" : " ");
- for (col = 0; col < 4; col++) {
- p = add_str(p, " ");
- p = add_val_as_hex(p, S390_lowcore.gpregs_save_area[row * 4 + col]);
- }
- add_str(p, "\n");
- sclp_early_printk(buf);
+ decompressor_printk("Call Trace:\n");
+ while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) {
+ struct stack_frame *sf = (struct stack_frame *)sp;
+
+ decompressor_printk(first ? "(sp:%016lx [<%016lx>] %pS)\n" :
+ " sp:%016lx [<%016lx>] %pS\n",
+ sp, sf->gprs[8], (void *)sf->gprs[8]);
+ if (sf->back_chain <= sp)
+ break;
+ sp = sf->back_chain;
+ first = false;
}
}
+
+void print_pgm_check_info(void)
+{
+ unsigned long *gpregs = (unsigned long *)S390_lowcore.gpregs_save_area;
+ struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area);
+
+ decompressor_printk("Linux version %s\n", kernel_version);
+ if (!is_prot_virt_guest() && early_command_line[0])
+ decompressor_printk("Kernel command line: %s\n", early_command_line);
+ decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n",
+ S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1);
+ if (kaslr_enabled)
+ decompressor_printk("Kernel random base: %lx\n", __kaslr_offset);
+ decompressor_printk("PSW : %016lx %016lx (%pS)\n",
+ S390_lowcore.psw_save_area.mask,
+ S390_lowcore.psw_save_area.addr,
+ (void *)S390_lowcore.psw_save_area.addr);
+ decompressor_printk(
+ " R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n",
+ psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck,
+ psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri,
+ psw->eaba);
+ decompressor_printk("GPRS: %016lx %016lx %016lx %016lx\n",
+ gpregs[0], gpregs[1], gpregs[2], gpregs[3]);
+ decompressor_printk(" %016lx %016lx %016lx %016lx\n",
+ gpregs[4], gpregs[5], gpregs[6], gpregs[7]);
+ decompressor_printk(" %016lx %016lx %016lx %016lx\n",
+ gpregs[8], gpregs[9], gpregs[10], gpregs[11]);
+ decompressor_printk(" %016lx %016lx %016lx %016lx\n",
+ gpregs[12], gpregs[13], gpregs[14], gpregs[15]);
+ print_stacktrace();
+ decompressor_printk("Last Breaking-Event-Address:\n");
+ decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break,
+ (void *)S390_lowcore.pgm_last_break);
+}
diff --git a/arch/s390/boot/sclp_early_core.c b/arch/s390/boot/sclp_early_core.c
index 5a19fd7020b5..6f30646afbd0 100644
--- a/arch/s390/boot/sclp_early_core.c
+++ b/arch/s390/boot/sclp_early_core.c
@@ -1,2 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+#include "boot.h"
#include "../../../drivers/s390/char/sclp_early_core.c"
+
+/* SCLP early buffer must stay page-aligned and below 2GB */
+static char __sclp_early_sccb[EXT_SCCB_READ_SCP] __aligned(PAGE_SIZE);
+
+void sclp_early_setup_buffer(void)
+{
+ sclp_early_set_buffer(&__sclp_early_sccb);
+}
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 3b3a11f95269..47ca3264c023 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -1,55 +1,37 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/string.h>
#include <linux/elf.h>
+#include <asm/boot_data.h>
#include <asm/sections.h>
+#include <asm/cpu_mf.h>
#include <asm/setup.h>
+#include <asm/kasan.h>
#include <asm/kexec.h>
#include <asm/sclp.h>
#include <asm/diag.h>
#include <asm/uv.h>
-#include "compressed/decompressor.h"
+#include <asm/abs_lowcore.h>
+#include "decompressor.h"
#include "boot.h"
+#include "uv.h"
-extern char __boot_data_start[], __boot_data_end[];
-extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
unsigned long __bootdata_preserved(__kaslr_offset);
+unsigned long __bootdata_preserved(__abs_lowcore);
+unsigned long __bootdata_preserved(__memcpy_real_area);
+unsigned long __bootdata(__amode31_base);
+unsigned long __bootdata_preserved(VMALLOC_START);
+unsigned long __bootdata_preserved(VMALLOC_END);
+struct page *__bootdata_preserved(vmemmap);
+unsigned long __bootdata_preserved(vmemmap_size);
+unsigned long __bootdata_preserved(MODULES_VADDR);
+unsigned long __bootdata_preserved(MODULES_END);
+unsigned long __bootdata(ident_map_size);
+int __bootdata(is_full_image) = 1;
+struct initrd_data __bootdata(initrd_data);
-/*
- * Some code and data needs to stay below 2 GB, even when the kernel would be
- * relocated above 2 GB, because it has to use 31 bit addresses.
- * Such code and data is part of the .dma section, and its location is passed
- * over to the decompressed / relocated kernel via the .boot.preserved.data
- * section.
- */
-extern char _sdma[], _edma[];
-extern char _stext_dma[], _etext_dma[];
-extern struct exception_table_entry _start_dma_ex_table[];
-extern struct exception_table_entry _stop_dma_ex_table[];
-unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma);
-unsigned long __bootdata_preserved(__edma) = __pa(&_edma);
-unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma);
-unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma);
-struct exception_table_entry *
- __bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table;
-struct exception_table_entry *
- __bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table;
-
-int _diag210_dma(struct diag210 *addr);
-int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode);
-int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode);
-void _diag0c_dma(struct hypfs_diag0c_entry *entry);
-void _diag308_reset_dma(void);
-struct diag_ops __bootdata_preserved(diag_dma_ops) = {
- .diag210 = _diag210_dma,
- .diag26c = _diag26c_dma,
- .diag14 = _diag14_dma,
- .diag0c = _diag0c_dma,
- .diag308_reset = _diag308_reset_dma
-};
-static struct diag210 _diag210_tmp_dma __section(.dma.data);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma;
-void _swsusp_reset_dma(void);
-unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+u64 __bootdata_preserved(alt_stfle_fac_list[16]);
+struct oldmem_data __bootdata_preserved(oldmem_data);
void error(char *x)
{
@@ -60,6 +42,14 @@ void error(char *x)
disabled_wait();
}
+static void setup_lpp(void)
+{
+ S390_lowcore.current_pid = 0;
+ S390_lowcore.lpp = LPP_MAGIC;
+ if (test_facility(40))
+ lpp(&S390_lowcore.lpp);
+}
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
unsigned long mem_safe_offset(void)
{
@@ -71,12 +61,12 @@ static void rescue_initrd(unsigned long addr)
{
if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD))
return;
- if (!INITRD_START || !INITRD_SIZE)
+ if (!initrd_data.start || !initrd_data.size)
return;
- if (addr <= INITRD_START)
+ if (addr <= initrd_data.start)
return;
- memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE);
- INITRD_START = addr;
+ memmove((void *)addr, (void *)initrd_data.start, initrd_data.size);
+ initrd_data.start = addr;
}
static void copy_bootdata(void)
@@ -120,41 +110,176 @@ static void handle_relocs(unsigned long offset)
}
}
+/*
+ * Merge information from several sources into a single ident_map_size value.
+ * "ident_map_size" represents the upper limit of physical memory we may ever
+ * reach. It might not be all online memory, but also include standby (offline)
+ * memory. "ident_map_size" could be lower then actual standby or even online
+ * memory present, due to limiting factors. We should never go above this limit.
+ * It is the size of our identity mapping.
+ *
+ * Consider the following factors:
+ * 1. max_physmem_end - end of physical memory online or standby.
+ * Always <= end of the last online memory block (get_mem_detect_end()).
+ * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the
+ * kernel is able to support.
+ * 3. "mem=" kernel command line option which limits physical memory usage.
+ * 4. OLDMEM_BASE which is a kdump memory limit when the kernel is executed as
+ * crash kernel.
+ * 5. "hsa" size which is a memory limit when the kernel is executed during
+ * zfcp/nvme dump.
+ */
+static void setup_ident_map_size(unsigned long max_physmem_end)
+{
+ unsigned long hsa_size;
+
+ ident_map_size = max_physmem_end;
+ if (memory_limit)
+ ident_map_size = min(ident_map_size, memory_limit);
+ ident_map_size = min(ident_map_size, 1UL << MAX_PHYSMEM_BITS);
+
+#ifdef CONFIG_CRASH_DUMP
+ if (oldmem_data.start) {
+ kaslr_enabled = 0;
+ ident_map_size = min(ident_map_size, oldmem_data.size);
+ } else if (ipl_block_valid && is_ipl_block_dump()) {
+ kaslr_enabled = 0;
+ if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size)
+ ident_map_size = min(ident_map_size, hsa_size);
+ }
+#endif
+}
+
+static void setup_kernel_memory_layout(void)
+{
+ unsigned long vmemmap_start;
+ unsigned long rte_size;
+ unsigned long pages;
+ unsigned long vmax;
+
+ pages = ident_map_size / PAGE_SIZE;
+ /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
+ vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
+
+ /* choose kernel address space layout: 4 or 3 levels. */
+ vmemmap_start = round_up(ident_map_size, _REGION3_SIZE);
+ if (IS_ENABLED(CONFIG_KASAN) ||
+ vmalloc_size > _REGION2_SIZE ||
+ vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN >
+ _REGION2_SIZE) {
+ vmax = _REGION1_SIZE;
+ rte_size = _REGION2_SIZE;
+ } else {
+ vmax = _REGION2_SIZE;
+ rte_size = _REGION3_SIZE;
+ }
+ /*
+ * forcing modules and vmalloc area under the ultravisor
+ * secure storage limit, so that any vmalloc allocation
+ * we do could be used to back secure guest storage.
+ */
+ vmax = adjust_to_uv_max(vmax);
+#ifdef CONFIG_KASAN
+ /* force vmalloc and modules below kasan shadow */
+ vmax = min(vmax, KASAN_SHADOW_START);
+#endif
+ __memcpy_real_area = round_down(vmax - PAGE_SIZE, PAGE_SIZE);
+ __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
+ sizeof(struct lowcore));
+ MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE);
+ MODULES_VADDR = MODULES_END - MODULES_LEN;
+ VMALLOC_END = MODULES_VADDR;
+
+ /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */
+ vmalloc_size = min(vmalloc_size, round_down(VMALLOC_END / 2, _REGION3_SIZE));
+ VMALLOC_START = VMALLOC_END - vmalloc_size;
+
+ /* split remaining virtual space between 1:1 mapping & vmemmap array */
+ pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
+ pages = SECTION_ALIGN_UP(pages);
+ /* keep vmemmap_start aligned to a top level region table entry */
+ vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size);
+ /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */
+ vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS);
+ /* make sure identity map doesn't overlay with vmemmap */
+ ident_map_size = min(ident_map_size, vmemmap_start);
+ vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page);
+ /* make sure vmemmap doesn't overlay with vmalloc area */
+ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START);
+ vmemmap = (struct page *)vmemmap_start;
+}
+
+/*
+ * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's.
+ */
static void clear_bss_section(void)
{
memset((void *)vmlinux.default_lma + vmlinux.image_size, 0, vmlinux.bss_size);
}
+/*
+ * Set vmalloc area size to an 8th of (potential) physical memory
+ * size, unless size has been set by kernel command line parameter.
+ */
+static void setup_vmalloc_size(void)
+{
+ unsigned long size;
+
+ if (vmalloc_size_set)
+ return;
+ size = round_up(ident_map_size / 8, _SEGMENT_SIZE);
+ vmalloc_size = max(size, vmalloc_size);
+}
+
+static void offset_vmlinux_info(unsigned long offset)
+{
+ vmlinux.default_lma += offset;
+ *(unsigned long *)(&vmlinux.entry) += offset;
+ vmlinux.bootdata_off += offset;
+ vmlinux.bootdata_preserved_off += offset;
+ vmlinux.rela_dyn_start += offset;
+ vmlinux.rela_dyn_end += offset;
+ vmlinux.dynsym_start += offset;
+}
+
+static unsigned long reserve_amode31(unsigned long safe_addr)
+{
+ __amode31_base = PAGE_ALIGN(safe_addr);
+ return safe_addr + vmlinux.amode31_size;
+}
+
void startup_kernel(void)
{
unsigned long random_lma;
unsigned long safe_addr;
void *img;
+ initrd_data.start = parmarea.initrd_start;
+ initrd_data.size = parmarea.initrd_size;
+ oldmem_data.start = parmarea.oldmem_base;
+ oldmem_data.size = parmarea.oldmem_size;
+
+ setup_lpp();
store_ipl_parmblock();
safe_addr = mem_safe_offset();
+ safe_addr = reserve_amode31(safe_addr);
safe_addr = read_ipl_report(safe_addr);
uv_query_info();
rescue_initrd(safe_addr);
sclp_early_read_info();
setup_boot_command_line();
parse_boot_command_line();
- setup_memory_end();
- detect_memory();
+ sanitize_prot_virt_host();
+ setup_ident_map_size(detect_memory());
+ setup_vmalloc_size();
+ setup_kernel_memory_layout();
- random_lma = __kaslr_offset = 0;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) {
random_lma = get_random_base(safe_addr);
if (random_lma) {
__kaslr_offset = random_lma - vmlinux.default_lma;
img = (void *)vmlinux.default_lma;
- vmlinux.default_lma += __kaslr_offset;
- vmlinux.entry += __kaslr_offset;
- vmlinux.bootdata_off += __kaslr_offset;
- vmlinux.bootdata_preserved_off += __kaslr_offset;
- vmlinux.rela_dyn_start += __kaslr_offset;
- vmlinux.rela_dyn_end += __kaslr_offset;
- vmlinux.dynsym_start += __kaslr_offset;
+ offset_vmlinux_info(__kaslr_offset);
}
}
@@ -166,8 +291,7 @@ void startup_kernel(void)
clear_bss_section();
copy_bootdata();
- if (IS_ENABLED(CONFIG_RELOCATABLE))
- handle_relocs(__kaslr_offset);
+ handle_relocs(__kaslr_offset);
if (__kaslr_offset) {
/*
diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c
index b11e8108773a..faccb33b462c 100644
--- a/arch/s390/boot/string.c
+++ b/arch/s390/boot/string.c
@@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
#include "../lib/string.c"
int strncmp(const char *cs, const char *ct, size_t count)
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index 3f501159ee9f..0a077c0a2056 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -1,9 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
#include <asm/uv.h>
+#include <asm/boot_data.h>
#include <asm/facility.h>
#include <asm/sections.h>
+#include "boot.h"
+#include "uv.h"
+
+/* will be used in arch/s390/kernel/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
int __bootdata_preserved(prot_virt_guest);
+#endif
+#if IS_ENABLED(CONFIG_KVM)
+int __bootdata_preserved(prot_virt_host);
+#endif
+struct uv_info __bootdata_preserved(uv_info);
void uv_query_info(void)
{
@@ -19,7 +30,62 @@ void uv_query_info(void)
if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100)
return;
+ if (IS_ENABLED(CONFIG_KVM)) {
+ memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list));
+ uv_info.uv_base_stor_len = uvcb.uv_base_stor_len;
+ uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len;
+ uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len;
+ uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len;
+ uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len;
+ uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
+ uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
+ uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
+ uv_info.uv_feature_indications = uvcb.uv_feature_indications;
+ uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions;
+ uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf;
+ uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len;
+ uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
+ uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
+ uv_info.supp_att_pflags = uvcb.supp_att_pflags;
+ }
+
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) &&
test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list))
prot_virt_guest = 1;
+#endif
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+unsigned long adjust_to_uv_max(unsigned long limit)
+{
+ if (is_prot_virt_host() && uv_info.max_sec_stor_addr)
+ limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr);
+ return limit;
+}
+
+static int is_prot_virt_host_capable(void)
+{
+ /* disable if no prot_virt=1 given on command-line */
+ if (!is_prot_virt_host())
+ return 0;
+ /* disable if protected guest virtualization is enabled */
+ if (is_prot_virt_guest())
+ return 0;
+ /* disable if no hardware support */
+ if (!test_facility(158))
+ return 0;
+ /* disable if kdump */
+ if (oldmem_data.start)
+ return 0;
+ /* disable if stand-alone dump */
+ if (ipl_block_valid && is_ipl_block_dump())
+ return 0;
+ return 1;
+}
+
+void sanitize_prot_virt_host(void)
+{
+ prot_virt_host = is_prot_virt_host_capable();
}
+#endif
diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h
new file mode 100644
index 000000000000..0f3070856f8d
--- /dev/null
+++ b/arch/s390/boot/uv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_UV_H
+#define BOOT_UV_H
+
+#if IS_ENABLED(CONFIG_KVM)
+unsigned long adjust_to_uv_max(unsigned long limit);
+void sanitize_prot_virt_host(void);
+#else
+static inline unsigned long adjust_to_uv_max(unsigned long limit)
+{
+ return limit;
+}
+static inline void sanitize_prot_virt_host(void) {}
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+void uv_query_info(void);
+#else
+static inline void uv_query_info(void) {}
+#endif
+
+#endif /* BOOT_UV_H */
diff --git a/arch/s390/boot/version.c b/arch/s390/boot/version.c
index d32e58bdda6a..fd32f038777f 100644
--- a/arch/s390/boot/version.c
+++ b/arch/s390/boot/version.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <generated/utsversion.h>
#include <generated/utsrelease.h>
#include <generated/compile.h>
#include "boot.h"
diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
index 44561b2c3712..fa9d33b01b85 100644
--- a/arch/s390/boot/compressed/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -1,6 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm-generic/vmlinux.lds.h>
#include <asm/vmlinux.lds.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/sclp.h>
+#include "boot.h"
OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
OUTPUT_ARCH(s390:64-bit)
@@ -10,11 +14,19 @@ ENTRY(startup)
SECTIONS
{
. = 0;
+ .ipldata : {
+ *(.ipldata)
+ }
+ . = IPL_START;
.head.text : {
_head = . ;
HEAD_TEXT
_ehead = . ;
}
+ . = PARMAREA;
+ .parmarea : {
+ *(.parmarea)
+ }
.text : {
_text = .; /* Text */
*(.text)
@@ -27,38 +39,42 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+ NOTES
.data : {
_data = . ;
*(.data)
*(.data.*)
_edata = . ;
}
- /*
- * .dma section for code, data, ex_table that need to stay below 2 GB,
- * even when the kernel is relocate: above 2 GB.
- */
- . = ALIGN(PAGE_SIZE);
- _sdma = .;
- .dma.text : {
- _stext_dma = .;
- *(.dma.text)
- . = ALIGN(PAGE_SIZE);
- _etext_dma = .;
- }
- . = ALIGN(16);
- .dma.ex_table : {
- _start_dma_ex_table = .;
- KEEP(*(.dma.ex_table))
- _stop_dma_ex_table = .;
- }
- .dma.data : { *(.dma.data) }
- . = ALIGN(PAGE_SIZE);
- _edma = .;
BOOT_DATA
BOOT_DATA_PRESERVED
/*
+ * This is the BSS section of the decompressor and not of the decompressed Linux kernel.
+ * It will consume place in the decompressor's image.
+ */
+ . = ALIGN(8);
+ .bss : {
+ _bss = . ;
+ *(.bss)
+ *(.bss.*)
+ *(COMMON)
+ /*
+ * Stacks for the decompressor
+ */
+ . = ALIGN(PAGE_SIZE);
+ _dump_info_stack_start = .;
+ . += PAGE_SIZE;
+ _dump_info_stack_end = .;
+ . = ALIGN(PAGE_SIZE);
+ _stack_start = .;
+ . += BOOT_STACK_SIZE;
+ _stack_end = .;
+ _ebss = .;
+ }
+
+ /*
* uncompressed image info used by the decompressor it should match
* struct vmlinux_info. It comes from .vmlinux.info section of
* uncompressed vmlinux in a form of info.o
@@ -69,6 +85,14 @@ SECTIONS
*(.vmlinux.info)
}
+ .decompressor.syms : {
+ . += 1; /* make sure we have \0 before the first entry */
+ . = ALIGN(2);
+ _decompressor_syms_start = .;
+ *(.decompressor.syms)
+ _decompressor_syms_end = .;
+ }
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
. = 0x100000;
#else
@@ -78,17 +102,17 @@ SECTIONS
_compressed_start = .;
*(.vmlinux.bin.compressed)
_compressed_end = .;
- FILL(0xff);
- . = ALIGN(4096);
}
- . = ALIGN(256);
- .bss : {
- _bss = . ;
- *(.bss)
- *(.bss.*)
- *(COMMON)
- . = ALIGN(8); /* For convenience during zeroing */
- _ebss = .;
+
+#define SB_TRAILER_SIZE 32
+ /* Trailer needed for Secure Boot */
+ . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
+ . = ALIGN(4096) - SB_TRAILER_SIZE;
+ .sb.trailer : {
+ QUAD(0)
+ QUAD(0)
+ QUAD(0)
+ QUAD(0x000000207a49504c)
}
_end = .;
diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config
new file mode 100644
index 000000000000..39227b4511af
--- /dev/null
+++ b/arch/s390/configs/btf.config
@@ -0,0 +1 @@
+CONFIG_DEBUG_INFO_BTF=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 0c86ba19fa2b..63807bd0b536 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -1,9 +1,16 @@
+CONFIG_UAPI_HEADER_TEST=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_WATCH_QUEUE=y
CONFIG_AUDIT=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_LSM=y
CONFIG_PREEMPT=y
+CONFIG_SCHED_CORE=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_TASKSTATS=y
@@ -14,7 +21,6 @@ CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_NUMA_BALANCING=y
CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
CONFIG_RT_GROUP_SCHED=y
@@ -27,18 +33,16 @@ CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_MISC=y
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
# CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_LIVEPATCH=y
+CONFIG_MARCH_ZEC12=y
CONFIG_TUNE_ZEC12=y
CONFIG_NR_CPUS=512
CONFIG_NUMA=y
@@ -51,32 +55,34 @@ CONFIG_CHSC_SCH=y
CONFIG_VFIO_CCW=m
CONFIG_VFIO_AP=m
CONFIG_CRASH_DUMP=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_DEBUG=y
CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
CONFIG_CMM=m
CONFIG_APPLDATA_BASE=y
CONFIG_KVM=m
-CONFIG_VHOST_NET=m
-CONFIG_VHOST_VSOCK=m
-CONFIG_OPROFILE=m
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_STATIC_KEYS_SELFTEST=y
+CONFIG_SECCOMP_CACHE_DEBUG=y
CONFIG_LOCK_EVENT_COUNTS=y
+# CONFIG_GCC_PLUGINS is not set
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y
CONFIG_MODVERSIONS=y
CONFIG_MODULE_SRCVERSION_ALL=y
CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
-CONFIG_BLK_DEV_INTEGRITY=y
CONFIG_BLK_DEV_THROTTLING=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BLK_INLINE_ENCRYPTION=y
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_IBM_PARTITION=y
CONFIG_BSD_DISKLABEL=y
@@ -86,23 +92,25 @@ CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
+CONFIG_ZSWAP=y
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_SLUB_STATS=y
+# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
CONFIG_CMA_DEBUG=y
CONFIG_CMA_DEBUGFS=y
+CONFIG_CMA_SYSFS=y
+CONFIG_CMA_AREAS=7
CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -130,6 +138,7 @@ CONFIG_SYN_COOKIES=y
CONFIG_NET_IPVTI=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
+CONFIG_INET_ESPINTCP=y
CONFIG_INET_IPCOMP=m
CONFIG_INET_DIAG=m
CONFIG_INET_UDP_DIAG=m
@@ -144,6 +153,7 @@ CONFIG_TCP_CONG_ILLINOIS=m
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
+CONFIG_INET6_ESPINTCP=y
CONFIG_INET6_IPCOMP=m
CONFIG_IPV6_MIP6=m
CONFIG_IPV6_VTI=m
@@ -151,9 +161,14 @@ CONFIG_IPV6_SIT=m
CONFIG_IPV6_GRE=m
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_RPL_LWTUNNEL=y
+CONFIG_MPTCP=y
CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
CONFIG_NF_CONNTRACK_EVENTS=y
CONFIG_NF_CONNTRACK_TIMEOUT=y
CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -170,13 +185,16 @@ CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_CT_NETLINK=m
CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_NAT=m
+CONFIG_NFT_OBJREF=m
+CONFIG_NFT_REJECT=m
CONFIG_NFT_COMPAT=m
CONFIG_NFT_HASH=m
+CONFIG_NFT_FIB_INET=m
CONFIG_NETFILTER_XT_SET=m
CONFIG_NETFILTER_XT_TARGET_AUDIT=m
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
@@ -265,10 +283,12 @@ CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_TWOS=m
CONFIG_IP_VS_FTP=m
CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_FIB_IPV4=m
CONFIG_NF_TABLES_ARP=y
+CONFIG_NF_LOG_IPV4=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
@@ -287,7 +307,7 @@ CONFIG_IP_NF_SECURITY=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NFT_FIB_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -316,7 +336,8 @@ CONFIG_L2TP_DEBUGFS=m
CONFIG_L2TP_V3=y
CONFIG_L2TP_IP=m
CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
+CONFIG_BRIDGE=y
+CONFIG_BRIDGE_MRP=y
CONFIG_VLAN_8021Q=m
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_NET_SCHED=y
@@ -341,6 +362,7 @@ CONFIG_NET_SCH_CODEL=m
CONFIG_NET_SCH_FQ_CODEL=m
CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_SCH_ETS=m
CONFIG_NET_CLS_BASIC=m
CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
@@ -364,24 +386,26 @@ CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_GATE=m
CONFIG_DNS_RESOLVER=y
CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
CONFIG_NET_PKTGEN=m
-# CONFIG_NET_DROP_MONITOR is not set
CONFIG_PCI=y
+# CONFIG_PCIEASPM is not set
CONFIG_PCI_DEBUG=y
+CONFIG_PCI_IOV=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_S390=y
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
+CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
@@ -402,12 +426,12 @@ CONFIG_SCSI_ENCLOSURE=m
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SRP_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
+CONFIG_ZFCP=m
CONFIG_SCSI_VIRTIO=m
CONFIG_SCSI_DH=y
CONFIG_SCSI_DH_RDAC=m
@@ -421,7 +445,7 @@ CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_MD_CLUSTER=m
CONFIG_BCACHE=m
-CONFIG_BLK_DEV_DM=m
+CONFIG_BLK_DEV_DM=y
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
@@ -435,12 +459,16 @@ CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
CONFIG_DM_MULTIPATH_QL=m
CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_MULTIPATH_HST=m
+CONFIG_DM_MULTIPATH_IOA=m
CONFIG_DM_DELAY=m
+CONFIG_DM_INIT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_FLAKEY=m
CONFIG_DM_VERITY=m
CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y
CONFIG_DM_SWITCH=m
+CONFIG_DM_INTEGRITY=m
CONFIG_NETDEVICES=y
CONFIG_BONDING=m
CONFIG_DUMMY=m
@@ -448,6 +476,9 @@ CONFIG_EQUALIZER=m
CONFIG_IFB=m
CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_BAREUDP=m
+CONFIG_AMT=m
CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
@@ -461,40 +492,45 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_AURORA is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_CADENCE is not set
# CONFIG_NET_VENDOR_CAVIUM is not set
# CONFIG_NET_VENDOR_CHELSIO is not set
# CONFIG_NET_VENDOR_CISCO is not set
# CONFIG_NET_VENDOR_CORTINA is not set
+# CONFIG_NET_VENDOR_DAVICOM is not set
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
# CONFIG_NET_VENDOR_EZCHIP is not set
+# CONFIG_NET_VENDOR_FUNGIBLE is not set
# CONFIG_NET_VENDOR_GOOGLE is not set
# CONFIG_NET_VENDOR_HUAWEI is not set
# CONFIG_NET_VENDOR_INTEL is not set
+# CONFIG_NET_VENDOR_WANGXUN is not set
+# CONFIG_NET_VENDOR_LITEX is not set
# CONFIG_NET_VENDOR_MARVELL is not set
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
-# CONFIG_MLXFW is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
+# CONFIG_NET_VENDOR_MICROSOFT is not set
# CONFIG_NET_VENDOR_MYRI is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETERION is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
-# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NVIDIA is not set
# CONFIG_NET_VENDOR_OKI is not set
# CONFIG_NET_VENDOR_PACKET_ENGINES is not set
# CONFIG_NET_VENDOR_PENSANDO is not set
# CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RDC is not set
# CONFIG_NET_VENDOR_REALTEK is not set
@@ -502,9 +538,9 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_ROCKER is not set
# CONFIG_NET_VENDOR_SAMSUNG is not set
# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SILAN is not set
# CONFIG_NET_VENDOR_SIS is not set
+# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SMSC is not set
# CONFIG_NET_VENDOR_SOCIONEXT is not set
# CONFIG_NET_VENDOR_STMICRO is not set
@@ -512,8 +548,10 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
# CONFIG_NET_VENDOR_VIA is not set
# CONFIG_NET_VENDOR_WIZNET is not set
+# CONFIG_NET_VENDOR_XILINX is not set
CONFIG_PPP=m
CONFIG_PPP_BSDCOMP=m
CONFIG_PPP_DEFLATE=m
@@ -531,11 +569,12 @@ CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_MOUSE is not set
# CONFIG_SERIO is not set
CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_NULL_TTY=m
+CONFIG_VIRTIO_CONSOLE=m
CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
CONFIG_HANGCHECK_TIMER=m
CONFIG_TN3270_FS=y
+# CONFIG_RANDOM_TRUST_CPU is not set
+# CONFIG_RANDOM_TRUST_BOOTLOADER is not set
CONFIG_PPS=m
# CONFIG_PTP_1588_CLOCK is not set
# CONFIG_HWMON is not set
@@ -543,6 +582,7 @@ CONFIG_WATCHDOG=y
CONFIG_WATCHDOG_NOWAYOUT=y
CONFIG_SOFT_WATCHDOG=m
CONFIG_DIAG288_WATCHDOG=m
+# CONFIG_DRM_DEBUG_MODESET_LOCK is not set
CONFIG_FB=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
@@ -555,11 +595,13 @@ CONFIG_MLX5_INFINIBAND=m
CONFIG_SYNC_FILE=y
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
+CONFIG_MLX5_VFIO_PCI=m
CONFIG_VFIO_MDEV=m
-CONFIG_VFIO_MDEV_DEVICE=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
+CONFIG_VHOST_NET=m
+CONFIG_VHOST_VSOCK=m
CONFIG_S390_CCW_IOMMU=y
CONFIG_S390_AP_IOMMU=y
CONFIG_EXT4_FS=y
@@ -599,6 +641,7 @@ CONFIG_FUSE_FS=y
CONFIG_CUSE=m
CONFIG_VIRTIO_FS=m
CONFIG_OVERLAY_FS=m
+CONFIG_NETFS_STATS=y
CONFIG_FSCACHE=m
CONFIG_CACHEFILES=m
CONFIG_ISO9660_FS=y
@@ -607,11 +650,13 @@ CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_EXFAT_FS=m
CONFIG_NTFS_FS=m
CONFIG_NTFS_RW=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_INODE64=y
CONFIG_HUGETLBFS=y
CONFIG_CONFIGFS_FS=m
CONFIG_ECRYPT_FS=m
@@ -632,13 +677,12 @@ CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_NFSD_V4_SECURITY_LABEL=y
CONFIG_CIFS=m
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG is not set
CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_CIFS_SWN_UPCALL=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -649,16 +693,18 @@ CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_UNICODE=y
CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
CONFIG_ENCRYPTED_KEYS=m
+CONFIG_KEY_NOTIFICATIONS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
+CONFIG_HARDENED_USERCOPY=y
CONFIG_FORTIFY_SOURCE=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_SECURITY_LOCKDOWN_LSM=y
CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y
+CONFIG_SECURITY_LANDLOCK=y
CONFIG_INTEGRITY_SIGNATURE=y
CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
CONFIG_IMA=y
@@ -673,41 +719,46 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_DH=m
CONFIG_CRYPTO_ECDH=m
+CONFIG_CRYPTO_ECDSA=m
CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CFB=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_ADIANTUM=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_SM2=m
+CONFIG_CRYPTO_CURVE25519=m
CONFIG_CRYPTO_AES_TI=m
CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARIA=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_CFB=m
+CONFIG_CRYPTO_HCTR2=m
+CONFIG_CRYPTO_KEYWRAP=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_AEGIS128=m
+CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
+CONFIG_CRYPTO_VMAC=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_CRC32=m
CONFIG_CRYPTO_842=m
CONFIG_CRYPTO_LZ4=m
CONFIG_CRYPTO_LZ4HC=m
@@ -718,40 +769,44 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_STATS=y
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_DES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
+CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
+CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_CORDIC=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRC32_SELFTEST=y
CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
CONFIG_RANDOM32_SELFTEST=y
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
-CONFIG_DMA_API_DEBUG=y
-CONFIG_STRING_SELFTEST=y
CONFIG_PRINTK_TIME=y
CONFIG_DYNAMIC_DEBUG=y
-CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
CONFIG_GDB_SCRIPTS=y
-CONFIG_FRAME_WARN=1024
CONFIG_HEADERS_INSTALL=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
CONFIG_PAGE_OWNER=y
CONFIG_DEBUG_RODATA_TEST=y
+CONFIG_DEBUG_WX=y
+CONFIG_PTDUMP_DEBUGFS=y
CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_SELFTEST=y
CONFIG_DEBUG_OBJECTS_FREE=y
@@ -759,33 +814,35 @@ CONFIG_DEBUG_OBJECTS_TIMERS=y
CONFIG_DEBUG_OBJECTS_WORK=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
-CONFIG_SLUB_DEBUG_ON=y
-CONFIG_SLUB_STATS=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_VM_VMACACHE=y
-CONFIG_DEBUG_VM_RB=y
CONFIG_DEBUG_VM_PGFLAGS=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_KFENCE=y
+CONFIG_KFENCE_DEFERRABLE=y
+CONFIG_KFENCE_STATIC_KEYS=y
CONFIG_DEBUG_SHIRQ=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_WQ_WATCHDOG=y
-CONFIG_DEBUG_TIMEKEEPING=y
+CONFIG_TEST_LOCKUP=m
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
-CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
+CONFIG_DEBUG_IRQFLAGS=y
CONFIG_DEBUG_SG=y
CONFIG_DEBUG_NOTIFIERS=y
CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_DEBUG_CREDENTIALS=y
CONFIG_RCU_TORTURE_TEST=m
+CONFIG_RCU_REF_SCALE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=300
+# CONFIG_RCU_TRACE is not set
CONFIG_LATENCYTOP=y
+CONFIG_BOOTTIME_TRACING=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
CONFIG_IRQSOFF_TRACER=y
@@ -793,24 +850,38 @@ CONFIG_PREEMPT_TRACER=y
CONFIG_SCHED_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_BPF_KPROBE_OVERRIDE=y
CONFIG_HIST_TRIGGERS=y
-CONFIG_S390_PTDUMP=y
+CONFIG_FTRACE_STARTUP_TEST=y
+# CONFIG_EVENT_TRACE_STARTUP_TEST is not set
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_DEBUG_ENTRY=y
+CONFIG_CIO_INJECT=y
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
CONFIG_NOTIFIER_ERROR_INJECTION=m
CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m
CONFIG_FAULT_INJECTION=y
CONFIG_FAILSLAB=y
CONFIG_FAIL_PAGE_ALLOC=y
+CONFIG_FAULT_INJECTION_USERCOPY=y
CONFIG_FAIL_MAKE_REQUEST=y
CONFIG_FAIL_IO_TIMEOUT=y
CONFIG_FAIL_FUTEX=y
CONFIG_FAULT_INJECTION_DEBUG_FS=y
+CONFIG_FAIL_FUNCTION=y
CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
CONFIG_LKDTM=m
-CONFIG_TEST_LIST_SORT=y
-CONFIG_TEST_SORT=y
-CONFIG_KPROBES_SANITY_TEST=y
+CONFIG_TEST_MIN_HEAP=y
+CONFIG_KPROBES_SANITY_TEST=m
CONFIG_RBTREE_TEST=y
CONFIG_INTERVAL_TREE_TEST=m
CONFIG_PERCPU_TEST=m
CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_STRING_SELFTEST=y
+CONFIG_TEST_BITOPS=m
CONFIG_TEST_BPF=m
+CONFIG_TEST_LIVEPATCH=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 6b27d861a9a3..4f9a98247442 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -1,8 +1,14 @@
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_WATCH_QUEUE=y
CONFIG_AUDIT=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_LSM=y
+CONFIG_SCHED_CORE=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_TASKSTATS=y
@@ -13,7 +19,6 @@ CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_NUMA_BALANCING=y
CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
CONFIG_RT_GROUP_SCHED=y
@@ -26,22 +31,19 @@ CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_MISC=y
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
# CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_LIVEPATCH=y
+CONFIG_MARCH_ZEC12=y
CONFIG_TUNE_ZEC12=y
CONFIG_NR_CPUS=512
CONFIG_NUMA=y
-# CONFIG_NUMA_EMU is not set
CONFIG_HZ_100=y
CONFIG_KEXEC_FILE=y
CONFIG_KEXEC_SIG=y
@@ -51,29 +53,31 @@ CONFIG_CHSC_SCH=y
CONFIG_VFIO_CCW=m
CONFIG_VFIO_AP=m
CONFIG_CRASH_DUMP=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_DEBUG=y
CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
CONFIG_CMM=m
CONFIG_APPLDATA_BASE=y
CONFIG_KVM=m
-CONFIG_VHOST_NET=m
-CONFIG_VHOST_VSOCK=m
-CONFIG_OPROFILE=m
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
+# CONFIG_GCC_PLUGINS is not set
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y
CONFIG_MODVERSIONS=y
CONFIG_MODULE_SRCVERSION_ALL=y
CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
CONFIG_BLK_DEV_THROTTLING=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BLK_INLINE_ENCRYPTION=y
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_IBM_PARTITION=y
CONFIG_BSD_DISKLABEL=y
@@ -83,21 +87,21 @@ CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
+CONFIG_ZSWAP=y
+CONFIG_ZSMALLOC_STAT=y
+# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
+CONFIG_CMA_SYSFS=y
+CONFIG_CMA_AREAS=7
CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -125,6 +129,7 @@ CONFIG_SYN_COOKIES=y
CONFIG_NET_IPVTI=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
+CONFIG_INET_ESPINTCP=y
CONFIG_INET_IPCOMP=m
CONFIG_INET_DIAG=m
CONFIG_INET_UDP_DIAG=m
@@ -139,6 +144,7 @@ CONFIG_TCP_CONG_ILLINOIS=m
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
+CONFIG_INET6_ESPINTCP=y
CONFIG_INET6_IPCOMP=m
CONFIG_IPV6_MIP6=m
CONFIG_IPV6_VTI=m
@@ -146,9 +152,14 @@ CONFIG_IPV6_SIT=m
CONFIG_IPV6_GRE=m
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_RPL_LWTUNNEL=y
+CONFIG_MPTCP=y
CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
CONFIG_NF_CONNTRACK_EVENTS=y
CONFIG_NF_CONNTRACK_TIMEOUT=y
CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -165,13 +176,16 @@ CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_CT_NETLINK=m
CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_NAT=m
+CONFIG_NFT_OBJREF=m
+CONFIG_NFT_REJECT=m
CONFIG_NFT_COMPAT=m
CONFIG_NFT_HASH=m
+CONFIG_NFT_FIB_INET=m
CONFIG_NETFILTER_XT_SET=m
CONFIG_NETFILTER_XT_TARGET_AUDIT=m
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
@@ -260,10 +274,12 @@ CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_TWOS=m
CONFIG_IP_VS_FTP=m
CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_FIB_IPV4=m
CONFIG_NF_TABLES_ARP=y
+CONFIG_NF_LOG_IPV4=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
@@ -282,7 +298,7 @@ CONFIG_IP_NF_SECURITY=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NFT_FIB_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -310,7 +326,8 @@ CONFIG_L2TP_DEBUGFS=m
CONFIG_L2TP_V3=y
CONFIG_L2TP_IP=m
CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
+CONFIG_BRIDGE=y
+CONFIG_BRIDGE_MRP=y
CONFIG_VLAN_8021Q=m
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_NET_SCHED=y
@@ -335,6 +352,7 @@ CONFIG_NET_SCH_CODEL=m
CONFIG_NET_SCH_FQ_CODEL=m
CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_SCH_ETS=m
CONFIG_NET_CLS_BASIC=m
CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
@@ -358,29 +376,30 @@ CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_GATE=m
CONFIG_DNS_RESOLVER=y
CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
CONFIG_NET_PKTGEN=m
-# CONFIG_NET_DROP_MONITOR is not set
CONFIG_PCI=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCI_IOV=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_S390=y
CONFIG_UEVENT_HELPER=y
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
+CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=32768
-# CONFIG_BLK_DEV_XPRAM is not set
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_RBD=m
CONFIG_BLK_DEV_NVME=m
@@ -397,12 +416,12 @@ CONFIG_SCSI_ENCLOSURE=m
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SRP_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
+CONFIG_ZFCP=m
CONFIG_SCSI_VIRTIO=m
CONFIG_SCSI_DH=y
CONFIG_SCSI_DH_RDAC=m
@@ -416,7 +435,7 @@ CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_MD_CLUSTER=m
CONFIG_BCACHE=m
-CONFIG_BLK_DEV_DM=m
+CONFIG_BLK_DEV_DM=y
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
@@ -430,7 +449,10 @@ CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
CONFIG_DM_MULTIPATH_QL=m
CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_MULTIPATH_HST=m
+CONFIG_DM_MULTIPATH_IOA=m
CONFIG_DM_DELAY=m
+CONFIG_DM_INIT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_FLAKEY=m
CONFIG_DM_VERITY=m
@@ -444,6 +466,9 @@ CONFIG_EQUALIZER=m
CONFIG_IFB=m
CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_BAREUDP=m
+CONFIG_AMT=m
CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
@@ -457,40 +482,45 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_AURORA is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_CADENCE is not set
# CONFIG_NET_VENDOR_CAVIUM is not set
# CONFIG_NET_VENDOR_CHELSIO is not set
# CONFIG_NET_VENDOR_CISCO is not set
# CONFIG_NET_VENDOR_CORTINA is not set
+# CONFIG_NET_VENDOR_DAVICOM is not set
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
# CONFIG_NET_VENDOR_EZCHIP is not set
+# CONFIG_NET_VENDOR_FUNGIBLE is not set
# CONFIG_NET_VENDOR_GOOGLE is not set
# CONFIG_NET_VENDOR_HUAWEI is not set
# CONFIG_NET_VENDOR_INTEL is not set
+# CONFIG_NET_VENDOR_WANGXUN is not set
+# CONFIG_NET_VENDOR_LITEX is not set
# CONFIG_NET_VENDOR_MARVELL is not set
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
-# CONFIG_MLXFW is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
+# CONFIG_NET_VENDOR_MICROSOFT is not set
# CONFIG_NET_VENDOR_MYRI is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETERION is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
-# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NVIDIA is not set
# CONFIG_NET_VENDOR_OKI is not set
# CONFIG_NET_VENDOR_PACKET_ENGINES is not set
# CONFIG_NET_VENDOR_PENSANDO is not set
# CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RDC is not set
# CONFIG_NET_VENDOR_REALTEK is not set
@@ -498,9 +528,9 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_ROCKER is not set
# CONFIG_NET_VENDOR_SAMSUNG is not set
# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SILAN is not set
# CONFIG_NET_VENDOR_SIS is not set
+# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SMSC is not set
# CONFIG_NET_VENDOR_SOCIONEXT is not set
# CONFIG_NET_VENDOR_STMICRO is not set
@@ -508,8 +538,10 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
# CONFIG_NET_VENDOR_VIA is not set
# CONFIG_NET_VENDOR_WIZNET is not set
+# CONFIG_NET_VENDOR_XILINX is not set
CONFIG_PPP=m
CONFIG_PPP_BSDCOMP=m
CONFIG_PPP_DEFLATE=m
@@ -527,11 +559,12 @@ CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_MOUSE is not set
# CONFIG_SERIO is not set
CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_NULL_TTY=m
+CONFIG_VIRTIO_CONSOLE=m
CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
CONFIG_HANGCHECK_TIMER=m
CONFIG_TN3270_FS=y
+# CONFIG_RANDOM_TRUST_CPU is not set
+# CONFIG_RANDOM_TRUST_BOOTLOADER is not set
# CONFIG_PTP_1588_CLOCK is not set
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
@@ -551,11 +584,13 @@ CONFIG_MLX5_INFINIBAND=m
CONFIG_SYNC_FILE=y
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
+CONFIG_MLX5_VFIO_PCI=m
CONFIG_VFIO_MDEV=m
-CONFIG_VFIO_MDEV_DEVICE=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
+CONFIG_VHOST_NET=m
+CONFIG_VHOST_VSOCK=m
CONFIG_S390_CCW_IOMMU=y
CONFIG_S390_AP_IOMMU=y
CONFIG_EXT4_FS=y
@@ -591,6 +626,7 @@ CONFIG_FUSE_FS=y
CONFIG_CUSE=m
CONFIG_VIRTIO_FS=m
CONFIG_OVERLAY_FS=m
+CONFIG_NETFS_STATS=y
CONFIG_FSCACHE=m
CONFIG_CACHEFILES=m
CONFIG_ISO9660_FS=y
@@ -599,11 +635,13 @@ CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_EXFAT_FS=m
CONFIG_NTFS_FS=m
CONFIG_NTFS_RW=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_INODE64=y
CONFIG_HUGETLBFS=y
CONFIG_CONFIGFS_FS=m
CONFIG_ECRYPT_FS=m
@@ -624,13 +662,12 @@ CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_NFSD_V4_SECURITY_LABEL=y
CONFIG_CIFS=m
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG is not set
CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_CIFS_SWN_UPCALL=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -641,8 +678,8 @@ CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_UNICODE=y
CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
CONFIG_ENCRYPTED_KEYS=m
+CONFIG_KEY_NOTIFICATIONS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
@@ -650,6 +687,7 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_SECURITY_LOCKDOWN_LSM=y
CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y
+CONFIG_SECURITY_LANDLOCK=y
CONFIG_INTEGRITY_SIGNATURE=y
CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
CONFIG_IMA=y
@@ -665,42 +703,47 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_DH=m
CONFIG_CRYPTO_ECDH=m
+CONFIG_CRYPTO_ECDSA=m
CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CFB=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_ADIANTUM=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_SM2=m
+CONFIG_CRYPTO_CURVE25519=m
CONFIG_CRYPTO_AES_TI=m
CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARIA=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_CFB=m
+CONFIG_CRYPTO_HCTR2=m
+CONFIG_CRYPTO_KEYWRAP=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_OFB=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_AEGIS128=m
+CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
+CONFIG_CRYPTO_VMAC=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_CRC32=m
CONFIG_CRYPTO_842=m
CONFIG_CRYPTO_LZ4=m
CONFIG_CRYPTO_LZ4HC=m
@@ -711,45 +754,63 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_STATS=y
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_DES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
+CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
+CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_CORDIC=m
+CONFIG_PRIME_NUMBERS=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
+CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO_DWARF4=y
CONFIG_GDB_SCRIPTS=y
-CONFIG_FRAME_WARN=1024
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_WX=y
+CONFIG_PTDUMP_DEBUGFS=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_PANIC_ON_OOPS=y
+CONFIG_TEST_LOCKUP=m
CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_RCU_TORTURE_TEST=m
+CONFIG_RCU_REF_SCALE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=60
CONFIG_LATENCYTOP=y
+CONFIG_BOOTTIME_TRACING=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
CONFIG_SCHED_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_BPF_KPROBE_OVERRIDE=y
CONFIG_HIST_TRIGGERS=y
-CONFIG_S390_PTDUMP=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
CONFIG_LKDTM=m
+CONFIG_KPROBES_SANITY_TEST=m
CONFIG_PERCPU_TEST=m
CONFIG_ATOMIC64_SELFTEST=y
CONFIG_TEST_BPF=m
+CONFIG_TEST_LIVEPATCH=m
diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config
new file mode 100644
index 000000000000..700a8b25c3ff
--- /dev/null
+++ b/arch/s390/configs/kasan.config
@@ -0,0 +1,3 @@
+CONFIG_KASAN=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KASAN_VMALLOC=y
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 20c51e5d9353..5fe9948be644 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -1,38 +1,42 @@
-# CONFIG_SWAP is not set
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
# CONFIG_CPU_ISOLATION is not set
# CONFIG_UTS_NS is not set
+# CONFIG_TIME_NS is not set
# CONFIG_PID_NS is not set
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_COMPAT_BRK is not set
+CONFIG_MARCH_ZEC12=y
CONFIG_TUNE_ZEC12=y
# CONFIG_COMPAT is not set
CONFIG_NR_CPUS=2
CONFIG_HZ_100=y
-# CONFIG_ARCH_RANDOM is not set
# CONFIG_RELOCATABLE is not set
# CONFIG_CHSC_SCH is not set
# CONFIG_SCM_BUS is not set
CONFIG_CRASH_DUMP=y
-# CONFIG_SECCOMP is not set
# CONFIG_PFAULT is not set
# CONFIG_S390_HYPFS_FS is not set
# CONFIG_VIRTUALIZATION is not set
# CONFIG_S390_GUEST is not set
+# CONFIG_SECCOMP is not set
+# CONFIG_GCC_PLUGINS is not set
+# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set
CONFIG_PARTITION_ADVANCED=y
-CONFIG_IBM_PARTITION=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_SWAP is not set
+# CONFIG_COMPAT_BRK is not set
# CONFIG_COMPACTION is not set
# CONFIG_MIGRATION is not set
-# CONFIG_BOUNCE is not set
CONFIG_NET=y
# CONFIG_IUCV is not set
+# CONFIG_PCPU_DEV_REFCNT is not set
+# CONFIG_ETHTOOL_NETLINK is not set
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_BLK_DEV_XPRAM is not set
# CONFIG_DCSSBLK is not set
# CONFIG_DASD is not set
CONFIG_ENCLOSURE_SERVICES=y
@@ -48,26 +52,32 @@ CONFIG_ZFCP=y
# CONFIG_SERIO is not set
# CONFIG_HVC_IUCV is not set
# CONFIG_HW_RANDOM_S390 is not set
-CONFIG_RAW_DRIVER=y
# CONFIG_HMC_DRV is not set
+# CONFIG_S390_UV_UAPI is not set
# CONFIG_S390_TAPE is not set
# CONFIG_VMCP is not set
# CONFIG_MONWRITER is not set
# CONFIG_S390_VMUR is not set
+# CONFIG_RANDOM_TRUST_BOOTLOADER is not set
# CONFIG_HID is not set
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_VHOST_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
# CONFIG_DNOTIFY is not set
# CONFIG_INOTIFY_USER is not set
-CONFIG_CONFIGFS_FS=y
# CONFIG_MISC_FILESYSTEMS is not set
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_LSM="yama,loadpin,safesetid,integrity"
+# CONFIG_ZLIB_DFLTCC is not set
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_FS=y
+# CONFIG_SYMBOLIC_ERRNAME is not set
CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_DEBUG_FS=y
CONFIG_PANIC_ON_OOPS=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_RCU_CPU_STALL_TIMEOUT=60
+# CONFIG_RCU_TRACE is not set
# CONFIG_FTRACE is not set
# CONFIG_RUNTIME_TESTING_MENU is not set
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
new file mode 100644
index 000000000000..06ee706b0d78
--- /dev/null
+++ b/arch/s390/crypto/Kconfig
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (s390)"
+
+config CRYPTO_CRC32_S390
+ tristate "CRC32c and CRC32"
+ depends on S390
+ select CRYPTO_HASH
+ select CRC32
+ help
+ CRC32c and CRC32 CRC algorithms
+
+ Architecture: s390
+
+ It is available with IBM z13 or later.
+
+config CRYPTO_SHA512_S390
+ tristate "Hash functions: SHA-384 and SHA-512"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: s390
+
+ It is available as of z10.
+
+config CRYPTO_SHA1_S390
+ tristate "Hash functions: SHA-1"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: s390
+
+ It is available as of z990.
+
+config CRYPTO_SHA256_S390
+ tristate "Hash functions: SHA-224 and SHA-256"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: s390
+
+ It is available as of z9.
+
+config CRYPTO_SHA3_256_S390
+ tristate "Hash functions: SHA3-224 and SHA3-256"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
+
+ Architecture: s390
+
+ It is available as of z14.
+
+config CRYPTO_SHA3_512_S390
+ tristate "Hash functions: SHA3-384 and SHA3-512"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
+
+ Architecture: s390
+
+ It is available as of z14.
+
+config CRYPTO_GHASH_S390
+ tristate "Hash functions: GHASH"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ GCM GHASH hash function (NIST SP800-38D)
+
+ Architecture: s390
+
+ It is available as of z196.
+
+config CRYPTO_AES_S390
+ tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM"
+ depends on S390
+ select CRYPTO_ALGAPI
+ select CRYPTO_SKCIPHER
+ help
+ Block cipher: AES cipher algorithms (FIPS 197)
+ AEAD cipher: AES with GCM
+ Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes
+
+ Architecture: s390
+
+ As of z9 the ECB and CBC modes are hardware accelerated
+ for 128 bit keys.
+
+ As of z10 the ECB and CBC modes are hardware accelerated
+ for all AES key sizes.
+
+ As of z196 the CTR mode is hardware accelerated for all AES
+ key sizes and XTS mode is hardware accelerated for 256 and
+ 512 bit keys.
+
+config CRYPTO_DES_S390
+ tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR"
+ depends on S390
+ select CRYPTO_ALGAPI
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_DES
+ help
+ Block ciphers: DES (FIPS 46-2) cipher algorithm
+ Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm
+ Length-preserving ciphers: DES with ECB, CBC, and CTR modes
+ Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes
+
+ Architecture: s390
+
+ As of z990 the ECB and CBC mode are hardware accelerated.
+ As of z196 the CTR mode is hardware accelerated.
+
+config CRYPTO_CHACHA_S390
+ tristate "Ciphers: ChaCha20"
+ depends on S390
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ help
+ Length-preserving cipher: ChaCha20 stream cipher (RFC 7539)
+
+ Architecture: s390
+
+ It is available as of z13.
+
+endmenu
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 12889d4652cc..1b1cc478fa94 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -11,9 +11,11 @@ obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
-obj-$(CONFIG_ARCH_RANDOM) += arch_random.o
+obj-y += arch_random.o
crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 1c23d84a9097..526c3f40f6a2 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -21,6 +21,7 @@
#include <crypto/algapi.h>
#include <crypto/ghash.h>
#include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/err.h>
@@ -342,6 +343,7 @@ static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier)
memcpy(walk.iv, param.iv, AES_BLOCK_SIZE);
ret = skcipher_walk_done(&walk, nbytes - n);
}
+ memzero_explicit(&param, sizeof(param));
return ret;
}
@@ -470,6 +472,8 @@ static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier)
walk.dst.virt.addr, walk.src.virt.addr, n);
ret = skcipher_walk_done(&walk, nbytes - n);
}
+ memzero_explicit(&pcc_param, sizeof(pcc_param));
+ memzero_explicit(&xts_param, sizeof(xts_param));
return ret;
}
@@ -697,7 +701,7 @@ static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw,
unsigned int nbytes)
{
gw->walk_bytes_remain -= nbytes;
- scatterwalk_unmap(&gw->walk);
+ scatterwalk_unmap(gw->walk_ptr);
scatterwalk_advance(&gw->walk, nbytes);
scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain);
gw->walk_ptr = NULL;
@@ -772,7 +776,7 @@ static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded)
goto out;
}
- scatterwalk_unmap(&gw->walk);
+ scatterwalk_unmap(gw->walk_ptr);
gw->walk_ptr = NULL;
gw->ptr = gw->buf;
@@ -1045,10 +1049,11 @@ out_err:
return ret;
}
-module_cpu_feature_match(MSA, aes_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init);
module_exit(aes_s390_fini);
MODULE_ALIAS_CRYPTO("aes-all");
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c
index dd95cdbd22ce..1f2d40993c4d 100644
--- a/arch/s390/crypto/arch_random.c
+++ b/arch/s390/crypto/arch_random.c
@@ -2,122 +2,17 @@
/*
* s390 arch random implementation.
*
- * Copyright IBM Corp. 2017, 2018
+ * Copyright IBM Corp. 2017, 2020
* Author(s): Harald Freudenberger
- *
- * The s390_arch_random_generate() function may be called from random.c
- * in interrupt context. So this implementation does the best to be very
- * fast. There is a buffer of random data which is asynchronously checked
- * and filled by a workqueue thread.
- * If there are enough bytes in the buffer the s390_arch_random_generate()
- * just delivers these bytes. Otherwise false is returned until the
- * worker thread refills the buffer.
- * The worker fills the rng buffer by pulling fresh entropy from the
- * high quality (but slow) true hardware random generator. This entropy
- * is then spread over the buffer with an pseudo random generator PRNG.
- * As the arch_get_random_seed_long() fetches 8 bytes and the calling
- * function add_interrupt_randomness() counts this as 1 bit entropy the
- * distribution needs to make sure there is in fact 1 bit entropy contained
- * in 8 bytes of the buffer. The current values pull 32 byte entropy
- * and scatter this into a 2048 byte buffer. So 8 byte in the buffer
- * will contain 1 bit of entropy.
- * The worker thread is rescheduled based on the charge level of the
- * buffer but at least with 500 ms delay to avoid too much CPU consumption.
- * So the max. amount of rng data delivered via arch_get_random_seed is
- * limited to 4k bytes per second.
*/
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/random.h>
-#include <linux/slab.h>
#include <linux/static_key.h>
-#include <linux/workqueue.h>
#include <asm/cpacf.h>
DEFINE_STATIC_KEY_FALSE(s390_arch_random_available);
atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0);
EXPORT_SYMBOL(s390_arch_random_counter);
-
-#define ARCH_REFILL_TICKS (HZ/2)
-#define ARCH_PRNG_SEED_SIZE 32
-#define ARCH_RNG_BUF_SIZE 2048
-
-static DEFINE_SPINLOCK(arch_rng_lock);
-static u8 *arch_rng_buf;
-static unsigned int arch_rng_buf_idx;
-
-static void arch_rng_refill_buffer(struct work_struct *);
-static DECLARE_DELAYED_WORK(arch_rng_work, arch_rng_refill_buffer);
-
-bool s390_arch_random_generate(u8 *buf, unsigned int nbytes)
-{
- /* lock rng buffer */
- if (!spin_trylock(&arch_rng_lock))
- return false;
-
- /* try to resolve the requested amount of bytes from the buffer */
- arch_rng_buf_idx -= nbytes;
- if (arch_rng_buf_idx < ARCH_RNG_BUF_SIZE) {
- memcpy(buf, arch_rng_buf + arch_rng_buf_idx, nbytes);
- atomic64_add(nbytes, &s390_arch_random_counter);
- spin_unlock(&arch_rng_lock);
- return true;
- }
-
- /* not enough bytes in rng buffer, refill is done asynchronously */
- spin_unlock(&arch_rng_lock);
-
- return false;
-}
-EXPORT_SYMBOL(s390_arch_random_generate);
-
-static void arch_rng_refill_buffer(struct work_struct *unused)
-{
- unsigned int delay = ARCH_REFILL_TICKS;
-
- spin_lock(&arch_rng_lock);
- if (arch_rng_buf_idx > ARCH_RNG_BUF_SIZE) {
- /* buffer is exhausted and needs refill */
- u8 seed[ARCH_PRNG_SEED_SIZE];
- u8 prng_wa[240];
- /* fetch ARCH_PRNG_SEED_SIZE bytes of entropy */
- cpacf_trng(NULL, 0, seed, sizeof(seed));
- /* blow this entropy up to ARCH_RNG_BUF_SIZE with PRNG */
- memset(prng_wa, 0, sizeof(prng_wa));
- cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED,
- &prng_wa, NULL, 0, seed, sizeof(seed));
- cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
- &prng_wa, arch_rng_buf, ARCH_RNG_BUF_SIZE, NULL, 0);
- arch_rng_buf_idx = ARCH_RNG_BUF_SIZE;
- }
- delay += (ARCH_REFILL_TICKS * arch_rng_buf_idx) / ARCH_RNG_BUF_SIZE;
- spin_unlock(&arch_rng_lock);
-
- /* kick next check */
- queue_delayed_work(system_long_wq, &arch_rng_work, delay);
-}
-
-static int __init s390_arch_random_init(void)
-{
- /* all the needed PRNO subfunctions available ? */
- if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG) &&
- cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) {
-
- /* alloc arch random working buffer */
- arch_rng_buf = kmalloc(ARCH_RNG_BUF_SIZE, GFP_KERNEL);
- if (!arch_rng_buf)
- return -ENOMEM;
-
- /* kick worker queue job to fill the random buffer */
- queue_delayed_work(system_long_wq,
- &arch_rng_work, ARCH_REFILL_TICKS);
-
- /* enable arch random to the outside world */
- static_branch_enable(&s390_arch_random_available);
- }
-
- return 0;
-}
-arch_initcall(s390_arch_random_init);
diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c
new file mode 100644
index 000000000000..7752bd314558
--- /dev/null
+++ b/arch/s390/crypto/chacha-glue.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/algapi.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu/api.h>
+#include "chacha-s390.h"
+
+static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
+ unsigned int nbytes, const u32 *key,
+ u32 *counter)
+{
+ struct kernel_fpu vxstate;
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ chacha20_vx(dst, src, nbytes, key, counter);
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+ *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static int chacha20_s390(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 state[CHACHA_STATE_WORDS] __aligned(16);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int rc;
+
+ rc = skcipher_walk_virt(&walk, req, false);
+ chacha_init_generic(state, ctx->key, req->iv);
+
+ while (walk.nbytes > 0) {
+ nbytes = walk.nbytes;
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ if (nbytes <= CHACHA_BLOCK_SIZE) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ chacha20_crypt_s390(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ &state[4], &state[12]);
+ }
+ rc = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+ return rc;
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ /* TODO: implement hchacha_block_arch() in assembly */
+ hchacha_block_generic(state, stream, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ /* s390 chacha20 implementation has 20 rounds hard-coded,
+ * it cannot handle a block of data or less, but otherwise
+ * it can handle data of arbitrary size
+ */
+ if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20)
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ else
+ chacha20_crypt_s390(state, dst, src, bytes,
+ &state[4], &state[12]);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static struct skcipher_alg chacha_algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-s390",
+ .base.cra_priority = 900,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha20_s390,
+ .decrypt = chacha20_s390,
+ }
+};
+
+static int __init chacha_mod_init(void)
+{
+ return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
+ crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0;
+}
+
+static void __exit chacha_mod_fini(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER))
+ crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
+}
+
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init);
+module_exit(chacha_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha20 stream cipher");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S
new file mode 100644
index 000000000000..9b033622191c
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.S
@@ -0,0 +1,907 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/vx-insn.h>
+
+#define SP %r15
+#define FRAME (16 * 8 + 4 * 8)
+
+.data
+.align 32
+
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+.long 1,0,0,0
+.long 2,0,0,0
+.long 3,0,0,0
+.long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+
+.long 0,1,2,3
+.long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma
+.long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
+.long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
+.long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
+
+.previous
+
+ GEN_BR_THUNK %r14
+
+.text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+#define CTR %v26
+
+#define K0 %v16
+#define K1 %v17
+#define K2 %v18
+#define K3 %v19
+
+#define XA0 %v0
+#define XA1 %v1
+#define XA2 %v2
+#define XA3 %v3
+
+#define XB0 %v4
+#define XB1 %v5
+#define XB2 %v6
+#define XB3 %v7
+
+#define XC0 %v8
+#define XC1 %v9
+#define XC2 %v10
+#define XC3 %v11
+
+#define XD0 %v12
+#define XD1 %v13
+#define XD2 %v14
+#define XD3 %v15
+
+#define XT0 %v27
+#define XT1 %v28
+#define XT2 %v29
+#define XT3 %v30
+
+ENTRY(chacha20_vx_4x)
+ stmg %r6,%r7,6*8(SP)
+
+ larl %r7,.Lsigma
+ lhi %r0,10
+ lhi %r1,0
+
+ VL K0,0,,%r7 # load sigma
+ VL K1,0,,KEY # load key
+ VL K2,16,,KEY
+ VL K3,0,,COUNTER # load counter
+
+ VL BEPERM,0x40,,%r7
+ VL CTR,0x50,,%r7
+
+ VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma
+
+ VREPF XB0,K1,0 # smash the key
+ VREPF XB1,K1,1
+ VREPF XB2,K1,2
+ VREPF XB3,K1,3
+
+ VREPF XD0,K3,0
+ VREPF XD1,K3,1
+ VREPF XD2,K3,2
+ VREPF XD3,K3,3
+ VAF XD0,XD0,CTR
+
+ VREPF XC0,K2,0
+ VREPF XC1,K2,1
+ VREPF XC2,K2,2
+ VREPF XC3,K2,3
+
+.Loop_4x:
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,16
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,16
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,16
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,16
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,12
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,12
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,12
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,12
+
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,8
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,8
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,8
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,8
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,7
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,7
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,7
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,7
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,16
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,16
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,16
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,16
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,12
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,12
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,12
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,12
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,8
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,8
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,8
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,8
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,7
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,7
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,7
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,7
+ brct %r0,.Loop_4x
+
+ VAF XD0,XD0,CTR
+
+ VMRHF XT0,XA0,XA1 # transpose data
+ VMRHF XT1,XA2,XA3
+ VMRLF XT2,XA0,XA1
+ VMRLF XT3,XA2,XA3
+ VPDI XA0,XT0,XT1,0b0000
+ VPDI XA1,XT0,XT1,0b0101
+ VPDI XA2,XT2,XT3,0b0000
+ VPDI XA3,XT2,XT3,0b0101
+
+ VMRHF XT0,XB0,XB1
+ VMRHF XT1,XB2,XB3
+ VMRLF XT2,XB0,XB1
+ VMRLF XT3,XB2,XB3
+ VPDI XB0,XT0,XT1,0b0000
+ VPDI XB1,XT0,XT1,0b0101
+ VPDI XB2,XT2,XT3,0b0000
+ VPDI XB3,XT2,XT3,0b0101
+
+ VMRHF XT0,XC0,XC1
+ VMRHF XT1,XC2,XC3
+ VMRLF XT2,XC0,XC1
+ VMRLF XT3,XC2,XC3
+ VPDI XC0,XT0,XT1,0b0000
+ VPDI XC1,XT0,XT1,0b0101
+ VPDI XC2,XT2,XT3,0b0000
+ VPDI XC3,XT2,XT3,0b0101
+
+ VMRHF XT0,XD0,XD1
+ VMRHF XT1,XD2,XD3
+ VMRLF XT2,XD0,XD1
+ VMRLF XT3,XD2,XD3
+ VPDI XD0,XT0,XT1,0b0000
+ VPDI XD1,XT0,XT1,0b0101
+ VPDI XD2,XT2,XT3,0b0000
+ VPDI XD3,XT2,XT3,0b0101
+
+ VAF XA0,XA0,K0
+ VAF XB0,XB0,K1
+ VAF XC0,XC0,K2
+ VAF XD0,XD0,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+
+ VAF XA0,XA1,K0
+ VAF XB0,XB1,K1
+ VAF XC0,XC1,K2
+ VAF XD0,XD1,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA2,K0
+ VAF XB0,XB2,K1
+ VAF XC0,XC2,K2
+ VAF XD0,XD2,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA3,K0
+ VAF XB0,XB3,K1
+ VAF XC0,XC3,K2
+ VAF XD0,XD3,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+
+.Ltail_4x:
+ VLR XT0,XC0
+ VLR XT1,XD0
+
+ VST XA0,8*8+0x00,,SP
+ VST XB0,8*8+0x10,,SP
+ VST XT0,8*8+0x20,,SP
+ VST XT1,8*8+0x30,,SP
+
+ lghi %r1,0
+
+.Loop_tail_4x:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_4x
+
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+ENDPROC(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+
+#define K0 %v27
+#define K1 %v24
+#define K2 %v25
+#define K3 %v26
+
+#define A0 %v0
+#define B0 %v1
+#define C0 %v2
+#define D0 %v3
+
+#define A1 %v4
+#define B1 %v5
+#define C1 %v6
+#define D1 %v7
+
+#define A2 %v8
+#define B2 %v9
+#define C2 %v10
+#define D2 %v11
+
+#define A3 %v12
+#define B3 %v13
+#define C3 %v14
+#define D3 %v15
+
+#define A4 %v16
+#define B4 %v17
+#define C4 %v18
+#define D4 %v19
+
+#define A5 %v20
+#define B5 %v21
+#define C5 %v22
+#define D5 %v23
+
+#define T0 %v27
+#define T1 %v28
+#define T2 %v29
+#define T3 %v30
+
+ENTRY(chacha20_vx)
+ clgfi LEN,256
+ jle chacha20_vx_4x
+ stmg %r6,%r7,6*8(SP)
+
+ lghi %r1,-FRAME
+ lgr %r0,SP
+ la SP,0(%r1,SP)
+ stg %r0,0(SP) # back-chain
+
+ larl %r7,.Lsigma
+ lhi %r0,10
+
+ VLM K1,K2,0,KEY,0 # load key
+ VL K3,0,,COUNTER # load counter
+
+ VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ...
+
+.Loop_outer_vx:
+ VLR A0,K0
+ VLR B0,K1
+ VLR A1,K0
+ VLR B1,K1
+ VLR A2,K0
+ VLR B2,K1
+ VLR A3,K0
+ VLR B3,K1
+ VLR A4,K0
+ VLR B4,K1
+ VLR A5,K0
+ VLR B5,K1
+
+ VLR D0,K3
+ VAF D1,K3,T1 # K[3]+1
+ VAF D2,K3,T2 # K[3]+2
+ VAF D3,K3,T3 # K[3]+3
+ VAF D4,D2,T2 # K[3]+4
+ VAF D5,D2,T3 # K[3]+5
+
+ VLR C0,K2
+ VLR C1,K2
+ VLR C2,K2
+ VLR C3,K2
+ VLR C4,K2
+ VLR C5,K2
+
+ VLR T1,D1
+ VLR T2,D2
+ VLR T3,D3
+
+.Loop_vx:
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,4
+ VSLDB B1,B1,B1,4
+ VSLDB B2,B2,B2,4
+ VSLDB B3,B3,B3,4
+ VSLDB B4,B4,B4,4
+ VSLDB B5,B5,B5,4
+ VSLDB D0,D0,D0,12
+ VSLDB D1,D1,D1,12
+ VSLDB D2,D2,D2,12
+ VSLDB D3,D3,D3,12
+ VSLDB D4,D4,D4,12
+ VSLDB D5,D5,D5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,12
+ VSLDB B1,B1,B1,12
+ VSLDB B2,B2,B2,12
+ VSLDB B3,B3,B3,12
+ VSLDB B4,B4,B4,12
+ VSLDB B5,B5,B5,12
+ VSLDB D0,D0,D0,4
+ VSLDB D1,D1,D1,4
+ VSLDB D2,D2,D2,4
+ VSLDB D3,D3,D3,4
+ VSLDB D4,D4,D4,4
+ VSLDB D5,D5,D5,4
+ brct %r0,.Loop_vx
+
+ VAF A0,A0,K0
+ VAF B0,B0,K1
+ VAF C0,C0,K2
+ VAF D0,D0,K3
+ VAF A1,A1,K0
+ VAF D1,D1,T1 # +K[3]+1
+
+ VPERM A0,A0,A0,BEPERM
+ VPERM B0,B0,B0,BEPERM
+ VPERM C0,C0,C0,BEPERM
+ VPERM D0,D0,D0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D2,D2,T2 # +K[3]+2
+ VAF D3,D3,T3 # +K[3]+3
+ VLM T0,T3,0,INP,0
+
+ VX A0,A0,T0
+ VX B0,B0,T1
+ VX C0,C0,T2
+ VX D0,D0,T3
+
+ VLM K0,T3,0,%r7,4 # re-load sigma and increments
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF B1,B1,K1
+ VAF C1,C1,K2
+
+ VPERM A0,A1,A1,BEPERM
+ VPERM B0,B1,B1,BEPERM
+ VPERM C0,C1,C1,BEPERM
+ VPERM D0,D1,D1,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A2,A2,K0
+ VAF B2,B2,K1
+ VAF C2,C2,K2
+
+ VPERM A0,A2,A2,BEPERM
+ VPERM B0,B2,B2,BEPERM
+ VPERM C0,C2,C2,BEPERM
+ VPERM D0,D2,D2,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A3,A3,K0
+ VAF B3,B3,K1
+ VAF C3,C3,K2
+ VAF D2,K3,T3 # K[3]+3
+
+ VPERM A0,A3,A3,BEPERM
+ VPERM B0,B3,B3,BEPERM
+ VPERM C0,C3,C3,BEPERM
+ VPERM D0,D3,D3,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D3,D2,T1 # K[3]+4
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A4,A4,K0
+ VAF B4,B4,K1
+ VAF C4,C4,K2
+ VAF D4,D4,D3 # +K[3]+4
+ VAF D3,D3,T1 # K[3]+5
+ VAF K3,D2,T3 # K[3]+=6
+
+ VPERM A0,A4,A4,BEPERM
+ VPERM B0,B4,B4,BEPERM
+ VPERM C0,C4,C4,BEPERM
+ VPERM D0,D4,D4,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A5,A5,K0
+ VAF B5,B5,K1
+ VAF C5,C5,K2
+ VAF D5,D5,D3 # +K[3]+5
+
+ VPERM A0,A5,A5,BEPERM
+ VPERM B0,B5,B5,BEPERM
+ VPERM C0,C5,C5,BEPERM
+ VPERM D0,D5,D5,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ lhi %r0,10
+ aghi LEN,-0x40
+ jne .Loop_outer_vx
+
+.Ldone_vx:
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+
+.Ltail_vx:
+ VSTM A0,D0,8*8,SP,3
+ lghi %r1,0
+
+.Loop_tail_vx:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_vx
+
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+ENDPROC(chacha20_vx)
+
+.previous
diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+ const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
index fafecad20752..017143e9cef7 100644
--- a/arch/s390/crypto/crc32-vx.c
+++ b/arch/s390/crypto/crc32-vx.c
@@ -298,7 +298,7 @@ static void __exit crc_vx_mod_exit(void)
crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs));
}
-module_cpu_feature_match(VXRS, crc_vx_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, crc_vx_mod_init);
module_exit(crc_vx_mod_exit);
MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S
index 0099044e2c86..6b3d1009c392 100644
--- a/arch/s390/crypto/crc32be-vx.S
+++ b/arch/s390/crypto/crc32be-vx.S
@@ -32,7 +32,7 @@
* process particular chunks of the input data stream in parallel.
*
* For the CRC-32 variants, the constants are precomputed according to
- * these defintions:
+ * these definitions:
*
* R1 = x4*128+64 mod P(x)
* R2 = x4*128 mod P(x)
@@ -189,7 +189,7 @@ ENTRY(crc32_be_vgfm_16)
* Note: To compensate the division by x^32, use the vector unpack
* instruction to move the leftmost word into the leftmost doubleword
* of the vector register. The rightmost doubleword is multiplied
- * with zero to not contribute to the intermedate results.
+ * with zero to not contribute to the intermediate results.
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c
index bfbafd35bcbd..8e75b83a5ddc 100644
--- a/arch/s390/crypto/des_s390.c
+++ b/arch/s390/crypto/des_s390.c
@@ -194,7 +194,7 @@ static struct skcipher_alg cbc_des_alg = {
* same as DES. Implementers MUST reject keys that exhibit this
* property.
*
- * In fips mode additinally check for all 3 keys are unique.
+ * In fips mode additionally check for all 3 keys are unique.
*
*/
static int des3_setkey(struct crypto_tfm *tfm, const u8 *key,
@@ -492,7 +492,7 @@ out_err:
return ret;
}
-module_cpu_feature_match(MSA, des_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init);
module_exit(des_s390_exit);
MODULE_ALIAS_CRYPTO("des");
diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c
index 6b07a2f1ce8a..0800a2a5799f 100644
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@@ -145,7 +145,7 @@ static void __exit ghash_mod_exit(void)
crypto_unregister_shash(&ghash_alg);
}
-module_cpu_feature_match(MSA, ghash_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init);
module_exit(ghash_mod_exit);
MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index f3caeb17c85b..a279b7d23a5e 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -22,6 +22,7 @@
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
+#include <linux/delay.h>
#include <crypto/internal/skcipher.h>
#include <crypto/xts.h>
#include <asm/cpacf.h>
@@ -128,6 +129,9 @@ static inline int __paes_keyblob2pkey(struct key_blob *kb,
/* try three times in case of failure */
for (i = 0; i < 3; i++) {
+ if (i > 0 && ret == -EAGAIN && in_task())
+ if (msleep_interruptible(1000))
+ return -EINTR;
ret = pkey_keyblob2pkey(kb->key, kb->keylen, pk);
if (ret == 0)
break;
@@ -138,10 +142,12 @@ static inline int __paes_keyblob2pkey(struct key_blob *kb,
static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
{
+ int ret;
struct pkey_protkey pkey;
- if (__paes_keyblob2pkey(&ctx->kb, &pkey))
- return -EINVAL;
+ ret = __paes_keyblob2pkey(&ctx->kb, &pkey);
+ if (ret)
+ return ret;
spin_lock_bh(&ctx->pk_lock);
memcpy(&ctx->pk, &pkey, sizeof(pkey));
@@ -169,10 +175,12 @@ static void ecb_paes_exit(struct crypto_skcipher *tfm)
static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx)
{
+ int rc;
unsigned long fc;
- if (__paes_convert_key(ctx))
- return -EINVAL;
+ rc = __paes_convert_key(ctx);
+ if (rc)
+ return rc;
/* Pick the correct function code based on the protected key type */
fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
@@ -282,10 +290,12 @@ static void cbc_paes_exit(struct crypto_skcipher *tfm)
static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
{
+ int rc;
unsigned long fc;
- if (__paes_convert_key(ctx))
- return -EINVAL;
+ rc = __paes_convert_key(ctx);
+ if (rc)
+ return rc;
/* Pick the correct function code based on the protected key type */
fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
@@ -577,10 +587,12 @@ static void ctr_paes_exit(struct crypto_skcipher *tfm)
static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
{
+ int rc;
unsigned long fc;
- if (__paes_convert_key(ctx))
- return -EINVAL;
+ rc = __paes_convert_key(ctx);
+ if (rc)
+ return rc;
/* Pick the correct function code based on the protected key type */
fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index d977643fa627..a077087bc6cc 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -249,7 +249,7 @@ static void prng_tdes_deinstantiate(void)
{
pr_debug("The prng module stopped "
"after running in triple DES mode\n");
- kzfree(prng_data);
+ kfree_sensitive(prng_data);
}
@@ -414,7 +414,7 @@ static int __init prng_sha512_instantiate(void)
}
/* append the seed by 16 bytes of unique nonce */
- get_tod_clock_ext(seed + seedlen);
+ store_tod_clock_ext((union tod_clock *)(seed + seedlen));
seedlen += 16;
/* now initial seed of the prno drng */
@@ -442,7 +442,7 @@ outfree:
static void prng_sha512_deinstantiate(void)
{
pr_debug("The prng module stopped after running in SHA-512 mode\n");
- kzfree(prng_data);
+ kfree_sensitive(prng_data);
}
@@ -528,7 +528,7 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf,
/* give mutex free before calling schedule() */
mutex_unlock(&prng_data->mutex);
schedule();
- /* occopy mutex again */
+ /* occupy mutex again */
if (mutex_lock_interruptible(&prng_data->mutex)) {
if (ret == 0)
ret = -ERESTARTSYS;
@@ -674,26 +674,12 @@ static const struct file_operations prng_tdes_fops = {
.llseek = noop_llseek,
};
-static struct miscdevice prng_sha512_dev = {
- .name = "prandom",
- .minor = MISC_DYNAMIC_MINOR,
- .mode = 0644,
- .fops = &prng_sha512_fops,
-};
-static struct miscdevice prng_tdes_dev = {
- .name = "prandom",
- .minor = MISC_DYNAMIC_MINOR,
- .mode = 0644,
- .fops = &prng_tdes_fops,
-};
-
-
/* chunksize attribute (ro) */
static ssize_t prng_chunksize_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size);
}
static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL);
@@ -712,7 +698,7 @@ static ssize_t prng_counter_show(struct device *dev,
counter = prng_data->prngws.byte_counter;
mutex_unlock(&prng_data->mutex);
- return snprintf(buf, PAGE_SIZE, "%llu\n", counter);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", counter);
}
static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL);
@@ -721,7 +707,7 @@ static ssize_t prng_errorflag_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag);
}
static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL);
@@ -731,9 +717,9 @@ static ssize_t prng_mode_show(struct device *dev,
char *buf)
{
if (prng_mode == PRNG_MODE_TDES)
- return snprintf(buf, PAGE_SIZE, "TDES\n");
+ return scnprintf(buf, PAGE_SIZE, "TDES\n");
else
- return snprintf(buf, PAGE_SIZE, "SHA512\n");
+ return scnprintf(buf, PAGE_SIZE, "SHA512\n");
}
static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL);
@@ -756,7 +742,7 @@ static ssize_t prng_reseed_limit_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit);
}
static ssize_t prng_reseed_limit_store(struct device *dev,
struct device_attribute *attr,
@@ -787,7 +773,7 @@ static ssize_t prng_strength_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "256\n");
+ return scnprintf(buf, PAGE_SIZE, "256\n");
}
static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL);
@@ -801,18 +787,30 @@ static struct attribute *prng_sha512_dev_attrs[] = {
&dev_attr_strength.attr,
NULL
};
+ATTRIBUTE_GROUPS(prng_sha512_dev);
+
static struct attribute *prng_tdes_dev_attrs[] = {
&dev_attr_chunksize.attr,
&dev_attr_byte_counter.attr,
&dev_attr_mode.attr,
NULL
};
+ATTRIBUTE_GROUPS(prng_tdes_dev);
-static struct attribute_group prng_sha512_dev_attr_group = {
- .attrs = prng_sha512_dev_attrs
+static struct miscdevice prng_sha512_dev = {
+ .name = "prandom",
+ .minor = MISC_DYNAMIC_MINOR,
+ .mode = 0644,
+ .fops = &prng_sha512_fops,
+ .groups = prng_sha512_dev_groups,
};
-static struct attribute_group prng_tdes_dev_attr_group = {
- .attrs = prng_tdes_dev_attrs
+
+static struct miscdevice prng_tdes_dev = {
+ .name = "prandom",
+ .minor = MISC_DYNAMIC_MINOR,
+ .mode = 0644,
+ .fops = &prng_tdes_fops,
+ .groups = prng_tdes_dev_groups,
};
@@ -867,13 +865,6 @@ static int __init prng_init(void)
prng_sha512_deinstantiate();
goto out;
}
- ret = sysfs_create_group(&prng_sha512_dev.this_device->kobj,
- &prng_sha512_dev_attr_group);
- if (ret) {
- misc_deregister(&prng_sha512_dev);
- prng_sha512_deinstantiate();
- goto out;
- }
} else {
@@ -898,14 +889,6 @@ static int __init prng_init(void)
prng_tdes_deinstantiate();
goto out;
}
- ret = sysfs_create_group(&prng_tdes_dev.this_device->kobj,
- &prng_tdes_dev_attr_group);
- if (ret) {
- misc_deregister(&prng_tdes_dev);
- prng_tdes_deinstantiate();
- goto out;
- }
-
}
out:
@@ -916,17 +899,13 @@ out:
static void __exit prng_exit(void)
{
if (prng_mode == PRNG_MODE_SHA512) {
- sysfs_remove_group(&prng_sha512_dev.this_device->kobj,
- &prng_sha512_dev_attr_group);
misc_deregister(&prng_sha512_dev);
prng_sha512_deinstantiate();
} else {
- sysfs_remove_group(&prng_tdes_dev.this_device->kobj,
- &prng_tdes_dev_attr_group);
misc_deregister(&prng_tdes_dev);
prng_tdes_deinstantiate();
}
}
-module_cpu_feature_match(MSA, prng_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init);
module_exit(prng_exit);
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
index ada2f98c27b7..65ea12fc87a1 100644
--- a/arch/s390/crypto/sha.h
+++ b/arch/s390/crypto/sha.h
@@ -11,7 +11,8 @@
#define _CRYPTO_ARCH_S390_SHA_H
#include <linux/crypto.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <crypto/sha3.h>
/* must be big enough for the largest SHA variant */
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index 7c15542d3685..bc3a22704e09 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -22,12 +22,12 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <asm/cpacf.h>
#include "sha.h"
-static int sha1_init(struct shash_desc *desc)
+static int s390_sha1_init(struct shash_desc *desc)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
@@ -42,7 +42,7 @@ static int sha1_init(struct shash_desc *desc)
return 0;
}
-static int sha1_export(struct shash_desc *desc, void *out)
+static int s390_sha1_export(struct shash_desc *desc, void *out)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
struct sha1_state *octx = out;
@@ -53,7 +53,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
return 0;
}
-static int sha1_import(struct shash_desc *desc, const void *in)
+static int s390_sha1_import(struct shash_desc *desc, const void *in)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
const struct sha1_state *ictx = in;
@@ -67,11 +67,11 @@ static int sha1_import(struct shash_desc *desc, const void *in)
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_init,
+ .init = s390_sha1_init,
.update = s390_sha_update,
.final = s390_sha_final,
- .export = sha1_export,
- .import = sha1_import,
+ .export = s390_sha1_export,
+ .import = s390_sha1_import,
.descsize = sizeof(struct s390_sha_ctx),
.statesize = sizeof(struct sha1_state),
.base = {
@@ -95,7 +95,7 @@ static void __exit sha1_s390_fini(void)
crypto_unregister_shash(&alg);
}
-module_cpu_feature_match(MSA, sha1_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init);
module_exit(sha1_s390_fini);
MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
index b52c87e44939..6f1ccdf93d3e 100644
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@@ -12,7 +12,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <asm/cpacf.h>
#include "sha.h"
@@ -134,7 +134,7 @@ static void __exit sha256_s390_fini(void)
crypto_unregister_shash(&sha256_alg);
}
-module_cpu_feature_match(MSA, sha256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init);
module_exit(sha256_s390_fini);
MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index 460cbbbaa44a..e1350e033a32 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -12,7 +12,6 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
#include <crypto/sha3.h>
#include <asm/cpacf.h>
@@ -138,7 +137,7 @@ static void __exit sha3_256_s390_fini(void)
crypto_unregister_shash(&sha3_256_alg);
}
-module_cpu_feature_match(MSA, sha3_256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
module_exit(sha3_256_s390_fini);
MODULE_ALIAS_CRYPTO("sha3-256");
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index 72cf460a53e5..06c142ed9bb1 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -11,7 +11,6 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
#include <crypto/sha3.h>
#include <asm/cpacf.h>
@@ -148,7 +147,7 @@ static void __exit fini(void)
crypto_unregister_shash(&sha3_384_alg);
}
-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index ad29db085a18..04f11c407763 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -8,7 +8,7 @@
* Author(s): Jan Glauber (jang@de.ibm.com)
*/
#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -22,14 +22,14 @@ static int sha512_init(struct shash_desc *desc)
{
struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
- *(__u64 *)&ctx->state[0] = 0x6a09e667f3bcc908ULL;
- *(__u64 *)&ctx->state[2] = 0xbb67ae8584caa73bULL;
- *(__u64 *)&ctx->state[4] = 0x3c6ef372fe94f82bULL;
- *(__u64 *)&ctx->state[6] = 0xa54ff53a5f1d36f1ULL;
- *(__u64 *)&ctx->state[8] = 0x510e527fade682d1ULL;
- *(__u64 *)&ctx->state[10] = 0x9b05688c2b3e6c1fULL;
- *(__u64 *)&ctx->state[12] = 0x1f83d9abfb41bd6bULL;
- *(__u64 *)&ctx->state[14] = 0x5be0cd19137e2179ULL;
+ *(__u64 *)&ctx->state[0] = SHA512_H0;
+ *(__u64 *)&ctx->state[2] = SHA512_H1;
+ *(__u64 *)&ctx->state[4] = SHA512_H2;
+ *(__u64 *)&ctx->state[6] = SHA512_H3;
+ *(__u64 *)&ctx->state[8] = SHA512_H4;
+ *(__u64 *)&ctx->state[10] = SHA512_H5;
+ *(__u64 *)&ctx->state[12] = SHA512_H6;
+ *(__u64 *)&ctx->state[14] = SHA512_H7;
ctx->count = 0;
ctx->func = CPACF_KIMD_SHA_512;
@@ -87,14 +87,14 @@ static int sha384_init(struct shash_desc *desc)
{
struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
- *(__u64 *)&ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
- *(__u64 *)&ctx->state[2] = 0x629a292a367cd507ULL;
- *(__u64 *)&ctx->state[4] = 0x9159015a3070dd17ULL;
- *(__u64 *)&ctx->state[6] = 0x152fecd8f70e5939ULL;
- *(__u64 *)&ctx->state[8] = 0x67332667ffc00b31ULL;
- *(__u64 *)&ctx->state[10] = 0x8eb44a8768581511ULL;
- *(__u64 *)&ctx->state[12] = 0xdb0c2e0d64f98fa7ULL;
- *(__u64 *)&ctx->state[14] = 0x47b5481dbefa4fa4ULL;
+ *(__u64 *)&ctx->state[0] = SHA384_H0;
+ *(__u64 *)&ctx->state[2] = SHA384_H1;
+ *(__u64 *)&ctx->state[4] = SHA384_H2;
+ *(__u64 *)&ctx->state[6] = SHA384_H3;
+ *(__u64 *)&ctx->state[8] = SHA384_H4;
+ *(__u64 *)&ctx->state[10] = SHA384_H5;
+ *(__u64 *)&ctx->state[12] = SHA384_H6;
+ *(__u64 *)&ctx->state[14] = SHA384_H7;
ctx->count = 0;
ctx->func = CPACF_KIMD_SHA_512;
@@ -142,7 +142,7 @@ static void __exit fini(void)
crypto_unregister_shash(&sha384_alg);
}
-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index f0bc4dc3e9bf..6511d15ace45 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -437,7 +437,7 @@ __init int hypfs_diag_init(void)
int rc;
if (diag204_probe()) {
- pr_err("The hardware system does not support hypfs\n");
+ pr_info("The hardware system does not support hypfs\n");
return -ENODATA;
}
diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c
index 3235e4d82f2d..9a2786079e3a 100644
--- a/arch/s390/hypfs/hypfs_diag0c.c
+++ b/arch/s390/hypfs/hypfs_diag0c.c
@@ -21,7 +21,7 @@
static void diag0c_fn(void *data)
{
diag_stat_inc(DIAG_STAT_X00C);
- diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]);
+ diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]);
}
/*
@@ -33,12 +33,12 @@ static void *diag0c_store(unsigned int *count)
unsigned int cpu_count, cpu, i;
void **cpu_vec;
- get_online_cpus();
+ cpus_read_lock();
cpu_count = num_online_cpus();
cpu_vec = kmalloc_array(num_possible_cpus(), sizeof(*cpu_vec),
GFP_KERNEL);
if (!cpu_vec)
- goto fail_put_online_cpus;
+ goto fail_unlock_cpus;
/* Note: Diag 0c needs 8 byte alignment and real storage */
diag0c_data = kzalloc(struct_size(diag0c_data, entry, cpu_count),
GFP_KERNEL | GFP_DMA);
@@ -54,13 +54,13 @@ static void *diag0c_store(unsigned int *count)
on_each_cpu(diag0c_fn, cpu_vec, 1);
*count = cpu_count;
kfree(cpu_vec);
- put_online_cpus();
+ cpus_read_unlock();
return diag0c_data;
fail_kfree_cpu_vec:
kfree(cpu_vec);
-fail_put_online_cpus:
- put_online_cpus();
+fail_unlock_cpus:
+ cpus_read_unlock();
return ERR_PTR(-ENOMEM);
}
@@ -84,7 +84,7 @@ static int dbfs_diag0c_create(void **data, void **data_free_ptr, size_t *size)
if (IS_ERR(diag0c_data))
return PTR_ERR(diag0c_data);
memset(&diag0c_data->hdr, 0, sizeof(diag0c_data->hdr));
- get_tod_clock_ext(diag0c_data->hdr.tod_ext);
+ store_tod_clock_ext((union tod_clock *)diag0c_data->hdr.tod_ext);
diag0c_data->hdr.len = count * sizeof(struct hypfs_diag0c_entry);
diag0c_data->hdr.version = DBFS_D0C_HDR_VERSION;
diag0c_data->hdr.count = count;
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index 7d9fb496d155..f5f7e78ddc0c 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -25,14 +25,13 @@
static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd)
{
- register unsigned long _data asm("2") = (unsigned long) data;
- register unsigned long _rc asm("3");
- register unsigned long _cmd asm("4") = cmd;
+ union register_pair r1 = { .even = (unsigned long)data, };
- asm volatile("diag %1,%2,0x304\n"
- : "=d" (_rc) : "d" (_data), "d" (_cmd) : "memory");
-
- return _rc;
+ asm volatile("diag %[r1],%[r3],0x304\n"
+ : [r1] "+&d" (r1.pair)
+ : [r3] "d" (cmd)
+ : "memory");
+ return r1.odd;
}
static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd)
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index e1fcc03159ef..a3d881ca0a98 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -10,6 +10,7 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
+#include <asm/extable.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/timex.h>
@@ -20,6 +21,7 @@
static char local_guest[] = " ";
static char all_guests[] = "* ";
+static char *all_groups = all_guests;
static char *guest_query;
struct diag2fc_data {
@@ -62,10 +64,11 @@ static int diag2fc(int size, char* query, void *addr)
memcpy(parm_list.userid, query, NAME_LEN);
ASCEBC(parm_list.userid, NAME_LEN);
- parm_list.addr = (unsigned long) addr ;
+ memcpy(parm_list.aci_grp, all_groups, NAME_LEN);
+ ASCEBC(parm_list.aci_grp, NAME_LEN);
+ parm_list.addr = (unsigned long)addr;
parm_list.size = size;
parm_list.fmt = 0x02;
- memset(parm_list.aci_grp, 0x40, NAME_LEN);
rc = -1;
diag_stat_inc(DIAG_STAT_X2FC);
@@ -187,7 +190,7 @@ int hypfs_vm_create_files(struct dentry *root)
if (IS_ERR(data))
return PTR_ERR(data);
- /* Hpervisor Info */
+ /* Hypervisor Info */
dir = hypfs_mkdir(root, "hyp");
if (IS_ERR(dir)) {
rc = PTR_ERR(dir);
@@ -234,7 +237,7 @@ failed:
struct dbfs_d2fc_hdr {
u64 len; /* Length of d2fc buffer without header */
u16 version; /* Version of header */
- char tod_ext[STORE_CLOCK_EXT_SIZE]; /* TOD clock for d2fc */
+ union tod_clock tod_ext; /* TOD clock for d2fc */
u64 count; /* Number of VM guests in d2fc buffer */
char reserved[30];
} __attribute__ ((packed));
@@ -252,7 +255,7 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size)
d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr));
if (IS_ERR(d2fc))
return PTR_ERR(d2fc);
- get_tod_clock_ext(d2fc->hdr.tod_ext);
+ store_tod_clock_ext(&d2fc->hdr.tod_ext);
d2fc->hdr.len = count * sizeof(struct diag2fc_data);
d2fc->hdr.version = DBFS_D2FC_HDR_VERSION;
d2fc->hdr.count = count;
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 5c97f48cea91..ee919bfc8186 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -496,9 +496,9 @@ fail_hypfs_sprp_exit:
hypfs_vm_exit();
fail_hypfs_diag_exit:
hypfs_diag_exit();
+ pr_err("Initialization of hypfs failed with rc=%i\n", rc);
fail_dbfs_exit:
hypfs_dbfs_exit();
- pr_err("Initialization of hypfs failed with rc=%i\n", rc);
return rc;
}
device_initcall(hypfs_init)
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 1832ae6442ef..1a18d7b82f86 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,21 +5,6 @@ generated-y += syscall_table.h
generated-y += unistd_nr.h
generic-y += asm-offsets.h
-generic-y += cacheflush.h
-generic-y += device.h
-generic-y += dma-mapping.h
-generic-y += div64.h
-generic-y += emergency-restart.h
generic-y += export.h
-generic-y += fb.h
-generic-y += irq_regs.h
-generic-y += irq_work.h
-generic-y += kmap_types.h
-generic-y += local.h
-generic-y += local64.h
+generic-y += kvm_types.h
generic-y += mcs_spinlock.h
-generic-y += mm-arch-hooks.h
-generic-y += mmiowb.h
-generic-y += trace_clock.h
-generic-y += unaligned.h
-generic-y += word-at-a-time.h
diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h
new file mode 100644
index 000000000000..4c61b14ee928
--- /dev/null
+++ b/arch/s390/include/asm/abs_lowcore.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ABS_LOWCORE_H
+#define _ASM_S390_ABS_LOWCORE_H
+
+#include <asm/lowcore.h>
+
+#define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore))
+
+extern unsigned long __abs_lowcore;
+extern bool abs_lowcore_mapped;
+
+struct lowcore *get_abs_lowcore(unsigned long *flags);
+void put_abs_lowcore(struct lowcore *lc, unsigned long flags);
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc);
+void abs_lowcore_unmap(int cpu);
+
+#endif /* _ASM_S390_ABS_LOWCORE_H */
diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h
index 01936fdfaddb..e82e5626e139 100644
--- a/arch/s390/include/asm/airq.h
+++ b/arch/s390/include/asm/airq.h
@@ -12,10 +12,11 @@
#include <linux/bit_spinlock.h>
#include <linux/dma-mapping.h>
+#include <asm/tpi.h>
struct airq_struct {
struct hlist_node list; /* Handler queueing. */
- void (*handler)(struct airq_struct *airq, bool floating);
+ void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info);
u8 *lsi_ptr; /* Local-Summary-Indicator pointer */
u8 lsi_mask; /* Local-Summary-Indicator mask */
u8 isc; /* Interrupt-subclass */
@@ -46,8 +47,10 @@ struct airq_iv {
#define AIRQ_IV_PTR 4 /* Allocate the ptr array */
#define AIRQ_IV_DATA 8 /* Allocate the data array */
#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */
+#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */
-struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags);
+struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
+ unsigned long *vec);
void airq_iv_release(struct airq_iv *iv);
unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num);
void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num);
diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h
index 955d620db23e..7db046596b93 100644
--- a/arch/s390/include/asm/alternative-asm.h
+++ b/arch/s390/include/asm/alternative-asm.h
@@ -5,19 +5,6 @@
#ifdef __ASSEMBLY__
/*
- * Check the length of an instruction sequence. The length may not be larger
- * than 254 bytes and it has to be divisible by 2.
- */
-.macro alt_len_check start,end
- .if ( \end - \start ) > 254
- .error "cpu alternatives does not support instructions blocks > 254 bytes\n"
- .endif
- .if ( \end - \start ) % 2
- .error "cpu alternatives instructions length is odd\n"
- .endif
-.endm
-
-/*
* Issue one struct alt_instr descriptor entry (need to put it into
* the section .altinstructions, see below). This entry contains
* enough information for the alternatives patching code to patch an
@@ -28,60 +15,29 @@
.long \alt_start - .
.word \feature
.byte \orig_end - \orig_start
- .byte \alt_end - \alt_start
-.endm
-
-/*
- * Fill up @bytes with nops. The macro emits 6-byte nop instructions
- * for the bulk of the area, possibly followed by a 4-byte and/or
- * a 2-byte nop if the size of the area is not divisible by 6.
- */
-.macro alt_pad_fill bytes
- .fill ( \bytes ) / 6, 6, 0xc0040000
- .fill ( \bytes ) % 6 / 4, 4, 0x47000000
- .fill ( \bytes ) % 6 % 4 / 2, 2, 0x0700
-.endm
-
-/*
- * Fill up @bytes with nops. If the number of bytes is larger
- * than 6, emit a jg instruction to branch over all nops, then
- * fill an area of size (@bytes - 6) with nop instructions.
- */
-.macro alt_pad bytes
- .if ( \bytes > 0 )
- .if ( \bytes > 6 )
- jg . + \bytes
- alt_pad_fill \bytes - 6
- .else
- alt_pad_fill \bytes
- .endif
- .endif
+ .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start )
+ .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start )
.endm
/*
* Define an alternative between two instructions. If @feature is
* present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
+ * @newinstr.
*/
.macro ALTERNATIVE oldinstr, newinstr, feature
.pushsection .altinstr_replacement,"ax"
770: \newinstr
771: .popsection
772: \oldinstr
-773: alt_len_check 770b, 771b
- alt_len_check 772b, 773b
- alt_pad ( ( 771b - 770b ) - ( 773b - 772b ) )
-774: .pushsection .altinstructions,"a"
- alt_entry 772b, 774b, 770b, 771b, \feature
+773: .pushsection .altinstructions,"a"
+ alt_entry 772b, 773b, 770b, 771b, \feature
.popsection
.endm
/*
* Define an alternative between two instructions. If @feature is
* present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
+ * @newinstr.
*/
.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
.pushsection .altinstr_replacement,"ax"
@@ -89,17 +45,9 @@
771: \newinstr2
772: .popsection
773: \oldinstr
-774: alt_len_check 770b, 771b
- alt_len_check 771b, 772b
- alt_len_check 773b, 774b
- .if ( 771b - 770b > 772b - 771b )
- alt_pad ( ( 771b - 770b ) - ( 774b - 773b ) )
- .else
- alt_pad ( ( 772b - 771b ) - ( 774b - 773b ) )
- .endif
-775: .pushsection .altinstructions,"a"
- alt_entry 773b, 775b, 770b, 771b,\feature1
- alt_entry 773b, 775b, 771b, 772b,\feature2
+774: .pushsection .altinstructions,"a"
+ alt_entry 773b, 774b, 770b, 771b,\feature1
+ alt_entry 773b, 774b, 771b, 772b,\feature2
.popsection
.endm
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 1c8a38f762a3..904dd049f954 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -13,32 +13,25 @@ struct alt_instr {
s32 repl_offset; /* offset to replacement instruction */
u16 facility; /* facility bit set for replacement */
u8 instrlen; /* length of original instruction */
- u8 replacementlen; /* length of new instruction */
} __packed;
void apply_alternative_instructions(void);
void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
/*
- * |661: |662: |6620 |663:
- * +-----------+---------------------+
- * | oldinstr | oldinstr_padding |
- * | +----------+----------+
- * | | | |
- * | | >6 bytes |6/4/2 nops|
- * | |6 bytes jg----------->
- * +-----------+---------------------+
- * ^^ static padding ^^
+ * +---------------------------------+
+ * |661: |662:
+ * | oldinstr |
+ * +---------------------------------+
*
* .altinstr_replacement section
- * +---------------------+-----------+
+ * +---------------------------------+
* |6641: |6651:
* | alternative instr 1 |
- * +-----------+---------+- - - - - -+
- * |6642: |6652: |
- * | alternative instr 2 | padding
- * +---------------------+- - - - - -+
- * ^ runtime ^
+ * +---------------------------------+
+ * |6642: |6652:
+ * | alternative instr 2 |
+ * +---------------------------------+
*
* .altinstructions section
* +---------------------------------+
@@ -47,70 +40,31 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
* +---------------------------------+
*/
-#define b_altinstr(num) "664"#num
-#define e_altinstr(num) "665"#num
-
-#define e_oldinstr_pad_end "663"
+#define b_altinstr(num) "664"#num
+#define e_altinstr(num) "665"#num
#define oldinstr_len "662b-661b"
-#define oldinstr_total_len e_oldinstr_pad_end"b-661b"
#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b"
-#define oldinstr_pad_len(num) \
- "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \
- "((" altinstr_len(num) ")-(" oldinstr_len "))"
-
-#define INSTR_LEN_SANITY_CHECK(len) \
- ".if " len " > 254\n" \
- "\t.error \"cpu alternatives does not support instructions " \
- "blocks > 254 bytes\"\n" \
- ".endif\n" \
- ".if (" len ") %% 2\n" \
- "\t.error \"cpu alternatives instructions length is odd\"\n" \
- ".endif\n"
-
-#define OLDINSTR_PADDING(oldinstr, num) \
- ".if " oldinstr_pad_len(num) " > 6\n" \
- "\tjg " e_oldinstr_pad_end "f\n" \
- "6620:\n" \
- "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \
- ".else\n" \
- "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \
- "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \
- "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \
- ".endif\n"
-
-#define OLDINSTR(oldinstr, num) \
- "661:\n\t" oldinstr "\n662:\n" \
- OLDINSTR_PADDING(oldinstr, num) \
- e_oldinstr_pad_end ":\n" \
- INSTR_LEN_SANITY_CHECK(oldinstr_len)
-
-#define OLDINSTR_2(oldinstr, num1, num2) \
- "661:\n\t" oldinstr "\n662:\n" \
- ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \
- OLDINSTR_PADDING(oldinstr, num2) \
- ".else\n" \
- OLDINSTR_PADDING(oldinstr, num1) \
- ".endif\n" \
- e_oldinstr_pad_end ":\n" \
- INSTR_LEN_SANITY_CHECK(oldinstr_len)
+
+#define OLDINSTR(oldinstr) \
+ "661:\n\t" oldinstr "\n662:\n"
#define ALTINSTR_ENTRY(facility, num) \
"\t.long 661b - .\n" /* old instruction */ \
"\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \
"\t.word " __stringify(facility) "\n" /* facility bit */ \
- "\t.byte " oldinstr_total_len "\n" /* source len */ \
- "\t.byte " altinstr_len(num) "\n" /* alt instruction len */
+ "\t.byte " oldinstr_len "\n" /* instruction len */ \
+ "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \
+ "\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n"
#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \
- b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \
- INSTR_LEN_SANITY_CHECK(altinstr_len(num))
+ b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, altinstr, facility) \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(altinstr, 1) \
".popsection\n" \
- OLDINSTR(oldinstr, 1) \
+ OLDINSTR(oldinstr) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(facility, 1) \
".popsection\n"
@@ -120,7 +74,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
ALTINSTR_REPLACEMENT(altinstr1, 1) \
ALTINSTR_REPLACEMENT(altinstr2, 2) \
".popsection\n" \
- OLDINSTR_2(oldinstr, 1, 2) \
+ OLDINSTR(oldinstr) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(facility1, 1) \
ALTINSTR_ENTRY(facility2, 2) \
@@ -145,6 +99,22 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \
altinstr2, facility2) ::: "memory")
+/* Alternative inline assembly with input. */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, altinstr, facility, output, input...) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) \
+ : output : input)
+
+/* Use this macro if more than one output parameter is needed. */
+#define ASM_OUTPUT2(a...) a
+
+/* Use this macro if clobbers are needed without inputs. */
+#define ASM_NO_INPUT_CLOBBER(clobber...) : clobber
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_ALTERNATIVE_H */
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index aea32dda3d14..f508f5025e38 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -12,6 +12,9 @@
#ifndef _ASM_S390_AP_H_
#define _ASM_S390_AP_H_
+#include <linux/io.h>
+#include <asm/asm-extable.h>
+
/**
* The ap_qid_t identifier of an ap queue.
* If the AP facilities test (APFT) facility is available,
@@ -53,18 +56,20 @@ struct ap_queue_status {
*/
static inline bool ap_instructions_available(void)
{
- register unsigned long reg0 asm ("0") = AP_MKQID(0, 0);
- register unsigned long reg1 asm ("1") = 0;
- register unsigned long reg2 asm ("2") = 0;
+ unsigned long reg0 = AP_MKQID(0, 0);
+ unsigned long reg1 = 0;
asm volatile(
- " .long 0xb2af0000\n" /* PQAP(TAPQ) */
- "0: la %0,1\n"
+ " lgr 0,%[reg0]\n" /* qid into gr0 */
+ " lghi 1,0\n" /* 0 into gr1 */
+ " lghi 2,0\n" /* 0 into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */
+ "0: la %[reg1],1\n" /* 1 into reg1 */
"1:\n"
EX_TABLE(0b, 1b)
- : "+d" (reg1), "+d" (reg2)
- : "d" (reg0)
- : "cc");
+ : [reg1] "+&d" (reg1)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1", "2");
return reg1 != 0;
}
@@ -77,14 +82,18 @@ static inline bool ap_instructions_available(void)
*/
static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info)
{
- register unsigned long reg0 asm ("0") = qid;
- register struct ap_queue_status reg1 asm ("1");
- register unsigned long reg2 asm ("2");
-
- asm volatile(".long 0xb2af0000" /* PQAP(TAPQ) */
- : "=d" (reg1), "=d" (reg2)
- : "d" (reg0)
- : "cc");
+ struct ap_queue_status reg1;
+ unsigned long reg2;
+
+ asm volatile(
+ " lgr 0,%[qid]\n" /* qid into gr0 */
+ " lghi 2,0\n" /* 0 into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ " lgr %[reg2],2\n" /* gr2 into reg2 */
+ : [reg1] "=&d" (reg1), [reg2] "=&d" (reg2)
+ : [qid] "d" (qid)
+ : "cc", "0", "1", "2");
if (info)
*info = reg2;
return reg1;
@@ -115,14 +124,16 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
*/
static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
{
- register unsigned long reg0 asm ("0") = qid | (1UL << 24);
- register struct ap_queue_status reg1 asm ("1");
+ unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */
+ struct ap_queue_status reg1;
asm volatile(
- ".long 0xb2af0000" /* PQAP(RAPQ) */
- : "=d" (reg1)
- : "d" (reg0)
- : "cc");
+ " lgr 0,%[reg0]\n" /* qid arg into gr0 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(RAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "=&d" (reg1)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1");
return reg1;
}
@@ -134,14 +145,16 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
*/
static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
{
- register unsigned long reg0 asm ("0") = qid | (2UL << 24);
- register struct ap_queue_status reg1 asm ("1");
+ unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */
+ struct ap_queue_status reg1;
asm volatile(
- ".long 0xb2af0000" /* PQAP(ZAPQ) */
- : "=d" (reg1)
- : "d" (reg0)
- : "cc");
+ " lgr 0,%[reg0]\n" /* qid arg into gr0 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(ZAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "=&d" (reg1)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1");
return reg1;
}
@@ -172,18 +185,20 @@ struct ap_config_info {
*/
static inline int ap_qci(struct ap_config_info *config)
{
- register unsigned long reg0 asm ("0") = 4UL << 24;
- register unsigned long reg1 asm ("1") = -EOPNOTSUPP;
- register struct ap_config_info *reg2 asm ("2") = config;
+ unsigned long reg0 = 4UL << 24; /* fc 4UL is QCI */
+ unsigned long reg1 = -EOPNOTSUPP;
+ struct ap_config_info *reg2 = config;
asm volatile(
- ".long 0xb2af0000\n" /* PQAP(QCI) */
- "0: la %0,0\n"
+ " lgr 0,%[reg0]\n" /* QCI fc into gr0 */
+ " lgr 2,%[reg2]\n" /* ptr to config into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(QCI) */
+ "0: la %[reg1],0\n" /* good case, QCI fc available */
"1:\n"
EX_TABLE(0b, 1b)
- : "+d" (reg1)
- : "d" (reg0), "d" (reg2)
- : "cc", "memory");
+ : [reg1] "+&d" (reg1)
+ : [reg0] "d" (reg0), [reg2] "d" (reg2)
+ : "cc", "memory", "0", "2");
return reg1;
}
@@ -212,29 +227,33 @@ struct ap_qirq_ctrl {
* ap_aqic(): Control interruption for a specific AP.
* @qid: The AP queue number
* @qirqctrl: struct ap_qirq_ctrl (64 bit value)
- * @ind: The notification indicator byte
+ * @pa_ind: Physical address of the notification indicator byte
*
* Returns AP queue status.
*/
static inline struct ap_queue_status ap_aqic(ap_qid_t qid,
struct ap_qirq_ctrl qirqctrl,
- void *ind)
+ phys_addr_t pa_ind)
{
- register unsigned long reg0 asm ("0") = qid | (3UL << 24);
- register union {
+ unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */
+ union {
unsigned long value;
struct ap_qirq_ctrl qirqctrl;
struct ap_queue_status status;
- } reg1 asm ("1");
- register void *reg2 asm ("2") = ind;
+ } reg1;
+ unsigned long reg2 = pa_ind;
reg1.qirqctrl = qirqctrl;
asm volatile(
- ".long 0xb2af0000" /* PQAP(AQIC) */
- : "+d" (reg1)
- : "d" (reg0), "d" (reg2)
- : "cc");
+ " lgr 0,%[reg0]\n" /* qid param into gr0 */
+ " lgr 1,%[reg1]\n" /* irq ctrl into gr1 */
+ " lgr 2,%[reg2]\n" /* ni addr into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(AQIC) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "+&d" (reg1)
+ : [reg0] "d" (reg0), [reg2] "d" (reg2)
+ : "cc", "0", "1", "2");
return reg1.status;
}
@@ -268,21 +287,24 @@ union ap_qact_ap_info {
static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit,
union ap_qact_ap_info *apinfo)
{
- register unsigned long reg0 asm ("0") = qid | (5UL << 24)
- | ((ifbit & 0x01) << 22);
- register union {
+ unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22);
+ union {
unsigned long value;
struct ap_queue_status status;
- } reg1 asm ("1");
- register unsigned long reg2 asm ("2");
+ } reg1;
+ unsigned long reg2;
reg1.value = apinfo->val;
asm volatile(
- ".long 0xb2af0000" /* PQAP(QACT) */
- : "+d" (reg1), "=d" (reg2)
- : "d" (reg0)
- : "cc");
+ " lgr 0,%[reg0]\n" /* qid param into gr0 */
+ " lgr 1,%[reg1]\n" /* qact in info into gr1 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(QACT) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ " lgr %[reg2],2\n" /* qact out info into reg2 */
+ : [reg1] "+&d" (reg1), [reg2] "=&d" (reg2)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1", "2");
apinfo->val = reg2;
return reg1.status;
}
@@ -303,19 +325,24 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
unsigned long long psmid,
void *msg, size_t length)
{
- register unsigned long reg0 asm ("0") = qid | 0x40000000UL;
- register struct ap_queue_status reg1 asm ("1");
- register unsigned long reg2 asm ("2") = (unsigned long) msg;
- register unsigned long reg3 asm ("3") = (unsigned long) length;
- register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32);
- register unsigned long reg5 asm ("5") = psmid & 0xffffffff;
+ unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */
+ union register_pair nqap_r1, nqap_r2;
+ struct ap_queue_status reg1;
+
+ nqap_r1.even = (unsigned int)(psmid >> 32);
+ nqap_r1.odd = psmid & 0xffffffff;
+ nqap_r2.even = (unsigned long)msg;
+ nqap_r2.odd = (unsigned long)length;
asm volatile (
- "0: .long 0xb2ad0042\n" /* NQAP */
- " brc 2,0b"
- : "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3)
- : "d" (reg4), "d" (reg5)
- : "cc", "memory");
+ " lgr 0,%[reg0]\n" /* qid param in gr0 */
+ "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n"
+ " brc 2,0b\n" /* handle partial completion */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1),
+ [nqap_r2] "+&d" (nqap_r2.pair)
+ : [nqap_r1] "d" (nqap_r1.pair)
+ : "cc", "memory", "0", "1");
return reg1;
}
@@ -325,6 +352,8 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
* @psmid: Pointer to program supplied message identifier
* @msg: The message text
* @length: The message length
+ * @reslength: Resitual length on return
+ * @resgr0: input: gr0 value (only used if != 0), output: resitual gr0 content
*
* Returns AP queue status structure.
* Condition code 1 on DQAP means the receive has taken place
@@ -336,27 +365,65 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
* Note that gpr2 is used by the DQAP instruction to keep track of
* any 'residual' length, in case the instruction gets interrupted.
* Hence it gets zeroed before the instruction.
+ * If the message does not fit into the buffer, this function will
+ * return with a truncated message and the reply in the firmware queue
+ * is not removed. This is indicated to the caller with an
+ * ap_queue_status response_code value of all bits on (0xFF) and (if
+ * the reslength ptr is given) the remaining length is stored in
+ * *reslength and (if the resgr0 ptr is given) the updated gr0 value
+ * for further processing of this msg entry is stored in *resgr0. The
+ * caller needs to detect this situation and should invoke ap_dqap
+ * with a valid resgr0 ptr and a value in there != 0 to indicate that
+ * *resgr0 is to be used instead of qid to further process this entry.
*/
static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
unsigned long long *psmid,
- void *msg, size_t length)
+ void *msg, size_t length,
+ size_t *reslength,
+ unsigned long *resgr0)
{
- register unsigned long reg0 asm("0") = qid | 0x80000000UL;
- register struct ap_queue_status reg1 asm ("1");
- register unsigned long reg2 asm("2") = 0UL;
- register unsigned long reg4 asm("4") = (unsigned long) msg;
- register unsigned long reg5 asm("5") = (unsigned long) length;
- register unsigned long reg6 asm("6") = 0UL;
- register unsigned long reg7 asm("7") = 0UL;
+ unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL;
+ struct ap_queue_status reg1;
+ unsigned long reg2;
+ union register_pair rp1, rp2;
+ rp1.even = 0UL;
+ rp1.odd = 0UL;
+ rp2.even = (unsigned long)msg;
+ rp2.odd = (unsigned long)length;
asm volatile(
- "0: .long 0xb2ae0064\n" /* DQAP */
- " brc 6,0b\n"
- : "+d" (reg0), "=d" (reg1), "+d" (reg2),
- "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7)
- : : "cc", "memory");
- *psmid = (((unsigned long long) reg6) << 32) + reg7;
+ " lgr 0,%[reg0]\n" /* qid param into gr0 */
+ " lghi 2,0\n" /* 0 into gr2 (res length) */
+ "0: ltgr %N[rp2],%N[rp2]\n" /* check buf len */
+ " jz 2f\n" /* go out if buf len is 0 */
+ "1: .insn rre,0xb2ae0000,%[rp1],%[rp2]\n"
+ " brc 6,0b\n" /* handle partial complete */
+ "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */
+ : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), [reg2] "=&d" (reg2),
+ [rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair)
+ :
+ : "cc", "memory", "0", "1", "2");
+
+ if (reslength)
+ *reslength = reg2;
+ if (reg2 != 0 && rp2.odd == 0) {
+ /*
+ * Partially complete, status in gr1 is not set.
+ * Signal the caller that this dqap is only partially received
+ * with a special status response code 0xFF and *resgr0 updated
+ */
+ reg1.response_code = 0xFF;
+ if (resgr0)
+ *resgr0 = reg0;
+ } else {
+ *psmid = (((unsigned long long)rp1.even) << 32) + rp1.odd;
+ if (resgr0)
+ *resgr0 = 0;
+ }
+
return reg1;
}
@@ -368,7 +435,7 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
#if IS_ENABLED(CONFIG_ZCRYPT)
void ap_bus_cfg_chg(void);
#else
-static inline void ap_bus_cfg_chg(void){};
+static inline void ap_bus_cfg_chg(void){}
#endif
#endif /* _ASM_S390_AP_H_ */
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
index de61ce562052..1594049893e0 100644
--- a/arch/s390/include/asm/archrandom.h
+++ b/arch/s390/include/asm/archrandom.h
@@ -2,7 +2,7 @@
/*
* Kernel interface for the s390 arch_random_* functions
*
- * Copyright IBM Corp. 2017
+ * Copyright IBM Corp. 2017, 2022
*
* Author: Harald Freudenberger <freude@de.ibm.com>
*
@@ -11,41 +11,28 @@
#ifndef _ASM_S390_ARCHRANDOM_H
#define _ASM_S390_ARCHRANDOM_H
-#ifdef CONFIG_ARCH_RANDOM
-
#include <linux/static_key.h>
+#include <linux/preempt.h>
#include <linux/atomic.h>
+#include <asm/cpacf.h>
DECLARE_STATIC_KEY_FALSE(s390_arch_random_available);
extern atomic64_t s390_arch_random_counter;
-bool s390_arch_random_generate(u8 *buf, unsigned int nbytes);
-
-static inline bool __must_check arch_get_random_long(unsigned long *v)
-{
- return false;
-}
-
-static inline bool __must_check arch_get_random_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
{
- return false;
-}
-
-static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
-{
- if (static_branch_likely(&s390_arch_random_available)) {
- return s390_arch_random_generate((u8 *)v, sizeof(*v));
- }
- return false;
+ return 0;
}
-static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
{
- if (static_branch_likely(&s390_arch_random_available)) {
- return s390_arch_random_generate((u8 *)v, sizeof(*v));
+ if (static_branch_likely(&s390_arch_random_available) &&
+ in_task()) {
+ cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v));
+ atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter);
+ return max_longs;
}
- return false;
+ return 0;
}
-#endif /* CONFIG_ARCH_RANDOM */
#endif /* _ASM_S390_ARCHRANDOM_H */
diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h
new file mode 100644
index 000000000000..11f615eb0066
--- /dev/null
+++ b/arch/s390/include/asm/asm-const.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ASM_CONST_H
+#define _ASM_S390_ASM_CONST_H
+
+#ifdef __ASSEMBLY__
+# define stringify_in_c(...) __VA_ARGS__
+#else
+/* This version of stringify will deal with commas... */
+# define __stringify_in_c(...) #__VA_ARGS__
+# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " "
+#endif
+#endif /* _ASM_S390_ASM_CONST_H */
diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h
new file mode 100644
index 000000000000..b74f1070ddb2
--- /dev/null
+++ b/arch/s390/include/asm/asm-extable.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_EXTABLE_H
+#define __ASM_EXTABLE_H
+
+#include <linux/stringify.h>
+#include <linux/bits.h>
+#include <asm/asm-const.h>
+
+#define EX_TYPE_NONE 0
+#define EX_TYPE_FIXUP 1
+#define EX_TYPE_BPF 2
+#define EX_TYPE_UA_STORE 3
+#define EX_TYPE_UA_LOAD_MEM 4
+#define EX_TYPE_UA_LOAD_REG 5
+
+#define EX_DATA_REG_ERR_SHIFT 0
+#define EX_DATA_REG_ERR GENMASK(3, 0)
+
+#define EX_DATA_REG_ADDR_SHIFT 4
+#define EX_DATA_REG_ADDR GENMASK(7, 4)
+
+#define EX_DATA_LEN_SHIFT 8
+#define EX_DATA_LEN GENMASK(11, 8)
+
+#define __EX_TABLE(_section, _fault, _target, _type) \
+ stringify_in_c(.section _section,"a";) \
+ stringify_in_c(.align 4;) \
+ stringify_in_c(.long (_fault) - .;) \
+ stringify_in_c(.long (_target) - .;) \
+ stringify_in_c(.short (_type);) \
+ stringify_in_c(.short 0;) \
+ stringify_in_c(.previous)
+
+#define __EX_TABLE_UA(_section, _fault, _target, _type, _regerr, _regaddr, _len)\
+ stringify_in_c(.section _section,"a";) \
+ stringify_in_c(.align 4;) \
+ stringify_in_c(.long (_fault) - .;) \
+ stringify_in_c(.long (_target) - .;) \
+ stringify_in_c(.short (_type);) \
+ stringify_in_c(.macro extable_reg regerr, regaddr;) \
+ stringify_in_c(.set .Lfound, 0;) \
+ stringify_in_c(.set .Lcurr, 0;) \
+ stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \
+ stringify_in_c( .ifc "\regerr", "%%r\rs";) \
+ stringify_in_c( .set .Lfound, 1;) \
+ stringify_in_c( .set .Lregerr, .Lcurr;) \
+ stringify_in_c( .endif;) \
+ stringify_in_c( .set .Lcurr, .Lcurr+1;) \
+ stringify_in_c(.endr;) \
+ stringify_in_c(.ifne (.Lfound != 1);) \
+ stringify_in_c( .error "extable_reg: bad register argument1";) \
+ stringify_in_c(.endif;) \
+ stringify_in_c(.set .Lfound, 0;) \
+ stringify_in_c(.set .Lcurr, 0;) \
+ stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \
+ stringify_in_c( .ifc "\regaddr", "%%r\rs";) \
+ stringify_in_c( .set .Lfound, 1;) \
+ stringify_in_c( .set .Lregaddr, .Lcurr;) \
+ stringify_in_c( .endif;) \
+ stringify_in_c( .set .Lcurr, .Lcurr+1;) \
+ stringify_in_c(.endr;) \
+ stringify_in_c(.ifne (.Lfound != 1);) \
+ stringify_in_c( .error "extable_reg: bad register argument2";) \
+ stringify_in_c(.endif;) \
+ stringify_in_c(.short .Lregerr << EX_DATA_REG_ERR_SHIFT | \
+ .Lregaddr << EX_DATA_REG_ADDR_SHIFT | \
+ _len << EX_DATA_LEN_SHIFT;) \
+ stringify_in_c(.endm;) \
+ stringify_in_c(extable_reg _regerr,_regaddr;) \
+ stringify_in_c(.purgem extable_reg;) \
+ stringify_in_c(.previous)
+
+#define EX_TABLE(_fault, _target) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP)
+
+#define EX_TABLE_AMODE31(_fault, _target) \
+ __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP)
+
+#define EX_TABLE_UA_STORE(_fault, _target, _regerr) \
+ __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_STORE, _regerr, _regerr, 0)
+
+#define EX_TABLE_UA_LOAD_MEM(_fault, _target, _regerr, _regmem, _len) \
+ __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_MEM, _regerr, _regmem, _len)
+
+#define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \
+ __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0)
+
+#endif /* __ASM_EXTABLE_H */
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 491ad53a0d4e..7138d189cc42 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -15,56 +15,46 @@
#include <asm/barrier.h>
#include <asm/cmpxchg.h>
-#define ATOMIC_INIT(i) { (i) }
-
-static inline int atomic_read(const atomic_t *v)
+static inline int arch_atomic_read(const atomic_t *v)
{
- int c;
-
- asm volatile(
- " l %0,%1\n"
- : "=d" (c) : "Q" (v->counter));
- return c;
+ return __atomic_read(v);
}
+#define arch_atomic_read arch_atomic_read
-static inline void atomic_set(atomic_t *v, int i)
+static inline void arch_atomic_set(atomic_t *v, int i)
{
- asm volatile(
- " st %1,%0\n"
- : "=Q" (v->counter) : "d" (i));
+ __atomic_set(v, i);
}
+#define arch_atomic_set arch_atomic_set
-static inline int atomic_add_return(int i, atomic_t *v)
+static inline int arch_atomic_add_return(int i, atomic_t *v)
{
return __atomic_add_barrier(i, &v->counter) + i;
}
+#define arch_atomic_add_return arch_atomic_add_return
-static inline int atomic_fetch_add(int i, atomic_t *v)
+static inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
return __atomic_add_barrier(i, &v->counter);
}
+#define arch_atomic_fetch_add arch_atomic_fetch_add
-static inline void atomic_add(int i, atomic_t *v)
+static inline void arch_atomic_add(int i, atomic_t *v)
{
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
- if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
- __atomic_add_const(i, &v->counter);
- return;
- }
-#endif
__atomic_add(i, &v->counter);
}
+#define arch_atomic_add arch_atomic_add
-#define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v)
-#define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v)
-#define atomic_fetch_sub(_i, _v) atomic_fetch_add(-(int)(_i), _v)
+#define arch_atomic_sub(_i, _v) arch_atomic_add(-(int)(_i), _v)
+#define arch_atomic_sub_return(_i, _v) arch_atomic_add_return(-(int)(_i), _v)
+#define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v)
#define ATOMIC_OPS(op) \
-static inline void atomic_##op(int i, atomic_t *v) \
+static inline void arch_atomic_##op(int i, atomic_t *v) \
{ \
__atomic_##op(i, &v->counter); \
} \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
+static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \
{ \
return __atomic_##op##_barrier(i, &v->counter); \
}
@@ -75,66 +65,67 @@ ATOMIC_OPS(xor)
#undef ATOMIC_OPS
-#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+#define arch_atomic_and arch_atomic_and
+#define arch_atomic_or arch_atomic_or
+#define arch_atomic_xor arch_atomic_xor
+#define arch_atomic_fetch_and arch_atomic_fetch_and
+#define arch_atomic_fetch_or arch_atomic_fetch_or
+#define arch_atomic_fetch_xor arch_atomic_fetch_xor
+
+#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new))
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
return __atomic_cmpxchg(&v->counter, old, new);
}
+#define arch_atomic_cmpxchg arch_atomic_cmpxchg
#define ATOMIC64_INIT(i) { (i) }
-static inline s64 atomic64_read(const atomic64_t *v)
+static inline s64 arch_atomic64_read(const atomic64_t *v)
{
- s64 c;
-
- asm volatile(
- " lg %0,%1\n"
- : "=d" (c) : "Q" (v->counter));
- return c;
+ return __atomic64_read(v);
}
+#define arch_atomic64_read arch_atomic64_read
-static inline void atomic64_set(atomic64_t *v, s64 i)
+static inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
- asm volatile(
- " stg %1,%0\n"
- : "=Q" (v->counter) : "d" (i));
+ __atomic64_set(v, i);
}
+#define arch_atomic64_set arch_atomic64_set
-static inline s64 atomic64_add_return(s64 i, atomic64_t *v)
+static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
return __atomic64_add_barrier(i, (long *)&v->counter) + i;
}
+#define arch_atomic64_add_return arch_atomic64_add_return
-static inline s64 atomic64_fetch_add(s64 i, atomic64_t *v)
+static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
return __atomic64_add_barrier(i, (long *)&v->counter);
}
+#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static inline void atomic64_add(s64 i, atomic64_t *v)
+static inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
- if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
- __atomic64_add_const(i, (long *)&v->counter);
- return;
- }
-#endif
__atomic64_add(i, (long *)&v->counter);
}
+#define arch_atomic64_add arch_atomic64_add
-#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
-static inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
return __atomic64_cmpxchg((long *)&v->counter, old, new);
}
+#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
#define ATOMIC64_OPS(op) \
-static inline void atomic64_##op(s64 i, atomic64_t *v) \
+static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \
{ \
__atomic64_##op(i, (long *)&v->counter); \
} \
-static inline long atomic64_fetch_##op(s64 i, atomic64_t *v) \
+static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \
{ \
return __atomic64_##op##_barrier(i, (long *)&v->counter); \
}
@@ -145,8 +136,15 @@ ATOMIC64_OPS(xor)
#undef ATOMIC64_OPS
-#define atomic64_sub_return(_i, _v) atomic64_add_return(-(s64)(_i), _v)
-#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(s64)(_i), _v)
-#define atomic64_sub(_i, _v) atomic64_add(-(s64)(_i), _v)
+#define arch_atomic64_and arch_atomic64_and
+#define arch_atomic64_or arch_atomic64_or
+#define arch_atomic64_xor arch_atomic64_xor
+#define arch_atomic64_fetch_and arch_atomic64_fetch_and
+#define arch_atomic64_fetch_or arch_atomic64_fetch_or
+#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
+
+#define arch_atomic64_sub_return(_i, _v) arch_atomic64_add_return(-(s64)(_i), _v)
+#define arch_atomic64_fetch_sub(_i, _v) arch_atomic64_fetch_add(-(s64)(_i), _v)
+#define arch_atomic64_sub(_i, _v) arch_atomic64_add(-(s64)(_i), _v)
#endif /* __ARCH_S390_ATOMIC__ */
diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
index 61467b9eecc7..50510e08b893 100644
--- a/arch/s390/include/asm/atomic_ops.h
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -8,6 +8,40 @@
#ifndef __ARCH_S390_ATOMIC_OPS__
#define __ARCH_S390_ATOMIC_OPS__
+static inline int __atomic_read(const atomic_t *v)
+{
+ int c;
+
+ asm volatile(
+ " l %0,%1\n"
+ : "=d" (c) : "R" (v->counter));
+ return c;
+}
+
+static inline void __atomic_set(atomic_t *v, int i)
+{
+ asm volatile(
+ " st %1,%0\n"
+ : "=R" (v->counter) : "d" (i));
+}
+
+static inline s64 __atomic64_read(const atomic64_t *v)
+{
+ s64 c;
+
+ asm volatile(
+ " lg %0,%1\n"
+ : "=d" (c) : "RT" (v->counter));
+ return c;
+}
+
+static inline void __atomic64_set(atomic64_t *v, s64 i)
+{
+ asm volatile(
+ " stg %1,%0\n"
+ : "=RT" (v->counter) : "d" (i));
+}
+
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
#define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \
@@ -18,7 +52,7 @@ static inline op_type op_name(op_type val, op_type *ptr) \
asm volatile( \
op_string " %[old],%[val],%[ptr]\n" \
op_barrier \
- : [old] "=d" (old), [ptr] "+Q" (*ptr) \
+ : [old] "=d" (old), [ptr] "+QS" (*ptr) \
: [val] "d" (val) : "cc", "memory"); \
return old; \
} \
@@ -46,7 +80,7 @@ static __always_inline void op_name(op_type val, op_type *ptr) \
asm volatile( \
op_string " %[ptr],%[val]\n" \
op_barrier \
- : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc", "memory");\
+ : [ptr] "+QS" (*ptr) : [val] "i" (val) : "cc", "memory");\
}
#define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \
@@ -97,7 +131,7 @@ static inline long op_name(long val, long *ptr) \
op_string " %[new],%[val]\n" \
" csg %[old],%[new],%[ptr]\n" \
" jl 0b" \
- : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\
+ : [old] "=d" (old), [new] "=&d" (new), [ptr] "+QS" (*ptr)\
: [val] "d" (val), "0" (*ptr) : "cc", "memory"); \
return old; \
}
@@ -122,22 +156,46 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr")
static inline int __atomic_cmpxchg(int *ptr, int old, int new)
{
- return __sync_val_compare_and_swap(ptr, old, new);
+ asm volatile(
+ " cs %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+Q" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old;
}
-static inline int __atomic_cmpxchg_bool(int *ptr, int old, int new)
+static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new)
{
- return __sync_bool_compare_and_swap(ptr, old, new);
+ int old_expected = old;
+
+ asm volatile(
+ " cs %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+Q" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old == old_expected;
}
static inline long __atomic64_cmpxchg(long *ptr, long old, long new)
{
- return __sync_val_compare_and_swap(ptr, old, new);
+ asm volatile(
+ " csg %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old;
}
-static inline long __atomic64_cmpxchg_bool(long *ptr, long old, long new)
+static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new)
{
- return __sync_bool_compare_and_swap(ptr, old, new);
+ long old_expected = old;
+
+ asm volatile(
+ " csg %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old == old_expected;
}
#endif /* __ARCH_S390_ATOMIC_OPS__ */
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index f9eddbca79d2..82de2a7c4160 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -16,20 +16,24 @@
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
/* Fast-BCR without checkpoint synchronization */
-#define __ASM_BARRIER "bcr 14,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 14,0\n"
#else
-#define __ASM_BARRIER "bcr 15,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 15,0\n"
#endif
-#define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+static __always_inline void bcr_serialize(void)
+{
+ asm volatile(__ASM_BCR_SERIALIZE : : : "memory");
+}
-#define rmb() barrier()
-#define wmb() barrier()
-#define dma_rmb() mb()
-#define dma_wmb() mb()
-#define __smp_mb() mb()
-#define __smp_rmb() rmb()
-#define __smp_wmb() wmb()
+#define __mb() bcr_serialize()
+#define __rmb() barrier()
+#define __wmb() barrier()
+#define __dma_rmb() __mb()
+#define __dma_wmb() __mb()
+#define __smp_mb() __mb()
+#define __smp_rmb() __rmb()
+#define __smp_wmb() __wmb()
#define __smp_store_release(p, v) \
do { \
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 431e208a5ea4..2de74fcd0578 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -42,7 +42,7 @@
#define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG)
static inline unsigned long *
-__bitops_word(unsigned long nr, volatile unsigned long *ptr)
+__bitops_word(unsigned long nr, const volatile unsigned long *ptr)
{
unsigned long addr;
@@ -50,73 +50,33 @@ __bitops_word(unsigned long nr, volatile unsigned long *ptr)
return (unsigned long *)addr;
}
-static inline unsigned char *
-__bitops_byte(unsigned long nr, volatile unsigned long *ptr)
+static inline unsigned long __bitops_mask(unsigned long nr)
{
- return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3);
+ return 1UL << (nr & (BITS_PER_LONG - 1));
}
static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long mask;
+ unsigned long mask = __bitops_mask(nr);
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
- if (__builtin_constant_p(nr)) {
- unsigned char *caddr = __bitops_byte(nr, ptr);
-
- asm volatile(
- "oi %0,%b1\n"
- : "+Q" (*caddr)
- : "i" (1 << (nr & 7))
- : "cc", "memory");
- return;
- }
-#endif
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
__atomic64_or(mask, (long *)addr);
}
static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long mask;
-
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
- if (__builtin_constant_p(nr)) {
- unsigned char *caddr = __bitops_byte(nr, ptr);
+ unsigned long mask = __bitops_mask(nr);
- asm volatile(
- "ni %0,%b1\n"
- : "+Q" (*caddr)
- : "i" (~(1 << (nr & 7)))
- : "cc", "memory");
- return;
- }
-#endif
- mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
- __atomic64_and(mask, (long *)addr);
+ __atomic64_and(~mask, (long *)addr);
}
static __always_inline void arch_change_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long mask;
-
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
- if (__builtin_constant_p(nr)) {
- unsigned char *caddr = __bitops_byte(nr, ptr);
+ unsigned long mask = __bitops_mask(nr);
- asm volatile(
- "xi %0,%b1\n"
- : "+Q" (*caddr)
- : "i" (1 << (nr & 7))
- : "cc", "memory");
- return;
- }
-#endif
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
__atomic64_xor(mask, (long *)addr);
}
@@ -124,106 +84,106 @@ static inline bool arch_test_and_set_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long old, mask;
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
old = __atomic64_or_barrier(mask, (long *)addr);
- return (old & mask) != 0;
+ return old & mask;
}
static inline bool arch_test_and_clear_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long old, mask;
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
- old = __atomic64_and_barrier(mask, (long *)addr);
- return (old & ~mask) != 0;
+ old = __atomic64_and_barrier(~mask, (long *)addr);
+ return old & mask;
}
static inline bool arch_test_and_change_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long old, mask;
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
old = __atomic64_xor_barrier(mask, (long *)addr);
- return (old & mask) != 0;
+ return old & mask;
}
-static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
- *addr |= 1 << (nr & 7);
+ *p |= mask;
}
-static inline void arch___clear_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
- *addr &= ~(1 << (nr & 7));
+ *p &= ~mask;
}
-static inline void arch___change_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
- *addr ^= 1 << (nr & 7);
+ *p ^= mask;
}
-static inline bool arch___test_and_set_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
- unsigned char ch;
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- ch = *addr;
- *addr |= 1 << (nr & 7);
- return (ch >> (nr & 7)) & 1;
+ old = *p;
+ *p |= mask;
+ return old & mask;
}
-static inline bool arch___test_and_clear_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
- unsigned char ch;
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- ch = *addr;
- *addr &= ~(1 << (nr & 7));
- return (ch >> (nr & 7)) & 1;
+ old = *p;
+ *p &= ~mask;
+ return old & mask;
}
-static inline bool arch___test_and_change_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
- unsigned char ch;
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- ch = *addr;
- *addr ^= 1 << (nr & 7);
- return (ch >> (nr & 7)) & 1;
+ old = *p;
+ *p ^= mask;
+ return old & mask;
}
-static inline bool arch_test_bit(unsigned long nr,
- const volatile unsigned long *ptr)
-{
- const volatile unsigned char *addr;
-
- addr = ((const volatile unsigned char *)ptr);
- addr += (nr ^ (BITS_PER_LONG - 8)) >> 3;
- return (*addr >> (nr & 7)) & 1;
-}
+#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
static inline bool arch_test_and_set_bit_lock(unsigned long nr,
volatile unsigned long *ptr)
{
if (arch_test_bit(nr, ptr))
- return 1;
+ return true;
return arch_test_and_set_bit(nr, ptr);
}
@@ -291,8 +251,6 @@ static inline bool test_bit_inv(unsigned long nr,
return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
}
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
-
/**
* __flogr - find leftmost one
* @word - The word to search
@@ -334,13 +292,13 @@ static inline unsigned char __flogr(unsigned long word)
}
return bit;
} else {
- register unsigned long bit asm("4") = word;
- register unsigned long out asm("5");
+ union register_pair rp;
+ rp.even = word;
asm volatile(
- " flogr %[bit],%[bit]\n"
- : [bit] "+d" (bit), [out] "=d" (out) : : "cc");
- return bit;
+ " flogr %[rp],%[rp]\n"
+ : [rp] "+d" (rp.pair) : : "cc");
+ return rp.even;
}
}
@@ -411,18 +369,7 @@ static inline int fls(unsigned int word)
return fls64(word);
}
-#else /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
-
-#include <asm-generic/bitops/__ffs.h>
-#include <asm-generic/bitops/ffs.h>
-#include <asm-generic/bitops/__fls.h>
-#include <asm-generic/bitops/fls.h>
-#include <asm-generic/bitops/fls64.h>
-
-#endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
-
#include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/le.h>
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index 7725f8006fdf..aebe1e22c7be 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -2,7 +2,7 @@
#ifndef _ASM_S390_BUG_H
#define _ASM_S390_BUG_H
-#include <linux/kernel.h>
+#include <linux/compiler.h>
#ifdef CONFIG_BUG
@@ -15,7 +15,8 @@
"1: .asciz \""__FILE__"\"\n" \
".previous\n" \
".section __bug_table,\"awM\",@progbits,%2\n" \
- "2: .long 0b-2b,1b-2b\n" \
+ "2: .long 0b-.\n" \
+ " .long 1b-.\n" \
" .short %0,%1\n" \
" .org 2b+%2\n" \
".previous\n" \
@@ -30,7 +31,7 @@
asm_inline volatile( \
"0: mc 0,0\n" \
".section __bug_table,\"awM\",@progbits,%1\n" \
- "1: .long 0b-1b\n" \
+ "1: .long 0b-.\n" \
" .short %0\n" \
" .org 1b+%1\n" \
".previous\n" \
diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h
index d5e22e837416..00128174c025 100644
--- a/arch/s390/include/asm/cache.h
+++ b/arch/s390/include/asm/cache.h
@@ -14,6 +14,6 @@
#define L1_CACHE_SHIFT 8
#define NET_SKB_PAD 32
-#define __read_mostly __section(.data..read_mostly)
+#define __read_mostly __section(".data..read_mostly")
#endif
diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h
index 865ce1cb86d5..bd1596810cc1 100644
--- a/arch/s390/include/asm/ccwdev.h
+++ b/arch/s390/include/asm/ccwdev.h
@@ -11,6 +11,7 @@
#include <linux/device.h>
#include <linux/mod_devicetable.h>
+#include <asm/chsc.h>
#include <asm/fcx.h>
#include <asm/irq.h>
#include <asm/schid.h>
@@ -103,6 +104,8 @@ struct ccw_device {
was successfully verified. */
#define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had
to be established again. */
+#define PE_PATH_FCES_EVENT 0x8 /* The FCES Status of a path has
+ * changed. */
/*
* Possible CIO actions triggered by the unit check handler.
@@ -114,7 +117,7 @@ enum uc_todo {
};
/**
- * struct ccw driver - device driver for channel attached devices
+ * struct ccw_driver - device driver for channel attached devices
* @ids: ids supported by this driver
* @probe: function called on probe
* @remove: function called on remove
@@ -123,11 +126,6 @@ enum uc_todo {
* @notify: notify driver of device state changes
* @path_event: notify driver of channel path events
* @shutdown: called at device shutdown
- * @prepare: prepare for pm state transition
- * @complete: undo work done in @prepare
- * @freeze: callback for freezing during hibernation snapshotting
- * @thaw: undo work done in @freeze
- * @restore: callback for restoring after hibernation
* @uc_handler: callback for unit check handler
* @driver: embedded device driver structure
* @int_class: interruption class to use for accounting interrupts
@@ -141,11 +139,6 @@ struct ccw_driver {
int (*notify) (struct ccw_device *, int);
void (*path_event) (struct ccw_device *, int *);
void (*shutdown) (struct ccw_device *);
- int (*prepare) (struct ccw_device *);
- void (*complete) (struct ccw_device *);
- int (*freeze)(struct ccw_device *);
- int (*thaw) (struct ccw_device *);
- int (*restore)(struct ccw_device *);
enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *);
struct device_driver driver;
enum interruption_class int_class;
@@ -159,9 +152,6 @@ extern struct ccw_device *get_ccwdev_by_busid(struct ccw_driver *cdrv,
* when new devices for its type pop up */
extern int ccw_driver_register (struct ccw_driver *driver);
extern void ccw_driver_unregister (struct ccw_driver *driver);
-
-struct ccw1;
-
extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long);
extern int ccw_device_set_options(struct ccw_device *, unsigned long);
extern void ccw_device_clear_options(struct ccw_device *, unsigned long);
@@ -224,7 +214,6 @@ extern struct ccw_device *ccw_device_create_console(struct ccw_driver *);
extern void ccw_device_destroy_console(struct ccw_device *);
extern int ccw_device_enable_console(struct ccw_device *);
extern void ccw_device_wait_idle(struct ccw_device *);
-extern int ccw_device_force_console(struct ccw_device *);
extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size);
extern void ccw_device_dma_free(struct ccw_device *cdev,
@@ -236,4 +225,11 @@ extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *);
struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int);
u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx);
+int ccw_device_pnso(struct ccw_device *cdev,
+ struct chsc_pnso_area *pnso_area, u8 oc,
+ struct chsc_pnso_resume_token resume_token, int cnc);
+int ccw_device_get_cssid(struct ccw_device *cdev, u8 *cssid);
+int ccw_device_get_iid(struct ccw_device *cdev, u8 *iid);
+int ccw_device_get_chpid(struct ccw_device *cdev, int chp_idx, u8 *chpid);
+int ccw_device_get_chid(struct ccw_device *cdev, int chp_idx, u16 *chid);
#endif /* _S390_CCWDEV_H_ */
diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h
index 7293c139dd79..11d2fb3de4f5 100644
--- a/arch/s390/include/asm/ccwgroup.h
+++ b/arch/s390/include/asm/ccwgroup.h
@@ -11,8 +11,7 @@ struct ccw_driver;
* @count: number of attached slave devices
* @dev: embedded device structure
* @cdev: variable number of slave devices, allocated as needed
- * @ungroup_work: work to be done when a ccwgroup notifier has action
- * type %BUS_NOTIFY_UNBIND_DRIVER
+ * @ungroup_work: used to ungroup the ccwgroup device
*/
struct ccwgroup_device {
enum {
@@ -26,7 +25,7 @@ struct ccwgroup_device {
unsigned int count;
struct device dev;
struct work_struct ungroup_work;
- struct ccw_device *cdev[0];
+ struct ccw_device *cdev[];
};
/**
@@ -36,11 +35,6 @@ struct ccwgroup_device {
* @set_online: function called when device is set online
* @set_offline: function called when device is set offline
* @shutdown: function called when device is shut down
- * @prepare: prepare for pm state transition
- * @complete: undo work done in @prepare
- * @freeze: callback for freezing during hibernation snapshotting
- * @thaw: undo work done in @freeze
- * @restore: callback for restoring after hibernation
* @driver: embedded driver structure
* @ccw_driver: supported ccw_driver (optional)
*/
@@ -50,11 +44,6 @@ struct ccwgroup_driver {
int (*set_online) (struct ccwgroup_device *);
int (*set_offline) (struct ccwgroup_device *);
void (*shutdown)(struct ccwgroup_device *);
- int (*prepare) (struct ccwgroup_device *);
- void (*complete) (struct ccwgroup_device *);
- int (*freeze)(struct ccwgroup_device *);
- int (*thaw) (struct ccwgroup_device *);
- int (*restore)(struct ccwgroup_device *);
struct device_driver driver;
struct ccw_driver *ccw_driver;
@@ -64,11 +53,9 @@ extern int ccwgroup_driver_register (struct ccwgroup_driver *cdriver);
extern void ccwgroup_driver_unregister (struct ccwgroup_driver *cdriver);
int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv,
int num_devices, const char *buf);
-struct ccwgroup_device *get_ccwgroupdev_by_busid(struct ccwgroup_driver *gdrv,
- char *bus_id);
extern int ccwgroup_set_online(struct ccwgroup_device *gdev);
-extern int ccwgroup_set_offline(struct ccwgroup_device *gdev);
+int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv);
extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev);
extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev);
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index 91e376b0d28c..cdd19d326345 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -13,127 +13,120 @@
#define _S390_CHECKSUM_H
#include <linux/uaccess.h>
+#include <linux/in6.h>
/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
+ * Computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit).
*
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
+ * Returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic.
*
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
+ * This function must be called with even lengths, except
+ * for the last fragment, which may be odd.
*
- * it's best to have buff aligned on a 32-bit boundary
+ * It's best to have buff aligned on a 32-bit boundary.
*/
-static inline __wsum
-csum_partial(const void *buff, int len, __wsum sum)
+static inline __wsum csum_partial(const void *buff, int len, __wsum sum)
{
- register unsigned long reg2 asm("2") = (unsigned long) buff;
- register unsigned long reg3 asm("3") = (unsigned long) len;
+ union register_pair rp = {
+ .even = (unsigned long) buff,
+ .odd = (unsigned long) len,
+ };
asm volatile(
- "0: cksm %0,%1\n" /* do checksum on longs */
+ "0: cksm %[sum],%[rp]\n"
" jo 0b\n"
- : "+d" (sum), "+d" (reg2), "+d" (reg3) : : "cc", "memory");
+ : [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory");
return sum;
}
/*
- * the same as csum_partial_copy, but copies from user space.
- *
- * here even more important to align src and dst on a 32-bit (or even
- * better 64-bit) boundary
- *
- * Copy from userspace and compute checksum.
- */
-static inline __wsum
-csum_partial_copy_from_user(const void __user *src, void *dst,
- int len, __wsum sum,
- int *err_ptr)
-{
- if (unlikely(copy_from_user(dst, src, len)))
- *err_ptr = -EFAULT;
- return csum_partial(dst, len, sum);
-}
-
-
-static inline __wsum
-csum_partial_copy_nocheck (const void *src, void *dst, int len, __wsum sum)
-{
- memcpy(dst,src,len);
- return csum_partial(dst, len, sum);
-}
-
-/*
- * Fold a partial checksum without adding pseudo headers
+ * Fold a partial checksum without adding pseudo headers.
*/
static inline __sum16 csum_fold(__wsum sum)
{
u32 csum = (__force u32) sum;
- csum += (csum >> 16) + (csum << 16);
+ csum += (csum >> 16) | (csum << 16);
csum >>= 16;
return (__force __sum16) ~csum;
}
/*
- * This is a version of ip_compute_csum() optimized for IP headers,
- * which always checksum on 4 octet boundaries.
- *
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksums on 4 octet boundaries.
*/
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
- return csum_fold(csum_partial(iph, ihl*4, 0));
+ __u64 csum = 0;
+ __u32 *ptr = (u32 *)iph;
+
+ csum += *ptr++;
+ csum += *ptr++;
+ csum += *ptr++;
+ csum += *ptr++;
+ ihl -= 4;
+ while (ihl--)
+ csum += *ptr++;
+ csum += (csum >> 32) | (csum << 32);
+ return csum_fold((__force __wsum)(csum >> 32));
}
/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 32-bit checksum
+ * Computes the checksum of the TCP/UDP pseudo-header.
+ * Returns a 32-bit checksum.
*/
-static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
- __wsum sum)
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
- __u32 csum = (__force __u32)sum;
+ __u64 csum = (__force __u64)sum;
csum += (__force __u32)saddr;
- if (csum < (__force __u32)saddr)
- csum++;
-
csum += (__force __u32)daddr;
- if (csum < (__force __u32)daddr)
- csum++;
-
- csum += len + proto;
- if (csum < len + proto)
- csum++;
-
- return (__force __wsum)csum;
+ csum += len;
+ csum += proto;
+ csum += (csum >> 32) | (csum << 32);
+ return (__force __wsum)(csum >> 32);
}
/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
+ * Computes the checksum of the TCP/UDP pseudo-header.
+ * Returns a 16-bit checksum, already complemented.
*/
-
-static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
- __wsum sum)
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
- return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
+ * Used for miscellaneous IP-like checksums, mainly icmp.
*/
-
static inline __sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff, len, 0));
}
-#endif /* _S390_CHECKSUM_H */
-
+#define _HAVE_ARCH_IPV6_CSUM
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum csum)
+{
+ __u64 sum = (__force __u64)csum;
+
+ sum += (__force __u32)saddr->s6_addr32[0];
+ sum += (__force __u32)saddr->s6_addr32[1];
+ sum += (__force __u32)saddr->s6_addr32[2];
+ sum += (__force __u32)saddr->s6_addr32[3];
+ sum += (__force __u32)daddr->s6_addr32[0];
+ sum += (__force __u32)daddr->s6_addr32[1];
+ sum += (__force __u32)daddr->s6_addr32[2];
+ sum += (__force __u32)daddr->s6_addr32[3];
+ sum += len;
+ sum += proto;
+ sum += (sum >> 32) | (sum << 32);
+ return csum_fold((__force __wsum)(sum >> 32));
+}
+#endif /* _S390_CHECKSUM_H */
diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h
new file mode 100644
index 000000000000..bb48ea380c0d
--- /dev/null
+++ b/arch/s390/include/asm/chsc.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Alexandra Winter <wintera@linux.ibm.com>
+ *
+ * Interface for Channel Subsystem Call
+ */
+#ifndef _ASM_S390_CHSC_H
+#define _ASM_S390_CHSC_H
+
+#include <uapi/asm/chsc.h>
+
+/**
+ * Operation codes for CHSC PNSO:
+ * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport
+ * PNSO_OC_NET_ADDR_INFO - all addresses
+ */
+#define PNSO_OC_NET_BRIDGE_INFO 0
+#define PNSO_OC_NET_ADDR_INFO 3
+/**
+ * struct chsc_pnso_naid_l2 - network address information descriptor
+ * @nit: Network interface token
+ * @addr_lnid: network address and logical network id (VLAN ID)
+ */
+struct chsc_pnso_naid_l2 {
+ u64 nit;
+ struct { u8 mac[6]; u16 lnid; } addr_lnid;
+} __packed;
+
+struct chsc_pnso_resume_token {
+ u64 t1;
+ u64 t2;
+} __packed;
+
+struct chsc_pnso_naihdr {
+ struct chsc_pnso_resume_token resume_token;
+ u32:32;
+ u32 instance;
+ u32:24;
+ u8 naids;
+ u32 reserved[3];
+} __packed;
+
+struct chsc_pnso_area {
+ struct chsc_header request;
+ u8:2;
+ u8 m:1;
+ u8:5;
+ u8:2;
+ u8 ssid:2;
+ u8 fmt:4;
+ u16 sch;
+ u8:8;
+ u8 cssid;
+ u16:16;
+ u8 oc;
+ u32:24;
+ struct chsc_pnso_resume_token resume_token;
+ u32 n:1;
+ u32:31;
+ u32 reserved[3];
+ struct chsc_header response;
+ u32:32;
+ struct chsc_pnso_naihdr naihdr;
+ struct chsc_pnso_naid_l2 entries[];
+} __packed __aligned(PAGE_SIZE);
+
+#endif /* _ASM_S390_CHSC_H */
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index b5bfb3123cb1..1c4f585dd39b 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -5,10 +5,10 @@
#ifndef _ASM_S390_CIO_H_
#define _ASM_S390_CIO_H_
-#include <linux/spinlock.h>
#include <linux/bitops.h>
#include <linux/genalloc.h>
#include <asm/types.h>
+#include <asm/tpi.h>
#define LPM_ANYPATH 0xff
#define __MAX_CSSID 0
@@ -329,7 +329,7 @@ struct ccw_dev_id {
};
/**
- * ccw_device_id_is_equal() - compare two ccw_dev_ids
+ * ccw_dev_id_is_equal() - compare two ccw_dev_ids
* @dev_id1: a ccw_dev_id
* @dev_id2: another ccw_dev_id
* Returns:
@@ -356,7 +356,6 @@ static inline u8 pathmask_to_pos(u8 mask)
return 8 - ffs(mask);
}
-void channel_subsystem_reinit(void);
extern void css_schedule_reprobe(void);
extern void *cio_dma_zalloc(size_t size);
@@ -370,8 +369,10 @@ void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev);
struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages);
/* Function from drivers/s390/cio/chsc.c */
-int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
+int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta);
int chsc_sstpi(void *page, void *result, size_t size);
+int chsc_stzi(void *page, void *result, size_t size);
int chsc_sgib(u32 origin);
+int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid);
#endif
diff --git a/arch/s390/include/asm/clocksource.h b/arch/s390/include/asm/clocksource.h
new file mode 100644
index 000000000000..03434369fce4
--- /dev/null
+++ b/arch/s390/include/asm/clocksource.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* s390-specific clocksource additions */
+
+#ifndef _ASM_S390_CLOCKSOURCE_H
+#define _ASM_S390_CLOCKSOURCE_H
+
+#endif /* _ASM_S390_CLOCKSOURCE_H */
diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h
index 3925b0f085b7..10919eeb7533 100644
--- a/arch/s390/include/asm/clp.h
+++ b/arch/s390/include/asm/clp.h
@@ -5,6 +5,9 @@
/* CLP common request & response block size */
#define CLP_BLK_SIZE PAGE_SIZE
+/* Call Logical Processor - Command Code */
+#define CLP_SLPC 0x0001
+
#define CLP_LPS_BASE 0
#define CLP_LPS_PCI 2
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index af99c1f66f12..84c3f0d576c5 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -12,55 +12,193 @@
#include <linux/types.h>
#include <linux/bug.h>
-#define cmpxchg(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __o = (o); \
- __typeof__(*(ptr)) __n = (n); \
- (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
-})
+void __xchg_called_with_bad_pointer(void);
-#define cmpxchg64 cmpxchg
-#define cmpxchg_local cmpxchg
-#define cmpxchg64_local cmpxchg
+static __always_inline unsigned long __xchg(unsigned long x,
+ unsigned long address, int size)
+{
+ unsigned long old;
+ int shift;
-#define xchg(ptr, x) \
+ switch (size) {
+ case 1:
+ shift = (3 ^ (address & 3)) << 3;
+ address ^= address & 3;
+ asm volatile(
+ " l %0,%1\n"
+ "0: lr 0,%0\n"
+ " nr 0,%3\n"
+ " or 0,%2\n"
+ " cs %0,0,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+Q" (*(int *) address)
+ : "d" ((x & 0xff) << shift), "d" (~(0xff << shift))
+ : "memory", "cc", "0");
+ return old >> shift;
+ case 2:
+ shift = (2 ^ (address & 2)) << 3;
+ address ^= address & 2;
+ asm volatile(
+ " l %0,%1\n"
+ "0: lr 0,%0\n"
+ " nr 0,%3\n"
+ " or 0,%2\n"
+ " cs %0,0,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+Q" (*(int *) address)
+ : "d" ((x & 0xffff) << shift), "d" (~(0xffff << shift))
+ : "memory", "cc", "0");
+ return old >> shift;
+ case 4:
+ asm volatile(
+ " l %0,%1\n"
+ "0: cs %0,%2,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+Q" (*(int *) address)
+ : "d" (x)
+ : "memory", "cc");
+ return old;
+ case 8:
+ asm volatile(
+ " lg %0,%1\n"
+ "0: csg %0,%2,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+QS" (*(long *) address)
+ : "d" (x)
+ : "memory", "cc");
+ return old;
+ }
+ __xchg_called_with_bad_pointer();
+ return x;
+}
+
+#define arch_xchg(ptr, x) \
({ \
- __typeof__(ptr) __ptr = (ptr); \
- __typeof__(*(ptr)) __old; \
- do { \
- __old = *__ptr; \
- } while (!__sync_bool_compare_and_swap(__ptr, __old, x)); \
- __old; \
+ __typeof__(*(ptr)) __ret; \
+ \
+ __ret = (__typeof__(*(ptr))) \
+ __xchg((unsigned long)(x), (unsigned long)(ptr), \
+ sizeof(*(ptr))); \
+ __ret; \
})
-#define __cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+void __cmpxchg_called_with_bad_pointer(void);
+
+static __always_inline unsigned long __cmpxchg(unsigned long address,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev, tmp;
+ int shift;
+
+ switch (size) {
+ case 1:
+ shift = (3 ^ (address & 3)) << 3;
+ address ^= address & 3;
+ asm volatile(
+ " l %0,%2\n"
+ "0: nr %0,%5\n"
+ " lr %1,%0\n"
+ " or %0,%3\n"
+ " or %1,%4\n"
+ " cs %0,%1,%2\n"
+ " jnl 1f\n"
+ " xr %1,%0\n"
+ " nr %1,%5\n"
+ " jnz 0b\n"
+ "1:"
+ : "=&d" (prev), "=&d" (tmp), "+Q" (*(int *) address)
+ : "d" ((old & 0xff) << shift),
+ "d" ((new & 0xff) << shift),
+ "d" (~(0xff << shift))
+ : "memory", "cc");
+ return prev >> shift;
+ case 2:
+ shift = (2 ^ (address & 2)) << 3;
+ address ^= address & 2;
+ asm volatile(
+ " l %0,%2\n"
+ "0: nr %0,%5\n"
+ " lr %1,%0\n"
+ " or %0,%3\n"
+ " or %1,%4\n"
+ " cs %0,%1,%2\n"
+ " jnl 1f\n"
+ " xr %1,%0\n"
+ " nr %1,%5\n"
+ " jnz 0b\n"
+ "1:"
+ : "=&d" (prev), "=&d" (tmp), "+Q" (*(int *) address)
+ : "d" ((old & 0xffff) << shift),
+ "d" ((new & 0xffff) << shift),
+ "d" (~(0xffff << shift))
+ : "memory", "cc");
+ return prev >> shift;
+ case 4:
+ asm volatile(
+ " cs %0,%3,%1\n"
+ : "=&d" (prev), "+Q" (*(int *) address)
+ : "0" (old), "d" (new)
+ : "memory", "cc");
+ return prev;
+ case 8:
+ asm volatile(
+ " csg %0,%3,%1\n"
+ : "=&d" (prev), "+QS" (*(long *) address)
+ : "0" (old), "d" (new)
+ : "memory", "cc");
+ return prev;
+ }
+ __cmpxchg_called_with_bad_pointer();
+ return old;
+}
+
+#define arch_cmpxchg(ptr, o, n) \
({ \
- register __typeof__(*(p1)) __old1 asm("2") = (o1); \
- register __typeof__(*(p2)) __old2 asm("3") = (o2); \
- register __typeof__(*(p1)) __new1 asm("4") = (n1); \
- register __typeof__(*(p2)) __new2 asm("5") = (n2); \
- int cc; \
- asm volatile( \
- " cdsg %[old],%[new],%[ptr]\n" \
- " ipm %[cc]\n" \
- " srl %[cc],28" \
- : [cc] "=d" (cc), [old] "+d" (__old1), "+d" (__old2) \
- : [new] "d" (__new1), "d" (__new2), \
- [ptr] "Q" (*(p1)), "Q" (*(p2)) \
- : "memory", "cc"); \
- !cc; \
+ __typeof__(*(ptr)) __ret; \
+ \
+ __ret = (__typeof__(*(ptr))) \
+ __cmpxchg((unsigned long)(ptr), (unsigned long)(o), \
+ (unsigned long)(n), sizeof(*(ptr))); \
+ __ret; \
})
-#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+#define arch_cmpxchg64 arch_cmpxchg
+#define arch_cmpxchg_local arch_cmpxchg
+#define arch_cmpxchg64_local arch_cmpxchg
+
+#define system_has_cmpxchg_double() 1
+
+static __always_inline int __cmpxchg_double(unsigned long p1, unsigned long p2,
+ unsigned long o1, unsigned long o2,
+ unsigned long n1, unsigned long n2)
+{
+ union register_pair old = { .even = o1, .odd = o2, };
+ union register_pair new = { .even = n1, .odd = n2, };
+ int cc;
+
+ asm volatile(
+ " cdsg %[old],%[new],%[ptr]\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [old] "+&d" (old.pair)
+ : [new] "d" (new.pair),
+ [ptr] "QS" (*(unsigned long *)p1), "Q" (*(unsigned long *)p2)
+ : "memory", "cc");
+ return !cc;
+}
+
+#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \
({ \
- __typeof__(p1) __p1 = (p1); \
- __typeof__(p2) __p2 = (p2); \
+ typeof(p1) __p1 = (p1); \
+ typeof(p2) __p2 = (p2); \
+ \
BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \
BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\
- __cmpxchg_double(__p1, __p2, o1, o2, n1, n2); \
+ __cmpxchg_double((unsigned long)__p1, (unsigned long)__p2, \
+ (unsigned long)(o1), (unsigned long)(o2), \
+ (unsigned long)(n1), (unsigned long)(n2)); \
})
-#define system_has_cmpxchg_double() 1
-
#endif /* __ASM_CMPXCHG_H */
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 9547cd5d6cdc..a386070f1d56 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -8,6 +8,22 @@
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <asm/ptrace.h>
+
+#define compat_mode_t compat_mode_t
+typedef u16 compat_mode_t;
+
+#define __compat_uid_t __compat_uid_t
+typedef u16 __compat_uid_t;
+typedef u16 __compat_gid_t;
+
+#define compat_dev_t compat_dev_t
+typedef u16 compat_dev_t;
+
+#define compat_ipc_pid_t compat_ipc_pid_t
+typedef u16 compat_ipc_pid_t;
+
+#define compat_statfs compat_statfs
#include <asm-generic/compat.h>
@@ -19,52 +35,16 @@
(__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
})
-#define PSW32_MASK_PER 0x40000000UL
-#define PSW32_MASK_DAT 0x04000000UL
-#define PSW32_MASK_IO 0x02000000UL
-#define PSW32_MASK_EXT 0x01000000UL
-#define PSW32_MASK_KEY 0x00F00000UL
-#define PSW32_MASK_BASE 0x00080000UL /* Always one */
-#define PSW32_MASK_MCHECK 0x00040000UL
-#define PSW32_MASK_WAIT 0x00020000UL
-#define PSW32_MASK_PSTATE 0x00010000UL
-#define PSW32_MASK_ASC 0x0000C000UL
-#define PSW32_MASK_CC 0x00003000UL
-#define PSW32_MASK_PM 0x00000f00UL
-#define PSW32_MASK_RI 0x00000080UL
-
#define PSW32_MASK_USER 0x0000FF00UL
-#define PSW32_ADDR_AMODE 0x80000000UL
-#define PSW32_ADDR_INSN 0x7FFFFFFFUL
-
-#define PSW32_DEFAULT_KEY (((u32) PAGE_DEFAULT_ACC) << 20)
-
-#define PSW32_ASC_PRIMARY 0x00000000UL
-#define PSW32_ASC_ACCREG 0x00004000UL
-#define PSW32_ASC_SECONDARY 0x00008000UL
-#define PSW32_ASC_HOME 0x0000C000UL
-
#define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \
PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \
PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \
PSW32_ASC_PRIMARY)
-#define COMPAT_USER_HZ 100
#define COMPAT_UTS_MACHINE "s390\0\0\0\0"
-typedef u16 __compat_uid_t;
-typedef u16 __compat_gid_t;
-typedef u32 __compat_uid32_t;
-typedef u32 __compat_gid32_t;
-typedef u16 compat_mode_t;
-typedef u16 compat_dev_t;
typedef u16 compat_nlink_t;
-typedef u16 compat_ipc_pid_t;
-typedef u32 compat_caddr_t;
-typedef __kernel_fsid_t compat_fsid_t;
-typedef s64 compat_s64;
-typedef u64 compat_u64;
typedef struct {
u32 mask;
@@ -105,26 +85,6 @@ struct compat_stat {
u32 __unused5;
};
-struct compat_flock {
- short l_type;
- short l_whence;
- compat_off_t l_start;
- compat_off_t l_len;
- compat_pid_t l_pid;
-};
-
-#define F_GETLK64 12
-#define F_SETLK64 13
-#define F_SETLKW64 14
-
-struct compat_flock64 {
- short l_type;
- short l_whence;
- compat_loff_t l_start;
- compat_loff_t l_len;
- compat_pid_t l_pid;
-};
-
struct compat_statfs {
u32 f_type;
u32 f_bsize;
@@ -155,17 +115,6 @@ struct compat_statfs64 {
u32 f_spare[4];
};
-#define COMPAT_RLIM_INFINITY 0xffffffff
-
-typedef u32 compat_old_sigset_t; /* at least 32 bits */
-
-#define _COMPAT_NSIG 64
-#define _COMPAT_NSIG_BPW 32
-
-typedef u32 compat_sigset_word;
-
-#define COMPAT_OFF_T_MAX 0x7fffffff
-
/*
* A pointer passed in from user mode. This should not
* be used for syscall parameters, just declare them
@@ -186,73 +135,6 @@ static inline int is_compat_task(void)
return test_thread_flag(TIF_31BIT);
}
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- unsigned long stack;
-
- stack = KSTK_ESP(current);
- if (is_compat_task())
- stack &= 0x7fffffffUL;
- return (void __user *) (stack - len);
-}
-
#endif
-struct compat_ipc64_perm {
- compat_key_t key;
- __compat_uid32_t uid;
- __compat_gid32_t gid;
- __compat_uid32_t cuid;
- __compat_gid32_t cgid;
- compat_mode_t mode;
- unsigned short __pad1;
- unsigned short seq;
- unsigned short __pad2;
- unsigned int __unused1;
- unsigned int __unused2;
-};
-
-struct compat_semid64_ds {
- struct compat_ipc64_perm sem_perm;
- compat_ulong_t sem_otime;
- compat_ulong_t sem_otime_high;
- compat_ulong_t sem_ctime;
- compat_ulong_t sem_ctime_high;
- compat_ulong_t sem_nsems;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
-};
-
-struct compat_msqid64_ds {
- struct compat_ipc64_perm msg_perm;
- compat_ulong_t msg_stime;
- compat_ulong_t msg_stime_high;
- compat_ulong_t msg_rtime;
- compat_ulong_t msg_rtime_high;
- compat_ulong_t msg_ctime;
- compat_ulong_t msg_ctime_high;
- compat_ulong_t msg_cbytes;
- compat_ulong_t msg_qnum;
- compat_ulong_t msg_qbytes;
- compat_pid_t msg_lspid;
- compat_pid_t msg_lrpid;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
-};
-
-struct compat_shmid64_ds {
- struct compat_ipc64_perm shm_perm;
- compat_size_t shm_segsz;
- compat_ulong_t shm_atime;
- compat_ulong_t shm_atime_high;
- compat_ulong_t shm_dtime;
- compat_ulong_t shm_dtime_high;
- compat_ulong_t shm_ctime;
- compat_ulong_t shm_ctime_high;
- compat_pid_t shm_cpid;
- compat_pid_t shm_lpid;
- compat_ulong_t shm_nattch;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
-};
#endif /* _ASM_S390X_COMPAT_H */
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index c0f3bfeddcbe..646b12981f20 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -173,17 +173,16 @@ typedef struct { unsigned char bytes[16]; } cpacf_mask_t;
*/
static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
{
- register unsigned long r0 asm("0") = 0; /* query function */
- register unsigned long r1 asm("1") = (unsigned long) mask;
-
asm volatile(
- " spm 0\n" /* pckmo doesn't change the cc */
+ " lghi 0,0\n" /* query function */
+ " lgr 1,%[mask]\n"
+ " spm 0\n" /* pckmo doesn't change the cc */
/* Parameter regs are ignored, but must be nonzero and unique */
"0: .insn rrf,%[opc] << 16,2,4,6,0\n"
" brc 1,0b\n" /* handle partial completion */
: "=m" (*mask)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode)
- : "cc");
+ : [mask] "d" ((unsigned long)mask), [opc] "i" (opcode)
+ : "cc", "0", "1");
}
static __always_inline int __cpacf_check_opcode(unsigned int opcode)
@@ -249,20 +248,22 @@ static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int fu
static inline int cpacf_km(unsigned long func, void *param,
u8 *dest, const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) dest;
+ union register_pair d, s;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,%[dst],%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KM)
- : "cc", "memory");
+ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KM)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -279,20 +280,22 @@ static inline int cpacf_km(unsigned long func, void *param,
static inline int cpacf_kmc(unsigned long func, void *param,
u8 *dest, const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) dest;
+ union register_pair d, s;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,%[dst],%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMC)
- : "cc", "memory");
+ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMC)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -306,17 +309,19 @@ static inline int cpacf_kmc(unsigned long func, void *param,
static inline void cpacf_kimd(unsigned long func, void *param,
const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
+ union register_pair s;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KIMD)
- : "cc", "memory");
+ : [src] "+&d" (s.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
+ [opc] "i" (CPACF_KIMD)
+ : "cc", "memory", "0", "1");
}
/**
@@ -329,17 +334,19 @@ static inline void cpacf_kimd(unsigned long func, void *param,
static inline void cpacf_klmd(unsigned long func, void *param,
const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
+ union register_pair s;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KLMD)
- : "cc", "memory");
+ : [src] "+&d" (s.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KLMD)
+ : "cc", "memory", "0", "1");
}
/**
@@ -355,19 +362,21 @@ static inline void cpacf_klmd(unsigned long func, void *param,
static inline int cpacf_kmac(unsigned long func, void *param,
const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
+ union register_pair s;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMAC)
- : "cc", "memory");
+ : [src] "+&d" (s.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMAC)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -385,22 +394,24 @@ static inline int cpacf_kmac(unsigned long func, void *param,
static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest,
const u8 *src, long src_len, u8 *counter)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) dest;
- register unsigned long r6 asm("6") = (unsigned long) counter;
+ union register_pair d, s, c;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
+ c.even = (unsigned long)counter;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3),
- [dst] "+a" (r4), [ctr] "+a" (r6)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMCTR)
- : "cc", "memory");
+ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair),
+ [ctr] "+&d" (c.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMCTR)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -417,20 +428,21 @@ static inline void cpacf_prno(unsigned long func, void *param,
u8 *dest, unsigned long dest_len,
const u8 *seed, unsigned long seed_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) dest;
- register unsigned long r3 asm("3") = (unsigned long) dest_len;
- register unsigned long r4 asm("4") = (unsigned long) seed;
- register unsigned long r5 asm("5") = (unsigned long) seed_len;
+ union register_pair d, s;
+ d.even = (unsigned long)dest;
+ d.odd = (unsigned long)dest_len;
+ s.even = (unsigned long)seed;
+ s.odd = (unsigned long)seed_len;
asm volatile (
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,%[dst],%[seed]\n"
" brc 1,0b\n" /* handle partial completion */
- : [dst] "+a" (r2), [dlen] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1),
- [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PRNO)
- : "cc", "memory");
+ : [dst] "+&d" (d.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [seed] "d" (s.pair), [opc] "i" (CPACF_PRNO)
+ : "cc", "memory", "0", "1");
}
/**
@@ -443,19 +455,19 @@ static inline void cpacf_prno(unsigned long func, void *param,
static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
u8 *cbuf, unsigned long cbuf_len)
{
- register unsigned long r0 asm("0") = (unsigned long) CPACF_PRNO_TRNG;
- register unsigned long r2 asm("2") = (unsigned long) ucbuf;
- register unsigned long r3 asm("3") = (unsigned long) ucbuf_len;
- register unsigned long r4 asm("4") = (unsigned long) cbuf;
- register unsigned long r5 asm("5") = (unsigned long) cbuf_len;
+ union register_pair u, c;
+ u.even = (unsigned long)ucbuf;
+ u.odd = (unsigned long)ucbuf_len;
+ c.even = (unsigned long)cbuf;
+ c.odd = (unsigned long)cbuf_len;
asm volatile (
+ " lghi 0,%[fc]\n"
"0: .insn rre,%[opc] << 16,%[ucbuf],%[cbuf]\n"
" brc 1,0b\n" /* handle partial completion */
- : [ucbuf] "+a" (r2), [ucbuflen] "+d" (r3),
- [cbuf] "+a" (r4), [cbuflen] "+d" (r5)
- : [fc] "d" (r0), [opc] "i" (CPACF_PRNO)
- : "cc", "memory");
+ : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair)
+ : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO)
+ : "cc", "memory", "0");
}
/**
@@ -466,15 +478,15 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
*/
static inline void cpacf_pcc(unsigned long func, void *param)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
-
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */
" brc 1,0b\n" /* handle partial completion */
:
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCC)
- : "cc", "memory");
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_PCC)
+ : "cc", "memory", "0", "1");
}
/**
@@ -487,14 +499,14 @@ static inline void cpacf_pcc(unsigned long func, void *param)
*/
static inline void cpacf_pckmo(long func, void *param)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
-
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
" .insn rre,%[opc] << 16,0,0\n" /* PCKMO opcode */
:
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO)
- : "cc", "memory");
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_PCKMO)
+ : "cc", "memory", "0", "1");
}
/**
@@ -512,21 +524,23 @@ static inline void cpacf_kma(unsigned long func, void *param, u8 *dest,
const u8 *src, unsigned long src_len,
const u8 *aad, unsigned long aad_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) aad;
- register unsigned long r5 asm("5") = (unsigned long) aad_len;
- register unsigned long r6 asm("6") = (unsigned long) dest;
+ union register_pair d, s, a;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
+ a.even = (unsigned long)aad;
+ a.odd = (unsigned long)aad_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n"
" brc 1,0b\n" /* handle partial completion */
- : [dst] "+a" (r6), [src] "+a" (r2), [slen] "+d" (r3),
- [aad] "+a" (r4), [alen] "+d" (r5)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMA)
- : "cc", "memory");
+ : [dst] "+&d" (d.pair), [src] "+&d" (s.pair),
+ [aad] "+&d" (a.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMA)
+ : "cc", "memory", "0", "1");
}
#endif /* _ASM_S390_CPACF_H */
diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h
index 62228a884e06..26c710cd3485 100644
--- a/arch/s390/include/asm/cpu.h
+++ b/arch/s390/include/asm/cpu.h
@@ -12,6 +12,7 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
+#include <linux/jump_label.h>
struct cpuid
{
@@ -21,5 +22,7 @@ struct cpuid
unsigned int unused : 16;
} __attribute__ ((packed, aligned(8)));
+DECLARE_STATIC_KEY_FALSE(cpu_has_bear);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_CPU_H */
diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
index 649b9fc60685..f87a4788c19c 100644
--- a/arch/s390/include/asm/cpu_mcf.h
+++ b/arch/s390/include/asm/cpu_mcf.h
@@ -24,47 +24,23 @@ enum cpumf_ctr_set {
#define CPUMF_LCCTL_ENABLE_SHIFT 16
#define CPUMF_LCCTL_ACTCTL_SHIFT 0
-static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
- [CPUMF_CTR_SET_BASIC] = 0x02,
- [CPUMF_CTR_SET_USER] = 0x04,
- [CPUMF_CTR_SET_CRYPTO] = 0x08,
- [CPUMF_CTR_SET_EXT] = 0x01,
- [CPUMF_CTR_SET_MT_DIAG] = 0x20,
-};
-
-static inline void ctr_set_enable(u64 *state, int ctr_set)
-{
- *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-static inline void ctr_set_disable(u64 *state, int ctr_set)
-{
- *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-static inline void ctr_set_start(u64 *state, int ctr_set)
-{
- *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-static inline void ctr_set_stop(u64 *state, int ctr_set)
-{
- *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
{
*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
}
-static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
{
*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
}
-static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
{
*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
}
-static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
{
*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
}
@@ -92,9 +68,15 @@ struct cpu_cf_events {
struct cpumf_ctr_info info;
atomic_t ctr_set[CPUMF_CTR_SET_MAX];
atomic64_t alert;
- u64 state, tx_state;
+ u64 state; /* For perf_event_open SVC */
+ u64 dev_state; /* For /dev/hwctr */
unsigned int flags;
- unsigned int txn_flags;
+ size_t used; /* Bytes used in data */
+ size_t usedss; /* Bytes used in start/stop */
+ unsigned char start[PAGE_SIZE]; /* Counter set at event add */
+ unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */
+ unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */
+ unsigned int sets; /* # Counter set saved in memory */
};
DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
@@ -123,4 +105,8 @@ static inline int stccm_avail(void)
return test_facility(142);
}
+size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
+ struct cpumf_ctr_info *info);
+int cfset_online_cpu(unsigned int cpu);
+int cfset_offline_cpu(unsigned int cpu);
#endif /* _ASM_S390_CPU_MCF_H */
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 0d90cbeb89b4..feaba12dbecb 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -10,6 +10,7 @@
#define _ASM_S390_CPU_MF_H
#include <linux/errno.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
asm(".include \"asm/cpu_mf-insn.h\"\n");
@@ -109,7 +110,9 @@ struct hws_basic_entry {
unsigned int AS:2; /* 29-30 PSW address-space control */
unsigned int I:1; /* 31 entry valid or invalid */
unsigned int CL:2; /* 32-33 Configuration Level */
- unsigned int:14;
+ unsigned int H:1; /* 34 Host Indicator */
+ unsigned int LS:1; /* 35 Limited Sampling */
+ unsigned int:12;
unsigned int prim_asn:16; /* primary ASN */
unsigned long long ia; /* Instruction Address */
unsigned long long gpp; /* Guest Program Parameter */
@@ -157,7 +160,7 @@ struct hws_trailer_entry {
/* Load program parameter */
static inline void lpp(void *pp)
{
- asm volatile(".insn s,0xb2800000,0(%0)\n":: "a" (pp) : "memory");
+ asm volatile("lpp 0(%0)\n" :: "a" (pp) : "memory");
}
/* Query counter information */
@@ -166,7 +169,7 @@ static inline int qctri(struct cpumf_ctr_info *info)
int rc = -EINVAL;
asm volatile (
- "0: .insn s,0xb28e0000,%1\n"
+ "0: qctri %1\n"
"1: lhi %0,0\n"
"2:\n"
EX_TABLE(1b, 2b)
@@ -180,7 +183,7 @@ static inline int lcctl(u64 ctl)
int cc;
asm volatile (
- " .insn s,0xb2840000,%1\n"
+ " lcctl %1\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc) : "Q" (ctl) : "cc");
@@ -194,7 +197,7 @@ static inline int __ecctr(u64 ctr, u64 *content)
int cc;
asm volatile (
- " .insn rre,0xb2e40000,%0,%2\n"
+ " ecctr %0,%2\n"
" ipm %1\n"
" srl %1,28\n"
: "=d" (_content), "=d" (cc) : "d" (ctr) : "cc");
@@ -244,7 +247,7 @@ static inline int qsi(struct hws_qsi_info_block *info)
int cc = 1;
asm volatile(
- "0: .insn s,0xb2860000,%1\n"
+ "0: qsi %1\n"
"1: lhi %0,0\n"
"2:\n"
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
@@ -259,7 +262,7 @@ static inline int lsctl(struct hws_lsctl_request_block *req)
cc = 1;
asm volatile(
- "0: .insn s,0xb2870000,0(%1)\n"
+ "0: lsctl 0(%1)\n"
"1: ipm %0\n"
" srl %0,28\n"
"2:\n"
diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h
index 1d007c6ede95..931204613753 100644
--- a/arch/s390/include/asm/cpufeature.h
+++ b/arch/s390/include/asm/cpufeature.h
@@ -2,28 +2,21 @@
/*
* Module interface for CPU features
*
- * Copyright IBM Corp. 2015
+ * Copyright IBM Corp. 2015, 2022
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
#ifndef __ASM_S390_CPUFEATURE_H
#define __ASM_S390_CPUFEATURE_H
-#include <asm/elf.h>
+enum {
+ S390_CPU_FEATURE_MSA,
+ S390_CPU_FEATURE_VXRS,
+ S390_CPU_FEATURE_UV,
+ MAX_CPU_FEATURES
+};
-/* Hardware features on Linux on z Systems are indicated by facility bits that
- * are mapped to the so-called machine flags. Particular machine flags are
- * then used to define ELF hardware capabilities; most notably hardware flags
- * that are essential for user space / glibc.
- *
- * Restrict the set of exposed CPU features to ELF hardware capabilities for
- * now. Additional machine flags can be indicated by values larger than
- * MAX_ELF_HWCAP_FEATURES.
- */
-#define MAX_ELF_HWCAP_FEATURES (8 * sizeof(elf_hwcap))
-#define MAX_CPU_FEATURES MAX_ELF_HWCAP_FEATURES
-
-#define cpu_feature(feat) ilog2(HWCAP_S390_ ## feat)
+#define cpu_feature(feature) (feature)
int cpu_have_feature(unsigned int nr);
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index cb729d111e20..1d389847b588 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -35,4 +35,6 @@ u64 arch_cpu_idle_time(int cpu);
#define arch_idle_time(cpu) arch_cpu_idle_time(cpu)
+void account_idle_time_irq(void);
+
#endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/include/asm/crw.h b/arch/s390/include/asm/crw.h
index c6ebfd31f1db..97456d98fe76 100644
--- a/arch/s390/include/asm/crw.h
+++ b/arch/s390/include/asm/crw.h
@@ -5,7 +5,6 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#ifndef _ASM_S390_CRW_H
diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h
index 480bb02ccacd..638137d46c85 100644
--- a/arch/s390/include/asm/css_chars.h
+++ b/arch/s390/include/asm/css_chars.h
@@ -36,7 +36,9 @@ struct css_general_char {
u64 alt_ssi : 1; /* bit 108 */
u64 : 1;
u64 narf : 1; /* bit 110 */
- u64 : 12;
+ u64 : 5;
+ u64 enarf: 1; /* bit 116 */
+ u64 : 6;
u64 util_str : 1;/* bit 123 */
} __packed;
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
index ed5efbb531c4..adf7d8cdac7e 100644
--- a/arch/s390/include/asm/ctl_reg.h
+++ b/arch/s390/include/asm/ctl_reg.h
@@ -12,6 +12,8 @@
#define CR0_CLOCK_COMPARATOR_SIGN BIT(63 - 10)
#define CR0_LOW_ADDRESS_PROTECTION BIT(63 - 35)
+#define CR0_FETCH_PROTECTION_OVERRIDE BIT(63 - 38)
+#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(63 - 39)
#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(63 - 49)
#define CR0_EXTERNAL_CALL_SUBMASK BIT(63 - 50)
#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(63 - 52)
@@ -21,8 +23,6 @@
#define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57)
#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58)
-#define CR2_GUARDED_STORAGE BIT(63 - 59)
-
#define CR14_UNUSED_32 BIT(63 - 32)
#define CR14_UNUSED_33 BIT(63 - 33)
#define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35)
@@ -74,8 +74,17 @@ static __always_inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
__ctl_load(reg, cr, cr);
}
-void smp_ctl_set_bit(int cr, int bit);
-void smp_ctl_clear_bit(int cr, int bit);
+void smp_ctl_set_clear_bit(int cr, int bit, bool set);
+
+static inline void ctl_set_bit(int cr, int bit)
+{
+ smp_ctl_set_clear_bit(cr, bit, true);
+}
+
+static inline void ctl_clear_bit(int cr, int bit)
+{
+ smp_ctl_set_clear_bit(cr, bit, false);
+}
union ctlreg0 {
unsigned long val;
@@ -84,7 +93,10 @@ union ctlreg0 {
unsigned long tcx : 1; /* Transactional-Execution control */
unsigned long pifo : 1; /* Transactional-Execution Program-
Interruption-Filtering Override */
- unsigned long : 22;
+ unsigned long : 3;
+ unsigned long ccc : 1; /* Cryptography counter control */
+ unsigned long pec : 1; /* PAI extension control */
+ unsigned long : 17;
unsigned long : 3;
unsigned long lap : 1; /* Low-address-protection control */
unsigned long : 4;
@@ -113,8 +125,22 @@ union ctlreg2 {
};
};
-#define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit)
-#define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit)
+union ctlreg5 {
+ unsigned long val;
+ struct {
+ unsigned long : 33;
+ unsigned long pasteo: 25;
+ unsigned long : 6;
+ };
+};
+
+union ctlreg15 {
+ unsigned long val;
+ struct {
+ unsigned long lsea : 61;
+ unsigned long : 3;
+ };
+};
#endif /* __ASSEMBLY__ */
#endif /* __ASM_CTL_REG_H */
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 310134015541..77f24262c25c 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -2,7 +2,7 @@
/*
* S/390 debug facility
*
- * Copyright IBM Corp. 1999, 2000
+ * Copyright IBM Corp. 1999, 2020
*/
#ifndef DEBUG_H
#define DEBUG_H
@@ -12,7 +12,8 @@
#include <linux/kernel.h>
#include <linux/time.h>
#include <linux/refcount.h>
-#include <uapi/asm/debug.h>
+#include <linux/fs.h>
+#include <linux/init.h>
#define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */
#define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */
@@ -26,6 +27,16 @@
#define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */
/* the entry information */
+#define __DEBUG_FEATURE_VERSION 3 /* version of debug feature */
+
+struct __debug_entry {
+ unsigned long clock : 60;
+ unsigned long exception : 1;
+ unsigned long level : 3;
+ void *caller;
+ unsigned short cpu;
+} __packed;
+
typedef struct __debug_entry debug_entry_t;
struct debug_view;
@@ -82,7 +93,6 @@ struct debug_view {
};
extern struct debug_view debug_hex_ascii_view;
-extern struct debug_view debug_raw_view;
extern struct debug_view debug_sprintf_view;
/* do NOT use the _common functions */
@@ -382,38 +392,99 @@ int debug_register_view(debug_info_t *id, struct debug_view *view);
int debug_unregister_view(debug_info_t *id, struct debug_view *view);
+#ifndef MODULE
+
/*
- define the debug levels:
- - 0 No debugging output to console or syslog
- - 1 Log internal errors to syslog, ignore check conditions
- - 2 Log internal errors and check conditions to syslog
- - 3 Log internal errors to console, log check conditions to syslog
- - 4 Log internal errors and check conditions to console
- - 5 panic on internal errors, log check conditions to console
- - 6 panic on both, internal errors and check conditions
+ * Note: Initial page and area numbers must be fixed to allow static
+ * initialization. This enables very early tracing. Changes to these values
+ * must be reflected in __DEFINE_STATIC_AREA.
*/
+#define EARLY_PAGES 8
+#define EARLY_AREAS 1
+
+#define VNAME(var, suffix) __##var##_##suffix
+
+/*
+ * Define static areas for early trace data. During boot debug_register_static()
+ * will replace these with dynamically allocated areas to allow custom page and
+ * area sizes, and dynamic resizing.
+ */
+#define __DEFINE_STATIC_AREA(var) \
+static char VNAME(var, data)[EARLY_PAGES][PAGE_SIZE] __initdata; \
+static debug_entry_t *VNAME(var, pages)[EARLY_PAGES] __initdata = { \
+ (debug_entry_t *)VNAME(var, data)[0], \
+ (debug_entry_t *)VNAME(var, data)[1], \
+ (debug_entry_t *)VNAME(var, data)[2], \
+ (debug_entry_t *)VNAME(var, data)[3], \
+ (debug_entry_t *)VNAME(var, data)[4], \
+ (debug_entry_t *)VNAME(var, data)[5], \
+ (debug_entry_t *)VNAME(var, data)[6], \
+ (debug_entry_t *)VNAME(var, data)[7], \
+}; \
+static debug_entry_t **VNAME(var, areas)[EARLY_AREAS] __initdata = { \
+ (debug_entry_t **)VNAME(var, pages), \
+}; \
+static int VNAME(var, active_pages)[EARLY_AREAS] __initdata; \
+static int VNAME(var, active_entries)[EARLY_AREAS] __initdata
+
+#define __DEBUG_INFO_INIT(var, _name, _buf_size) { \
+ .next = NULL, \
+ .prev = NULL, \
+ .ref_count = REFCOUNT_INIT(1), \
+ .lock = __SPIN_LOCK_UNLOCKED(var.lock), \
+ .level = DEBUG_DEFAULT_LEVEL, \
+ .nr_areas = EARLY_AREAS, \
+ .pages_per_area = EARLY_PAGES, \
+ .buf_size = (_buf_size), \
+ .entry_size = sizeof(debug_entry_t) + (_buf_size), \
+ .areas = VNAME(var, areas), \
+ .active_area = 0, \
+ .active_pages = VNAME(var, active_pages), \
+ .active_entries = VNAME(var, active_entries), \
+ .debugfs_root_entry = NULL, \
+ .debugfs_entries = { NULL }, \
+ .views = { NULL }, \
+ .name = (_name), \
+ .mode = 0600, \
+}
+
+#define __REGISTER_STATIC_DEBUG_INFO(var, name, pages, areas, view) \
+static int __init VNAME(var, reg)(void) \
+{ \
+ debug_register_static(&var, (pages), (areas)); \
+ debug_register_view(&var, (view)); \
+ return 0; \
+} \
+arch_initcall(VNAME(var, reg))
+
+/**
+ * DEFINE_STATIC_DEBUG_INFO - Define static debug_info_t
+ *
+ * @var: Name of debug_info_t variable
+ * @name: Name of debug log (e.g. used for debugfs entry)
+ * @pages: Number of pages per area
+ * @nr_areas: Number of debug areas
+ * @buf_size: Size of data area in each debug entry
+ * @view: Pointer to debug view struct
+ *
+ * Define a static debug_info_t for early tracing. The associated debugfs log
+ * is automatically registered with the specified debug view.
+ *
+ * Important: Users of this macro must not call any of the
+ * debug_register/_unregister() functions for this debug_info_t!
+ *
+ * Note: Tracing will start with a fixed number of initial pages and areas.
+ * The debug area will be changed to use the specified numbers during
+ * arch_initcall.
+ */
+#define DEFINE_STATIC_DEBUG_INFO(var, name, pages, nr_areas, buf_size, view) \
+__DEFINE_STATIC_AREA(var); \
+static debug_info_t __refdata var = \
+ __DEBUG_INFO_INIT(var, (name), (buf_size)); \
+__REGISTER_STATIC_DEBUG_INFO(var, name, pages, nr_areas, view)
+
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas);
-#ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 4
-#endif
-
-#define INTERNAL_ERRMSG(x,y...) "E" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_WRNMSG(x,y...) "W" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_INFMSG(x,y...) "I" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_DEBMSG(x,y...) "D" __FILE__ "%d: " x, __LINE__, y
-
-#if DEBUG_LEVEL > 0
-#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_INFO(x...) printk(KERN_INFO PRINTK_HEADER x)
-#define PRINT_WARN(x...) printk(KERN_WARNING PRINTK_HEADER x)
-#define PRINT_ERR(x...) printk(KERN_ERR PRINTK_HEADER x)
-#define PRINT_FATAL(x...) panic(PRINTK_HEADER x)
-#else
-#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_INFO(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_WARN(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_ERR(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_FATAL(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#endif /* DASD_DEBUG */
+#endif /* MODULE */
#endif /* DEBUG_H */
diff --git a/arch/s390/include/asm/delay.h b/arch/s390/include/asm/delay.h
index 898323fd93d2..21a8fe18fe66 100644
--- a/arch/s390/include/asm/delay.h
+++ b/arch/s390/include/asm/delay.h
@@ -13,13 +13,12 @@
#ifndef _S390_DELAY_H
#define _S390_DELAY_H
-void __ndelay(unsigned long long nsecs);
-void __udelay(unsigned long long usecs);
-void udelay_simple(unsigned long long usecs);
+void __ndelay(unsigned long nsecs);
+void __udelay(unsigned long usecs);
void __delay(unsigned long loops);
-#define ndelay(n) __ndelay((unsigned long long) (n))
-#define udelay(n) __udelay((unsigned long long) (n))
-#define mdelay(n) __udelay((unsigned long long) (n) * 1000)
+#define ndelay(n) __ndelay((unsigned long)(n))
+#define udelay(n) __udelay((unsigned long)(n))
+#define mdelay(n) __udelay((unsigned long)(n) * 1000)
#endif /* defined(_S390_DELAY_H) */
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 0036eab14391..56e99c286d12 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -11,6 +11,7 @@
#include <linux/if_ether.h>
#include <linux/percpu.h>
+#include <asm/asm-extable.h>
enum diag_stat_enum {
DIAG_STAT_X008,
@@ -47,8 +48,8 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn)
{
unsigned long start_addr, end_addr;
- start_addr = start_pfn << PAGE_SHIFT;
- end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT;
+ start_addr = pfn_to_phys(start_pfn);
+ end_addr = pfn_to_phys(start_pfn + num_pfn - 1);
diag_stat_inc(DIAG_STAT_X010);
asm volatile(
@@ -298,10 +299,8 @@ struct diag26c_mac_resp {
union diag318_info {
unsigned long val;
struct {
- unsigned int cpnc : 8;
- unsigned int cpvc_linux : 24;
- unsigned char cpvc_distro[3];
- unsigned char zero;
+ unsigned long cpnc : 8;
+ unsigned long cpvc : 56;
};
};
@@ -311,6 +310,10 @@ int diag26c(void *req, void *resp, enum diag26c_sc subcode);
struct hypfs_diag0c_entry;
+/*
+ * This structure must contain only pointers/references into
+ * the AMODE31 text section.
+ */
struct diag_ops {
int (*diag210)(struct diag210 *addr);
int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode);
@@ -319,6 +322,13 @@ struct diag_ops {
void (*diag308_reset)(void);
};
-extern struct diag_ops diag_dma_ops;
-extern struct diag210 *__diag210_tmp_dma;
+extern struct diag_ops diag_amode31_ops;
+extern struct diag210 *__diag210_tmp_amode31;
+
+int _diag210_amode31(struct diag210 *addr);
+int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode);
+int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode);
+void _diag0c_amode31(struct hypfs_diag0c_entry *entry);
+void _diag308_reset_amode31(void);
+
#endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h
index 6f26f35d4a71..dec1c4ce628c 100644
--- a/arch/s390/include/asm/dma.h
+++ b/arch/s390/include/asm/dma.h
@@ -11,10 +11,4 @@
*/
#define MAX_DMA_ADDRESS 0x80000000
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy (0)
-#endif
-
#endif /* _ASM_S390_DMA_H */
diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h
index bb63b2afdf6f..06f795855af7 100644
--- a/arch/s390/include/asm/eadm.h
+++ b/arch/s390/include/asm/eadm.h
@@ -78,7 +78,7 @@ struct aob {
struct aob_rq_header {
struct scm_device *scmdev;
- char data[0];
+ char data[];
};
struct scm_device {
@@ -105,7 +105,7 @@ enum scm_event {SCM_CHANGE, SCM_AVAIL};
struct scm_driver {
struct device_driver drv;
int (*probe) (struct scm_device *scmdev);
- int (*remove) (struct scm_device *scmdev);
+ void (*remove) (struct scm_device *scmdev);
void (*notify) (struct scm_device *scmdev, enum scm_event event);
void (*handler) (struct scm_device *scmdev, void *data,
blk_status_t error);
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 5775fc22f410..70a30ae258b7 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -91,29 +91,57 @@
/* Keep this the last entry. */
#define R_390_NUM 61
-/* Bits present in AT_HWCAP. */
-#define HWCAP_S390_ESAN3 1
-#define HWCAP_S390_ZARCH 2
-#define HWCAP_S390_STFLE 4
-#define HWCAP_S390_MSA 8
-#define HWCAP_S390_LDISP 16
-#define HWCAP_S390_EIMM 32
-#define HWCAP_S390_DFP 64
-#define HWCAP_S390_HPAGE 128
-#define HWCAP_S390_ETF3EH 256
-#define HWCAP_S390_HIGH_GPRS 512
-#define HWCAP_S390_TE 1024
-#define HWCAP_S390_VXRS 2048
-#define HWCAP_S390_VXRS_BCD 4096
-#define HWCAP_S390_VXRS_EXT 8192
-#define HWCAP_S390_GS 16384
-#define HWCAP_S390_VXRS_EXT2 32768
-#define HWCAP_S390_VXRS_PDE 65536
-#define HWCAP_S390_SORT 131072
-#define HWCAP_S390_DFLT 262144
+enum {
+ HWCAP_NR_ESAN3 = 0,
+ HWCAP_NR_ZARCH = 1,
+ HWCAP_NR_STFLE = 2,
+ HWCAP_NR_MSA = 3,
+ HWCAP_NR_LDISP = 4,
+ HWCAP_NR_EIMM = 5,
+ HWCAP_NR_DFP = 6,
+ HWCAP_NR_HPAGE = 7,
+ HWCAP_NR_ETF3EH = 8,
+ HWCAP_NR_HIGH_GPRS = 9,
+ HWCAP_NR_TE = 10,
+ HWCAP_NR_VXRS = 11,
+ HWCAP_NR_VXRS_BCD = 12,
+ HWCAP_NR_VXRS_EXT = 13,
+ HWCAP_NR_GS = 14,
+ HWCAP_NR_VXRS_EXT2 = 15,
+ HWCAP_NR_VXRS_PDE = 16,
+ HWCAP_NR_SORT = 17,
+ HWCAP_NR_DFLT = 18,
+ HWCAP_NR_VXRS_PDE2 = 19,
+ HWCAP_NR_NNPA = 20,
+ HWCAP_NR_PCI_MIO = 21,
+ HWCAP_NR_SIE = 22,
+ HWCAP_NR_MAX
+};
-/* Internal bits, not exposed via elf */
-#define HWCAP_INT_SIE 1UL
+/* Bits present in AT_HWCAP. */
+#define HWCAP_ESAN3 BIT(HWCAP_NR_ESAN3)
+#define HWCAP_ZARCH BIT(HWCAP_NR_ZARCH)
+#define HWCAP_STFLE BIT(HWCAP_NR_STFLE)
+#define HWCAP_MSA BIT(HWCAP_NR_MSA)
+#define HWCAP_LDISP BIT(HWCAP_NR_LDISP)
+#define HWCAP_EIMM BIT(HWCAP_NR_EIMM)
+#define HWCAP_DFP BIT(HWCAP_NR_DFP)
+#define HWCAP_HPAGE BIT(HWCAP_NR_HPAGE)
+#define HWCAP_ETF3EH BIT(HWCAP_NR_ETF3EH)
+#define HWCAP_HIGH_GPRS BIT(HWCAP_NR_HIGH_GPRS)
+#define HWCAP_TE BIT(HWCAP_NR_TE)
+#define HWCAP_VXRS BIT(HWCAP_NR_VXRS)
+#define HWCAP_VXRS_BCD BIT(HWCAP_NR_VXRS_BCD)
+#define HWCAP_VXRS_EXT BIT(HWCAP_NR_VXRS_EXT)
+#define HWCAP_GS BIT(HWCAP_NR_GS)
+#define HWCAP_VXRS_EXT2 BIT(HWCAP_NR_VXRS_EXT2)
+#define HWCAP_VXRS_PDE BIT(HWCAP_NR_VXRS_PDE)
+#define HWCAP_SORT BIT(HWCAP_NR_SORT)
+#define HWCAP_DFLT BIT(HWCAP_NR_DFLT)
+#define HWCAP_VXRS_PDE2 BIT(HWCAP_NR_VXRS_PDE2)
+#define HWCAP_NNPA BIT(HWCAP_NR_NNPA)
+#define HWCAP_PCI_MIO BIT(HWCAP_NR_PCI_MIO)
+#define HWCAP_SIE BIT(HWCAP_NR_SIE)
/*
* These are used to set parameters in the core dumps.
@@ -144,10 +172,6 @@ typedef s390_compat_regs compat_elf_gregset_t;
#include <linux/sched/mm.h> /* for task_struct */
#include <asm/mmu_context.h>
-#include <asm/vdso.h>
-
-extern unsigned int vdso_enabled;
-
/*
* This is used to ensure we don't load something for the wrong architecture.
*/
@@ -176,7 +200,7 @@ struct arch_elf_state {
!current->mm->context.alloc_pgste) { \
set_thread_flag(TIF_PGSTE); \
set_pt_regs_flag(task_pt_regs(current), \
- PIF_SYSCALL_RESTART); \
+ PIF_EXECVE_PGSTE_RESTART); \
_state->rc = -EAGAIN; \
} \
_state->rc; \
@@ -213,10 +237,6 @@ struct arch_elf_state {
extern unsigned long elf_hwcap;
#define ELF_HWCAP (elf_hwcap)
-/* Internal hardware capabilities, not exposed via elf */
-
-extern unsigned long int_hwcap;
-
/* This yields a string that ld.so will use to load implementation
specific libraries for optimization. This is more specific in
intent than poking at uname or /proc/cpuinfo.
@@ -233,8 +253,7 @@ extern char elf_platform[];
do { \
set_personality(PER_LINUX | \
(current->personality & (~PER_MASK))); \
- current->thread.sys_call_table = \
- (unsigned long) &sys_call_table; \
+ current->thread.sys_call_table = sys_call_table; \
} while (0)
#else /* CONFIG_COMPAT */
#define SET_PERSONALITY(ex) \
@@ -245,11 +264,11 @@ do { \
if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \
set_thread_flag(TIF_31BIT); \
current->thread.sys_call_table = \
- (unsigned long) &sys_call_table_emu; \
+ sys_call_table_emu; \
} else { \
clear_thread_flag(TIF_31BIT); \
current->thread.sys_call_table = \
- (unsigned long) &sys_call_table; \
+ sys_call_table; \
} \
} while (0)
#endif /* CONFIG_COMPAT */
@@ -269,11 +288,10 @@ do { \
#define STACK_RND_MASK MMAP_RND_MASK
/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO \
-do { \
- if (vdso_enabled) \
- NEW_AUX_ENT(AT_SYSINFO_EHDR, \
- (unsigned long)current->mm->context.vdso_base); \
+#define ARCH_DLINFO \
+do { \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long)current->mm->context.vdso_base); \
} while (0)
struct linux_binprm;
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
new file mode 100644
index 000000000000..000de2b1e67a
--- /dev/null
+++ b/arch/s390/include/asm/entry-common.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_S390_ENTRY_COMMON_H
+#define ARCH_S390_ENTRY_COMMON_H
+
+#include <linux/sched.h>
+#include <linux/audit.h>
+#include <linux/randomize_kstack.h>
+#include <linux/processor.h>
+#include <linux/uaccess.h>
+#include <asm/timex.h>
+#include <asm/fpu/api.h>
+#include <asm/pai.h>
+
+#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
+
+void do_per_trap(struct pt_regs *regs);
+
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ debug_user_asce(0);
+
+ pai_kernel_enter(regs);
+}
+
+#define arch_enter_from_user_mode arch_enter_from_user_mode
+
+static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ if (ti_work & _TIF_PER_TRAP) {
+ clear_thread_flag(TIF_PER_TRAP);
+ do_per_trap(regs);
+ }
+
+ if (ti_work & _TIF_GUARDED_STORAGE)
+ gs_load_bc_cb(regs);
+}
+
+#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work
+
+static __always_inline void arch_exit_to_user_mode(void)
+{
+ if (test_cpu_flag(CIF_FPU))
+ __load_fpu_regs();
+
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ debug_user_asce(1);
+
+ pai_kernel_exit(current_pt_regs());
+}
+
+#define arch_exit_to_user_mode arch_exit_to_user_mode
+
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ choose_random_kstack_offset(get_tod_clock_fast() & 0xff);
+}
+
+#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+
+static inline bool on_thread_stack(void)
+{
+ return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1));
+}
+
+#endif
diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h
index ae27f756b409..af6ba52743e9 100644
--- a/arch/s390/include/asm/extable.h
+++ b/arch/s390/include/asm/extable.h
@@ -1,12 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __S390_EXTABLE_H
#define __S390_EXTABLE_H
+
+#include <asm/ptrace.h>
+#include <linux/compiler.h>
+
/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of three addresses:
+ *
+ * - Address of an instruction that is allowed to fault.
+ * - Address at which the program should continue.
+ * - Optional address of handler that takes pt_regs * argument and runs in
+ * interrupt context.
+ *
+ * No registers are modified, so it is entirely up to the continuation code
+ * to figure out what to do.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
@@ -17,10 +25,11 @@
struct exception_table_entry
{
int insn, fixup;
+ short type, data;
};
-extern struct exception_table_entry *__start_dma_ex_table;
-extern struct exception_table_entry *__stop_dma_ex_table;
+extern struct exception_table_entry *__start_amode31_ex_table;
+extern struct exception_table_entry *__stop_amode31_ex_table;
const struct exception_table_entry *s390_search_extables(unsigned long addr);
@@ -31,4 +40,33 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
#define ARCH_HAS_RELATIVE_EXTABLE
+static inline void swap_ex_entry_fixup(struct exception_table_entry *a,
+ struct exception_table_entry *b,
+ struct exception_table_entry tmp,
+ int delta)
+{
+ a->fixup = b->fixup + delta;
+ b->fixup = tmp.fixup - delta;
+ a->type = b->type;
+ b->type = tmp.type;
+ a->data = b->data;
+ b->data = tmp.data;
+}
+#define swap_ex_entry_fixup swap_ex_entry_fixup
+
+#ifdef CONFIG_BPF_JIT
+
+bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs);
+
+#else /* !CONFIG_BPF_JIT */
+
+static inline bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ return false;
+}
+
+#endif /* CONFIG_BPF_JIT */
+
+bool fixup_exception(struct pt_regs *regs);
+
#endif
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index 68c476b20b57..94b6919026df 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -9,11 +9,18 @@
#define __ASM_FACILITY_H
#include <asm/facility-defs.h>
+
+#include <linux/minmax.h>
#include <linux/string.h>
+#include <linux/types.h>
#include <linux/preempt.h>
+
#include <asm/lowcore.h>
-#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8)
+#define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8)
+
+extern u64 stfle_fac_list[16];
+extern u64 alt_stfle_fac_list[16];
static inline void __set_facility(unsigned long nr, void *facilities)
{
@@ -44,7 +51,7 @@ static inline int __test_facility(unsigned long nr, void *facilities)
}
/*
- * The test_facility function uses the bit odering where the MSB is bit 0.
+ * The test_facility function uses the bit ordering where the MSB is bit 0.
* That makes it easier to query facility bits with the bit number as
* documented in the Principles of Operation.
*/
@@ -56,18 +63,20 @@ static inline int test_facility(unsigned long nr)
if (__test_facility(nr, &facilities_als))
return 1;
}
- return __test_facility(nr, &S390_lowcore.stfle_fac_list);
+ return __test_facility(nr, &stfle_fac_list);
}
static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
{
- register unsigned long reg0 asm("0") = size - 1;
+ unsigned long reg0 = size - 1;
asm volatile(
- ".insn s,0xb2b00000,0(%1)" /* stfle */
- : "+d" (reg0)
- : "a" (stfle_fac_list)
- : "memory", "cc");
+ " lgr 0,%[reg0]\n"
+ " .insn s,0xb2b00000,%[list]\n" /* stfle */
+ " lgr %[reg0],0\n"
+ : [reg0] "+&d" (reg0), [list] "+Q" (*stfle_fac_list)
+ :
+ : "memory", "cc", "0");
return reg0;
}
@@ -79,13 +88,15 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
static inline void __stfle(u64 *stfle_fac_list, int size)
{
unsigned long nr;
+ u32 stfl_fac_list;
asm volatile(
" stfl 0(0)\n"
: "=m" (S390_lowcore.stfl_fac_list));
+ stfl_fac_list = S390_lowcore.stfl_fac_list;
+ memcpy(stfle_fac_list, &stfl_fac_list, 4);
nr = 4; /* bytes stored by stfl */
- memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4);
- if (S390_lowcore.stfl_fac_list & 0x01000000) {
+ if (stfl_fac_list & 0x01000000) {
/* More facility bits available with stfle */
nr = __stfle_asm(stfle_fac_list, size);
nr = min_t(unsigned long, (nr + 1) * 8, size * 8);
diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h
index cff0749e9657..b8a028a36173 100644
--- a/arch/s390/include/asm/fcx.h
+++ b/arch/s390/include/asm/fcx.h
@@ -214,7 +214,7 @@ struct dcw_intrg_data {
u32 :32;
u64 time;
u64 prog_id;
- u8 prog_data[0];
+ u8 prog_data[];
} __attribute__ ((packed));
#define DCW_FLAGS_CC (1 << (7 - 1))
@@ -241,7 +241,7 @@ struct dcw {
u32 :8;
u32 cd_count:8;
u32 count;
- u8 cd[0];
+ u8 cd[];
} __attribute__ ((packed));
#define TCCB_FORMAT_DEFAULT 0x7f
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
index 34a7ae68485c..b714ed0ef688 100644
--- a/arch/s390/include/asm/fpu/api.h
+++ b/arch/s390/include/asm/fpu/api.h
@@ -45,8 +45,11 @@
#define _ASM_S390_FPU_API_H
#include <linux/preempt.h>
+#include <asm/asm-extable.h>
void save_fpu_regs(void);
+void load_fpu_regs(void);
+void __load_fpu_regs(void);
static inline int test_fp_ctl(u32 fpc)
{
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 68d362f8d6c1..6f80ec9c04be 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -2,16 +2,9 @@
#ifndef _ASM_S390_FTRACE_H
#define _ASM_S390_FTRACE_H
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
#define ARCH_SUPPORTS_FTRACE_OPS 1
-
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
#define MCOUNT_INSN_SIZE 6
-#else
-#define MCOUNT_INSN_SIZE 24
-#define MCOUNT_RETURN_FIXUP 18
-#endif
-
-#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
#ifndef __ASSEMBLY__
@@ -22,70 +15,61 @@
#define ftrace_return_address(n) __builtin_return_address(n)
#endif
-void _mcount(void);
void ftrace_caller(void);
-extern char ftrace_graph_caller_end;
-extern unsigned long ftrace_plt;
+extern void *ftrace_func;
struct dyn_arch_ftrace { };
-#define MCOUNT_ADDR ((unsigned long)_mcount)
+#define MCOUNT_ADDR 0
#define FTRACE_ADDR ((unsigned long)ftrace_caller)
#define KPROBE_ON_FTRACE_NOP 0
#define KPROBE_ON_FTRACE_CALL 1
+struct module;
+struct dyn_ftrace;
+
+bool ftrace_need_init_nop(void);
+#define ftrace_need_init_nop ftrace_need_init_nop
+
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
+#define ftrace_init_nop ftrace_init_nop
+
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
return addr;
}
-struct ftrace_insn {
- u16 opc;
- s32 disp;
-} __packed;
+struct ftrace_regs {
+ struct pt_regs regs;
+};
-static inline void ftrace_generate_nop_insn(struct ftrace_insn *insn)
+static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
{
-#ifdef CONFIG_FUNCTION_TRACER
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- /* brcl 0,0 */
- insn->opc = 0xc004;
- insn->disp = 0;
-#else
- /* jg .+24 */
- insn->opc = 0xc0f4;
- insn->disp = MCOUNT_INSN_SIZE / 2;
-#endif
-#endif
+ struct pt_regs *regs = &fregs->regs;
+
+ if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS))
+ return regs;
+ return NULL;
}
-static inline int is_ftrace_nop(struct ftrace_insn *insn)
+static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs,
+ unsigned long ip)
{
-#ifdef CONFIG_FUNCTION_TRACER
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- if (insn->disp == 0)
- return 1;
-#else
- if (insn->disp == MCOUNT_INSN_SIZE / 2)
- return 1;
-#endif
-#endif
- return 0;
+ fregs->regs.psw.addr = ip;
}
-static inline void ftrace_generate_call_insn(struct ftrace_insn *insn,
- unsigned long ip)
+/*
+ * When an ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this,
+ * place the direct caller in the ORIG_GPR2 part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
{
-#ifdef CONFIG_FUNCTION_TRACER
- unsigned long target;
-
- /* brasl r0,ftrace_caller */
- target = is_module_addr((void *) ip) ? ftrace_plt : FTRACE_ADDR;
- insn->opc = 0xc005;
- insn->disp = (target - ip) / 2;
-#endif
+ regs->orig_gpr2 = addr;
}
/*
@@ -114,4 +98,32 @@ static inline bool arch_syscall_match_sym_name(const char *sym,
}
#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */
+
+#ifndef CC_USING_HOTPATCH
+
+#define FTRACE_GEN_MCOUNT_RECORD(name) \
+ .section __mcount_loc, "a", @progbits; \
+ .quad name; \
+ .previous;
+
+#else /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_MCOUNT_RECORD(name)
+
+#endif /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_NOP_ASM(name) \
+ FTRACE_GEN_MCOUNT_RECORD(name) \
+ FTRACE_NOP_INSN
+
+#else /* CONFIG_FUNCTION_TRACER */
+
+#define FTRACE_GEN_NOP_ASM(name)
+
+#endif /* CONFIG_FUNCTION_TRACER */
+
#endif /* _ASM_S390_FTRACE_H */
diff --git a/arch/s390/include/asm/ftrace.lds.h b/arch/s390/include/asm/ftrace.lds.h
new file mode 100644
index 000000000000..968adfd41240
--- /dev/null
+++ b/arch/s390/include/asm/ftrace.lds.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef DIV_ROUND_UP
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#endif
+
+#define SIZEOF_MCOUNT_LOC_ENTRY 8
+#define SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE 24
+#define FTRACE_HOTPATCH_TRAMPOLINES_SIZE(n) \
+ DIV_ROUND_UP(SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE * (n), \
+ SIZEOF_MCOUNT_LOC_ENTRY)
+
+#ifdef CONFIG_FUNCTION_TRACER
+#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT \
+ . = ALIGN(8); \
+ __ftrace_hotpatch_trampolines_start = .; \
+ . = . + FTRACE_HOTPATCH_TRAMPOLINES_SIZE(__stop_mcount_loc - \
+ __start_mcount_loc); \
+ __ftrace_hotpatch_trampolines_end = .;
+#else
+#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT
+#endif
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 5e97a4353147..eaeaeb3ff0be 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -4,6 +4,7 @@
#include <linux/uaccess.h>
#include <linux/futex.h>
+#include <asm/asm-extable.h>
#include <asm/mmu_context.h>
#include <asm/errno.h>
@@ -16,7 +17,8 @@
"3: jl 1b\n" \
" lhi %0,0\n" \
"4: sacf 768\n" \
- EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \
+ EX_TABLE(0b,4b) EX_TABLE(1b,4b) \
+ EX_TABLE(2b,4b) EX_TABLE(3b,4b) \
: "=d" (ret), "=&d" (oldval), "=&d" (newval), \
"=m" (*uaddr) \
: "0" (-EFAULT), "d" (oparg), "a" (uaddr), \
@@ -26,10 +28,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
u32 __user *uaddr)
{
int oldval = 0, newval, ret;
- mm_segment_t old_fs;
- old_fs = enable_sacf_uaccess();
- pagefault_disable();
switch (op) {
case FUTEX_OP_SET:
__futex_atomic_op("lr %2,%5\n",
@@ -54,8 +53,6 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
default:
ret = -ENOSYS;
}
- pagefault_enable();
- disable_sacf_uaccess(old_fs);
if (!ret)
*oval = oldval;
@@ -66,10 +63,8 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
u32 oldval, u32 newval)
{
- mm_segment_t old_fs;
int ret;
- old_fs = enable_sacf_uaccess();
asm volatile(
" sacf 256\n"
"0: cs %1,%4,0(%5)\n"
@@ -79,7 +74,6 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
: "=d" (ret), "+d" (oldval), "=m" (*uaddr)
: "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
: "cc", "memory");
- disable_sacf_uaccess(old_fs);
*uval = oldval;
return ret;
}
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 37f96b6f0e61..5cc46e0dde62 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -9,6 +9,7 @@
#ifndef _ASM_S390_GMAP_H
#define _ASM_S390_GMAP_H
+#include <linux/radix-tree.h>
#include <linux/refcount.h>
/* Generic bits for GMAP notification on DAT table entry changes. */
@@ -31,6 +32,7 @@
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
* @pfault_enabled: defines if pfaults are applicable for the guest
+ * @guest_handle: protected virtual machine handle for the ultravisor
* @host_to_rmap: radix tree with gmap_rmap lists
* @children: list of shadow gmap structures
* @pt_list: list of all page tables used in the shadow guest address space
@@ -54,6 +56,8 @@ struct gmap {
unsigned long asce_end;
void *private;
bool pfault_enabled;
+ /* only set for protected virtual machines */
+ unsigned long guest_handle;
/* Additional data for shadow guest address spaces */
struct radix_tree_root host_to_rmap;
struct list_head children;
@@ -136,12 +140,49 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
void gmap_register_pte_notifier(struct gmap_notifier *);
void gmap_unregister_pte_notifier(struct gmap_notifier *);
-void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
- unsigned long bits);
int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot);
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
+int gmap_mark_unmergeable(void);
+void s390_unlist_old_asce(struct gmap *gmap);
+int s390_replace_asce(struct gmap *gmap);
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible);
+
+/**
+ * s390_uv_destroy_range - Destroy a range of pages in the given mm.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls, but
+ * it will otherwise only return when it completed.
+ */
+static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ (void)__s390_uv_destroy_range(mm, start, end, false);
+}
+
+/**
+ * s390_uv_destroy_range_interruptible - Destroy a range of pages in the
+ * given mm, but stop when a fatal signal is received.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls. If
+ * a fatal signal is received, it will return with -EINTR immediately,
+ * without finishing destroying the whole range. Upon successful
+ * completion, 0 is returned.
+ */
+static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ return __s390_uv_destroy_range(mm, start, end, true);
+}
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index dfbc3c6c0674..58668ffb5488 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,6 @@
#define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x))
#define __ARCH_IRQ_STAT
-#define __ARCH_HAS_DO_SOFTIRQ
#define __ARCH_IRQ_EXIT_IRQS_DISABLED
static inline void ack_bad_irq(unsigned int irq)
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index de8f0bf5f238..ccdbccfde148 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -9,8 +9,8 @@
#ifndef _ASM_S390_HUGETLB_H
#define _ASM_S390_HUGETLB_H
+#include <linux/pgtable.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#define hugetlb_free_pgd_range free_pgd_range
#define hugepages_supported() (MACHINE_HAS_EDAT1)
@@ -21,13 +21,6 @@ pte_t huge_ptep_get(pte_t *ptep);
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
-static inline bool is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len)
-{
- return false;
-}
-
/*
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
@@ -35,9 +28,11 @@ static inline bool is_hugepage_only_range(struct mm_struct *mm,
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
- if (len & ~HPAGE_MASK)
+ struct hstate *h = hstate_file(file);
+
+ if (len & ~huge_page_mask(h))
return -EINVAL;
- if (addr & ~HPAGE_MASK)
+ if (addr & ~huge_page_mask(h))
return -EINVAL;
return 0;
}
@@ -46,20 +41,21 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_arch_1, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
{
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
+ set_pte(ptep, __pte(_REGION3_ENTRY_EMPTY));
else
- pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
+ set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY));
}
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
- huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
+ return huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
}
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -91,6 +87,11 @@ static inline int huge_pte_none(pte_t pte)
return pte_none(pte);
}
+static inline int huge_pte_none_mostly(pte_t pte)
+{
+ return huge_pte_none(pte);
+}
+
static inline int huge_pte_write(pte_t pte)
{
return pte_write(pte);
@@ -121,6 +122,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
+static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static inline int huge_pte_uffd_wp(pte_t pte)
+{
+ return 0;
+}
+
static inline bool gigantic_page_runtime_supported(void)
{
return true;
diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h
index adae176757ae..9078b5b6b837 100644
--- a/arch/s390/include/asm/hw_irq.h
+++ b/arch/s390/include/asm/hw_irq.h
@@ -7,6 +7,5 @@
void __init init_airq_interrupts(void);
void __init init_cio_interrupts(void);
-void __init init_ext_interrupts(void);
#endif
diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h
index 6fb7aced104a..40eae2c08d61 100644
--- a/arch/s390/include/asm/idals.h
+++ b/arch/s390/include/asm/idals.h
@@ -108,7 +108,7 @@ clear_normalized_cda(struct ccw1 * ccw)
struct idal_buffer {
size_t size;
size_t page_order;
- void *data[0];
+ void *data[];
};
/*
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 6d4226dcf42a..5cea629c548e 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -14,17 +14,19 @@
struct s390_idle_data {
seqcount_t seqcount;
- unsigned long long idle_count;
- unsigned long long idle_time;
- unsigned long long clock_idle_enter;
- unsigned long long clock_idle_exit;
- unsigned long long timer_idle_enter;
- unsigned long long timer_idle_exit;
+ unsigned long idle_count;
+ unsigned long idle_time;
+ unsigned long clock_idle_enter;
+ unsigned long clock_idle_exit;
+ unsigned long timer_idle_enter;
+ unsigned long timer_idle_exit;
+ unsigned long mt_cycles_enter[8];
};
extern struct device_attribute dev_attr_idle_count;
extern struct device_attribute dev_attr_idle_time_us;
-void psw_idle(struct s390_idle_data *, unsigned long);
+void psw_idle(struct s390_idle_data *data, unsigned long psw_mask);
+void psw_idle_exit(void);
#endif /* _S390_IDLE_H */
diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 5a16f500515a..e3882b012bfa 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <asm/page.h>
+#include <asm/pgtable.h>
#include <asm/pci_io.h>
#define xlate_dev_mem_ptr xlate_dev_mem_ptr
@@ -19,14 +20,12 @@ void *xlate_dev_mem_ptr(phys_addr_t phys);
#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
#define IO_SPACE_LIMIT 0
-void __iomem *ioremap(unsigned long offset, unsigned long size);
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot);
+void __iomem *ioremap(phys_addr_t addr, size_t size);
+void __iomem *ioremap_wc(phys_addr_t addr, size_t size);
+void __iomem *ioremap_wt(phys_addr_t addr, size_t size);
void iounmap(volatile void __iomem *addr);
static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
@@ -52,6 +51,10 @@ static inline void ioport_unmap(void __iomem *p)
#define pci_iomap_wc pci_iomap_wc
#define pci_iomap_wc_range pci_iomap_wc_range
+#define ioremap ioremap
+#define ioremap_wt ioremap_wt
+#define ioremap_wc ioremap_wc
+
#define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count)
#define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count)
#define memset_io(dst, val, count) zpci_memset_io(dst, val, count)
diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 084e71b7272a..a405b6bb89fb 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h
@@ -12,6 +12,7 @@
#include <asm/types.h>
#include <asm/cio.h>
#include <asm/setup.h>
+#include <asm/page.h>
#include <uapi/asm/ipl.h>
struct ipl_parameter_block {
@@ -21,6 +22,7 @@ struct ipl_parameter_block {
struct ipl_pb0_common common;
struct ipl_pb0_fcp fcp;
struct ipl_pb0_ccw ccw;
+ struct ipl_pb0_nvme nvme;
char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)];
};
} __packed __aligned(PAGE_SIZE);
@@ -30,6 +32,11 @@ struct ipl_parameter_block {
#define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \
sizeof(struct ipl_pb0_fcp))
#define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp))
+
+#define IPL_BP_NVME_LEN (sizeof(struct ipl_pl_hdr) + \
+ sizeof(struct ipl_pb0_nvme))
+#define IPL_BP0_NVME_LEN (sizeof(struct ipl_pb0_nvme))
+
#define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \
sizeof(struct ipl_pb0_ccw))
#define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw))
@@ -59,6 +66,8 @@ enum ipl_type {
IPL_TYPE_FCP = 4,
IPL_TYPE_FCP_DUMP = 8,
IPL_TYPE_NSS = 16,
+ IPL_TYPE_NVME = 32,
+ IPL_TYPE_NVME_DUMP = 64,
};
struct ipl_info
@@ -74,6 +83,10 @@ struct ipl_info
u64 lun;
} fcp;
struct {
+ u32 fid;
+ u32 nsid;
+ } nvme;
+ struct {
char name[NSS_NAME_SIZE + 1];
} nss;
} data;
@@ -83,6 +96,12 @@ extern struct ipl_info ipl_info;
extern void setup_ipl(void);
extern void set_os_info_reipl_block(void);
+static inline bool is_ipl_type_dump(void)
+{
+ return (ipl_info.type == IPL_TYPE_FCP_DUMP) ||
+ (ipl_info.type == IPL_TYPE_NVME_DUMP);
+}
+
struct ipl_report {
struct ipl_parameter_block *ipib;
struct list_head components;
@@ -114,11 +133,18 @@ int ipl_report_add_certificate(struct ipl_report *report, void *key,
* DIAG 308 support
*/
enum diag308_subcode {
+ DIAG308_CLEAR_RESET = 0,
+ DIAG308_LOAD_NORMAL_RESET = 1,
DIAG308_REL_HSA = 2,
DIAG308_LOAD_CLEAR = 3,
DIAG308_LOAD_NORMAL_DUMP = 4,
DIAG308_SET = 5,
DIAG308_STORE = 6,
+ DIAG308_LOAD_NORMAL = 7,
+};
+
+enum diag308_subcode_flags {
+ DIAG308_FLAG_EI = 1UL << 16,
};
enum diag308_rc {
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 9f75d67b8c20..89902f754740 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -81,8 +81,13 @@ static __always_inline void inc_irq_stat(enum interruption_class irq)
}
struct ext_code {
- unsigned short subcode;
- unsigned short code;
+ union {
+ struct {
+ unsigned short subcode;
+ unsigned short code;
+ };
+ unsigned int int_code;
+ };
};
typedef void (*ext_int_handler_t)(struct ext_code, unsigned int, unsigned long);
diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h
new file mode 100644
index 000000000000..603783766d0a
--- /dev/null
+++ b/arch/s390/include/asm/irq_work.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_IRQ_WORK_H
+#define _ASM_S390_IRQ_WORK_H
+
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return true;
+}
+
+void arch_irq_work_raise(void);
+
+#endif /* _ASM_S390_IRQ_WORK_H */
diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h
index 586df4c9e2f2..02427b205c11 100644
--- a/arch/s390/include/asm/irqflags.h
+++ b/arch/s390/include/asm/irqflags.h
@@ -32,45 +32,45 @@
})
/* set system mask. */
-static inline notrace void __arch_local_irq_ssm(unsigned long flags)
+static __always_inline void __arch_local_irq_ssm(unsigned long flags)
{
asm volatile("ssm %0" : : "Q" (flags) : "memory");
}
-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
{
return __arch_local_irq_stnsm(0xff);
}
-static inline notrace unsigned long arch_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
return __arch_local_irq_stnsm(0xfc);
}
-static inline notrace void arch_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
{
arch_local_irq_save();
}
-static inline notrace void arch_local_irq_enable(void)
+static __always_inline void arch_local_irq_enable(void)
{
__arch_local_irq_stosm(0x03);
}
/* This only restores external and I/O interrupt state */
-static inline notrace void arch_local_irq_restore(unsigned long flags)
+static __always_inline void arch_local_irq_restore(unsigned long flags)
{
/* only disabled->disabled and disabled->enabled is valid */
if (flags & ARCH_IRQ_ENABLED)
arch_local_irq_enable();
}
-static inline notrace bool arch_irqs_disabled_flags(unsigned long flags)
+static __always_inline bool arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & ARCH_IRQ_ENABLED);
}
-static inline notrace bool arch_irqs_disabled(void)
+static __always_inline bool arch_irqs_disabled(void)
{
return arch_irqs_disabled_flags(arch_local_save_flags());
}
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index dcb1bba4f406..895f774bbcc5 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -2,13 +2,14 @@
#ifndef _ASM_S390_JUMP_LABEL_H
#define _ASM_S390_JUMP_LABEL_H
+#define HAVE_JUMP_LABEL_BATCH
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/stringify.h>
#define JUMP_LABEL_NOP_SIZE 6
-#define JUMP_LABEL_NOP_OFFSET 2
#ifdef CONFIG_CC_IS_CLANG
#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "i"
@@ -19,12 +20,12 @@
#endif
/*
- * We use a brcl 0,2 instruction for jump labels at compile time so it
+ * We use a brcl 0,<offset> instruction for jump labels so it
* can be easily distinguished from a hotpatch generated instruction.
*/
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
- asm_volatile_goto("0: brcl 0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n"
+ asm_volatile_goto("0: brcl 0,%l[label]\n"
".pushsection __jump_table,\"aw\"\n"
".balign 8\n"
".long 0b-.,%l[label]-.\n"
diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h
index 70930fe5c496..2768d5db181f 100644
--- a/arch/s390/include/asm/kasan.h
+++ b/arch/s390/include/asm/kasan.h
@@ -7,24 +7,45 @@
#ifdef CONFIG_KASAN
#define KASAN_SHADOW_SCALE_SHIFT 3
-#ifdef CONFIG_KASAN_S390_4_LEVEL_PAGING
#define KASAN_SHADOW_SIZE \
(_AC(1, UL) << (_REGION1_SHIFT - KASAN_SHADOW_SCALE_SHIFT))
-#else
-#define KASAN_SHADOW_SIZE \
- (_AC(1, UL) << (_REGION2_SHIFT - KASAN_SHADOW_SCALE_SHIFT))
-#endif
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
#define KASAN_SHADOW_START KASAN_SHADOW_OFFSET
#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
extern void kasan_early_init(void);
-extern void kasan_copy_shadow(pgd_t *dst);
+extern void kasan_copy_shadow_mapping(void);
extern void kasan_free_early_identity(void);
+
+/*
+ * Estimate kasan memory requirements, which it will reserve
+ * at the very end of available physical memory. To estimate
+ * that, we take into account that kasan would require
+ * 1/8 of available physical memory (for shadow memory) +
+ * creating page tables for the whole memory + shadow memory
+ * region (1 + 1/8). To keep page tables estimates simple take
+ * the double of combined ptes size.
+ *
+ * physmem parameter has to be already adjusted if not entire physical memory
+ * would be used (e.g. due to effect of "mem=" option).
+ */
+static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem)
+{
+ unsigned long kasan_needs;
+ unsigned long pages;
+ /* for shadow memory */
+ kasan_needs = round_up(physmem / 8, PAGE_SIZE);
+ /* for paging structures */
+ pages = DIV_ROUND_UP(physmem + kasan_needs, PAGE_SIZE);
+ kasan_needs += DIV_ROUND_UP(pages, _PAGE_ENTRIES) * _PAGE_TABLE_SIZE * 2;
+
+ return kasan_needs;
+}
#else
static inline void kasan_early_init(void) { }
-static inline void kasan_copy_shadow(pgd_t *dst) { }
+static inline void kasan_copy_shadow_mapping(void) { }
static inline void kasan_free_early_identity(void) { }
+static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) { return 0; }
#endif
#endif
diff --git a/arch/s390/include/asm/kdebug.h b/arch/s390/include/asm/kdebug.h
index d5327f064799..4377238e4752 100644
--- a/arch/s390/include/asm/kdebug.h
+++ b/arch/s390/include/asm/kdebug.h
@@ -23,6 +23,6 @@ enum die_val {
DIE_NMI_IPI,
};
-extern void die(struct pt_regs *, const char *);
+extern void __noreturn die(struct pt_regs *, const char *);
#endif
diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index ea398a05f643..1bd08eb56d5f 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -9,6 +9,8 @@
#ifndef _S390_KEXEC_H
#define _S390_KEXEC_H
+#include <linux/module.h>
+
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/setup.h>
@@ -29,7 +31,7 @@
#define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31)
/* Allocate control page with GFP_DMA */
-#define KEXEC_CONTROL_MEMORY_GFP GFP_DMA
+#define KEXEC_CONTROL_MEMORY_GFP (GFP_DMA | __GFP_NORETRY)
/* Maximum address we can use for the crash control pages */
#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL)
@@ -74,7 +76,35 @@ void *kexec_file_add_components(struct kimage *image,
int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
unsigned long addr);
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+ void *ipl_buf;
+};
+
extern const struct kexec_file_ops s390_kexec_image_ops;
extern const struct kexec_file_ops s390_kexec_elf_ops;
+#ifdef CONFIG_CRASH_DUMP
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
+#define crash_free_reserved_phys_range crash_free_reserved_phys_range
+
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+#endif
+
+#ifdef CONFIG_KEXEC_FILE
+struct purgatory_info;
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+#endif
#endif /*_S390_KEXEC_H */
diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h
new file mode 100644
index 000000000000..d55ba878378b
--- /dev/null
+++ b/arch/s390/include/asm/kfence.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_KFENCE_H
+#define _ASM_S390_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/kfence.h>
+#include <asm/set_memory.h>
+#include <asm/page.h>
+
+void __kernel_map_pages(struct page *page, int numpages, int enable);
+
+static __always_inline bool arch_kfence_init_pool(void)
+{
+ return true;
+}
+
+#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK)
+
+/*
+ * Do not split kfence pool to 4k mapping with arch_kfence_init_pool(),
+ * but earlier where page table allocations still happen with memblock.
+ * Reason is that arch_kfence_init_pool() gets called when the system
+ * is still in a limbo state - disabling and enabling bottom halves is
+ * not yet allowed, but that is what our page_table_alloc() would do.
+ */
+static __always_inline void kfence_split_mapping(void)
+{
+#ifdef CONFIG_KFENCE
+ unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+
+ set_memory_4k((unsigned long)__kfence_pool, pool_pages);
+#endif
+}
+
+static inline bool kfence_protect_page(unsigned long addr, bool protect)
+{
+ __kernel_map_pages(virt_to_page(addr), 1, !protect);
+ return true;
+}
+
+#endif /* _ASM_S390_KFENCE_H */
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 09cdb632a490..598095f4b924 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -70,7 +70,8 @@ struct kprobe_ctlblk {
};
void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
+void __kretprobe_trampoline(void);
+void trampoline_probe_handler(struct pt_regs *regs);
int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
int kprobe_exceptions_notify(struct notifier_block *self,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 1726224e7772..b1e98a9ed152 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -19,6 +19,8 @@
#include <linux/kvm.h>
#include <linux/seqlock.h>
#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/mmu_notifier.h>
#include <asm/debug.h>
#include <asm/cpu.h>
#include <asm/fpu/api.h>
@@ -28,15 +30,14 @@
#define KVM_S390_BSCA_CPU_SLOTS 64
#define KVM_S390_ESCA_CPU_SLOTS 248
#define KVM_MAX_VCPUS 255
-#define KVM_USER_MEM_SLOTS 32
/*
- * These seem to be used for allocating ->chip in the routing table,
- * which we don't use. 4096 is an out-of-thin-air value. If we need
- * to look at ->chip later on, we'll need to revisit this.
+ * These seem to be used for allocating ->chip in the routing table, which we
+ * don't use. 1 is as small as we can get to reduce the needed memory. If we
+ * need to look at ->chip later on, we'll need to revisit this.
*/
#define KVM_NR_IRQCHIPS 1
-#define KVM_IRQCHIP_NUM_PINS 4096
+#define KVM_IRQCHIP_NUM_PINS 1
#define KVM_HALT_POLL_NS_DEFAULT 50000
/* s390-specific vcpu->requests bit members */
@@ -46,6 +47,8 @@
#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
#define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4)
#define KVM_REQ_VSIE_RESTART KVM_ARCH_REQ(5)
+#define KVM_REQ_REFRESH_GUEST_PREFIX \
+ KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define SIGP_CTRL_C 0x80
#define SIGP_CTRL_SCN_MASK 0x3f
@@ -92,19 +95,30 @@ union ipte_control {
};
};
+union sca_utility {
+ __u16 val;
+ struct {
+ __u16 mtcr : 1;
+ __u16 reserved : 15;
+ };
+};
+
struct bsca_block {
union ipte_control ipte_control;
__u64 reserved[5];
__u64 mcn;
- __u64 reserved2;
+ union sca_utility utility;
+ __u8 reserved2[6];
struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
};
struct esca_block {
union ipte_control ipte_control;
- __u64 reserved1[7];
+ __u64 reserved1[6];
+ union sca_utility utility;
+ __u8 reserved2[6];
__u64 mcn[4];
- __u64 reserved2[20];
+ __u64 reserved3[20];
struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
};
@@ -127,6 +141,12 @@ struct mcck_volatile_info {
#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \
CR14_EXTERNAL_DAMAGE_SUBMASK)
+#define SIDAD_SIZE_MASK 0xff
+#define sida_origin(sie_block) \
+ ((sie_block)->sidad & PAGE_MASK)
+#define sida_size(sie_block) \
+ ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE)
+
#define CPUSTAT_STOPPED 0x80000000
#define CPUSTAT_WAIT 0x10000000
#define CPUSTAT_ECALL_PEND 0x08000000
@@ -160,7 +180,13 @@ struct kvm_s390_sie_block {
__u8 reserved08[4]; /* 0x0008 */
#define PROG_IN_SIE (1<<0)
__u32 prog0c; /* 0x000c */
- __u8 reserved10[16]; /* 0x0010 */
+ union {
+ __u8 reserved10[16]; /* 0x0010 */
+ struct {
+ __u64 pv_handle_cpu;
+ __u64 pv_handle_config;
+ };
+ };
#define PROG_BLOCK_SIE (1<<0)
#define PROG_REQUEST (1<<1)
atomic_t prog20; /* 0x0020 */
@@ -209,10 +235,23 @@ struct kvm_s390_sie_block {
#define ICPT_PARTEXEC 0x38
#define ICPT_IOINST 0x40
#define ICPT_KSS 0x5c
+#define ICPT_MCHKREQ 0x60
+#define ICPT_INT_ENABLE 0x64
+#define ICPT_PV_INSTR 0x68
+#define ICPT_PV_NOTIFY 0x6c
+#define ICPT_PV_PREF 0x70
__u8 icptcode; /* 0x0050 */
__u8 icptstatus; /* 0x0051 */
__u16 ihcpu; /* 0x0052 */
- __u8 reserved54[2]; /* 0x0054 */
+ __u8 reserved54; /* 0x0054 */
+#define IICTL_CODE_NONE 0x00
+#define IICTL_CODE_MCHK 0x01
+#define IICTL_CODE_EXT 0x02
+#define IICTL_CODE_IO 0x03
+#define IICTL_CODE_RESTART 0x04
+#define IICTL_CODE_SPECIFICATION 0x10
+#define IICTL_CODE_OPERAND 0x11
+ __u8 iictl; /* 0x0055 */
__u16 ipa; /* 0x0056 */
__u32 ipb; /* 0x0058 */
__u32 scaoh; /* 0x005c */
@@ -220,22 +259,28 @@ struct kvm_s390_sie_block {
__u8 fpf; /* 0x0060 */
#define ECB_GS 0x40
#define ECB_TE 0x10
+#define ECB_SPECI 0x08
#define ECB_SRSI 0x04
#define ECB_HOSTPROTINT 0x02
+#define ECB_PTF 0x01
__u8 ecb; /* 0x0061 */
#define ECB2_CMMA 0x80
#define ECB2_IEP 0x20
#define ECB2_PFMFI 0x08
#define ECB2_ESCA 0x04
+#define ECB2_ZPCI_LSI 0x02
__u8 ecb2; /* 0x0062 */
+#define ECB3_AISI 0x20
+#define ECB3_AISII 0x10
#define ECB3_DEA 0x08
#define ECB3_AES 0x04
#define ECB3_RI 0x01
__u8 ecb3; /* 0x0063 */
__u32 scaol; /* 0x0064 */
- __u8 reserved68; /* 0x0068 */
+ __u8 sdf; /* 0x0068 */
__u8 epdx; /* 0x0069 */
- __u8 reserved6a[2]; /* 0x006a */
+ __u8 cpnc; /* 0x006a */
+ __u8 reserved6b; /* 0x006b */
__u32 todpr; /* 0x006c */
#define GISA_FORMAT1 0x00000001
__u32 gd; /* 0x0070 */
@@ -249,31 +294,58 @@ struct kvm_s390_sie_block {
#define HPID_KVM 0x4
#define HPID_VSIE 0x5
__u8 hpid; /* 0x00b8 */
- __u8 reservedb9[11]; /* 0x00b9 */
- __u16 extcpuaddr; /* 0x00c4 */
- __u16 eic; /* 0x00c6 */
+ __u8 reservedb9[7]; /* 0x00b9 */
+ union {
+ struct {
+ __u32 eiparams; /* 0x00c0 */
+ __u16 extcpuaddr; /* 0x00c4 */
+ __u16 eic; /* 0x00c6 */
+ };
+ __u64 mcic; /* 0x00c0 */
+ } __packed;
__u32 reservedc8; /* 0x00c8 */
- __u16 pgmilc; /* 0x00cc */
- __u16 iprcc; /* 0x00ce */
- __u32 dxc; /* 0x00d0 */
- __u16 mcn; /* 0x00d4 */
- __u8 perc; /* 0x00d6 */
- __u8 peratmid; /* 0x00d7 */
+ union {
+ struct {
+ __u16 pgmilc; /* 0x00cc */
+ __u16 iprcc; /* 0x00ce */
+ };
+ __u32 edc; /* 0x00cc */
+ } __packed;
+ union {
+ struct {
+ __u32 dxc; /* 0x00d0 */
+ __u16 mcn; /* 0x00d4 */
+ __u8 perc; /* 0x00d6 */
+ __u8 peratmid; /* 0x00d7 */
+ };
+ __u64 faddr; /* 0x00d0 */
+ } __packed;
__u64 peraddr; /* 0x00d8 */
__u8 eai; /* 0x00e0 */
__u8 peraid; /* 0x00e1 */
__u8 oai; /* 0x00e2 */
__u8 armid; /* 0x00e3 */
__u8 reservede4[4]; /* 0x00e4 */
- __u64 tecmc; /* 0x00e8 */
- __u8 reservedf0[12]; /* 0x00f0 */
+ union {
+ __u64 tecmc; /* 0x00e8 */
+ struct {
+ __u16 subchannel_id; /* 0x00e8 */
+ __u16 subchannel_nr; /* 0x00ea */
+ __u32 io_int_parm; /* 0x00ec */
+ __u32 io_int_word; /* 0x00f0 */
+ };
+ } __packed;
+ __u8 reservedf4[8]; /* 0x00f4 */
#define CRYCB_FORMAT_MASK 0x00000003
#define CRYCB_FORMAT0 0x00000000
#define CRYCB_FORMAT1 0x00000001
#define CRYCB_FORMAT2 0x00000003
__u32 crycbd; /* 0x00fc */
__u64 gcr[16]; /* 0x0100 */
- __u64 gbea; /* 0x0180 */
+ union {
+ __u64 gbea; /* 0x0180 */
+ __u64 sidad;
+ };
__u8 reserved188[8]; /* 0x0188 */
__u64 sdnxo; /* 0x0190 */
__u8 reserved198[8]; /* 0x0198 */
@@ -292,7 +364,7 @@ struct kvm_s390_sie_block {
__u64 itdba; /* 0x01e8 */
__u64 riccbd; /* 0x01f0 */
__u64 gvrd; /* 0x01f8 */
-} __attribute__((packed));
+} __packed __aligned(512);
struct kvm_s390_itdb {
__u8 data[256];
@@ -301,12 +373,15 @@ struct kvm_s390_itdb {
struct sie_page {
struct kvm_s390_sie_block sie_block;
struct mcck_volatile_info mcck_info; /* 0x0200 */
- __u8 reserved218[1000]; /* 0x0218 */
+ __u8 reserved218[360]; /* 0x0218 */
+ __u64 pv_grregs[16]; /* 0x0380 */
+ __u8 reserved400[512]; /* 0x0400 */
struct kvm_s390_itdb itdb; /* 0x0600 */
__u8 reserved700[2304]; /* 0x0700 */
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
u64 exit_userspace;
u64 exit_null;
u64 exit_external_request;
@@ -316,11 +391,7 @@ struct kvm_vcpu_stat {
u64 exit_validity;
u64 exit_instruction;
u64 exit_pei;
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_invalid;
u64 halt_no_poll_steal;
- u64 halt_wakeup;
u64 instruction_lctl;
u64 instruction_lctlg;
u64 instruction_stctl;
@@ -394,14 +465,16 @@ struct kvm_vcpu_stat {
u64 instruction_sigp_init_cpu_reset;
u64 instruction_sigp_cpu_reset;
u64 instruction_sigp_unknown;
- u64 diagnose_10;
- u64 diagnose_44;
- u64 diagnose_9c;
- u64 diagnose_9c_ignored;
- u64 diagnose_258;
- u64 diagnose_308;
- u64 diagnose_500;
- u64 diagnose_other;
+ u64 instruction_diagnose_10;
+ u64 instruction_diagnose_44;
+ u64 instruction_diagnose_9c;
+ u64 diag_9c_ignored;
+ u64 diag_9c_forward;
+ u64 instruction_diagnose_258;
+ u64 instruction_diagnose_308;
+ u64 instruction_diagnose_500;
+ u64 instruction_diagnose_other;
+ u64 pfault_sync;
};
#define PGM_OPERATION 0x01
@@ -476,6 +549,7 @@ enum irq_types {
IRQ_PEND_PFAULT_INIT,
IRQ_PEND_EXT_HOST,
IRQ_PEND_EXT_SERVICE,
+ IRQ_PEND_EXT_SERVICE_EV,
IRQ_PEND_EXT_TIMING,
IRQ_PEND_EXT_CPU_TIMER,
IRQ_PEND_EXT_CLOCK_COMP,
@@ -520,6 +594,7 @@ enum irq_types {
(1UL << IRQ_PEND_EXT_TIMING) | \
(1UL << IRQ_PEND_EXT_HOST) | \
(1UL << IRQ_PEND_EXT_SERVICE) | \
+ (1UL << IRQ_PEND_EXT_SERVICE_EV) | \
(1UL << IRQ_PEND_VIRTIO) | \
(1UL << IRQ_PEND_PFAULT_INIT) | \
(1UL << IRQ_PEND_PFAULT_DONE))
@@ -536,6 +611,13 @@ enum irq_types {
#define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \
(1UL << IRQ_PEND_MCHK_EX))
+#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER) | \
+ (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \
+ (1UL << IRQ_PEND_EXT_EMERGENCY) | \
+ (1UL << IRQ_PEND_EXT_EXTERNAL) | \
+ (1UL << IRQ_PEND_EXT_SERVICE) | \
+ (1UL << IRQ_PEND_EXT_SERVICE_EV))
+
struct kvm_s390_interrupt_info {
struct list_head list;
u64 type;
@@ -594,6 +676,7 @@ struct kvm_s390_local_interrupt {
struct kvm_s390_float_interrupt {
unsigned long pending_irqs;
+ unsigned long masked_irqs;
spinlock_t lock;
struct list_head lists[FIRQ_LIST_COUNT];
int counters[FIRQ_MAX_COUNT];
@@ -633,6 +716,10 @@ struct kvm_hw_bp_info_arch {
#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
(vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+ KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
struct kvm_guestdbg_info_arch {
unsigned long cr0;
unsigned long cr9;
@@ -645,6 +732,11 @@ struct kvm_guestdbg_info_arch {
unsigned long last_bp;
};
+struct kvm_s390_pv_vcpu {
+ u64 handle;
+ unsigned long stor_base;
+};
+
struct kvm_vcpu_arch {
struct kvm_s390_sie_block *sie_block;
/* if vsie is active, currently executed shadow sie control block */
@@ -673,15 +765,18 @@ struct kvm_vcpu_arch {
__u64 cputm_start;
bool gs_enabled;
bool skey_enabled;
+ struct kvm_s390_pv_vcpu pv;
+ union diag318_info diag318_info;
};
struct kvm_vm_stat {
+ struct kvm_vm_stat_generic generic;
u64 inject_io;
u64 inject_float_mchk;
u64 inject_pfault_done;
u64 inject_service_signal;
u64 inject_virtio;
- u64 remote_tlb_flush;
+ u64 aen_forward;
};
struct kvm_arch_memory_slot {
@@ -701,9 +796,6 @@ struct s390_io_adapter {
bool masked;
bool swap;
bool suppressible;
- struct rw_semaphore maps_lock;
- struct list_head maps;
- atomic_t nr_maps;
};
#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
@@ -727,14 +819,12 @@ struct kvm_s390_cpu_model {
unsigned short ibc;
};
-struct kvm_s390_module_hook {
- int (*hook)(struct kvm_vcpu *vcpu);
- struct module *owner;
-};
+typedef int (*crypto_hook)(struct kvm_vcpu *vcpu);
struct kvm_s390_crypto {
struct kvm_s390_crypto_cb *crycb;
- struct kvm_s390_module_hook *pqap_hook;
+ struct rw_semaphore pqap_hook_rwsem;
+ crypto_hook *pqap_hook;
__u32 crycbd;
__u8 aes_kw;
__u8 dea_kw;
@@ -846,6 +936,15 @@ struct kvm_s390_gisa_interrupt {
DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS);
};
+struct kvm_s390_pv {
+ u64 handle;
+ u64 guest_len;
+ unsigned long stor_base;
+ void *stor_var;
+ bool dumping;
+ struct mmu_notifier mmu_notifier;
+};
+
struct kvm_arch{
void *sca;
int use_esca;
@@ -860,6 +959,7 @@ struct kvm_arch{
int use_cmma;
int use_pfmfi;
int use_skf;
+ int use_zpci_interp;
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
@@ -879,8 +979,12 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
+ struct kvm_s390_pv pv;
+ struct list_head kzdev_list;
+ spinlock_t kzdev_list_lock;
};
#define KVM_HVA_ERR_BAD (-1UL)
@@ -896,17 +1000,19 @@ struct kvm_arch_async_pf {
unsigned long pfault_token;
};
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
+static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {}
+
void kvm_arch_crypto_clear_masks(struct kvm *kvm);
void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
unsigned long *aqm, unsigned long *adm);
@@ -921,7 +1027,7 @@ static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
- struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
+ struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -929,6 +1035,14 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
-void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu);
+#define __KVM_HAVE_ARCH_VM_FREE
+void kvm_arch_free_vm(struct kvm *kvm);
+
+struct zpci_kvm_hook {
+ int (*kvm_register)(void *opaque, struct kvm *kvm);
+ void (*kvm_unregister)(void *opaque);
+};
+
+extern struct zpci_kvm_hook zpci_kvm_hook;
#endif
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index cbc7c3a68e4d..df73a052760c 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -24,162 +24,79 @@
#include <uapi/asm/kvm_para.h>
#include <asm/diag.h>
-static inline long __kvm_hypercall0(unsigned long nr)
-{
- register unsigned long __nr asm("1") = nr;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr): "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall0(unsigned long nr)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall0(nr);
-}
-
-static inline long __kvm_hypercall1(unsigned long nr, unsigned long p1)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall1(nr, p1);
-}
-
-static inline long __kvm_hypercall2(unsigned long nr, unsigned long p1,
- unsigned long p2)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2)
- : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
- unsigned long p2)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall2(nr, p1, p2);
-}
-
-static inline long __kvm_hypercall3(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall3(nr, p1, p2, p3);
-}
-
-static inline long __kvm_hypercall4(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register unsigned long __p4 asm("5") = p4;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3), "d" (__p4) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall4(nr, p1, p2, p3, p4);
-}
-
-static inline long __kvm_hypercall5(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register unsigned long __p4 asm("5") = p4;
- register unsigned long __p5 asm("6") = p5;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3), "d" (__p4), "d" (__p5) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall5(nr, p1, p2, p3, p4, p5);
-}
-
-static inline long __kvm_hypercall6(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5,
- unsigned long p6)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register unsigned long __p4 asm("5") = p4;
- register unsigned long __p5 asm("6") = p5;
- register unsigned long __p6 asm("7") = p6;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3), "d" (__p4), "d" (__p5), "d" (__p6)
- : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5,
- unsigned long p6)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall6(nr, p1, p2, p3, p4, p5, p6);
-}
+#define HYPERCALL_FMT_0
+#define HYPERCALL_FMT_1 , "0" (r2)
+#define HYPERCALL_FMT_2 , "d" (r3) HYPERCALL_FMT_1
+#define HYPERCALL_FMT_3 , "d" (r4) HYPERCALL_FMT_2
+#define HYPERCALL_FMT_4 , "d" (r5) HYPERCALL_FMT_3
+#define HYPERCALL_FMT_5 , "d" (r6) HYPERCALL_FMT_4
+#define HYPERCALL_FMT_6 , "d" (r7) HYPERCALL_FMT_5
+
+#define HYPERCALL_PARM_0
+#define HYPERCALL_PARM_1 , unsigned long arg1
+#define HYPERCALL_PARM_2 HYPERCALL_PARM_1, unsigned long arg2
+#define HYPERCALL_PARM_3 HYPERCALL_PARM_2, unsigned long arg3
+#define HYPERCALL_PARM_4 HYPERCALL_PARM_3, unsigned long arg4
+#define HYPERCALL_PARM_5 HYPERCALL_PARM_4, unsigned long arg5
+#define HYPERCALL_PARM_6 HYPERCALL_PARM_5, unsigned long arg6
+
+#define HYPERCALL_REGS_0
+#define HYPERCALL_REGS_1 \
+ register unsigned long r2 asm("2") = arg1
+#define HYPERCALL_REGS_2 \
+ HYPERCALL_REGS_1; \
+ register unsigned long r3 asm("3") = arg2
+#define HYPERCALL_REGS_3 \
+ HYPERCALL_REGS_2; \
+ register unsigned long r4 asm("4") = arg3
+#define HYPERCALL_REGS_4 \
+ HYPERCALL_REGS_3; \
+ register unsigned long r5 asm("5") = arg4
+#define HYPERCALL_REGS_5 \
+ HYPERCALL_REGS_4; \
+ register unsigned long r6 asm("6") = arg5
+#define HYPERCALL_REGS_6 \
+ HYPERCALL_REGS_5; \
+ register unsigned long r7 asm("7") = arg6
+
+#define HYPERCALL_ARGS_0
+#define HYPERCALL_ARGS_1 , arg1
+#define HYPERCALL_ARGS_2 HYPERCALL_ARGS_1, arg2
+#define HYPERCALL_ARGS_3 HYPERCALL_ARGS_2, arg3
+#define HYPERCALL_ARGS_4 HYPERCALL_ARGS_3, arg4
+#define HYPERCALL_ARGS_5 HYPERCALL_ARGS_4, arg5
+#define HYPERCALL_ARGS_6 HYPERCALL_ARGS_5, arg6
+
+#define GENERATE_KVM_HYPERCALL_FUNC(args) \
+static inline \
+long __kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \
+{ \
+ register unsigned long __nr asm("1") = nr; \
+ register long __rc asm("2"); \
+ HYPERCALL_REGS_##args; \
+ \
+ asm volatile ( \
+ " diag 2,4,0x500\n" \
+ : "=d" (__rc) \
+ : "d" (__nr) HYPERCALL_FMT_##args \
+ : "memory", "cc"); \
+ return __rc; \
+} \
+ \
+static inline \
+long kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \
+{ \
+ diag_stat_inc(DIAG_STAT_X500); \
+ return __kvm_hypercall##args(nr HYPERCALL_ARGS_##args); \
+}
+
+GENERATE_KVM_HYPERCALL_FUNC(0)
+GENERATE_KVM_HYPERCALL_FUNC(1)
+GENERATE_KVM_HYPERCALL_FUNC(2)
+GENERATE_KVM_HYPERCALL_FUNC(3)
+GENERATE_KVM_HYPERCALL_FUNC(4)
+GENERATE_KVM_HYPERCALL_FUNC(5)
+GENERATE_KVM_HYPERCALL_FUNC(6)
/* kvm on s390 is always paravirtualization enabled */
static inline int kvm_para_available(void)
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index 7f22262b0e46..c76777b15fec 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -4,36 +4,7 @@
#include <linux/stringify.h>
-#define __ALIGN .align 4, 0x07
+#define __ALIGN .align 16, 0x07
#define __ALIGN_STR __stringify(__ALIGN)
-#ifndef __ASSEMBLY__
-
-/*
- * Helper macro for exception table entries
- */
-#define EX_TABLE(_fault, _target) \
- ".section __ex_table,\"a\"\n" \
- ".align 4\n" \
- ".long (" #_fault ") - .\n" \
- ".long (" #_target ") - .\n" \
- ".previous\n"
-
-#else /* __ASSEMBLY__ */
-
-#define EX_TABLE(_fault, _target) \
- .section __ex_table,"a" ; \
- .align 4 ; \
- .long (_fault) - . ; \
- .long (_target) - . ; \
- .previous
-
-#define EX_TABLE_DMA(_fault, _target) \
- .section .dma.ex_table, "a" ; \
- .align 4 ; \
- .long (_fault) - . ; \
- .long (_target) - . ; \
- .previous
-
-#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
deleted file mode 100644
index 818612b784cd..000000000000
--- a/arch/s390/include/asm/livepatch.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * livepatch.h - s390-specific Kernel Live Patching Core
- *
- * Copyright (c) 2013-2015 SUSE
- * Authors: Jiri Kosina
- * Vojtech Pavlik
- * Jiri Slaby
- */
-
-#ifndef ASM_LIVEPATCH_H
-#define ASM_LIVEPATCH_H
-
-#include <asm/ptrace.h>
-
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
-{
- regs->psw.addr = ip;
-}
-
-#endif
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 237ee0c4169f..8aa1f6530a3e 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -17,21 +17,39 @@
#define LC_ORDER 1
#define LC_PAGES 2
+struct pgm_tdb {
+ u64 data[32];
+};
+
struct lowcore {
__u8 pad_0x0000[0x0014-0x0000]; /* 0x0000 */
__u32 ipl_parmblock_ptr; /* 0x0014 */
__u8 pad_0x0018[0x0080-0x0018]; /* 0x0018 */
__u32 ext_params; /* 0x0080 */
- __u16 ext_cpu_addr; /* 0x0084 */
- __u16 ext_int_code; /* 0x0086 */
- __u16 svc_ilc; /* 0x0088 */
- __u16 svc_code; /* 0x008a */
- __u16 pgm_ilc; /* 0x008c */
- __u16 pgm_code; /* 0x008e */
+ union {
+ struct {
+ __u16 ext_cpu_addr; /* 0x0084 */
+ __u16 ext_int_code; /* 0x0086 */
+ };
+ __u32 ext_int_code_addr;
+ };
+ __u32 svc_int_code; /* 0x0088 */
+ union {
+ struct {
+ __u16 pgm_ilc; /* 0x008c */
+ __u16 pgm_code; /* 0x008e */
+ };
+ __u32 pgm_int_code;
+ };
__u32 data_exc_code; /* 0x0090 */
__u16 mon_class_num; /* 0x0094 */
- __u8 per_code; /* 0x0096 */
- __u8 per_atmid; /* 0x0097 */
+ union {
+ struct {
+ __u8 per_code; /* 0x0096 */
+ __u8 per_atmid; /* 0x0097 */
+ };
+ __u16 per_code_combined;
+ };
__u64 per_address; /* 0x0098 */
__u8 exc_access_id; /* 0x00a0 */
__u8 per_access_id; /* 0x00a1 */
@@ -40,10 +58,15 @@ struct lowcore {
__u8 pad_0x00a4[0x00a8-0x00a4]; /* 0x00a4 */
__u64 trans_exc_code; /* 0x00a8 */
__u64 monitor_code; /* 0x00b0 */
- __u16 subchannel_id; /* 0x00b8 */
- __u16 subchannel_nr; /* 0x00ba */
- __u32 io_int_parm; /* 0x00bc */
- __u32 io_int_word; /* 0x00c0 */
+ union {
+ struct {
+ __u16 subchannel_id; /* 0x00b8 */
+ __u16 subchannel_nr; /* 0x00ba */
+ __u32 io_int_parm; /* 0x00bc */
+ __u32 io_int_word; /* 0x00c0 */
+ };
+ struct tpi_info tpi_info; /* 0x00b8 */
+ };
__u8 pad_0x00c4[0x00c8-0x00c4]; /* 0x00c4 */
__u32 stfl_fac_list; /* 0x00c8 */
__u8 pad_0x00cc[0x00e8-0x00cc]; /* 0x00cc */
@@ -52,7 +75,7 @@ struct lowcore {
__u32 external_damage_code; /* 0x00f4 */
__u64 failing_storage_address; /* 0x00f8 */
__u8 pad_0x0100[0x0110-0x0100]; /* 0x0100 */
- __u64 breaking_event_addr; /* 0x0110 */
+ __u64 pgm_last_break; /* 0x0110 */
__u8 pad_0x0118[0x0120-0x0118]; /* 0x0118 */
psw_t restart_old_psw; /* 0x0120 */
psw_t external_old_psw; /* 0x0130 */
@@ -80,9 +103,10 @@ struct lowcore {
psw_t return_psw; /* 0x0290 */
psw_t return_mcck_psw; /* 0x02a0 */
+ __u64 last_break; /* 0x02b0 */
+
/* CPU accounting and timing values. */
- __u64 sync_enter_timer; /* 0x02b0 */
- __u64 async_enter_timer; /* 0x02b8 */
+ __u64 sys_enter_timer; /* 0x02b8 */
__u64 mcck_enter_timer; /* 0x02c0 */
__u64 exit_timer; /* 0x02c8 */
__u64 user_timer; /* 0x02d0 */
@@ -107,16 +131,16 @@ struct lowcore {
__u64 async_stack; /* 0x0350 */
__u64 nodat_stack; /* 0x0358 */
__u64 restart_stack; /* 0x0360 */
-
+ __u64 mcck_stack; /* 0x0368 */
/* Restart function and parameter. */
- __u64 restart_fn; /* 0x0368 */
- __u64 restart_data; /* 0x0370 */
- __u64 restart_source; /* 0x0378 */
+ __u64 restart_fn; /* 0x0370 */
+ __u64 restart_data; /* 0x0378 */
+ __u32 restart_source; /* 0x0380 */
+ __u32 restart_flags; /* 0x0384 */
/* Address space pointer. */
- __u64 kernel_asce; /* 0x0380 */
- __u64 user_asce; /* 0x0388 */
- __u64 vdso_asce; /* 0x0390 */
+ __u64 kernel_asce; /* 0x0388 */
+ __u64 user_asce; /* 0x0390 */
/*
* The lpp and current_pid fields form a
@@ -134,14 +158,14 @@ struct lowcore {
__u32 spinlock_index; /* 0x03b0 */
__u32 fpu_flags; /* 0x03b4 */
__u64 percpu_offset; /* 0x03b8 */
- __u64 vdso_per_cpu_data; /* 0x03c0 */
+ __u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */
__u64 machine_flags; /* 0x03c8 */
__u64 gmap; /* 0x03d0 */
__u8 pad_0x03d8[0x0400-0x03d8]; /* 0x03d8 */
- /* br %r1 trampoline */
- __u16 br_r1_trampoline; /* 0x0400 */
- __u8 pad_0x0402[0x0e00-0x0402]; /* 0x0402 */
+ __u32 return_lpswe; /* 0x0400 */
+ __u32 return_mcck_lpswe; /* 0x0404 */
+ __u8 pad_0x040a[0x0e00-0x0408]; /* 0x0408 */
/*
* 0xe00 contains the address of the IPL Parameter Information
@@ -153,12 +177,7 @@ struct lowcore {
__u64 vmcore_info; /* 0x0e0c */
__u8 pad_0x0e14[0x0e18-0x0e14]; /* 0x0e14 */
__u64 os_info; /* 0x0e18 */
- __u8 pad_0x0e20[0x0f00-0x0e20]; /* 0x0e20 */
-
- /* Extended facility list */
- __u64 stfle_fac_list[16]; /* 0x0f00 */
- __u64 alt_stfle_fac_list[16]; /* 0x0f80 */
- __u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */
+ __u8 pad_0x0e20[0x11b0-0x0e20]; /* 0x0e20 */
/* Pointer to the machine check extended save area */
__u64 mcesad; /* 0x11b0 */
@@ -178,13 +197,18 @@ struct lowcore {
__u32 tod_progreg_save_area; /* 0x1324 */
__u32 cpu_timer_save_area[2]; /* 0x1328 */
__u32 clock_comp_save_area[2]; /* 0x1330 */
- __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */
+ __u64 last_break_save_area; /* 0x1338 */
__u32 access_regs_save_area[16]; /* 0x1340 */
__u64 cregs_save_area[16]; /* 0x1380 */
- __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */
+ __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */
+ /* Cryptography-counter designation */
+ __u64 ccd; /* 0x1500 */
+ /* AI-extension counter designation */
+ __u64 aicd; /* 0x1508 */
+ __u8 pad_0x1510[0x1800-0x1510]; /* 0x1510 */
/* Transaction abort diagnostic block */
- __u8 pgm_tdb[256]; /* 0x1800 */
+ struct pgm_tdb pgm_tdb; /* 0x1800 */
__u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */
} __packed __aligned(8192);
diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h
new file mode 100644
index 000000000000..c7fa838cf6b9
--- /dev/null
+++ b/arch/s390/include/asm/maccess.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_S390_MACCESS_H
+#define __ASM_S390_MACCESS_H
+
+#include <linux/types.h>
+
+struct iov_iter;
+
+extern unsigned long __memcpy_real_area;
+void memcpy_real_init(void);
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count);
+int memcpy_real(void *dest, unsigned long src, size_t count);
+#ifdef CONFIG_CRASH_DUMP
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count);
+#endif
+
+#endif /* __ASM_S390_MACCESS_H */
diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index 2542cbf7e2d1..08a8b96606d7 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,8 +4,6 @@
#ifndef __ASSEMBLY__
-static inline bool mem_encrypt_active(void) { return false; }
-
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bcfb6371086f..829d68e2c685 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -4,6 +4,7 @@
#include <linux/cpumask.h>
#include <linux/errno.h>
+#include <asm/asm-extable.h>
typedef struct {
spinlock_t lock;
@@ -16,6 +17,8 @@ typedef struct {
unsigned long asce;
unsigned long asce_limit;
unsigned long vdso_base;
+ /* The mmu context belongs to a secure guest. */
+ atomic_t protected_count;
/*
* The following bitfields need a down_write on the mm
* semaphore when they are written to. As they are only
@@ -32,8 +35,6 @@ typedef struct {
unsigned int uses_cmm:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
- /* The mmu context is for compat task */
- unsigned int compat_mm:1;
} mm_context_t;
#define INIT_MM_CONTEXT(name) \
@@ -41,18 +42,4 @@ typedef struct {
.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
-static inline int tprot(unsigned long addr)
-{
- int rc = -EFAULT;
-
- asm volatile(
- " tprot 0(%1),0\n"
- "0: ipm %0\n"
- " srl %0,28\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "+d" (rc) : "a" (addr) : "cc");
- return rc;
-}
-
#endif
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 8d04e6f3f796..2a38af5a00c2 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,17 +15,20 @@
#include <asm/ctl_reg.h>
#include <asm-generic/mm_hooks.h>
+#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ unsigned long asce_type, init_entry;
+
spin_lock_init(&mm->context.lock);
INIT_LIST_HEAD(&mm->context.pgtable_list);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
+ atomic_set(&mm->context.protected_count, 0);
mm->context.gmap_asce = 0;
mm->context.flush_mm = 0;
- mm->context.compat_mm = test_thread_flag(TIF_31BIT);
#ifdef CONFIG_PGSTE
mm->context.alloc_pgste = page_table_allocate_pgste ||
test_thread_flag(TIF_PGSTE) ||
@@ -36,73 +39,62 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
- case _REGION2_SIZE:
+ default:
/*
- * forked 3-level task, fall through to set new asce with new
- * mm->pgd
+ * context created by exec, the value of asce_limit can
+ * only be zero in this case
*/
- case 0:
- /* context created by exec, set asce limit to 4TB */
- mm->context.asce_limit = STACK_TOP_MAX;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+ VM_BUG_ON(mm->context.asce_limit);
+ /* continue as 3-level task */
+ mm->context.asce_limit = _REGION2_SIZE;
+ fallthrough;
+ case _REGION2_SIZE:
+ /* forked 3-level task */
+ init_entry = _REGION3_ENTRY_EMPTY;
+ asce_type = _ASCE_TYPE_REGION3;
break;
- case -PAGE_SIZE:
- /* forked 5-level task, set new asce with new_mm->pgd */
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+ case TASK_SIZE_MAX:
+ /* forked 5-level task */
+ init_entry = _REGION1_ENTRY_EMPTY;
+ asce_type = _ASCE_TYPE_REGION1;
break;
case _REGION1_SIZE:
- /* forked 4-level task, set new asce with new mm->pgd */
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ /* forked 4-level task */
+ init_entry = _REGION2_ENTRY_EMPTY;
+ asce_type = _ASCE_TYPE_REGION2;
break;
- case _REGION3_SIZE:
- /* forked 2-level compat task, set new asce with new mm->pgd */
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
}
- crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | asce_type;
+ crst_table_init((unsigned long *) mm->pgd, init_entry);
return 0;
}
-#define destroy_context(mm) do { } while (0)
-
-static inline void set_user_asce(struct mm_struct *mm)
+static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
{
- S390_lowcore.user_asce = mm->context.asce;
- __ctl_load(S390_lowcore.user_asce, 1, 1);
- clear_cpu_flag(CIF_ASCE_PRIMARY);
-}
+ int cpu = smp_processor_id();
-static inline void clear_user_asce(void)
-{
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
+ if (next == &init_mm)
+ S390_lowcore.user_asce = s390_invalid_asce;
+ else
+ S390_lowcore.user_asce = next->context.asce;
+ cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+ /* Clear previous user-ASCE from CR7 */
+ __ctl_load(s390_invalid_asce, 7, 7);
+ if (prev != next)
+ cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
}
-
-mm_segment_t enable_sacf_uaccess(void);
-void disable_sacf_uaccess(mm_segment_t old_fs);
+#define switch_mm_irqs_off switch_mm_irqs_off
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- int cpu = smp_processor_id();
+ unsigned long flags;
- S390_lowcore.user_asce = next->context.asce;
- cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
- /* Clear previous user-ASCE from CR1 and CR7 */
- if (!test_cpu_flag(CIF_ASCE_PRIMARY)) {
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- }
- if (test_cpu_flag(CIF_ASCE_SECONDARY)) {
- __ctl_load(S390_lowcore.vdso_asce, 7, 7);
- clear_cpu_flag(CIF_ASCE_SECONDARY);
- }
- if (prev != next)
- cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
}
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
@@ -119,18 +111,18 @@ static inline void finish_arch_post_lock_switch(void)
__tlb_flush_mm_lazy(mm);
preempt_enable();
}
- set_fs(current->thread.mm_segment);
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
}
-#define enter_lazy_tlb(mm,tsk) do { } while (0)
-#define deactivate_mm(tsk,mm) do { } while (0)
-
+#define activate_mm activate_mm
static inline void activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
switch_mm(prev, next, current);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
- set_user_asce(next);
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
}
+#include <asm-generic/mmu_context.h>
+
#endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/s390/include/asm/module.h b/arch/s390/include/asm/module.h
index e0a6d29846e2..9f1eea15872c 100644
--- a/arch/s390/include/asm/module.h
+++ b/arch/s390/include/asm/module.h
@@ -8,16 +8,14 @@
* This file contains the s390 architecture specific module code.
*/
-struct mod_arch_syminfo
-{
+struct mod_arch_syminfo {
unsigned long got_offset;
unsigned long plt_offset;
int got_initialized;
int plt_initialized;
};
-struct mod_arch_specific
-{
+struct mod_arch_specific {
/* Starting offset of got in the module core memory. */
unsigned long got_offset;
/* Starting offset of plt in the module core memory. */
@@ -30,6 +28,14 @@ struct mod_arch_specific
int nsyms;
/* Additional symbol information (got and plt offsets). */
struct mod_arch_syminfo *syminfo;
+#ifdef CONFIG_FUNCTION_TRACER
+ /* Start of memory reserved for ftrace hotpatch trampolines. */
+ struct ftrace_hotpatch_trampoline *trampolines_start;
+ /* End of memory reserved for ftrace hotpatch trampolines. */
+ struct ftrace_hotpatch_trampoline *trampolines_end;
+ /* Next unused ftrace hotpatch trampoline slot. */
+ struct ftrace_hotpatch_trampoline *next_trampoline;
+#endif /* CONFIG_FUNCTION_TRACER */
};
#endif /* _ASM_S390_MODULE_H */
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index b160da8fa14b..af1cd3a6f406 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -6,7 +6,6 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#ifndef _ASM_S390_NMI_H
@@ -23,12 +22,16 @@
#define MCCK_CODE_SYSTEM_DAMAGE BIT(63)
#define MCCK_CODE_EXT_DAMAGE BIT(63 - 5)
#define MCCK_CODE_CP BIT(63 - 9)
-#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
+#define MCCK_CODE_STG_ERROR BIT(63 - 16)
+#define MCCK_CODE_STG_KEY_ERROR BIT(63 - 18)
+#define MCCK_CODE_STG_DEGRAD BIT(63 - 19)
#define MCCK_CODE_PSW_MWP_VALID BIT(63 - 20)
#define MCCK_CODE_PSW_IA_VALID BIT(63 - 23)
+#define MCCK_CODE_STG_FAIL_ADDR BIT(63 - 24)
#define MCCK_CODE_CR_VALID BIT(63 - 29)
#define MCCK_CODE_GS_VALID BIT(63 - 36)
#define MCCK_CODE_FC_VALID BIT(63 - 43)
+#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
#ifndef __ASSEMBLY__
@@ -94,12 +97,13 @@ struct mcesa {
struct pt_regs;
-void nmi_alloc_boot_cpu(struct lowcore *lc);
-int nmi_alloc_per_cpu(struct lowcore *lc);
-void nmi_free_per_cpu(struct lowcore *lc);
+void nmi_alloc_mcesa_early(u64 *mcesad);
+int nmi_alloc_mcesa(u64 *mcesad);
+void nmi_free_mcesa(u64 *mcesad);
-void s390_handle_mcck(void);
-void s390_do_machine_check(struct pt_regs *regs);
+void s390_handle_mcck(struct pt_regs *regs);
+void __s390_handle_mcck(void);
+int s390_do_machine_check(struct pt_regs *regs);
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_NMI_H */
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index b4bd8c41e9d3..82725cf783c7 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -12,6 +12,11 @@ void nospec_init_branches(void);
void nospec_auto_detect(void);
void nospec_revert(s32 *start, s32 *end);
+static inline bool nospec_uses_trampoline(void)
+{
+ return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 0033dcd663b1..7e9e99523e95 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -2,23 +2,24 @@
#ifndef _ASM_S390_NOSPEC_ASM_H
#define _ASM_S390_NOSPEC_ASM_H
-#include <asm/alternative-asm.h>
-#include <asm/asm-offsets.h>
#include <asm/dwarf.h>
#ifdef __ASSEMBLY__
#ifdef CC_USING_EXPOLINE
-_LC_BR_R1 = __LC_BR_R1
-
/*
* The expoline macros are used to create thunks in the same format
* as gcc generates them. The 'comdat' section flag makes sure that
* the various thunks are merged into a single copy.
*/
.macro __THUNK_PROLOG_NAME name
+#ifdef CONFIG_EXPOLINE_EXTERN
+ .pushsection .text,"ax",@progbits
+ .align 16,0x07
+#else
.pushsection .text.\name,"axG",@progbits,\name,comdat
+#endif
.globl \name
.hidden \name
.type \name,@function
@@ -26,167 +27,101 @@ _LC_BR_R1 = __LC_BR_R1
CFI_STARTPROC
.endm
- .macro __THUNK_EPILOG
+ .macro __THUNK_EPILOG_NAME name
CFI_ENDPROC
+#ifdef CONFIG_EXPOLINE_EXTERN
+ .size \name, .-\name
+#endif
.popsection
.endm
- .macro __THUNK_PROLOG_BR r1,r2
- __THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1
- .endm
-
- .macro __THUNK_PROLOG_BC d0,r1,r2
- __THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1
+ .macro __THUNK_PROLOG_BR r1
+ __THUNK_PROLOG_NAME __s390_indirect_jump_r\r1
.endm
- .macro __THUNK_BR r1,r2
- jg __s390_indirect_jump_r\r2\()use_r\r1
+ .macro __THUNK_EPILOG_BR r1
+ __THUNK_EPILOG_NAME __s390_indirect_jump_r\r1
.endm
- .macro __THUNK_BC d0,r1,r2
- jg __s390_indirect_branch_\d0\()_\r2\()use_\r1
+ .macro __THUNK_BR r1
+ jg __s390_indirect_jump_r\r1
.endm
- .macro __THUNK_BRASL r1,r2,r3
- brasl \r1,__s390_indirect_jump_r\r3\()use_r\r2
+ .macro __THUNK_BRASL r1,r2
+ brasl \r1,__s390_indirect_jump_r\r2
.endm
- .macro __DECODE_RR expand,reg,ruse
- .set __decode_fail,1
+ .macro __DECODE_R expand,reg
+ .set .L__decode_fail,1
.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.ifc \reg,%r\r1
- .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \ruse,%r\r2
- \expand \r1,\r2
- .set __decode_fail,0
- .endif
- .endr
+ \expand \r1
+ .set .L__decode_fail,0
.endif
.endr
- .if __decode_fail == 1
- .error "__DECODE_RR failed"
+ .if .L__decode_fail == 1
+ .error "__DECODE_R failed"
.endif
.endm
- .macro __DECODE_RRR expand,rsave,rtarget,ruse
- .set __decode_fail,1
+ .macro __DECODE_RR expand,rsave,rtarget
+ .set .L__decode_fail,1
.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.ifc \rsave,%r\r1
.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.ifc \rtarget,%r\r2
- .irp r3,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \ruse,%r\r3
- \expand \r1,\r2,\r3
- .set __decode_fail,0
- .endif
- .endr
- .endif
- .endr
- .endif
- .endr
- .if __decode_fail == 1
- .error "__DECODE_RRR failed"
- .endif
- .endm
-
- .macro __DECODE_DRR expand,disp,reg,ruse
- .set __decode_fail,1
- .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \reg,%r\r1
- .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \ruse,%r\r2
- \expand \disp,\r1,\r2
- .set __decode_fail,0
+ \expand \r1,\r2
+ .set .L__decode_fail,0
.endif
.endr
.endif
.endr
- .if __decode_fail == 1
- .error "__DECODE_DRR failed"
+ .if .L__decode_fail == 1
+ .error "__DECODE_RR failed"
.endif
.endm
- .macro __THUNK_EX_BR reg,ruse
- # Be very careful when adding instructions to this macro!
- # The ALTERNATIVE replacement code has a .+10 which targets
- # the "br \reg" after the code has been patched.
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+ .macro __THUNK_EX_BR reg
exrl 0,555f
j .
-#else
- .ifc \reg,%r1
- ALTERNATIVE "ex %r0,_LC_BR_R1", ".insn ril,0xc60000000000,0,.+10", 35
- j .
- .else
- larl \ruse,555f
- ex 0,0(\ruse)
- j .
- .endif
-#endif
555: br \reg
.endm
- .macro __THUNK_EX_BC disp,reg,ruse
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
- exrl 0,556f
- j .
+#ifdef CONFIG_EXPOLINE_EXTERN
+ .macro GEN_BR_THUNK reg
+ .endm
+ .macro GEN_BR_THUNK_EXTERN reg
#else
- larl \ruse,556f
- ex 0,0(\ruse)
- j .
+ .macro GEN_BR_THUNK reg
#endif
-556: b \disp(\reg)
- .endm
-
- .macro GEN_BR_THUNK reg,ruse=%r1
- __DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse
- __THUNK_EX_BR \reg,\ruse
- __THUNK_EPILOG
- .endm
-
- .macro GEN_B_THUNK disp,reg,ruse=%r1
- __DECODE_DRR __THUNK_PROLOG_BC,\disp,\reg,\ruse
- __THUNK_EX_BC \disp,\reg,\ruse
- __THUNK_EPILOG
+ __DECODE_R __THUNK_PROLOG_BR,\reg
+ __THUNK_EX_BR \reg
+ __DECODE_R __THUNK_EPILOG_BR,\reg
.endm
- .macro BR_EX reg,ruse=%r1
-557: __DECODE_RR __THUNK_BR,\reg,\ruse
+ .macro BR_EX reg
+557: __DECODE_R __THUNK_BR,\reg
.pushsection .s390_indirect_branches,"a",@progbits
.long 557b-.
.popsection
.endm
- .macro B_EX disp,reg,ruse=%r1
-558: __DECODE_DRR __THUNK_BC,\disp,\reg,\ruse
- .pushsection .s390_indirect_branches,"a",@progbits
- .long 558b-.
- .popsection
- .endm
-
- .macro BASR_EX rsave,rtarget,ruse=%r1
-559: __DECODE_RRR __THUNK_BRASL,\rsave,\rtarget,\ruse
+ .macro BASR_EX rsave,rtarget
+559: __DECODE_RR __THUNK_BRASL,\rsave,\rtarget
.pushsection .s390_indirect_branches,"a",@progbits
.long 559b-.
.popsection
.endm
#else
- .macro GEN_BR_THUNK reg,ruse=%r1
- .endm
-
- .macro GEN_B_THUNK disp,reg,ruse=%r1
+ .macro GEN_BR_THUNK reg
.endm
- .macro BR_EX reg,ruse=%r1
+ .macro BR_EX reg
br \reg
.endm
- .macro B_EX disp,reg,ruse=%r1
- b \disp(\reg)
- .endm
-
- .macro BASR_EX rsave,rtarget,ruse=%r1
+ .macro BASR_EX rsave,rtarget
basr \rsave,\rtarget
.endm
#endif /* CC_USING_EXPOLINE */
diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h
index 35f8cbe7e5bb..23cd5d1b734b 100644
--- a/arch/s390/include/asm/numa.h
+++ b/arch/s390/include/asm/numa.h
@@ -13,24 +13,13 @@
#ifdef CONFIG_NUMA
#include <linux/numa.h>
-#include <linux/cpumask.h>
void numa_setup(void);
-int numa_pfn_to_nid(unsigned long pfn);
-int __node_distance(int a, int b);
-void numa_update_cpu_topology(void);
-
-extern cpumask_t node_to_cpumask_map[MAX_NUMNODES];
-extern int numa_debug_enabled;
#else
static inline void numa_setup(void) { }
-static inline void numa_update_cpu_topology(void) { }
-static inline int numa_pfn_to_nid(unsigned long pfn)
-{
- return 0;
-}
#endif /* CONFIG_NUMA */
+
#endif /* _ASM_S390_NUMA_H */
diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h
index 3c89279d2a4b..0d1c74a7a650 100644
--- a/arch/s390/include/asm/os_info.h
+++ b/arch/s390/include/asm/os_info.h
@@ -8,6 +8,8 @@
#ifndef _ASM_S390_OS_INFO_H
#define _ASM_S390_OS_INFO_H
+#include <linux/uio.h>
+
#define OS_INFO_VERSION_MAJOR 1
#define OS_INFO_VERSION_MINOR 1
#define OS_INFO_MAGIC 0x4f53494e464f535aULL /* OSINFOSZ */
@@ -39,7 +41,6 @@ u32 os_info_csum(struct os_info *os_info);
#ifdef CONFIG_CRASH_DUMP
void *os_info_old_entry(int nr, unsigned long *size);
-int copy_oldmem_kernel(void *dst, void *src, size_t count);
#else
static inline void *os_info_old_entry(int nr, unsigned long *size)
{
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 1019efd85b9d..61dea67bb9c7 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -20,6 +20,8 @@
#define PAGE_SIZE _PAGE_SIZE
#define PAGE_MASK _PAGE_MASK
#define PAGE_DEFAULT_ACC 0
+/* storage-protection override */
+#define PAGE_SPO_ACC 9
#define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4)
#define HPAGE_SHIFT 20
@@ -55,22 +57,25 @@ static inline void storage_key_init_range(unsigned long start, unsigned long end
*/
static inline void copy_page(void *to, void *from)
{
- register void *reg2 asm ("2") = to;
- register unsigned long reg3 asm ("3") = 0x1000;
- register void *reg4 asm ("4") = from;
- register unsigned long reg5 asm ("5") = 0xb0001000;
+ union register_pair dst, src;
+
+ dst.even = (unsigned long) to;
+ dst.odd = 0x1000;
+ src.even = (unsigned long) from;
+ src.odd = 0xb0001000;
+
asm volatile(
- " mvcl 2,4"
- : "+d" (reg2), "+d" (reg3), "+d" (reg4), "+d" (reg5)
+ " mvcl %[dst],%[src]"
+ : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair)
: : "memory", "cc");
}
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
/*
* These are used to make use of C type-checking..
@@ -87,11 +92,31 @@ typedef pte_t *pgtable_t;
#define pgprot_val(x) ((x).pgprot)
#define pgste_val(x) ((x).pgste)
-#define pte_val(x) ((x).pte)
-#define pmd_val(x) ((x).pmd)
-#define pud_val(x) ((x).pud)
-#define p4d_val(x) ((x).p4d)
-#define pgd_val(x) ((x).pgd)
+
+static inline unsigned long pte_val(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline unsigned long pmd_val(pmd_t pmd)
+{
+ return pmd.pmd;
+}
+
+static inline unsigned long pud_val(pud_t pud)
+{
+ return pud.pud;
+}
+
+static inline unsigned long p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
+
+static inline unsigned long pgd_val(pgd_t pgd)
+{
+ return pgd.pgd;
+}
#define __pgste(x) ((pgste_t) { (x) } )
#define __pte(x) ((pte_t) { (x) } )
@@ -141,9 +166,6 @@ struct page;
void arch_free_page(struct page *page, int order);
void arch_alloc_page(struct page *page, int order);
void arch_set_page_dat(struct page *page, int order);
-void arch_set_page_nodat(struct page *page, int order);
-int arch_test_page_nodat(struct page *page);
-void arch_set_page_states(int make_stable);
static inline int devmem_is_allowed(unsigned long pfn)
{
@@ -153,6 +175,11 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define HAVE_ARCH_FREE_PAGE
#define HAVE_ARCH_ALLOC_PAGE
+#if IS_ENABLED(CONFIG_PGSTE)
+int arch_make_page_accessible(struct page *page);
+#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+#endif
+
#endif /* !__ASSEMBLY__ */
#define __PAGE_OFFSET 0x0UL
@@ -161,23 +188,22 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)(unsigned long)(x))
-#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)
+#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT)
+#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT)
+
+#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
+#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
+
+#define pfn_to_virt(pfn) __va(pfn_to_phys(pfn))
+#define virt_to_pfn(kaddr) (phys_to_pfn(__pa(kaddr)))
#define pfn_to_kaddr(pfn) pfn_to_virt(pfn)
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
-#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT)
-#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT)
-
-#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr))
-#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
-
-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr))
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h
new file mode 100644
index 000000000000..1a8a6b15d121
--- /dev/null
+++ b/arch/s390/include/asm/pai.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Processor Activity Instrumentation support for cryptography counters
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#ifndef _ASM_S390_PAI_H
+#define _ASM_S390_PAI_H
+
+#include <linux/jump_label.h>
+#include <asm/lowcore.h>
+#include <asm/ptrace.h>
+
+struct qpaci_info_block {
+ u64 header;
+ struct {
+ u64 : 8;
+ u64 num_cc : 8; /* # of supported crypto counters */
+ u64 : 9;
+ u64 num_nnpa : 7; /* # of supported NNPA counters */
+ u64 : 32;
+ };
+};
+
+static inline int qpaci(struct qpaci_info_block *info)
+{
+ /* Size of info (in double words minus one) */
+ size_t size = sizeof(*info) / sizeof(u64) - 1;
+ int cc;
+
+ asm volatile(
+ " lgr 0,%[size]\n"
+ " .insn s,0xb28f0000,%[info]\n"
+ " lgr %[size],0\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size)
+ :
+ : "0", "cc", "memory");
+ return cc ? (size + 1) * sizeof(u64) : 0;
+}
+
+#define PAI_CRYPTO_BASE 0x1000 /* First event number */
+#define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */
+#define PAI_CRYPTO_KERNEL_OFFSET 2048
+#define PAI_NNPA_BASE 0x1800 /* First event number */
+#define PAI_NNPA_MAXCTR 128 /* Max # of event counters */
+
+DECLARE_STATIC_KEY_FALSE(pai_key);
+
+static __always_inline void pai_kernel_enter(struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_PERF_EVENTS))
+ return;
+ if (!static_branch_unlikely(&pai_key))
+ return;
+ if (!S390_lowcore.ccd)
+ return;
+ if (!user_mode(regs))
+ return;
+ WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd | PAI_CRYPTO_KERNEL_OFFSET);
+}
+
+static __always_inline void pai_kernel_exit(struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_PERF_EVENTS))
+ return;
+ if (!static_branch_unlikely(&pai_key))
+ return;
+ if (!S390_lowcore.ccd)
+ return;
+ if (!user_mode(regs))
+ return;
+ WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd & ~PAI_CRYPTO_KERNEL_OFFSET);
+}
+
+#endif
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index b05187ce5dbd..108e732d7b14 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -5,9 +5,10 @@
#include <linux/pci.h>
#include <linux/mutex.h>
#include <linux/iommu.h>
-#include <asm-generic/pci.h>
+#include <linux/pci_hotplug.h>
#include <asm/pci_clp.h>
#include <asm/pci_debug.h>
+#include <asm/pci_insn.h>
#include <asm/sclp.h>
#define PCIBIOS_MIN_IO 0x1000
@@ -21,10 +22,16 @@ int pci_domain_nr(struct pci_bus *);
int pci_proc_domain(struct pci_bus *);
#define ZPCI_BUS_NR 0 /* default bus number */
-#define ZPCI_DEVFN 0 /* default device number */
#define ZPCI_NR_DMA_SPACES 1
#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS
+#define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16)
+
+#ifdef PCI
+#if (ZPCI_NR_DEVICES > ZPCI_DOMAIN_BITMAP_SIZE)
+# error ZPCI_NR_DEVICES can not be bigger than ZPCI_DOMAIN_BITMAP_SIZE
+#endif
+#endif /* PCI */
/* PCI Function Controls */
#define ZPCI_FC_FN_ENABLED 0x80
@@ -78,7 +85,6 @@ enum zpci_state {
ZPCI_FN_STATE_STANDBY = 0,
ZPCI_FN_STATE_CONFIGURED = 1,
ZPCI_FN_STATE_RESERVED = 2,
- ZPCI_FN_STATE_ONLINE = 3,
};
struct zpci_bar_struct {
@@ -91,20 +97,48 @@ struct zpci_bar_struct {
};
struct s390_domain;
+struct kvm_zdev;
+
+#define ZPCI_FUNCTIONS_PER_BUS 256
+struct zpci_bus {
+ struct kref kref;
+ struct pci_bus *bus;
+ struct zpci_dev *function[ZPCI_FUNCTIONS_PER_BUS];
+ struct list_head resources;
+ struct list_head bus_next;
+ struct resource bus_resource;
+ int pchid;
+ int domain_nr;
+ bool multifunction;
+ enum pci_bus_speed max_bus_speed;
+};
/* Private data per function */
struct zpci_dev {
- struct pci_bus *bus;
+ struct zpci_bus *zbus;
struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */
+ struct kref kref;
+ struct hotplug_slot hotplug_slot;
enum zpci_state state;
u32 fid; /* function ID, used by sclp */
u32 fh; /* function handle, used by insn's */
+ u32 gisa; /* GISA designation for passthrough */
u16 vfn; /* virtual function number */
u16 pchid; /* physical channel ID */
+ u16 maxstbl; /* Maximum store block size */
u8 pfgid; /* function group ID */
u8 pft; /* pci function type */
- u16 domain;
+ u8 port;
+ u8 dtsm; /* Supported DT mask */
+ u8 rid_available : 1;
+ u8 has_hp_slot : 1;
+ u8 has_resources : 1;
+ u8 is_physfn : 1;
+ u8 util_str_avail : 1;
+ u8 irqs_registered : 1;
+ u8 reserved : 2;
+ unsigned int devfn; /* DEVFN part of the RID*/
struct mutex lock;
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
@@ -150,11 +184,15 @@ struct zpci_dev {
atomic64_t mapped_pages;
atomic64_t unmapped_pages;
+ u8 version;
enum pci_bus_speed max_bus_speed;
struct dentry *debugfs_dev;
+ /* IOMMU and passthrough */
struct s390_domain *s390_domain; /* s390 IOMMU domain data */
+ struct kvm_zdev *kzdev;
+ struct mutex kzdev_lock;
};
static inline bool zdev_enabled(struct zpci_dev *zdev)
@@ -164,27 +202,40 @@ static inline bool zdev_enabled(struct zpci_dev *zdev)
extern const struct attribute_group *zpci_attr_groups[];
extern unsigned int s390_pci_force_floating __initdata;
+extern unsigned int s390_pci_no_rid;
+
+extern union zpci_sic_iib *zpci_aipb;
+extern struct airq_iv *zpci_aif_sbv;
/* -----------------------------------------------------------------------------
Prototypes
----------------------------------------------------------------------------- */
/* Base stuff */
-int zpci_create_device(struct zpci_dev *);
-void zpci_remove_device(struct zpci_dev *zdev);
+struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
int zpci_enable_device(struct zpci_dev *);
int zpci_disable_device(struct zpci_dev *);
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
+int zpci_deconfigure_device(struct zpci_dev *zdev);
+void zpci_device_reserved(struct zpci_dev *zdev);
+bool zpci_is_device_configured(struct zpci_dev *zdev);
+
+int zpci_hot_reset_device(struct zpci_dev *zdev);
int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64);
int zpci_unregister_ioat(struct zpci_dev *, u8);
void zpci_remove_reserved_devices(void);
+void zpci_update_fh(struct zpci_dev *zdev, u32 fh);
/* CLP */
+int clp_setup_writeback_mio(void);
int clp_scan_pci_devices(void);
-int clp_rescan_pci_devices(void);
-int clp_rescan_pci_devices_simple(u32 *fid);
-int clp_add_pci_device(u32, u32, int);
-int clp_enable_fh(struct zpci_dev *, u8);
-int clp_disable_fh(struct zpci_dev *);
+int clp_query_pci_fn(struct zpci_dev *zdev);
+int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as);
+int clp_disable_fh(struct zpci_dev *zdev, u32 *fh);
int clp_get_state(u32 fid, enum zpci_state *state);
+int clp_refresh_fh(u32 fid, u32 *fh);
+
+/* UID */
+void update_uid_checking(bool new);
/* IOMMU Interface */
int zpci_init_iommu(struct zpci_dev *zdev);
@@ -199,12 +250,10 @@ static inline bool zpci_use_mio(struct zpci_dev *zdev)
/* Error handling and recovery */
void zpci_event_error(void *);
void zpci_event_availability(void *);
-void zpci_rescan(void);
bool zpci_is_enabled(void);
#else /* CONFIG_PCI */
static inline void zpci_event_error(void *e) {}
static inline void zpci_event_availability(void *e) {}
-static inline void zpci_rescan(void) {}
#endif /* CONFIG_PCI */
#ifdef CONFIG_HOTPLUG_PCI_S390
@@ -221,7 +270,14 @@ static inline void zpci_exit_slot(struct zpci_dev *zdev) {}
/* Helpers */
static inline struct zpci_dev *to_zpci(struct pci_dev *pdev)
{
- return pdev->sysdata;
+ struct zpci_bus *zbus = pdev->sysdata;
+
+ return zbus->function[pdev->devfn];
+}
+
+static inline struct zpci_dev *to_zpci_dev(struct device *dev)
+{
+ return to_zpci(to_pci_dev(dev));
}
struct zpci_dev *get_zdev_by_fid(u32);
@@ -229,7 +285,10 @@ struct zpci_dev *get_zdev_by_fid(u32);
/* DMA */
int zpci_dma_init(void);
void zpci_dma_exit(void);
+int zpci_dma_init_device(struct zpci_dev *zdev);
+int zpci_dma_exit_device(struct zpci_dev *zdev);
+/* IRQ */
int __init zpci_irq_init(void);
void __init zpci_irq_exit(void);
@@ -242,10 +301,11 @@ int zpci_debug_init(void);
void zpci_debug_exit(void);
void zpci_debug_init_device(struct zpci_dev *, const char *);
void zpci_debug_exit_device(struct zpci_dev *);
-void zpci_debug_info(struct zpci_dev *, struct seq_file *);
-/* Error reporting */
+/* Error handling */
int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
+int zpci_clear_error_state(struct zpci_dev *zdev);
+int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
#ifdef CONFIG_NUMA
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index bd2cb4ea7d93..d6189ed14f84 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -7,6 +7,7 @@
/*
* Call Logical Processor - Command Codes
*/
+#define CLP_SLPC 0x0001
#define CLP_LIST_PCI 0x0002
#define CLP_QUERY_PCI_FN 0x0003
#define CLP_QUERY_PCI_FNGRP 0x0004
@@ -51,6 +52,19 @@ struct clp_fh_list_entry {
extern bool zpci_unique_uid;
+struct clp_rsp_slpc_pci {
+ struct clp_rsp_hdr hdr;
+ u32 reserved2[4];
+ u32 lpif[8];
+ u32 reserved3[4];
+ u32 vwb : 1;
+ u32 : 1;
+ u32 mio_wb : 6;
+ u32 : 24;
+ u32 reserved5[3];
+ u32 lpic[8];
+} __packed;
+
/* List PCI functions request */
struct clp_req_list_pci {
struct clp_req_hdr hdr;
@@ -93,7 +107,10 @@ struct clp_req_query_pci {
struct clp_rsp_query_pci {
struct clp_rsp_hdr hdr;
u16 vfn; /* virtual fn number */
- u16 : 6;
+ u16 : 3;
+ u16 rid_avail : 1;
+ u16 is_physfn : 1;
+ u16 reserved1 : 1;
u16 mio_addr_avail : 1;
u16 util_str_avail : 1; /* utility string available? */
u16 pfgid : 8; /* pci function group id */
@@ -102,12 +119,16 @@ struct clp_rsp_query_pci {
u16 pchid;
__le32 bar[PCI_STD_NUM_BARS];
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
- u32 : 16;
+ u16 : 12;
+ u16 port : 4;
u8 fmb_len;
u8 pft; /* pci function type */
u64 sdma; /* start dma as */
u64 edma; /* end dma as */
- u32 reserved[11];
+#define ZPCI_RID_MASK_DEVFN 0x00ff
+ u16 rid; /* BUS/DEVFN PCI address */
+ u16 reserved0;
+ u32 reserved[10];
u32 uid; /* user defined id */
u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */
u32 reserved2[16];
@@ -132,9 +153,11 @@ struct clp_rsp_query_pci_grp {
u8 : 6;
u8 frame : 1;
u8 refresh : 1; /* TLB refresh mode */
- u16 reserved2;
+ u16 : 3;
+ u16 maxstbl : 13; /* Maximum store block size */
u16 mui;
- u16 : 16;
+ u8 dtsm; /* Supported DT mask */
+ u8 reserved3;
u16 maxfaal;
u16 : 4;
u16 dnoi : 12;
@@ -152,7 +175,8 @@ struct clp_req_set_pci {
u16 reserved2;
u8 oc; /* operation controls */
u8 ndas; /* number of dma spaces */
- u64 reserved3;
+ u32 reserved3;
+ u32 gisa; /* GISA designation */
} __packed;
/* Set PCI function response */
@@ -165,6 +189,11 @@ struct clp_rsp_set_pci {
} __packed;
/* Combined request/response block structures used by clp insn */
+struct clp_req_rsp_slpc_pci {
+ struct clp_req_slpc request;
+ struct clp_rsp_slpc_pci response;
+} __packed;
+
struct clp_req_rsp_list_pci {
struct clp_req_list_pci request;
struct clp_rsp_list_pci response;
diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h
index 5dfe47588277..3bb4e7e33a0e 100644
--- a/arch/s390/include/asm/pci_debug.h
+++ b/arch/s390/include/asm/pci_debug.h
@@ -17,9 +17,14 @@ extern debug_info_t *pci_debug_err_id;
debug_text_event(pci_debug_err_id, 0, debug_buffer); \
} while (0)
+static inline void zpci_err_hex_level(int level, void *addr, int len)
+{
+ debug_event(pci_debug_err_id, level, addr, len);
+}
+
static inline void zpci_err_hex(void *addr, int len)
{
- debug_event(pci_debug_err_id, 0, addr, len);
+ zpci_err_hex_level(0, addr, len);
}
#endif
diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index 419fac7a62c0..91e63426bdc5 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -97,23 +97,23 @@ static inline unsigned int calc_px(dma_addr_t ptr)
return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK;
}
-static inline void set_pt_pfaa(unsigned long *entry, void *pfaa)
+static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa)
{
*entry &= ZPCI_PTE_FLAG_MASK;
- *entry |= ((unsigned long) pfaa & ZPCI_PTE_ADDR_MASK);
+ *entry |= (pfaa & ZPCI_PTE_ADDR_MASK);
}
-static inline void set_rt_sto(unsigned long *entry, void *sto)
+static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto)
{
*entry &= ZPCI_RTE_FLAG_MASK;
- *entry |= ((unsigned long) sto & ZPCI_RTE_ADDR_MASK);
+ *entry |= (sto & ZPCI_RTE_ADDR_MASK);
*entry |= ZPCI_TABLE_TYPE_RTX;
}
-static inline void set_st_pto(unsigned long *entry, void *pto)
+static inline void set_st_pto(unsigned long *entry, phys_addr_t pto)
{
*entry &= ZPCI_STE_FLAG_MASK;
- *entry |= ((unsigned long) pto & ZPCI_STE_ADDR_MASK);
+ *entry |= (pto & ZPCI_STE_ADDR_MASK);
*entry |= ZPCI_TABLE_TYPE_SX;
}
@@ -131,12 +131,6 @@ static inline void validate_st_entry(unsigned long *entry)
*entry |= ZPCI_TABLE_VALID;
}
-static inline void invalidate_table_entry(unsigned long *entry)
-{
- *entry &= ~ZPCI_TABLE_VALID_MASK;
- *entry |= ZPCI_TABLE_INVALID;
-}
-
static inline void invalidate_pt_entry(unsigned long *entry)
{
WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID);
@@ -173,33 +167,29 @@ static inline int pt_entry_isvalid(unsigned long entry)
return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
}
-static inline int entry_isprotected(unsigned long entry)
-{
- return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED;
-}
-
static inline unsigned long *get_rt_sto(unsigned long entry)
{
- return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
- ? (unsigned long *) (entry & ZPCI_RTE_ADDR_MASK)
- : NULL;
+ if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
+ return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
+ else
+ return NULL;
+
}
static inline unsigned long *get_st_pto(unsigned long entry)
{
- return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX)
- ? (unsigned long *) (entry & ZPCI_STE_ADDR_MASK)
- : NULL;
+ if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX)
+ return phys_to_virt(entry & ZPCI_STE_ADDR_MASK);
+ else
+ return NULL;
}
/* Prototypes */
-int zpci_dma_init_device(struct zpci_dev *);
-void zpci_dma_exit_device(struct zpci_dev *);
void dma_free_seg_table(unsigned long);
unsigned long *dma_alloc_cpu_table(void);
void dma_cleanup_tables(unsigned long *);
unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr);
-void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags);
+void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags);
extern const struct dma_map_ops s390_pci_dma_ops;
diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h
index 61cf9531f68f..e5f57cfe1d45 100644
--- a/arch/s390/include/asm/pci_insn.h
+++ b/arch/s390/include/asm/pci_insn.h
@@ -98,6 +98,15 @@ struct zpci_fib {
u32 gd;
} __packed __aligned(8);
+/* Set Interruption Controls Operation Controls */
+#define SIC_IRQ_MODE_ALL 0
+#define SIC_IRQ_MODE_SINGLE 1
+#define SIC_SET_AENI_CONTROLS 2
+#define SIC_IRQ_MODE_DIRECT 4
+#define SIC_IRQ_MODE_D_ALL 16
+#define SIC_IRQ_MODE_D_SINGLE 17
+#define SIC_IRQ_MODE_SET_CPU 18
+
/* directed interruption information block */
struct zpci_diib {
u32 : 1;
@@ -119,9 +128,20 @@ struct zpci_cdiib {
u64 : 64;
} __packed __aligned(8);
+/* adapter interruption parameters block */
+struct zpci_aipb {
+ u64 faisb;
+ u64 gait;
+ u16 : 13;
+ u16 afi : 3;
+ u32 : 32;
+ u16 faal;
+} __packed __aligned(8);
+
union zpci_sic_iib {
struct zpci_diib diib;
struct zpci_cdiib cdiib;
+ struct zpci_aipb aipb;
};
DECLARE_STATIC_KEY_FALSE(have_mio);
@@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset);
int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len);
int __zpci_store_block(const u64 *data, u64 req, u64 offset);
void zpci_barrier(void);
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
-
-static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc)
-{
- union zpci_sic_iib iib = {{0}};
-
- return __zpci_set_irq_ctrl(ctl, isc, &iib);
-}
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
#endif
diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h
index cd060b5dd8fd..287bb88f7698 100644
--- a/arch/s390/include/asm/pci_io.h
+++ b/arch/s390/include/asm/pci_io.h
@@ -8,14 +8,19 @@
#include <linux/slab.h>
#include <asm/pci_insn.h>
+/* I/O size constraints */
+#define ZPCI_MAX_READ_SIZE 8
+#define ZPCI_MAX_WRITE_SIZE 128
+
/* I/O Map */
#define ZPCI_IOMAP_SHIFT 48
-#define ZPCI_IOMAP_ADDR_BASE 0x8000000000000000UL
+#define ZPCI_IOMAP_ADDR_SHIFT 62
+#define ZPCI_IOMAP_ADDR_BASE (1UL << ZPCI_IOMAP_ADDR_SHIFT)
#define ZPCI_IOMAP_ADDR_OFF_MASK ((1UL << ZPCI_IOMAP_SHIFT) - 1)
#define ZPCI_IOMAP_MAX_ENTRIES \
- ((ULONG_MAX - ZPCI_IOMAP_ADDR_BASE + 1) / (1UL << ZPCI_IOMAP_SHIFT))
+ (1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT))
#define ZPCI_IOMAP_ADDR_IDX_MASK \
- (~ZPCI_IOMAP_ADDR_OFF_MASK - ZPCI_IOMAP_ADDR_BASE)
+ ((ZPCI_IOMAP_ADDR_BASE - 1) & ~ZPCI_IOMAP_ADDR_OFF_MASK)
struct zpci_iomap_entry {
u32 fh;
@@ -140,7 +145,8 @@ static inline int zpci_memcpy_fromio(void *dst,
while (n > 0) {
size = zpci_get_max_write_size((u64 __force) src,
- (u64) dst, n, 8);
+ (u64) dst, n,
+ ZPCI_MAX_READ_SIZE);
rc = zpci_read_single(dst, src, size);
if (rc)
break;
@@ -161,7 +167,8 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst,
while (n > 0) {
size = zpci_get_max_write_size((u64 __force) dst,
- (u64) src, n, 128);
+ (u64) src, n,
+ ZPCI_MAX_WRITE_SIZE);
if (size > 8) /* main path */
rc = zpci_write_block(dst, src, size);
else
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 50b4ce8cddfd..cb5fc0690435 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -29,7 +29,7 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ old__, new__, prev__; \
pcp_op_T__ *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
prev__ = *ptr__; \
do { \
@@ -37,7 +37,7 @@
new__ = old__ op (val); \
prev__ = cmpxchg(ptr__, old__, new__); \
} while (prev__ != old__); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
new__; \
})
@@ -68,7 +68,7 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
if (__builtin_constant_p(val__) && \
((szcast)val__ > -129) && ((szcast)val__ < 128)) { \
@@ -84,7 +84,7 @@
: [val__] "d" (val__) \
: "cc"); \
} \
- preempt_enable(); \
+ preempt_enable_notrace(); \
}
#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int)
@@ -95,14 +95,14 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
asm volatile( \
op " %[old__],%[val__],%[ptr__]\n" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
old__ + val__; \
})
@@ -114,14 +114,14 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
asm volatile( \
op " %[old__],%[val__],%[ptr__]\n" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
}
#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lan")
@@ -136,10 +136,10 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ ret__; \
pcp_op_T__ *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = cmpxchg(ptr__, oval, nval); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
ret__; \
})
@@ -152,10 +152,10 @@
({ \
typeof(pcp) *ptr__; \
typeof(pcp) ret__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = xchg(ptr__, nval); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
ret__; \
})
@@ -164,19 +164,20 @@
#define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval)
#define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval)
-#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2) \
-({ \
- typeof(pcp1) o1__ = (o1), n1__ = (n1); \
- typeof(pcp2) o2__ = (o2), n2__ = (n2); \
- typeof(pcp1) *p1__; \
- typeof(pcp2) *p2__; \
- int ret__; \
- preempt_disable(); \
- p1__ = raw_cpu_ptr(&(pcp1)); \
- p2__ = raw_cpu_ptr(&(pcp2)); \
- ret__ = __cmpxchg_double(p1__, p2__, o1__, o2__, n1__, n2__); \
- preempt_enable(); \
- ret__; \
+#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2) \
+({ \
+ typeof(pcp1) *p1__; \
+ typeof(pcp2) *p2__; \
+ int ret__; \
+ \
+ preempt_disable_notrace(); \
+ p1__ = raw_cpu_ptr(&(pcp1)); \
+ p2__ = raw_cpu_ptr(&(pcp2)); \
+ ret__ = __cmpxchg_double((unsigned long)p1__, (unsigned long)p2__, \
+ (unsigned long)(o1), (unsigned long)(o2), \
+ (unsigned long)(n1), (unsigned long)(n2)); \
+ preempt_enable_notrace(); \
+ ret__; \
})
#define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 77606c4acd58..17eb618f1348 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -34,19 +34,21 @@ static inline void crst_table_init(unsigned long *crst, unsigned long entry)
memset64((u64 *)crst, entry, _CRST_ENTRIES);
}
-static inline unsigned long pgd_entry_type(struct mm_struct *mm)
+int crst_table_upgrade(struct mm_struct *mm, unsigned long limit);
+
+static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
{
- if (mm_pmd_folded(mm))
- return _SEGMENT_ENTRY_EMPTY;
- if (mm_pud_folded(mm))
- return _REGION3_ENTRY_EMPTY;
- if (mm_p4d_folded(mm))
- return _REGION2_ENTRY_EMPTY;
- return _REGION1_ENTRY_EMPTY;
-}
+ int rc;
-int crst_table_upgrade(struct mm_struct *mm, unsigned long limit);
-void crst_table_downgrade(struct mm_struct *);
+ if (addr + len > mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
+ rc = crst_table_upgrade(mm, addr + len);
+ if (rc)
+ return (unsigned long) rc;
+ }
+ return addr;
+}
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address)
{
@@ -101,53 +103,37 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
- pgd_val(*pgd) = _REGION1_ENTRY | __pa(p4d);
+ set_pgd(pgd, __pgd(_REGION1_ENTRY | __pa(p4d)));
}
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
- p4d_val(*p4d) = _REGION2_ENTRY | __pa(pud);
+ set_p4d(p4d, __p4d(_REGION2_ENTRY | __pa(pud)));
}
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
- pud_val(*pud) = _REGION3_ENTRY | __pa(pmd);
+ set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd)));
}
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- unsigned long *table = crst_table_alloc(mm);
-
- if (!table)
- return NULL;
- if (mm->context.asce_limit == _REGION3_SIZE) {
- /* Forking a compat process with 2 page table levels */
- if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
- crst_table_free(mm, table);
- return NULL;
- }
- }
- return (pgd_t *) table;
+ return (pgd_t *) crst_table_alloc(mm);
}
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- if (mm->context.asce_limit == _REGION3_SIZE)
- pgtable_pmd_page_dtor(virt_to_page(pgd));
crst_table_free(mm, (unsigned long *) pgd);
}
static inline void pmd_populate(struct mm_struct *mm,
pmd_t *pmd, pgtable_t pte)
{
- pmd_val(*pmd) = _SEGMENT_ENTRY + __pa(pte);
+ set_pmd(pmd, __pmd(_SEGMENT_ENTRY | __pa(pte)));
}
#define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte)
-#define pmd_pgtable(pmd) \
- (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)
-
/*
* page table entry allocation/free routines.
*/
@@ -157,8 +143,6 @@ static inline void pmd_populate(struct mm_struct *mm,
#define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
#define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
-extern void rcu_table_freelist_finish(void);
-
void vmem_map_init(void);
void *vmem_crst_alloc(unsigned long val);
pte_t *vmem_pte_alloc(void);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 6d7c3b7e9281..f1cb9391190d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -17,11 +17,14 @@
#include <linux/page-flags.h>
#include <linux/radix-tree.h>
#include <linux/atomic.h>
+#include <asm/sections.h>
#include <asm/bug.h>
#include <asm/page.h>
+#include <asm/uv.h>
extern pgd_t swapper_pg_dir[];
extern void paging_init(void);
+extern unsigned long s390_invalid_asce;
enum {
PG_DIRECT_MAP_4K = 0,
@@ -63,36 +66,35 @@ extern unsigned long zero_page_mask;
/* TODO: s390 cannot support io_remap_pfn_range... */
-#define FIRST_USER_ADDRESS 0UL
-
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
+ pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
#define pmd_ERROR(e) \
- printk("%s:%d: bad pmd %p.\n", __FILE__, __LINE__, (void *) pmd_val(e))
+ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
#define pud_ERROR(e) \
- printk("%s:%d: bad pud %p.\n", __FILE__, __LINE__, (void *) pud_val(e))
+ pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
#define p4d_ERROR(e) \
- printk("%s:%d: bad p4d %p.\n", __FILE__, __LINE__, (void *) p4d_val(e))
+ pr_err("%s:%d: bad p4d %016lx.\n", __FILE__, __LINE__, p4d_val(e))
#define pgd_ERROR(e) \
- printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e))
+ pr_err("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
/*
* The vmalloc and module area will always be on the topmost area of the
- * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules.
- * On 64 bit kernels we have a 2GB area at the top of the vmalloc area where
- * modules will reside. That makes sure that inter module branches always
- * happen without trampolines and in addition the placement within a 2GB frame
- * is branch prediction unit friendly.
+ * kernel mapping. 512GB are reserved for vmalloc by default.
+ * At the top of the vmalloc area a 2GB area is reserved where modules
+ * will reside. That makes sure that inter module branches always
+ * happen without trampolines and in addition the placement within a
+ * 2GB frame is branch prediction unit friendly.
*/
-extern unsigned long VMALLOC_START;
-extern unsigned long VMALLOC_END;
-#define VMALLOC_DEFAULT_SIZE ((128UL << 30) - MODULES_LEN)
-extern struct page *vmemmap;
+extern unsigned long __bootdata_preserved(VMALLOC_START);
+extern unsigned long __bootdata_preserved(VMALLOC_END);
+#define VMALLOC_DEFAULT_SIZE ((512UL << 30) - MODULES_LEN)
+extern struct page *__bootdata_preserved(vmemmap);
+extern unsigned long __bootdata_preserved(vmemmap_size);
#define VMEM_MAX_PHYS ((unsigned long) vmemmap)
-extern unsigned long MODULES_VADDR;
-extern unsigned long MODULES_END;
+extern unsigned long __bootdata_preserved(MODULES_VADDR);
+extern unsigned long __bootdata_preserved(MODULES_END);
#define MODULES_VADDR MODULES_VADDR
#define MODULES_END MODULES_END
#define MODULES_LEN (1UL << 31)
@@ -179,6 +181,8 @@ static inline int is_module_addr(void *addr)
#define _PAGE_SOFT_DIRTY 0x000
#endif
+#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */
+
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
_PAGE_YOUNG | _PAGE_SOFT_DIRTY)
@@ -341,8 +345,6 @@ static inline int is_module_addr(void *addr)
#define PTRS_PER_P4D _CRST_ENTRIES
#define PTRS_PER_PGD _CRST_ENTRIES
-#define MAX_PTRS_PER_P4D PTRS_PER_P4D
-
/*
* Segment table and region3 table entry encoding
* (R = read-only, I = invalid, y = young bit):
@@ -422,23 +424,6 @@ static inline int is_module_addr(void *addr)
* implies read permission.
*/
/*xwr*/
-#define __P000 PAGE_NONE
-#define __P001 PAGE_RO
-#define __P010 PAGE_RO
-#define __P011 PAGE_RO
-#define __P100 PAGE_RX
-#define __P101 PAGE_RX
-#define __P110 PAGE_RX
-#define __P111 PAGE_RX
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_RO
-#define __S010 PAGE_RW
-#define __S011 PAGE_RW
-#define __S100 PAGE_RX
-#define __S101 PAGE_RX
-#define __S110 PAGE_RWX
-#define __S111 PAGE_RWX
/*
* Segment entry (large page) protection definitions.
@@ -520,6 +505,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
}
+static inline int mm_is_protected(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+ if (unlikely(atomic_read(&mm->context.protected_count)))
+ return 1;
+#endif
+ return 0;
+}
+
static inline int mm_alloc_pgste(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
@@ -529,6 +523,36 @@ static inline int mm_alloc_pgste(struct mm_struct *mm)
return 0;
}
+static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
+{
+ return __pte(pte_val(pte) & ~pgprot_val(prot));
+}
+
+static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
+{
+ return __pte(pte_val(pte) | pgprot_val(prot));
+}
+
+static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+ return __pmd(pmd_val(pmd) & ~pgprot_val(prot));
+}
+
+static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+ return __pmd(pmd_val(pmd) | pgprot_val(prot));
+}
+
+static inline pud_t clear_pud_bit(pud_t pud, pgprot_t prot)
+{
+ return __pud(pud_val(pud) & ~pgprot_val(prot));
+}
+
+static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
+{
+ return __pud(pud_val(pud) | pgprot_val(prot));
+}
+
/*
* In the case that a guest uses storage keys
* faults should no longer be backed by zero pages
@@ -545,27 +569,25 @@ static inline int mm_uses_skeys(struct mm_struct *mm)
static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
{
- register unsigned long reg2 asm("2") = old;
- register unsigned long reg3 asm("3") = new;
+ union register_pair r1 = { .even = old, .odd = new, };
unsigned long address = (unsigned long)ptr | 1;
asm volatile(
- " csp %0,%3"
- : "+d" (reg2), "+m" (*ptr)
- : "d" (reg3), "d" (address)
+ " csp %[r1],%[address]"
+ : [r1] "+&d" (r1.pair), "+m" (*ptr)
+ : [address] "d" (address)
: "cc");
}
static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
{
- register unsigned long reg2 asm("2") = old;
- register unsigned long reg3 asm("3") = new;
+ union register_pair r1 = { .even = old, .odd = new, };
unsigned long address = (unsigned long)ptr | 1;
asm volatile(
- " .insn rre,0xb98a0000,%0,%3"
- : "+d" (reg2), "+m" (*ptr)
- : "d" (reg3), "d" (address)
+ " cspg %[r1],%[address]"
+ : [r1] "+&d" (r1.pair), "+m" (*ptr)
+ : [address] "d" (address)
: "cc");
}
@@ -576,17 +598,15 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new
#define CRDTE_DTT_REGION1 0x1cUL
static inline void crdte(unsigned long old, unsigned long new,
- unsigned long table, unsigned long dtt,
+ unsigned long *table, unsigned long dtt,
unsigned long address, unsigned long asce)
{
- register unsigned long reg2 asm("2") = old;
- register unsigned long reg3 asm("3") = new;
- register unsigned long reg4 asm("4") = table | dtt;
- register unsigned long reg5 asm("5") = address;
+ union register_pair r1 = { .even = old, .odd = new, };
+ union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, };
- asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
- : "+d" (reg2)
- : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
+ asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0"
+ : [r1] "+&d" (r1.pair)
+ : [r2] "d" (r2.pair), [asce] "a" (asce)
: "memory", "cc");
}
@@ -681,16 +701,6 @@ static inline int pud_large(pud_t pud)
return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
}
-static inline unsigned long pud_pfn(pud_t pud)
-{
- unsigned long origin_mask;
-
- origin_mask = _REGION_ENTRY_ORIGIN;
- if (pud_large(pud))
- origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
- return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
-}
-
#define pmd_leaf pmd_large
static inline int pmd_large(pmd_t pmd)
{
@@ -736,16 +746,6 @@ static inline int pmd_none(pmd_t pmd)
return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
}
-static inline unsigned long pmd_pfn(pmd_t pmd)
-{
- unsigned long origin_mask;
-
- origin_mask = _SEGMENT_ENTRY_ORIGIN;
- if (pmd_large(pmd))
- origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
- return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
-}
-
#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
@@ -811,6 +811,22 @@ static inline int pmd_protnone(pmd_t pmd)
}
#endif
+#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
+static inline int pte_swp_exclusive(pte_t pte)
+{
+ return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+ return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+ return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
+}
+
static inline int pte_soft_dirty(pte_t pte)
{
return pte_val(pte) & _PAGE_SOFT_DIRTY;
@@ -819,15 +835,13 @@ static inline int pte_soft_dirty(pte_t pte)
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
- pte_val(pte) |= _PAGE_SOFT_DIRTY;
- return pte;
+ return set_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY));
}
#define pte_swp_mksoft_dirty pte_mksoft_dirty
static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_SOFT_DIRTY;
- return pte;
+ return clear_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY));
}
#define pte_swp_clear_soft_dirty pte_clear_soft_dirty
@@ -838,14 +852,12 @@ static inline int pmd_soft_dirty(pmd_t pmd)
static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY;
- return pmd;
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
}
static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY;
- return pmd;
+ return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
}
/*
@@ -874,35 +886,79 @@ static inline int pte_unused(pte_t pte)
}
/*
+ * Extract the pgprot value from the given pte while at the same time making it
+ * usable for kernel address space mappings where fault driven dirty and
+ * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID
+ * must not be set.
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+ unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK;
+
+ if (pte_write(pte))
+ pte_flags |= pgprot_val(PAGE_KERNEL);
+ else
+ pte_flags |= pgprot_val(PAGE_KERNEL_RO);
+ pte_flags |= pte_val(pte) & mio_wb_bit_mask;
+
+ return __pgprot(pte_flags);
+}
+
+/*
* pgd/pmd/pte modification functions
*/
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ WRITE_ONCE(*pgdp, pgd);
+}
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ WRITE_ONCE(*p4dp, p4d);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+ WRITE_ONCE(*pudp, pud);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ WRITE_ONCE(*pmdp, pmd);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ WRITE_ONCE(*ptep, pte);
+}
+
static inline void pgd_clear(pgd_t *pgd)
{
if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1)
- pgd_val(*pgd) = _REGION1_ENTRY_EMPTY;
+ set_pgd(pgd, __pgd(_REGION1_ENTRY_EMPTY));
}
static inline void p4d_clear(p4d_t *p4d)
{
if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
- p4d_val(*p4d) = _REGION2_ENTRY_EMPTY;
+ set_p4d(p4d, __p4d(_REGION2_ENTRY_EMPTY));
}
static inline void pud_clear(pud_t *pud)
{
if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- pud_val(*pud) = _REGION3_ENTRY_EMPTY;
+ set_pud(pud, __pud(_REGION3_ENTRY_EMPTY));
}
static inline void pmd_clear(pmd_t *pmdp)
{
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
}
/*
@@ -911,79 +967,74 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt
*/
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pte_val(pte) &= _PAGE_CHG_MASK;
- pte_val(pte) |= pgprot_val(newprot);
+ pte = clear_pte_bit(pte, __pgprot(~_PAGE_CHG_MASK));
+ pte = set_pte_bit(pte, newprot);
/*
* newprot for PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX
* has the invalid bit set, clear it again for readable, young pages
*/
if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ))
- pte_val(pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID));
/*
* newprot for PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX has the page
* protection bit set, clear it again for writable, dirty pages
*/
if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE))
- pte_val(pte) &= ~_PAGE_PROTECT;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
return pte;
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_WRITE;
- pte_val(pte) |= _PAGE_PROTECT;
- return pte;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_WRITE));
+ return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
}
static inline pte_t pte_mkwrite(pte_t pte)
{
- pte_val(pte) |= _PAGE_WRITE;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE));
if (pte_val(pte) & _PAGE_DIRTY)
- pte_val(pte) &= ~_PAGE_PROTECT;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
return pte;
}
static inline pte_t pte_mkclean(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_DIRTY;
- pte_val(pte) |= _PAGE_PROTECT;
- return pte;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_DIRTY));
+ return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_DIRTY | _PAGE_SOFT_DIRTY));
if (pte_val(pte) & _PAGE_WRITE)
- pte_val(pte) &= ~_PAGE_PROTECT;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
return pte;
}
static inline pte_t pte_mkold(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_YOUNG;
- pte_val(pte) |= _PAGE_INVALID;
- return pte;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_YOUNG));
+ return set_pte_bit(pte, __pgprot(_PAGE_INVALID));
}
static inline pte_t pte_mkyoung(pte_t pte)
{
- pte_val(pte) |= _PAGE_YOUNG;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_YOUNG));
if (pte_val(pte) & _PAGE_READ)
- pte_val(pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID));
return pte;
}
static inline pte_t pte_mkspecial(pte_t pte)
{
- pte_val(pte) |= _PAGE_SPECIAL;
- return pte;
+ return set_pte_bit(pte, __pgprot(_PAGE_SPECIAL));
}
#ifdef CONFIG_HUGETLB_PAGE
static inline pte_t pte_mkhuge(pte_t pte)
{
- pte_val(pte) |= _PAGE_LARGE;
- return pte;
+ return set_pte_bit(pte, __pgprot(_PAGE_LARGE));
}
#endif
@@ -997,12 +1048,12 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
unsigned long opt, unsigned long asce,
int local)
{
- unsigned long pto = (unsigned long) ptep;
+ unsigned long pto = __pa(ptep);
if (__builtin_constant_p(opt) && opt == 0) {
/* Invalidation + TLB flush for the pte */
asm volatile(
- " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]"
+ " ipte %[r1],%[r2],0,%[m4]"
: "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address),
[m4] "i" (local));
return;
@@ -1011,7 +1062,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
/* Invalidate ptes with options + TLB flush of the ptes */
opt = opt | (asce & _ASCE_ORIGIN);
asm volatile(
- " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]"
+ " ipte %[r1],%[r2],%[r3],%[m4]"
: [r2] "+a" (address), [r3] "+a" (opt)
: [r1] "a" (pto), [m4] "i" (local) : "memory");
}
@@ -1019,12 +1070,12 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
static __always_inline void __ptep_ipte_range(unsigned long address, int nr,
pte_t *ptep, int local)
{
- unsigned long pto = (unsigned long) ptep;
+ unsigned long pto = __pa(ptep);
/* Invalidate a range of ptes + TLB flush of the ptes */
do {
asm volatile(
- " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]"
+ " ipte %[r1],%[r2],%[r3],%[m4]"
: [r2] "+a" (address), [r3] "+a" (nr)
: [r1] "a" (pto), [m4] "i" (local) : "memory");
} while (nr != 255);
@@ -1067,7 +1118,13 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ pte_t res;
+
+ res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ /* At this point the reference through the mapping is still present */
+ if (mm_is_protected(mm) && pte_present(res))
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ return res;
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -1079,7 +1136,13 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
- return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+ pte_t res;
+
+ res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+ /* At this point the reference through the mapping is still present */
+ if (mm_is_protected(vma->vm_mm) && pte_present(res))
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ return res;
}
/*
@@ -1094,12 +1157,31 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep, int full)
{
+ pte_t res;
+
if (full) {
- pte_t pte = *ptep;
- *ptep = __pte(_PAGE_INVALID);
- return pte;
+ res = *ptep;
+ set_pte(ptep, __pte(_PAGE_INVALID));
+ } else {
+ res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
}
- return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ /* Nothing to do */
+ if (!mm_is_protected(mm) || !pte_present(res))
+ return res;
+ /*
+ * At this point the reference through the mapping is still present.
+ * The notifier should have destroyed all protected vCPUs at this
+ * point, so the destroy should be successful.
+ */
+ if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK))
+ return res;
+ /*
+ * If something went wrong and the page could not be destroyed, or
+ * if this is not a mm teardown, the slower export is used as
+ * fallback instead.
+ */
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ return res;
}
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -1161,6 +1243,12 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
+#define pgprot_writecombine pgprot_writecombine
+pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#define pgprot_writethrough pgprot_writethrough
+pgprot_t pgprot_writethrough(pgprot_t prot);
+
/*
* Certain architectures need to do special things when PTEs
* within a page table are directly modified. Thus, the following
@@ -1170,11 +1258,11 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t entry)
{
if (pte_present(entry))
- pte_val(entry) &= ~_PAGE_UNUSED;
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED));
if (mm_has_pgste(mm))
ptep_set_pte_at(mm, addr, ptep, entry);
else
- *ptep = entry;
+ set_pte(ptep, entry);
}
/*
@@ -1184,9 +1272,10 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
{
pte_t __pte;
- pte_val(__pte) = physpage + pgprot_val(pgprot);
+
+ __pte = __pte(physpage | pgprot_val(pgprot));
if (!MACHINE_HAS_NX)
- pte_val(__pte) &= ~_PAGE_NOEXEC;
+ __pte = clear_pte_bit(__pte, __pgprot(_PAGE_NOEXEC));
return pte_mkyoung(__pte);
}
@@ -1204,12 +1293,39 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
#define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
-#define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN)
-#define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN)
-#define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN)
-#define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN)
+#define p4d_deref(pud) ((unsigned long)__va(p4d_val(pud) & _REGION_ENTRY_ORIGIN))
+#define pgd_deref(pgd) ((unsigned long)__va(pgd_val(pgd) & _REGION_ENTRY_ORIGIN))
+
+static inline unsigned long pmd_deref(pmd_t pmd)
+{
+ unsigned long origin_mask;
+
+ origin_mask = _SEGMENT_ENTRY_ORIGIN;
+ if (pmd_large(pmd))
+ origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
+ return (unsigned long)__va(pmd_val(pmd) & origin_mask);
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ return __pa(pmd_deref(pmd)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pud_deref(pud_t pud)
+{
+ unsigned long origin_mask;
+
+ origin_mask = _REGION_ENTRY_ORIGIN;
+ if (pud_large(pud))
+ origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
+ return (unsigned long)__va(pud_val(pud) & origin_mask);
+}
+
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ return __pa(pud_deref(pud)) >> PAGE_SHIFT;
+}
/*
* The pgd_offset function *always* adds the index for the top-level
@@ -1235,38 +1351,52 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address)
}
#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address)
{
- if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
- return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
- return (p4d_t *) pgd;
+ if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+ return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+ return (p4d_t *) pgdp;
}
+#define p4d_offset_lockless p4d_offset_lockless
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
{
- if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
- return (pud_t *) p4d_deref(*p4d) + pud_index(address);
- return (pud_t *) p4d;
+ return p4d_offset_lockless(pgdp, *pgdp, address);
}
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address)
{
- if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
- return (pmd_t *) pud_deref(*pud) + pmd_index(address);
- return (pmd_t *) pud;
+ if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+ return (pud_t *) p4d_deref(p4d) + pud_index(address);
+ return (pud_t *) p4dp;
}
+#define pud_offset_lockless pud_offset_lockless
-static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address)
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
{
- return (pte_t *) pmd_deref(*pmd) + pte_index(address);
+ return pud_offset_lockless(p4dp, *p4dp, address);
}
+#define pud_offset pud_offset
-#define pte_offset_kernel(pmd, address) pte_offset(pmd, address)
-#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address)
+{
+ if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
+ return (pmd_t *) pud_deref(pud) + pmd_index(address);
+ return (pmd_t *) pudp;
+}
+#define pmd_offset_lockless pmd_offset_lockless
-static inline void pte_unmap(pte_t *pte) { }
+static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address)
+{
+ return pmd_offset_lockless(pudp, *pudp, address);
+}
+#define pmd_offset pmd_offset
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+ return (unsigned long) pmd_deref(pmd);
+}
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
@@ -1274,7 +1404,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#define gup_fast_permitted gup_fast_permitted
-#define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot))
+#define pfn_pte(pfn, pgprot) mk_pte_phys(((pfn) << PAGE_SHIFT), (pgprot))
#define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT)
#define pte_page(x) pfn_to_page(pte_pfn(x))
@@ -1285,61 +1415,57 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE;
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
- return pmd;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_WRITE;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
return pmd;
}
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_DIRTY;
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
- return pmd;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY));
if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE)
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
return pmd;
}
static inline pud_t pud_wrprotect(pud_t pud)
{
- pud_val(pud) &= ~_REGION3_ENTRY_WRITE;
- pud_val(pud) |= _REGION_ENTRY_PROTECT;
- return pud;
+ pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE));
+ return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
}
static inline pud_t pud_mkwrite(pud_t pud)
{
- pud_val(pud) |= _REGION3_ENTRY_WRITE;
+ pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE));
if (pud_val(pud) & _REGION3_ENTRY_DIRTY)
- pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
return pud;
}
static inline pud_t pud_mkclean(pud_t pud)
{
- pud_val(pud) &= ~_REGION3_ENTRY_DIRTY;
- pud_val(pud) |= _REGION_ENTRY_PROTECT;
- return pud;
+ pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY));
+ return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- pud_val(pud) |= _REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY;
+ pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY));
if (pud_val(pud) & _REGION3_ENTRY_WRITE)
- pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
return pud;
}
@@ -1363,37 +1489,39 @@ static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
if (pmd_val(pmd) & _SEGMENT_ENTRY_READ)
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_INVALID;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
return pmd;
}
static inline pmd_t pmd_mkold(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_YOUNG;
- pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
- return pmd;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
- pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
- _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
- _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
- pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+ unsigned long mask;
+
+ mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
+ mask |= _SEGMENT_ENTRY_DIRTY;
+ mask |= _SEGMENT_ENTRY_YOUNG;
+ mask |= _SEGMENT_ENTRY_LARGE;
+ mask |= _SEGMENT_ENTRY_SOFT_DIRTY;
+ pmd = __pmd(pmd_val(pmd) & mask);
+ pmd = set_pmd_bit(pmd, __pgprot(massage_pgprot_pmd(newprot)));
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG))
- pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
return pmd;
}
static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
{
- pmd_t __pmd;
- pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
- return __pmd;
+ return __pmd(physpage + massage_pgprot_pmd(pgprot));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
@@ -1417,11 +1545,11 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
{
unsigned long sto;
- sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t);
+ sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t);
if (__builtin_constant_p(opt) && opt == 0) {
/* flush without guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]"
+ " idte %[r1],0,%[r2],%[m4]"
: "+m" (*pmdp)
: [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)),
[m4] "i" (local)
@@ -1429,7 +1557,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
} else {
/* flush with guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]"
+ " idte %[r1],%[r3],%[r2],%[m4]"
: "+m" (*pmdp)
: [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt),
[r3] "a" (asce), [m4] "i" (local)
@@ -1443,12 +1571,12 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
{
unsigned long r3o;
- r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t);
+ r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t);
r3o |= _ASCE_TYPE_REGION3;
if (__builtin_constant_p(opt) && opt == 0) {
/* flush without guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]"
+ " idte %[r1],0,%[r2],%[m4]"
: "+m" (*pudp)
: [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)),
[m4] "i" (local)
@@ -1456,7 +1584,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
} else {
/* flush with guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]"
+ " idte %[r1],%[r3],%[r2],%[m4]"
: "+m" (*pudp)
: [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt),
[r3] "a" (asce), [m4] "i" (local)
@@ -1515,16 +1643,15 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t entry)
{
if (!MACHINE_HAS_NX)
- pmd_val(entry) &= ~_SEGMENT_ENTRY_NOEXEC;
- *pmdp = entry;
+ entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+ set_pmd(pmdp, entry);
}
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
- pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
- return pmd;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_LARGE));
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
@@ -1535,16 +1662,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmdp, int full)
{
if (full) {
pmd_t pmd = *pmdp;
- *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
return pmd;
}
- return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+ return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
@@ -1581,7 +1708,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
}
#define pmdp_collapse_flush pmdp_collapse_flush
-#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
+#define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot))
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
static inline int pmd_trans_huge(pmd_t pmd)
@@ -1599,18 +1726,18 @@ static inline int has_transparent_hugepage(void)
/*
* 64 bit swap entry format:
* A page-table entry has some bits we have to treat in a special way.
- * Bits 52 and bit 55 have to be zero, otherwise a specification
- * exception will occur instead of a page translation exception. The
- * specification exception has the bad habit not to store necessary
- * information in the lowcore.
- * Bits 54 and 63 are used to indicate the page type.
+ * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte
+ * as invalid.
* A swap pte is indicated by bit pattern (pte & 0x201) == 0x200
- * This leaves the bits 0-51 and bits 56-62 to store type and offset.
- * We use the 5 bits from 57-61 for the type and the 52 bits from 0-51
- * for the offset.
- * | offset |01100|type |00|
+ * | offset |E11XX|type |S0|
* |0000000000111111111122222222223333333333444444444455|55555|55566|66|
* |0123456789012345678901234567890123456789012345678901|23456|78901|23|
+ *
+ * Bits 0-51 store the offset.
+ * Bit 52 (E) is used to remember PG_anon_exclusive.
+ * Bits 57-61 store the type.
+ * Bit 62 (S) is used for softdirty tracking.
+ * Bits 55 and 56 (X) are unused.
*/
#define __SWP_OFFSET_MASK ((1UL << 52) - 1)
@@ -1620,12 +1747,12 @@ static inline int has_transparent_hugepage(void)
static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
{
- pte_t pte;
+ unsigned long pteval;
- pte_val(pte) = _PAGE_INVALID | _PAGE_PROTECT;
- pte_val(pte) |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
- pte_val(pte) |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
- return pte;
+ pteval = _PAGE_INVALID | _PAGE_PROTECT;
+ pteval |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
+ pteval |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
+ return __pte(pteval);
}
static inline unsigned long __swp_type(swp_entry_t entry)
@@ -1649,7 +1776,11 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
#define kern_addr_valid(addr) (1)
extern int vmem_add_mapping(unsigned long start, unsigned long size);
-extern int vmem_remove_mapping(unsigned long start, unsigned long size);
+extern void vmem_remove_mapping(unsigned long start, unsigned long size);
+extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
+extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot);
+extern void vmem_unmap_4k_page(unsigned long addr);
+extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc);
extern int s390_enable_sie(void);
extern int s390_enable_skey(void);
extern void s390_reset_cmma(struct mm_struct *mm);
@@ -1658,6 +1789,7 @@ extern void s390_reset_cmma(struct mm_struct *mm);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#include <asm-generic/pgtable.h>
+#define pmd_pgtable(pmd) \
+ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
#endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index 6ede29907fbf..bf15da0fedbc 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc)
old, new) != old);
}
-#define init_task_preempt_count(p) do { } while (0)
-
-#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
-} while (0)
-
static inline void set_preempt_need_resched(void)
{
__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
@@ -52,10 +46,17 @@ static inline bool test_preempt_need_resched(void)
static inline void __preempt_count_add(int val)
{
- if (__builtin_constant_p(val) && (val >= -128) && (val <= 127))
- __atomic_add_const(val, &S390_lowcore.preempt_count);
- else
- __atomic_add(val, &S390_lowcore.preempt_count);
+ /*
+ * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES
+ * enabled, gcc 12 fails to handle __builtin_constant_p().
+ */
+ if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) {
+ if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) {
+ __atomic_add_const(val, &S390_lowcore.preempt_count);
+ return;
+ }
+ }
+ __atomic_add(val, &S390_lowcore.preempt_count);
}
static inline void __preempt_count_sub(int val)
@@ -88,12 +89,6 @@ static inline void preempt_count_set(int pc)
S390_lowcore.preempt_count = pc;
}
-#define init_task_preempt_count(p) do { } while (0)
-
-#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
-} while (0)
-
static inline void set_preempt_need_resched(void)
{
}
@@ -130,10 +125,14 @@ static inline bool should_resched(int preempt_offset)
#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+#define init_task_preempt_count(p) do { } while (0)
+/* Deferred to CPU bringup time */
+#define init_idle_preempt_count(p, cpu) do { } while (0)
+
#ifdef CONFIG_PREEMPTION
-extern asmlinkage void preempt_schedule(void);
+extern void preempt_schedule(void);
#define __preempt_schedule() preempt_schedule()
-extern asmlinkage void preempt_schedule_notrace(void);
+extern void preempt_schedule_notrace(void);
#define __preempt_schedule_notrace() preempt_schedule_notrace()
#endif /* CONFIG_PREEMPTION */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index aadb3d0e2adc..87be3e855bf7 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -14,26 +14,20 @@
#include <linux/bits.h>
-#define CIF_MCCK_PENDING 0 /* machine check handling is pending */
-#define CIF_ASCE_PRIMARY 1 /* primary asce needs fixup / uaccess */
-#define CIF_ASCE_SECONDARY 2 /* secondary asce needs fixup / uaccess */
-#define CIF_NOHZ_DELAY 3 /* delay HZ disable for a tick */
-#define CIF_FPU 4 /* restore FPU registers */
-#define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */
-#define CIF_ENABLED_WAIT 6 /* in enabled wait state */
-#define CIF_MCCK_GUEST 7 /* machine check happening in guest */
-#define CIF_DEDICATED_CPU 8 /* this CPU is dedicated */
-
-#define _CIF_MCCK_PENDING BIT(CIF_MCCK_PENDING)
-#define _CIF_ASCE_PRIMARY BIT(CIF_ASCE_PRIMARY)
-#define _CIF_ASCE_SECONDARY BIT(CIF_ASCE_SECONDARY)
+#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */
+#define CIF_FPU 3 /* restore FPU registers */
+#define CIF_ENABLED_WAIT 5 /* in enabled wait state */
+#define CIF_MCCK_GUEST 6 /* machine check happening in guest */
+#define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */
+
#define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY)
#define _CIF_FPU BIT(CIF_FPU)
-#define _CIF_IGNORE_IRQ BIT(CIF_IGNORE_IRQ)
#define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT)
#define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST)
#define _CIF_DEDICATED_CPU BIT(CIF_DEDICATED_CPU)
+#define RESTART_FLAG_CTLREGS _AC(1 << 0, U)
+
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
@@ -46,6 +40,9 @@
#include <asm/runtime_instr.h>
#include <asm/fpu/types.h>
#include <asm/fpu/internal.h>
+#include <asm/irqflags.h>
+
+typedef long (*sys_call_ptr_t)(struct pt_regs *regs);
static inline void set_cpu_flag(int flag)
{
@@ -86,57 +83,56 @@ void cpu_detect_mhz_feature(void);
extern const struct seq_operations cpuinfo_op;
extern void execve_tail(void);
extern void __bpon(void);
+unsigned long vdso_size(void);
/*
* User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
*/
-#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \
- (1UL << 31) : -PAGE_SIZE)
+#define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \
+ _REGION3_SIZE : TASK_SIZE_MAX)
#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \
- (1UL << 30) : (1UL << 41))
-#define TASK_SIZE TASK_SIZE_OF(current)
+ (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1))
#define TASK_SIZE_MAX (-PAGE_SIZE)
-#define STACK_TOP (test_thread_flag(TIF_31BIT) ? \
- (1UL << 31) : (1UL << 42))
-#define STACK_TOP_MAX (1UL << 42)
+#define VDSO_BASE (STACK_TOP + PAGE_SIZE)
+#define VDSO_LIMIT (test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE)
+#define STACK_TOP (VDSO_LIMIT - vdso_size() - PAGE_SIZE)
+#define STACK_TOP_MAX (_REGION2_SIZE - vdso_size() - PAGE_SIZE)
#define HAVE_ARCH_PICK_MMAP_LAYOUT
-typedef unsigned int mm_segment_t;
-
/*
* Thread structure
*/
struct thread_struct {
unsigned int acrs[NUM_ACRS];
- unsigned long ksp; /* kernel stack pointer */
- unsigned long user_timer; /* task cputime in user space */
- unsigned long guest_timer; /* task cputime in kvm guest */
- unsigned long system_timer; /* task cputime in kernel space */
- unsigned long hardirq_timer; /* task cputime in hardirq context */
- unsigned long softirq_timer; /* task cputime in softirq context */
- unsigned long sys_call_table; /* system call table address */
- mm_segment_t mm_segment;
- unsigned long gmap_addr; /* address of last gmap fault. */
- unsigned int gmap_write_flag; /* gmap fault write indication */
- unsigned int gmap_int_code; /* int code of last gmap fault */
- unsigned int gmap_pfault; /* signal of a pending guest pfault */
+ unsigned long ksp; /* kernel stack pointer */
+ unsigned long user_timer; /* task cputime in user space */
+ unsigned long guest_timer; /* task cputime in kvm guest */
+ unsigned long system_timer; /* task cputime in kernel space */
+ unsigned long hardirq_timer; /* task cputime in hardirq context */
+ unsigned long softirq_timer; /* task cputime in softirq context */
+ const sys_call_ptr_t *sys_call_table; /* system call table address */
+ unsigned long gmap_addr; /* address of last gmap fault. */
+ unsigned int gmap_write_flag; /* gmap fault write indication */
+ unsigned int gmap_int_code; /* int code of last gmap fault */
+ unsigned int gmap_pfault; /* signal of a pending guest pfault */
+
/* Per-thread information related to debugging */
- struct per_regs per_user; /* User specified PER registers */
- struct per_event per_event; /* Cause of the last PER trap */
- unsigned long per_flags; /* Flags to control debug behavior */
- unsigned int system_call; /* system call number in signal */
- unsigned long last_break; /* last breaking-event-address. */
- /* pfault_wait is used to block the process on a pfault event */
+ struct per_regs per_user; /* User specified PER registers */
+ struct per_event per_event; /* Cause of the last PER trap */
+ unsigned long per_flags; /* Flags to control debug behavior */
+ unsigned int system_call; /* system call number in signal */
+ unsigned long last_break; /* last breaking-event-address. */
+ /* pfault_wait is used to block the process on a pfault event */
unsigned long pfault_wait;
struct list_head list;
/* cpu runtime instrumentation */
struct runtime_instr_cb *ri_cb;
- struct gs_cb *gs_cb; /* Current guarded storage cb */
- struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */
- unsigned char trap_tdb[256]; /* Transaction abort diagnose block */
+ struct gs_cb *gs_cb; /* Current guarded storage cb */
+ struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */
+ struct pgm_tdb trap_tdb; /* Transaction abort diagnose block */
/*
* Warning: 'fpu' is dynamically-sized. It *MUST* be at
* the end.
@@ -161,6 +157,7 @@ typedef struct thread_struct thread_struct;
#define INIT_THREAD { \
.ksp = sizeof(init_stack) + (unsigned long) &init_stack, \
.fpu.regs = (void *) init_task.thread.fpu.fprs, \
+ .last_break = 1, \
}
/*
@@ -177,7 +174,6 @@ typedef struct thread_struct thread_struct;
regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \
regs->psw.addr = new_psw; \
regs->gprs[15] = new_stackp; \
- crst_table_downgrade(current->mm); \
execve_tail(); \
} while (0)
@@ -190,13 +186,11 @@ struct pt_regs;
void show_registers(struct pt_regs *regs);
void show_cacheinfo(struct seq_file *m);
-/* Free all resources held by a thread. */
-static inline void release_thread(struct task_struct *tsk) { }
-
/* Free guarded storage control block */
void guarded_storage_release(struct task_struct *tsk);
+void gs_load_bc_cb(struct pt_regs *regs);
-unsigned long get_wchan(struct task_struct *p);
+unsigned long __get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
(task_stack_page(tsk) + THREAD_SIZE) - 1)
#define KSTK_EIP(tsk) (task_pt_regs(tsk)->psw.addr)
@@ -205,15 +199,9 @@ unsigned long get_wchan(struct task_struct *p);
/* Has task runtime instrumentation enabled ? */
#define is_ri_task(tsk) (!!(tsk)->thread.ri_cb)
-static __always_inline unsigned long current_stack_pointer(void)
-{
- unsigned long sp;
-
- asm volatile("la %0,0(15)" : "=a" (sp));
- return sp;
-}
+register unsigned long current_stack_pointer asm("r15");
-static __no_kasan_or_inline unsigned short stap(void)
+static __always_inline unsigned short stap(void)
{
unsigned short cpu_address;
@@ -230,8 +218,7 @@ static inline unsigned long __ecag(unsigned int asi, unsigned char parm)
{
unsigned long val;
- asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
- : "=d" (val) : "a" (asi << 8 | parm));
+ asm volatile("ecag %0,0,0(%1)" : "=d" (val) : "a" (asi << 8 | parm));
return val;
}
@@ -252,7 +239,7 @@ static inline void __load_psw(psw_t psw)
* Set PSW mask to specified value, while leaving the
* PSW addr pointing to the next instruction.
*/
-static __no_kasan_or_inline void __load_psw_mask(unsigned long mask)
+static __always_inline void __load_psw_mask(unsigned long mask)
{
unsigned long addr;
psw_t psw;
@@ -302,11 +289,6 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc)
}
/*
- * Function to stop a processor until the next interrupt occurs
- */
-void enabled_wait(void);
-
-/*
* Function to drop a processor into disabled wait state
*/
static __always_inline void __noreturn disabled_wait(void)
@@ -319,31 +301,16 @@ static __always_inline void __noreturn disabled_wait(void)
while (1);
}
-/*
- * Basic Machine Check/Program Check Handler.
- */
-
-extern void s390_base_pgm_handler(void);
-extern void s390_base_ext_handler(void);
-
-extern void (*s390_base_pgm_handler_fn)(void);
-extern void (*s390_base_ext_handler_fn)(void);
-
#define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL
-extern int memcpy_real(void *, void *, size_t);
-extern void memcpy_absolute(void *, void *, size_t);
-
-#define mem_assign_absolute(dest, val) do { \
- __typeof__(dest) __tmp = (val); \
- \
- BUILD_BUG_ON(sizeof(__tmp) != sizeof(val)); \
- memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \
-} while (0)
-
extern int s390_isolate_bp(void);
extern int s390_isolate_bp_guest(void);
+static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
+{
+ return arch_irqs_disabled_flags(regs->psw.mask);
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_S390_PROCESSOR_H */
diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h
new file mode 100644
index 000000000000..f960b2896606
--- /dev/null
+++ b/arch/s390/include/asm/ptdump.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_PTDUMP_H
+#define _ASM_S390_PTDUMP_H
+
+void ptdump_check_wx(void);
+
+static inline void debug_checkwx(void)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_WX))
+ ptdump_check_wx();
+}
+
+#endif /* _ASM_S390_PTDUMP_H */
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index f009a13afe71..8bae33ab320a 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -9,16 +9,19 @@
#include <linux/bits.h>
#include <uapi/asm/ptrace.h>
+#include <asm/tpi.h>
-#define PIF_SYSCALL 0 /* inside a system call */
-#define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */
-#define PIF_SYSCALL_RESTART 2 /* restart the current system call */
-#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */
+#define PIF_SYSCALL 0 /* inside a system call */
+#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */
+#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */
+#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */
+#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */
-#define _PIF_SYSCALL BIT(PIF_SYSCALL)
-#define _PIF_PER_TRAP BIT(PIF_PER_TRAP)
-#define _PIF_SYSCALL_RESTART BIT(PIF_SYSCALL_RESTART)
-#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT)
+#define _PIF_SYSCALL BIT(PIF_SYSCALL)
+#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART)
+#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET)
+#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT)
+#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS)
#ifndef __ASSEMBLY__
@@ -68,12 +71,43 @@ enum {
&(*(struct psw_bits *)(&(__psw))); \
}))
+#define PSW32_MASK_PER 0x40000000UL
+#define PSW32_MASK_DAT 0x04000000UL
+#define PSW32_MASK_IO 0x02000000UL
+#define PSW32_MASK_EXT 0x01000000UL
+#define PSW32_MASK_KEY 0x00F00000UL
+#define PSW32_MASK_BASE 0x00080000UL /* Always one */
+#define PSW32_MASK_MCHECK 0x00040000UL
+#define PSW32_MASK_WAIT 0x00020000UL
+#define PSW32_MASK_PSTATE 0x00010000UL
+#define PSW32_MASK_ASC 0x0000C000UL
+#define PSW32_MASK_CC 0x00003000UL
+#define PSW32_MASK_PM 0x00000f00UL
+#define PSW32_MASK_RI 0x00000080UL
+
+#define PSW32_ADDR_AMODE 0x80000000UL
+#define PSW32_ADDR_INSN 0x7FFFFFFFUL
+
+#define PSW32_DEFAULT_KEY (((u32)PAGE_DEFAULT_ACC) << 20)
+
+#define PSW32_ASC_PRIMARY 0x00000000UL
+#define PSW32_ASC_ACCREG 0x00004000UL
+#define PSW32_ASC_SECONDARY 0x00008000UL
+#define PSW32_ASC_HOME 0x0000C000UL
+
+typedef struct {
+ unsigned int mask;
+ unsigned int addr;
+} psw_t32 __aligned(8);
+
+#define PGM_INT_CODE_MASK 0x7f
+#define PGM_INT_CODE_PER 0x80
+
/*
* The pt_regs struct defines the way the registers are stored on
* the stack during a system call.
*/
-struct pt_regs
-{
+struct pt_regs {
union {
user_pt_regs user_regs;
struct {
@@ -83,10 +117,17 @@ struct pt_regs
};
};
unsigned long orig_gpr2;
- unsigned int int_code;
- unsigned int int_parm;
- unsigned long int_parm_long;
+ union {
+ struct {
+ unsigned int int_code;
+ unsigned int int_parm;
+ unsigned long int_parm_long;
+ };
+ struct tpi_info tpi_info;
+ };
unsigned long flags;
+ unsigned long cr1;
+ unsigned long last_break;
};
/*
@@ -152,6 +193,14 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
return !!(regs->flags & (1UL << flag));
}
+static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
+{
+ int ret = test_pt_regs_flag(regs, flag);
+
+ clear_pt_regs_flag(regs, flag);
+ return ret;
+}
+
/*
* These are defined as per linux/ptrace.h, which see.
*/
@@ -179,10 +228,34 @@ const char *regs_query_register_name(unsigned int offset);
unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
+/**
+ * regs_get_kernel_argument() - get Nth function argument in kernel
+ * @regs: pt_regs of that context
+ * @n: function argument number (start from 0)
+ *
+ * regs_get_kernel_argument() returns @n th argument of the function call.
+ */
+static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
+ unsigned int n)
+{
+ unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long);
+
+#define NR_REG_ARGUMENTS 5
+ if (n < NR_REG_ARGUMENTS)
+ return regs_get_register(regs, 2 + n);
+ n -= NR_REG_ARGUMENTS;
+ return regs_get_kernel_stack_nth(regs, argoffset + n);
+}
+
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->gprs[15];
}
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+ regs->gprs[2] = rc;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* _S390_PTRACE_H */
diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index 1e3517b0518b..2f983e0b95e0 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -18,7 +18,6 @@
#define QDIO_MAX_BUFFERS_MASK (QDIO_MAX_BUFFERS_PER_Q - 1)
#define QDIO_BUFNR(num) ((num) & QDIO_MAX_BUFFERS_MASK)
#define QDIO_MAX_ELEMENTS_PER_BUFFER 16
-#define QDIO_SBAL_SIZE 256
#define QDIO_QETH_QFMT 0
#define QDIO_ZFCP_QFMT 1
@@ -26,9 +25,9 @@
/**
* struct qdesfmt0 - queue descriptor, format 0
- * @sliba: storage list information block address
- * @sla: storage list address
- * @slsba: storage list state block address
+ * @sliba: absolute address of storage list information block
+ * @sla: absolute address of storage list
+ * @slsba: absolute address of storage list state block
* @akey: access key for SLIB
* @bkey: access key for SL
* @ckey: access key for SBALs
@@ -56,7 +55,7 @@ struct qdesfmt0 {
* @oqdcnt: output queue descriptor count
* @iqdsz: input queue descriptor size
* @oqdsz: output queue descriptor size
- * @qiba: queue information block address
+ * @qiba: absolute address of queue information block
* @qkey: queue information block key
* @qdf0: queue descriptions
*/
@@ -92,8 +91,8 @@ struct qdr {
* @pfmt: implementation dependent parameter format
* @rflags: QEBSM
* @ac: adapter characteristics
- * @isliba: absolute address of first input SLIB
- * @osliba: absolute address of first output SLIB
+ * @isliba: logical address of first input SLIB
+ * @osliba: logical address of first output SLIB
* @ebcnam: adapter identifier in EBCDIC
* @parm: implementation dependent parameters
*/
@@ -134,10 +133,9 @@ struct slibe {
* @sb_count: number of storage blocks
* @sba: storage block element addresses
* @dcount: size of storage block elements
- * @user0: user defineable value
- * @res4: reserved paramater
- * @user1: user defineable value
- * @user2: user defineable value
+ * @user0: user definable value
+ * @res4: reserved parameter
+ * @user1: user definable value
*/
struct qaob {
u64 res0[6];
@@ -152,8 +150,7 @@ struct qaob {
u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER];
u64 user0;
u64 res4[2];
- u64 user1;
- u64 user2;
+ u8 user1[16];
} __attribute__ ((packed, aligned(256)));
/**
@@ -246,25 +243,8 @@ struct slsb {
u8 val[QDIO_MAX_BUFFERS_PER_Q];
} __attribute__ ((packed, aligned(256)));
-/**
- * struct qdio_outbuf_state - SBAL related asynchronous operation information
- * (for communication with upper layer programs)
- * (only required for use with completion queues)
- * @flags: flags indicating state of buffer
- * @user: pointer to upper layer program's state information related to SBAL
- * (stored in user1 data of QAOB)
- */
-struct qdio_outbuf_state {
- u8 flags;
- void *user;
-};
-
-#define QDIO_OUTBUF_STATE_FLAG_PENDING 0x01
-
-#define CHSC_AC1_INITIATE_INPUTQ 0x80
-
-
/* qdio adapter-characteristics-1 flag */
+#define CHSC_AC1_INITIATE_INPUTQ 0x80
#define AC1_SIGA_INPUT_NEEDED 0x40 /* process input queues */
#define AC1_SIGA_OUTPUT_NEEDED 0x20 /* process output queues */
#define AC1_SIGA_SYNC_NEEDED 0x10 /* ask hypervisor to sync */
@@ -310,14 +290,14 @@ struct qdio_ssqd_desc {
typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
int, int, unsigned long);
-/* qdio errors reported to the upper-layer program */
+/* qdio errors reported through the queue handlers: */
#define QDIO_ERROR_ACTIVATE 0x0001
#define QDIO_ERROR_GET_BUF_STATE 0x0002
#define QDIO_ERROR_SET_BUF_STATE 0x0004
-#define QDIO_ERROR_SLSB_STATE 0x0100
-#define QDIO_ERROR_FATAL 0x00ff
-#define QDIO_ERROR_TEMPORARY 0xff00
+/* extra info for completed SBALs: */
+#define QDIO_ERROR_SLSB_STATE 0x0100
+#define QDIO_ERROR_SLSB_PENDING 0x0200
/* for qdio_cleanup */
#define QDIO_FLAG_CLEANUP_USING_CLEAR 0x01
@@ -325,109 +305,60 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
/**
* struct qdio_initialize - qdio initialization data
- * @cdev: associated ccw device
* @q_format: queue format
* @qdr_ac: feature flags to set
- * @adapter_name: name for the adapter
* @qib_param_field_format: format for qib_parm_field
* @qib_param_field: pointer to 128 bytes or NULL, if no param field
* @qib_rflags: rflags to set
- * @input_slib_elements: pointer to no_input_qs * 128 words of data or NULL
- * @output_slib_elements: pointer to no_output_qs * 128 words of data or NULL
* @no_input_qs: number of input queues
* @no_output_qs: number of output queues
- * @input_handler: handler to be called for input queues
+ * @input_handler: handler to be called for input queues, and device-wide errors
* @output_handler: handler to be called for output queues
- * @queue_start_poll_array: polling handlers (one per input queue or NULL)
+ * @irq_poll: Data IRQ polling handler
* @scan_threshold: # of in-use buffers that triggers scan on output queue
* @int_parm: interruption parameter
- * @input_sbal_addr_array: address of no_input_qs * 128 pointers
- * @output_sbal_addr_array: address of no_output_qs * 128 pointers
- * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL)
+ * @input_sbal_addr_array: per-queue array, each element points to 128 SBALs
+ * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs
*/
struct qdio_initialize {
- struct ccw_device *cdev;
unsigned char q_format;
unsigned char qdr_ac;
- unsigned char adapter_name[8];
unsigned int qib_param_field_format;
unsigned char *qib_param_field;
unsigned char qib_rflags;
- unsigned long *input_slib_elements;
- unsigned long *output_slib_elements;
unsigned int no_input_qs;
unsigned int no_output_qs;
qdio_handler_t *input_handler;
qdio_handler_t *output_handler;
- void (**queue_start_poll_array) (struct ccw_device *, int,
- unsigned long);
- unsigned int scan_threshold;
+ void (*irq_poll)(struct ccw_device *cdev, unsigned long data);
unsigned long int_parm;
- struct qdio_buffer **input_sbal_addr_array;
- struct qdio_buffer **output_sbal_addr_array;
- struct qdio_outbuf_state *output_sbal_state_array;
+ struct qdio_buffer ***input_sbal_addr_array;
+ struct qdio_buffer ***output_sbal_addr_array;
};
-/**
- * enum qdio_brinfo_entry_type - type of address entry for qdio_brinfo_desc()
- * @l3_ipv6_addr: entry contains IPv6 address
- * @l3_ipv4_addr: entry contains IPv4 address
- * @l2_addr_lnid: entry contains MAC address and VLAN ID
- */
-enum qdio_brinfo_entry_type {l3_ipv6_addr, l3_ipv4_addr, l2_addr_lnid};
-
-/**
- * struct qdio_brinfo_entry_XXX - Address entry for qdio_brinfo_desc()
- * @nit: Network interface token
- * @addr: Address of one of the three types
- *
- * The struct is passed to the callback function by qdio_brinfo_desc()
- */
-struct qdio_brinfo_entry_l3_ipv6 {
- u64 nit;
- struct { unsigned char _s6_addr[16]; } addr;
-} __packed;
-struct qdio_brinfo_entry_l3_ipv4 {
- u64 nit;
- struct { uint32_t _s_addr; } addr;
-} __packed;
-struct qdio_brinfo_entry_l2 {
- u64 nit;
- struct { u8 mac[6]; u16 lnid; } addr_lnid;
-} __packed;
-
-#define QDIO_STATE_INACTIVE 0x00000002 /* after qdio_cleanup */
-#define QDIO_STATE_ESTABLISHED 0x00000004 /* after qdio_establish */
-#define QDIO_STATE_ACTIVE 0x00000008 /* after qdio_activate */
-#define QDIO_STATE_STOPPED 0x00000010 /* after queues went down */
-
-#define QDIO_FLAG_SYNC_INPUT 0x01
-#define QDIO_FLAG_SYNC_OUTPUT 0x02
-#define QDIO_FLAG_PCI_OUT 0x10
-
int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count);
void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count);
void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count);
-extern int qdio_allocate(struct qdio_initialize *);
-extern int qdio_establish(struct qdio_initialize *);
+extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+ unsigned int no_output_qs);
+extern int qdio_establish(struct ccw_device *cdev,
+ struct qdio_initialize *init_data);
extern int qdio_activate(struct ccw_device *);
-extern void qdio_release_aob(struct qaob *);
-extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int,
- unsigned int);
-extern int qdio_start_irq(struct ccw_device *, int);
-extern int qdio_stop_irq(struct ccw_device *, int);
-extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *);
-extern int qdio_inspect_queue(struct ccw_device *cdev, unsigned int nr,
- bool is_input, unsigned int *bufnr,
- unsigned int *error);
+extern int qdio_start_irq(struct ccw_device *cdev);
+extern int qdio_stop_irq(struct ccw_device *cdev);
+extern int qdio_inspect_input_queue(struct ccw_device *cdev, unsigned int nr,
+ unsigned int *bufnr, unsigned int *error);
+extern int qdio_inspect_output_queue(struct ccw_device *cdev, unsigned int nr,
+ unsigned int *bufnr, unsigned int *error);
+extern int qdio_add_bufs_to_input_queue(struct ccw_device *cdev,
+ unsigned int q_nr, unsigned int bufnr,
+ unsigned int count);
+extern int qdio_add_bufs_to_output_queue(struct ccw_device *cdev,
+ unsigned int q_nr, unsigned int bufnr,
+ unsigned int count, struct qaob *aob);
extern int qdio_shutdown(struct ccw_device *, int);
extern int qdio_free(struct ccw_device *);
extern int qdio_get_ssqd_desc(struct ccw_device *, struct qdio_ssqd_desc *);
-extern int qdio_pnso_brinfo(struct subchannel_id schid,
- int cnc, u16 *response,
- void (*cb)(void *priv, enum qdio_brinfo_entry_type type,
- void *entry),
- void *priv);
#endif /* __QDIO_H__ */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index c563f8368b19..9d4c7f71e070 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -1,18 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright IBM Corp. 2007
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#ifndef _ASM_S390_SCLP_H
#define _ASM_S390_SCLP_H
#include <linux/types.h>
-#include <asm/chpid.h>
-#include <asm/cpu.h>
#define SCLP_CHP_INFO_MASK_SIZE 32
-#define SCLP_MAX_CORES 256
+#define EARLY_SCCB_SIZE PAGE_SIZE
+#define SCLP_MAX_CORES 512
+/* 144 + 16 * SCLP_MAX_CORES + 2 * (SCLP_MAX_CORES - 1) */
+#define EXT_SCCB_READ_SCP (3 * PAGE_SIZE)
+/* 24 + 16 * SCLP_MAX_CORES */
+#define EXT_SCCB_READ_CPU (3 * PAGE_SIZE)
+
+#ifndef __ASSEMBLY__
+#include <linux/uio.h>
+#include <asm/chpid.h>
+#include <asm/cpu.h>
struct sclp_chp_info {
u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
@@ -81,6 +88,11 @@ struct sclp_info {
unsigned char has_diag318 : 1;
unsigned char has_sipl : 1;
unsigned char has_dirq : 1;
+ unsigned char has_iplcc : 1;
+ unsigned char has_zpci_lsi : 1;
+ unsigned char has_aisii : 1;
+ unsigned char has_aeni : 1;
+ unsigned char has_aisi : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
@@ -105,17 +117,20 @@ struct zpci_report_error_header {
* (OpenCrypto Successful Diagnostics Execution)
*/
u16 length; /* Length of Subsequent Data (up to 4K – SCLP header */
- u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */
+ u8 data[]; /* Subsequent Data passed verbatim to SCLP ET 24 */
} __packed;
+extern char *sclp_early_sccb;
+
+void sclp_early_adjust_va(void);
+void sclp_early_set_buffer(void *sccb);
int sclp_early_read_info(void);
int sclp_early_read_storage_info(void);
int sclp_early_get_core_info(struct sclp_core_info *info);
void sclp_early_get_ipl_info(struct sclp_ipl_info *info);
void sclp_early_detect(void);
void sclp_early_printk(const char *s);
-void sclp_early_printk_force(const char *s);
-void __sclp_early_printk(const char *s, unsigned int len, unsigned int force);
+void __sclp_early_printk(const char *s, unsigned int len);
int sclp_early_get_memsize(unsigned long *mem);
int sclp_early_get_hsa_size(unsigned long *hsa_size);
@@ -129,9 +144,10 @@ int sclp_chp_deconfigure(struct chp_id chpid);
int sclp_chp_read_info(struct sclp_chp_info *info);
int sclp_pci_configure(u32 fid);
int sclp_pci_deconfigure(u32 fid);
+int sclp_ap_configure(u32 apid);
+int sclp_ap_deconfigure(u32 apid);
int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
-int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
-int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
+size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count);
void sclp_ocf_cpc_name_copy(char *dst);
static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
@@ -141,4 +157,5 @@ static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
return _sclp_get_core_info(info);
}
+#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h
index c00f7b031628..322bdcd4b616 100644
--- a/arch/s390/include/asm/scsw.h
+++ b/arch/s390/include/asm/scsw.h
@@ -215,6 +215,11 @@ union scsw {
#define SNS2_ENV_DATA_PRESENT 0x10
#define SNS2_INPRECISE_END 0x04
+/*
+ * architectured values for PPRC errors
+ */
+#define SNS7_INVALID_ON_SEC 0x0e
+
/**
* scsw_is_tm - check for transport mode scsw
* @scsw: pointer to scsw
@@ -508,9 +513,21 @@ static inline int scsw_cmd_is_valid_zcc(union scsw *scsw)
*/
static inline int scsw_cmd_is_valid_ectl(union scsw *scsw)
{
- return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) &&
- !(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS);
+ /* Must be status pending. */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Must have alert status. */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS))
+ return 0;
+
+ /* Must be alone or together with primary, secondary or both,
+ * => no intermediate status.
+ */
+ if (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS)
+ return 0;
+
+ return 1;
}
/**
@@ -522,11 +539,25 @@ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw)
*/
static inline int scsw_cmd_is_valid_pno(union scsw *scsw)
{
- return (scsw->cmd.fctl != 0) &&
- (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) &&
- (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) ||
- ((scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)));
+ /* Must indicate at least one I/O function. */
+ if (!scsw->cmd.fctl)
+ return 0;
+
+ /* Must be status pending. */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Can be status pending alone, or with any combination of primary,
+ * secondary and alert => no intermediate status.
+ */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS))
+ return 1;
+
+ /* If intermediate, must be suspended. */
+ if (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)
+ return 1;
+
+ return 0;
}
/**
@@ -676,9 +707,21 @@ static inline int scsw_tm_is_valid_q(union scsw *scsw)
*/
static inline int scsw_tm_is_valid_ectl(union scsw *scsw)
{
- return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) &&
- !(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS);
+ /* Must be status pending. */
+ if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Must have alert status. */
+ if (!(scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS))
+ return 0;
+
+ /* Must be alone or together with primary, secondary or both,
+ * => no intermediate status.
+ */
+ if (scsw->tm.stctl & SCSW_STCTL_INTER_STATUS)
+ return 0;
+
+ return 1;
}
/**
@@ -690,11 +733,25 @@ static inline int scsw_tm_is_valid_ectl(union scsw *scsw)
*/
static inline int scsw_tm_is_valid_pno(union scsw *scsw)
{
- return (scsw->tm.fctl != 0) &&
- (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) &&
- (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) ||
- ((scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->tm.actl & SCSW_ACTL_SUSPENDED)));
+ /* Must indicate at least one I/O function. */
+ if (!scsw->tm.fctl)
+ return 0;
+
+ /* Must be status pending. */
+ if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Can be status pending alone, or with any combination of primary,
+ * secondary and alert => no intermediate status.
+ */
+ if (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS))
+ return 1;
+
+ /* If intermediate, must be suspended. */
+ if (scsw->tm.actl & SCSW_ACTL_SUSPENDED)
+ return 1;
+
+ return 0;
}
/**
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 795bbe0d7ca6..71d46f0ba97b 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -16,4 +16,13 @@
#include <asm-generic/seccomp.h>
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "s390x"
+#ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390
+# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "s390"
+#endif
+
#endif /* _ASM_S390_SECCOMP_H */
diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h
index 42de04ad9c07..3fecaa4e8b74 100644
--- a/arch/s390/include/asm/sections.h
+++ b/arch/s390/include/asm/sections.h
@@ -2,20 +2,8 @@
#ifndef _S390_SECTIONS_H
#define _S390_SECTIONS_H
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
#include <asm-generic/sections.h>
-extern bool initmem_freed;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
- if (!initmem_freed)
- return 0;
- return addr >= (unsigned long)__init_begin &&
- addr < (unsigned long)__init_end;
-}
-
/*
* .boot.data section contains variables "shared" between the decompressor and
* the decompressed kernel. The decompressor will store values in them, and
@@ -26,16 +14,16 @@ static inline int arch_is_kernel_initmem_freed(unsigned long addr)
* final .boot.data section, which should be identical in the decompressor and
* the decompressed kernel (that is checked during the build).
*/
-#define __bootdata(var) __section(.boot.data.var) var
+#define __bootdata(var) __section(".boot.data." #var) var
/*
* .boot.preserved.data is similar to .boot.data, but it is not part of the
* .init section and thus will be preserved for later use in the decompressed
* kernel.
*/
-#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var
+#define __bootdata_preserved(var) __section(".boot.preserved.data." #var) var
-extern unsigned long __sdma, __edma;
-extern unsigned long __stext_dma, __etext_dma;
+extern unsigned long __samode31, __eamode31;
+extern unsigned long __stext_amode31, __etext_amode31;
#endif
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index c59a83536c70..950d87bd997a 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -2,10 +2,15 @@
#ifndef _ASMS390_SET_MEMORY_H
#define _ASMS390_SET_MEMORY_H
+#include <linux/mutex.h>
+
+extern struct mutex cpa_mutex;
+
#define SET_MEMORY_RO 1UL
#define SET_MEMORY_RW 2UL
#define SET_MEMORY_NX 4UL
#define SET_MEMORY_X 8UL
+#define SET_MEMORY_4K 16UL
int __set_memory(unsigned long addr, int numpages, unsigned long flags);
@@ -29,4 +34,9 @@ static inline int set_memory_x(unsigned long addr, int numpages)
return __set_memory(addr, numpages, SET_MEMORY_X);
}
+static inline int set_memory_4k(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, SET_MEMORY_4K);
+}
+
#endif
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index b241ddb67caf..77e6506898f5 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -8,15 +8,11 @@
#include <linux/bits.h>
#include <uapi/asm/setup.h>
+#include <linux/build_bug.h>
-#define EP_OFFSET 0x10008
-#define EP_STRING "S390EP"
#define PARMAREA 0x10400
-#define EARLY_SCCB_OFFSET 0x11000
-#define HEAD_END 0x12000
-
-#define EARLY_SCCB_SIZE PAGE_SIZE
+#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE
/*
* Machine features detected in early.c
*/
@@ -37,6 +33,7 @@
#define MACHINE_FLAG_NX BIT(15)
#define MACHINE_FLAG_GS BIT(16)
#define MACHINE_FLAG_SCC BIT(17)
+#define MACHINE_FLAG_PCI_MIO BIT(18)
#define LPP_MAGIC BIT(31)
#define LPP_PID_MASK _AC(0xffffffff, UL)
@@ -46,28 +43,13 @@
#define STARTUP_NORMAL_OFFSET 0x10000
#define STARTUP_KDUMP_OFFSET 0x10010
-/* Offsets to parameters in kernel/head.S */
-
-#define IPL_DEVICE_OFFSET 0x10400
-#define INITRD_START_OFFSET 0x10408
-#define INITRD_SIZE_OFFSET 0x10410
-#define OLDMEM_BASE_OFFSET 0x10418
-#define OLDMEM_SIZE_OFFSET 0x10420
-#define KERNEL_VERSION_OFFSET 0x10428
-#define COMMAND_LINE_OFFSET 0x10480
+#define LEGACY_COMMAND_LINE_SIZE 896
#ifndef __ASSEMBLY__
#include <asm/lowcore.h>
#include <asm/types.h>
-#define IPL_DEVICE (*(unsigned long *) (IPL_DEVICE_OFFSET))
-#define INITRD_START (*(unsigned long *) (INITRD_START_OFFSET))
-#define INITRD_SIZE (*(unsigned long *) (INITRD_SIZE_OFFSET))
-#define OLDMEM_BASE (*(unsigned long *) (OLDMEM_BASE_OFFSET))
-#define OLDMEM_SIZE (*(unsigned long *) (OLDMEM_SIZE_OFFSET))
-#define COMMAND_LINE ((char *) (COMMAND_LINE_OFFSET))
-
struct parmarea {
unsigned long ipl_device; /* 0x10400 */
unsigned long initrd_start; /* 0x10408 */
@@ -75,10 +57,13 @@ struct parmarea {
unsigned long oldmem_base; /* 0x10418 */
unsigned long oldmem_size; /* 0x10420 */
unsigned long kernel_version; /* 0x10428 */
- char pad1[0x10480 - 0x10430]; /* 0x10430 - 0x10480 */
- char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */
+ unsigned long max_command_line_size; /* 0x10430 */
+ char pad1[0x10480-0x10438]; /* 0x10438 - 0x10480 */
+ char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */
};
+extern struct parmarea parmarea;
+
extern unsigned int zlib_dfltcc_support;
#define ZLIB_DFLTCC_DISABLED 0
#define ZLIB_DFLTCC_FULL 1
@@ -87,11 +72,10 @@ extern unsigned int zlib_dfltcc_support;
#define ZLIB_DFLTCC_FULL_DEBUG 4
extern int noexec_disabled;
-extern int memory_end_set;
-extern unsigned long memory_end;
-extern unsigned long vmalloc_size;
-extern unsigned long max_physmem_end;
-extern unsigned long __swsusp_reset_dma;
+extern unsigned long ident_map_size;
+
+/* The Write Back bit position in the physaddr is given by the SLPC PCI */
+extern unsigned long mio_wb_bit_mask;
#define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM)
#define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
@@ -110,6 +94,7 @@ extern unsigned long __swsusp_reset_dma;
#define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX)
#define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS)
#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC)
+#define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO)
/*
* Console mode. Override with conmode=
@@ -118,9 +103,6 @@ extern unsigned int console_mode;
extern unsigned int console_devno;
extern unsigned int console_irq;
-extern char vmhalt_cmd[];
-extern char vmpoff_cmd[];
-
#define CONSOLE_IS_UNDEFINED (console_mode == 0)
#define CONSOLE_IS_SCLP (console_mode == 1)
#define CONSOLE_IS_3215 (console_mode == 2)
@@ -162,14 +144,24 @@ static inline unsigned long kaslr_offset(void)
return __kaslr_offset;
}
-#else /* __ASSEMBLY__ */
+extern int is_full_image;
-#define IPL_DEVICE (IPL_DEVICE_OFFSET)
-#define INITRD_START (INITRD_START_OFFSET)
-#define INITRD_SIZE (INITRD_SIZE_OFFSET)
-#define OLDMEM_BASE (OLDMEM_BASE_OFFSET)
-#define OLDMEM_SIZE (OLDMEM_SIZE_OFFSET)
-#define COMMAND_LINE (COMMAND_LINE_OFFSET)
+struct initrd_data {
+ unsigned long start;
+ unsigned long size;
+};
+extern struct initrd_data initrd_data;
+
+struct oldmem_data {
+ unsigned long start;
+ unsigned long size;
+};
+extern struct oldmem_data oldmem_data;
+static inline u32 gen_lpswe(unsigned long addr)
+{
+ BUILD_BUG_ON(addr > 0xfff);
+ return 0xb2b20000 | addr;
+}
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_SETUP_H */
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index 53ee795cd3d3..edee63da08e7 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -41,15 +41,17 @@
static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm,
u32 *status)
{
- register unsigned long reg1 asm ("1") = parm;
+ union register_pair r1 = { .odd = parm, };
int cc;
asm volatile(
- " sigp %1,%2,0(%3)\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc), "+d" (reg1) : "d" (addr), "a" (order) : "cc");
- *status = reg1;
+ " sigp %[r1],%[addr],0(%[order])\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [r1] "+&d" (r1.pair)
+ : [addr] "d" (addr), [order] "a" (order)
+ : "cc");
+ *status = r1.even;
return cc;
}
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index b157a81fb977..73ed2781073b 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -3,13 +3,13 @@
* Copyright IBM Corp. 1999, 2012
* Author(s): Denis Joseph Barrow,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#ifndef __ASM_SMP_H
#define __ASM_SMP_H
#include <asm/sigp.h>
#include <asm/lowcore.h>
+#include <asm/processor.h>
#define raw_smp_processor_id() (S390_lowcore.cpu_nr)
@@ -17,6 +17,7 @@ extern struct mutex smp_cpu_state_mutex;
extern unsigned int smp_cpu_mt_shift;
extern unsigned int smp_cpu_mtid;
extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
+extern cpumask_t cpu_setup_mask;
extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
@@ -29,11 +30,12 @@ extern void smp_emergency_stop(void);
extern int smp_find_processor_id(u16 address);
extern int smp_store_status(int cpu);
-extern void smp_save_dump_cpus(void);
-extern int smp_vcpu_scheduled(int cpu);
+extern void smp_save_dump_ipl_cpu(void);
+extern void smp_save_dump_secondary_cpus(void);
extern void smp_yield_cpu(int cpu);
extern void smp_cpu_set_polarization(int cpu, int val);
extern int smp_cpu_get_polarization(int cpu);
+extern int smp_cpu_get_cpu_address(int cpu);
extern void smp_fill_possible_mask(void);
extern void smp_detect_cpus(void);
@@ -53,9 +55,16 @@ static inline int smp_get_base_cpu(int cpu)
return cpu - (cpu % (smp_cpu_mtid + 1));
}
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+extern int smp_reinit_ipl_cpu(void);
extern int smp_rescan_cpus(void);
extern void __noreturn cpu_die(void);
extern void __cpu_die(unsigned int cpu);
extern int __cpu_disable(void);
+extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
#endif /* __ASM_SMP_H */
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
new file mode 100644
index 000000000000..1ac5115d3115
--- /dev/null
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_S390_SOFTIRQ_STACK_H
+#define __ASM_S390_SOFTIRQ_STACK_H
+
+#include <asm/lowcore.h>
+#include <asm/stacktrace.h>
+
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
+static inline void do_softirq_own_stack(void)
+{
+ call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
+}
+#endif
+#endif /* __ASM_S390_SOFTIRQ_STACK_H */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 3a37172d5398..37127cd7749e 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -67,14 +67,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp)
arch_spin_lock_wait(lp);
}
-static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
- unsigned long flags)
-{
- if (!arch_spin_trylock_once(lp))
- arch_spin_lock_wait(lp);
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
static inline int arch_spin_trylock(arch_spinlock_t *lp)
{
if (!arch_spin_trylock_once(lp))
@@ -85,10 +77,11 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp)
static inline void arch_spin_unlock(arch_spinlock_t *lp)
{
typecheck(int, lp->lock);
+ kcsan_release();
asm_inline volatile(
- ALTERNATIVE("", ".long 0xb2fa0070", 49) /* NIAI 7 */
+ ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */
" sth %1,%0\n"
- : "=Q" (((unsigned short *) &lp->lock)[1])
+ : "=R" (((unsigned short *) &lp->lock)[1])
: "d" (0) : "cc", "memory");
}
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index cfed272e4fd5..b69695e39957 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -2,13 +2,13 @@
#ifndef __ASM_SPINLOCK_TYPES_H
#define __ASM_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
# error "please don't include this file directly"
#endif
typedef struct {
int lock;
-} __attribute__ ((aligned (4))) arch_spinlock_t;
+} arch_spinlock_t;
#define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, }
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index ee056f4a4fa3..b23c658dce77 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -12,6 +12,7 @@ enum stack_type {
STACK_TYPE_IRQ,
STACK_TYPE_NODAT,
STACK_TYPE_RESTART,
+ STACK_TYPE_MCCK,
};
struct stack_info {
@@ -33,37 +34,26 @@ static inline bool on_stack(struct stack_info *info,
return addr >= info->begin && addr + len <= info->end;
}
-static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
- struct pt_regs *regs)
-{
- if (regs)
- return (unsigned long) kernel_stack_pointer(regs);
- if (task == current)
- return current_stack_pointer();
- return (unsigned long) task->thread.ksp;
-}
-
/*
* Stack layout of a C stack frame.
+ * Kernel uses the packed stack layout (-mpacked-stack).
*/
-#ifndef __PACK_STACK
struct stack_frame {
- unsigned long back_chain;
- unsigned long empty1[5];
- unsigned long gprs[10];
- unsigned int empty2[8];
-};
-#else
-struct stack_frame {
- unsigned long empty1[5];
- unsigned int empty2[8];
+ union {
+ unsigned long empty[9];
+ struct {
+ unsigned long sie_control_block;
+ unsigned long sie_savearea;
+ unsigned long sie_reason;
+ unsigned long sie_flags;
+ };
+ };
unsigned long gprs[10];
unsigned long back_chain;
};
-#endif
/*
- * Unlike current_stack_pointer() which simply returns current value of %r15
+ * Unlike current_stack_pointer which simply contains the current value of %r15
* current_frame_address() returns function stack frame address, which matches
* %r15 upon function invocation. It may differ from %r15 later if function
* allocates stack for local variables or new stack frame to call other
@@ -73,29 +63,26 @@ struct stack_frame {
((unsigned long)__builtin_frame_address(0) - \
offsetof(struct stack_frame, back_chain))
-#define CALL_ARGS_0() \
- register unsigned long r2 asm("2")
-#define CALL_ARGS_1(arg1) \
- register unsigned long r2 asm("2") = (unsigned long)(arg1)
-#define CALL_ARGS_2(arg1, arg2) \
- CALL_ARGS_1(arg1); \
- register unsigned long r3 asm("3") = (unsigned long)(arg2)
-#define CALL_ARGS_3(arg1, arg2, arg3) \
- CALL_ARGS_2(arg1, arg2); \
- register unsigned long r4 asm("4") = (unsigned long)(arg3)
-#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \
- CALL_ARGS_3(arg1, arg2, arg3); \
- register unsigned long r4 asm("5") = (unsigned long)(arg4)
-#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \
- CALL_ARGS_4(arg1, arg2, arg3, arg4); \
- register unsigned long r4 asm("6") = (unsigned long)(arg5)
-
-#define CALL_FMT_0 "=&d" (r2) :
-#define CALL_FMT_1 "+&d" (r2) :
-#define CALL_FMT_2 CALL_FMT_1 "d" (r3),
-#define CALL_FMT_3 CALL_FMT_2 "d" (r4),
-#define CALL_FMT_4 CALL_FMT_3 "d" (r5),
-#define CALL_FMT_5 CALL_FMT_4 "d" (r6),
+static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long)kernel_stack_pointer(regs);
+ if (task == current)
+ return current_frame_address();
+ return (unsigned long)task->thread.ksp;
+}
+
+/*
+ * To keep this simple mark register 2-6 as being changed (volatile)
+ * by the called function, even though register 6 is saved/nonvolatile.
+ */
+#define CALL_FMT_0 "=&d" (r2)
+#define CALL_FMT_1 "+&d" (r2)
+#define CALL_FMT_2 CALL_FMT_1, "+&d" (r3)
+#define CALL_FMT_3 CALL_FMT_2, "+&d" (r4)
+#define CALL_FMT_4 CALL_FMT_3, "+&d" (r5)
+#define CALL_FMT_5 CALL_FMT_4, "+&d" (r6)
#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory"
#define CALL_CLOBBER_4 CALL_CLOBBER_5
@@ -104,34 +91,113 @@ struct stack_frame {
#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3"
#define CALL_CLOBBER_0 CALL_CLOBBER_1
-#define CALL_ON_STACK(fn, stack, nr, args...) \
+#define CALL_LARGS_0(...) \
+ long dummy = 0
+#define CALL_LARGS_1(t1, a1) \
+ long arg1 = (long)(t1)(a1)
+#define CALL_LARGS_2(t1, a1, t2, a2) \
+ CALL_LARGS_1(t1, a1); \
+ long arg2 = (long)(t2)(a2)
+#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3) \
+ CALL_LARGS_2(t1, a1, t2, a2); \
+ long arg3 = (long)(t3)(a3)
+#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4) \
+ CALL_LARGS_3(t1, a1, t2, a2, t3, a3); \
+ long arg4 = (long)(t4)(a4)
+#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \
+ CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4); \
+ long arg5 = (long)(t5)(a5)
+
+#define CALL_REGS_0 \
+ register long r2 asm("2") = dummy
+#define CALL_REGS_1 \
+ register long r2 asm("2") = arg1
+#define CALL_REGS_2 \
+ CALL_REGS_1; \
+ register long r3 asm("3") = arg2
+#define CALL_REGS_3 \
+ CALL_REGS_2; \
+ register long r4 asm("4") = arg3
+#define CALL_REGS_4 \
+ CALL_REGS_3; \
+ register long r5 asm("5") = arg4
+#define CALL_REGS_5 \
+ CALL_REGS_4; \
+ register long r6 asm("6") = arg5
+
+#define CALL_TYPECHECK_0(...)
+#define CALL_TYPECHECK_1(t, a, ...) \
+ typecheck(t, a)
+#define CALL_TYPECHECK_2(t, a, ...) \
+ CALL_TYPECHECK_1(__VA_ARGS__); \
+ typecheck(t, a)
+#define CALL_TYPECHECK_3(t, a, ...) \
+ CALL_TYPECHECK_2(__VA_ARGS__); \
+ typecheck(t, a)
+#define CALL_TYPECHECK_4(t, a, ...) \
+ CALL_TYPECHECK_3(__VA_ARGS__); \
+ typecheck(t, a)
+#define CALL_TYPECHECK_5(t, a, ...) \
+ CALL_TYPECHECK_4(__VA_ARGS__); \
+ typecheck(t, a)
+
+#define CALL_PARM_0(...) void
+#define CALL_PARM_1(t, a, ...) t
+#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__)
+#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__)
+#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__)
+#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__)
+#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__)
+
+/*
+ * Use call_on_stack() to call a function switching to a specified
+ * stack. Proper sign and zero extension of function arguments is
+ * done. Usage:
+ *
+ * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - stack specifies the stack to be used.
+ * - fn is the function to be called.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ * argument of fn, t2 the second, etc. a1 is the corresponding
+ * first function argument (not name), etc.
+ */
+#define call_on_stack(nr, stack, rettype, fn, ...) \
({ \
+ rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn; \
unsigned long frame = current_frame_address(); \
- CALL_ARGS_##nr(args); \
+ unsigned long __stack = stack; \
unsigned long prev; \
+ CALL_LARGS_##nr(__VA_ARGS__); \
+ CALL_REGS_##nr; \
\
+ CALL_TYPECHECK_##nr(__VA_ARGS__); \
asm volatile( \
- " la %[_prev],0(15)\n" \
+ " lgr %[_prev],15\n" \
" lg 15,%[_stack]\n" \
" stg %[_frame],%[_bc](15)\n" \
" brasl 14,%[_fn]\n" \
- " la 15,0(%[_prev])\n" \
- : [_prev] "=&a" (prev), CALL_FMT_##nr \
- [_stack] "R" (stack), \
+ " lgr 15,%[_prev]\n" \
+ : [_prev] "=&d" (prev), CALL_FMT_##nr \
+ : [_stack] "R" (__stack), \
[_bc] "i" (offsetof(struct stack_frame, back_chain)), \
[_frame] "d" (frame), \
- [_fn] "X" (fn) : CALL_CLOBBER_##nr); \
- r2; \
+ [_fn] "X" (__fn) : CALL_CLOBBER_##nr); \
+ (rettype)r2; \
})
-#define CALL_ON_STACK_NORETURN(fn, stack) \
+#define call_on_stack_noreturn(fn, stack) \
({ \
+ void (*__fn)(void) = fn; \
+ \
asm volatile( \
" la 15,0(%[_stack])\n" \
" xc %[_bc](8,15),%[_bc](15)\n" \
" brasl 14,%[_fn]\n" \
::[_bc] "i" (offsetof(struct stack_frame, back_chain)), \
- [_stack] "a" (stack), [_fn] "X" (fn)); \
+ [_stack] "a" (stack), [_fn] "X" (__fn)); \
BUG(); \
})
diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h
index f0ddefb06ec8..4d74d7e33340 100644
--- a/arch/s390/include/asm/stp.h
+++ b/arch/s390/include/asm/stp.h
@@ -6,43 +6,89 @@
#ifndef __S390_STP_H
#define __S390_STP_H
+#include <linux/compiler.h>
+
/* notifier for syncs */
extern struct atomic_notifier_head s390_epoch_delta_notifier;
/* STP interruption parameter */
struct stp_irq_parm {
- unsigned int _pad0 : 14;
- unsigned int tsc : 1; /* Timing status change */
- unsigned int lac : 1; /* Link availability change */
- unsigned int tcpc : 1; /* Time control parameter change */
- unsigned int _pad2 : 15;
-} __attribute__ ((packed));
+ u32 : 14;
+ u32 tsc : 1; /* Timing status change */
+ u32 lac : 1; /* Link availability change */
+ u32 tcpc : 1; /* Time control parameter change */
+ u32 : 15;
+} __packed;
#define STP_OP_SYNC 1
#define STP_OP_CTRL 3
struct stp_sstpi {
- unsigned int rsvd0;
- unsigned int rsvd1 : 8;
- unsigned int stratum : 8;
- unsigned int vbits : 16;
- unsigned int leaps : 16;
- unsigned int tmd : 4;
- unsigned int ctn : 4;
- unsigned int rsvd2 : 3;
- unsigned int c : 1;
- unsigned int tst : 4;
- unsigned int tzo : 16;
- unsigned int dsto : 16;
- unsigned int ctrl : 16;
- unsigned int rsvd3 : 16;
- unsigned int tto;
- unsigned int rsvd4;
- unsigned int ctnid[3];
- unsigned int rsvd5;
- unsigned int todoff[4];
- unsigned int rsvd6[48];
-} __attribute__ ((packed));
+ u32 : 32;
+ u32 tu : 1;
+ u32 lu : 1;
+ u32 : 6;
+ u32 stratum : 8;
+ u32 vbits : 16;
+ u32 leaps : 16;
+ u32 tmd : 4;
+ u32 ctn : 4;
+ u32 : 3;
+ u32 c : 1;
+ u32 tst : 4;
+ u32 tzo : 16;
+ u32 dsto : 16;
+ u32 ctrl : 16;
+ u32 : 16;
+ u32 tto;
+ u32 : 32;
+ u32 ctnid[3];
+ u32 : 32;
+ u64 todoff;
+ u32 rsvd[50];
+} __packed;
+
+struct stp_tzib {
+ u32 tzan : 16;
+ u32 : 16;
+ u32 tzo : 16;
+ u32 dsto : 16;
+ u32 stn;
+ u32 dstn;
+ u64 dst_on_alg;
+ u64 dst_off_alg;
+} __packed;
+
+struct stp_tcpib {
+ u32 atcode : 4;
+ u32 ntcode : 4;
+ u32 d : 1;
+ u32 : 23;
+ s32 tto;
+ struct stp_tzib atzib;
+ struct stp_tzib ntzib;
+ s32 adst_offset : 16;
+ s32 ndst_offset : 16;
+ u32 rsvd1;
+ u64 ntzib_update;
+ u64 ndsto_update;
+} __packed;
+
+struct stp_lsoib {
+ u32 p : 1;
+ u32 : 31;
+ s32 also : 16;
+ s32 nlso : 16;
+ u64 nlsout;
+} __packed;
+
+struct stp_stzi {
+ u32 rsvd0[3];
+ u64 data_ts;
+ u32 rsvd1[22];
+ struct stp_tcpib tcpib;
+ struct stp_lsoib lsoib;
+} __packed;
/* Functions needed by the machine check handler */
int stp_sync_check(void);
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 4c0690fc5167..3fae93ddb322 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -31,22 +31,18 @@ void *memmove(void *dest, const void *src, size_t n);
#define __HAVE_ARCH_STRCMP /* arch function */
#define __HAVE_ARCH_STRCPY /* inline & arch function */
#define __HAVE_ARCH_STRLCAT /* arch function */
-#define __HAVE_ARCH_STRLCPY /* arch function */
#define __HAVE_ARCH_STRLEN /* inline & arch function */
#define __HAVE_ARCH_STRNCAT /* arch function */
#define __HAVE_ARCH_STRNCPY /* arch function */
#define __HAVE_ARCH_STRNLEN /* inline & arch function */
-#define __HAVE_ARCH_STRRCHR /* arch function */
#define __HAVE_ARCH_STRSTR /* arch function */
/* Prototypes for non-inlined arch strings functions. */
int memcmp(const void *s1, const void *s2, size_t n);
int strcmp(const char *s1, const char *s2);
size_t strlcat(char *dest, const char *src, size_t n);
-size_t strlcpy(char *dest, const char *src, size_t size);
char *strncat(char *dest, const char *src, size_t n);
char *strncpy(char *dest, const char *src, size_t n);
-char *strrchr(const char *s, int c);
char *strstr(const char *s1, const char *s2);
#endif /* !CONFIG_KASAN */
@@ -107,16 +103,18 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t count)
#ifdef __HAVE_ARCH_MEMCHR
static inline void *memchr(const void * s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
asm volatile(
- "0: srst %0,%1\n"
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
" jo 0b\n"
" jl 1f\n"
- " la %0,0\n"
+ " la %[ret],0\n"
"1:"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
return (void *) ret;
}
#endif
@@ -124,13 +122,15 @@ static inline void *memchr(const void * s, int c, size_t n)
#ifdef __HAVE_ARCH_MEMSCAN
static inline void *memscan(void *s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
asm volatile(
- "0: srst %0,%1\n"
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
" jo 0b\n"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
return (void *) ret;
}
#endif
@@ -138,17 +138,18 @@ static inline void *memscan(void *s, int c, size_t n)
#ifdef __HAVE_ARCH_STRCAT
static inline char *strcat(char *dst, const char *src)
{
- register int r0 asm("0") = 0;
- unsigned long dummy;
+ unsigned long dummy = 0;
char *ret = dst;
asm volatile(
- "0: srst %0,%1\n"
+ " lghi 0,0\n"
+ "0: srst %[dummy],%[dst]\n"
" jo 0b\n"
- "1: mvst %0,%2\n"
+ "1: mvst %[dummy],%[src]\n"
" jo 1b"
- : "=&a" (dummy), "+a" (dst), "+a" (src)
- : "d" (r0), "0" (0) : "cc", "memory" );
+ : [dummy] "+&a" (dummy), [dst] "+&a" (dst), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
#endif
@@ -156,14 +157,15 @@ static inline char *strcat(char *dst, const char *src)
#ifdef __HAVE_ARCH_STRCPY
static inline char *strcpy(char *dst, const char *src)
{
- register int r0 asm("0") = 0;
char *ret = dst;
asm volatile(
- "0: mvst %0,%1\n"
+ " lghi 0,0\n"
+ "0: mvst %[dst],%[src]\n"
" jo 0b"
- : "+&a" (dst), "+&a" (src) : "d" (r0)
- : "cc", "memory");
+ : [dst] "+&a" (dst), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
#endif
@@ -171,28 +173,33 @@ static inline char *strcpy(char *dst, const char *src)
#if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__))
static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s)
{
- register unsigned long r0 asm("0") = 0;
+ unsigned long end = 0;
const char *tmp = s;
asm volatile(
- "0: srst %0,%1\n"
+ " lghi 0,0\n"
+ "0: srst %[end],%[tmp]\n"
" jo 0b"
- : "+d" (r0), "+a" (tmp) : : "cc", "memory");
- return r0 - (unsigned long) s;
+ : [end] "+&a" (end), [tmp] "+&a" (tmp)
+ :
+ : "cc", "memory", "0");
+ return end - (unsigned long)s;
}
#endif
#ifdef __HAVE_ARCH_STRNLEN
static inline size_t strnlen(const char * s, size_t n)
{
- register int r0 asm("0") = 0;
const char *tmp = s;
const char *end = s + n;
asm volatile(
- "0: srst %0,%1\n"
+ " lghi 0,0\n"
+ "0: srst %[end],%[tmp]\n"
" jo 0b"
- : "+a" (end), "+a" (tmp) : "d" (r0) : "cc", "memory");
+ : [end] "+&a" (end), [tmp] "+&a" (tmp)
+ :
+ : "cc", "memory", "0");
return end - s;
}
#endif
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index f073292e9fdb..27e3d804b311 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -14,8 +14,8 @@
#include <linux/err.h>
#include <asm/ptrace.h>
-extern const unsigned long sys_call_table[];
-extern const unsigned long sys_call_table_emu[];
+extern const sys_call_ptr_t sys_call_table[];
+extern const sys_call_ptr_t sys_call_table_emu[];
static inline long syscall_get_nr(struct task_struct *task,
struct pt_regs *regs)
@@ -33,7 +33,17 @@ static inline void syscall_rollback(struct task_struct *task,
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
- return IS_ERR_VALUE(regs->gprs[2]) ? regs->gprs[2] : 0;
+ unsigned long error = regs->gprs[2];
+#ifdef CONFIG_COMPAT
+ if (test_tsk_thread_flag(task, TIF_31BIT)) {
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ error = (long)(int)error;
+ }
+#endif
+ return IS_ERR_VALUE(error) ? error : 0;
}
static inline long syscall_get_return_value(struct task_struct *task,
@@ -46,6 +56,7 @@ static inline void syscall_set_return_value(struct task_struct *task,
struct pt_regs *regs,
int error, long val)
{
+ set_pt_regs_flag(regs, PIF_SYSCALL_RET_SET);
regs->gprs[2] = error ? error : val;
}
@@ -67,18 +78,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[0] = regs->orig_gpr2 & mask;
}
-static inline void syscall_set_arguments(struct task_struct *task,
- struct pt_regs *regs,
- const unsigned long *args)
-{
- unsigned int n = 6;
-
- while (n-- > 0)
- if (n > 0)
- regs->gprs[2 + n] = args[n];
- regs->orig_gpr2 = args[0];
-}
-
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_COMPAT
@@ -87,4 +86,69 @@ static inline int syscall_get_arch(struct task_struct *task)
#endif
return AUDIT_ARCH_S390X;
}
+
+static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+ return false;
+}
+
+#define SYSCALL_FMT_0
+#define SYSCALL_FMT_1 , "0" (r2)
+#define SYSCALL_FMT_2 , "d" (r3) SYSCALL_FMT_1
+#define SYSCALL_FMT_3 , "d" (r4) SYSCALL_FMT_2
+#define SYSCALL_FMT_4 , "d" (r5) SYSCALL_FMT_3
+#define SYSCALL_FMT_5 , "d" (r6) SYSCALL_FMT_4
+#define SYSCALL_FMT_6 , "d" (r7) SYSCALL_FMT_5
+
+#define SYSCALL_PARM_0
+#define SYSCALL_PARM_1 , long arg1
+#define SYSCALL_PARM_2 SYSCALL_PARM_1, long arg2
+#define SYSCALL_PARM_3 SYSCALL_PARM_2, long arg3
+#define SYSCALL_PARM_4 SYSCALL_PARM_3, long arg4
+#define SYSCALL_PARM_5 SYSCALL_PARM_4, long arg5
+#define SYSCALL_PARM_6 SYSCALL_PARM_5, long arg6
+
+#define SYSCALL_REGS_0
+#define SYSCALL_REGS_1 \
+ register long r2 asm("2") = arg1
+#define SYSCALL_REGS_2 \
+ SYSCALL_REGS_1; \
+ register long r3 asm("3") = arg2
+#define SYSCALL_REGS_3 \
+ SYSCALL_REGS_2; \
+ register long r4 asm("4") = arg3
+#define SYSCALL_REGS_4 \
+ SYSCALL_REGS_3; \
+ register long r5 asm("5") = arg4
+#define SYSCALL_REGS_5 \
+ SYSCALL_REGS_4; \
+ register long r6 asm("6") = arg5
+#define SYSCALL_REGS_6 \
+ SYSCALL_REGS_5; \
+ register long r7 asm("7") = arg6
+
+#define GENERATE_SYSCALL_FUNC(nr) \
+static __always_inline \
+long syscall##nr(unsigned long syscall SYSCALL_PARM_##nr) \
+{ \
+ register unsigned long r1 asm ("1") = syscall; \
+ register long rc asm ("2"); \
+ SYSCALL_REGS_##nr; \
+ \
+ asm volatile ( \
+ " svc 0\n" \
+ : "=d" (rc) \
+ : "d" (r1) SYSCALL_FMT_##nr \
+ : "memory"); \
+ return rc; \
+}
+
+GENERATE_SYSCALL_FUNC(0)
+GENERATE_SYSCALL_FUNC(1)
+GENERATE_SYSCALL_FUNC(2)
+GENERATE_SYSCALL_FUNC(3)
+GENERATE_SYSCALL_FUNC(4)
+GENERATE_SYSCALL_FUNC(5)
+GENERATE_SYSCALL_FUNC(6)
+
#endif /* _ASM_SYSCALL_H */
diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h
index 3c3d6fe8e2f0..fde7e6b1df48 100644
--- a/arch/s390/include/asm/syscall_wrapper.h
+++ b/arch/s390/include/asm/syscall_wrapper.h
@@ -7,6 +7,33 @@
#ifndef _ASM_S390_SYSCALL_WRAPPER_H
#define _ASM_S390_SYSCALL_WRAPPER_H
+#define __SC_TYPE(t, a) t
+
+#define SYSCALL_PT_ARG6(regs, m, t1, t2, t3, t4, t5, t6)\
+ SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5), \
+ m(t6, (regs->gprs[7]))
+
+#define SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5) \
+ SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4), \
+ m(t5, (regs->gprs[6]))
+
+#define SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4) \
+ SYSCALL_PT_ARG3(regs, m, t1, t2, t3), \
+ m(t4, (regs->gprs[5]))
+
+#define SYSCALL_PT_ARG3(regs, m, t1, t2, t3) \
+ SYSCALL_PT_ARG2(regs, m, t1, t2), \
+ m(t3, (regs->gprs[4]))
+
+#define SYSCALL_PT_ARG2(regs, m, t1, t2) \
+ SYSCALL_PT_ARG1(regs, m, t1), \
+ m(t2, (regs->gprs[3]))
+
+#define SYSCALL_PT_ARG1(regs, m, t1) \
+ m(t1, (regs->orig_gpr2))
+
+#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__)
+
#ifdef CONFIG_COMPAT
#define __SC_COMPAT_TYPE(t, a) \
__typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a
@@ -29,14 +56,15 @@
(t)__ReS; \
})
-#define __S390_SYS_STUBx(x, name, ...) \
- asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
- ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \
- asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
- { \
- long ret = __s390x_sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__));\
- __MAP(x,__SC_TEST,__VA_ARGS__); \
- return ret; \
+#define __S390_SYS_STUBx(x, name, ...) \
+ long __s390_sys##name(struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \
+ long __s390_sys##name(struct pt_regs *regs) \
+ { \
+ long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \
+ __SC_COMPAT_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \
+ __MAP(x,__SC_TEST,__VA_ARGS__); \
+ return ret; \
}
/*
@@ -45,17 +73,17 @@
*/
#define COMPAT_SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
- asmlinkage long __s390_compat_sys_##sname(void); \
- ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO); \
- asmlinkage long __s390_compat_sys_##sname(void)
+ long __s390_compat_sys_##sname(void); \
+ ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \
+ long __s390_compat_sys_##sname(void)
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
- asmlinkage long __s390x_sys_##sname(void); \
+ long __s390x_sys_##sname(void); \
ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \
- asmlinkage long __s390_sys_##sname(void) \
+ long __s390_sys_##sname(void) \
__attribute__((alias(__stringify(__s390x_sys_##sname)))); \
- asmlinkage long __s390x_sys_##sname(void)
+ long __s390x_sys_##sname(void)
#define COND_SYSCALL(name) \
cond_syscall(__s390x_sys_##name); \
@@ -65,23 +93,24 @@
SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); \
SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers)
-#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
- __diag_push(); \
- __diag_ignore(GCC, 8, "-Wattribute-alias", \
- "Type aliasing is used to sanitize syscall arguments");\
- asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
- asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
- __attribute__((alias(__stringify(__se_compat_sys##name)))); \
- ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \
- static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
- asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
- asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
- { \
- long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
- __MAP(x,__SC_TEST,__VA_ARGS__); \
- return ret; \
- } \
- __diag_pop(); \
+#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
+ __diag_push(); \
+ __diag_ignore(GCC, 8, "-Wattribute-alias", \
+ "Type aliasing is used to sanitize syscall arguments"); \
+ long __s390_compat_sys##name(struct pt_regs *regs); \
+ long __s390_compat_sys##name(struct pt_regs *regs) \
+ __attribute__((alias(__stringify(__se_compat_sys##name)))); \
+ ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \
+ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
+ long __se_compat_sys##name(struct pt_regs *regs); \
+ long __se_compat_sys##name(struct pt_regs *regs) \
+ { \
+ long ret = __do_compat_sys##name(SYSCALL_PT_ARGS(x, regs, __SC_DELOUSE, \
+ __MAP(x, __SC_TYPE, __VA_ARGS__))); \
+ __MAP(x,__SC_TEST,__VA_ARGS__); \
+ return ret; \
+ } \
+ __diag_pop(); \
static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
/*
@@ -101,9 +130,9 @@
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
- asmlinkage long __s390x_sys_##sname(void); \
+ long __s390x_sys_##sname(void); \
ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \
- asmlinkage long __s390x_sys_##sname(void)
+ long __s390x_sys_##sname(void)
#define COND_SYSCALL(name) \
cond_syscall(__s390x_sys_##name)
@@ -113,23 +142,24 @@
#endif /* CONFIG_COMPAT */
-#define __SYSCALL_DEFINEx(x, name, ...) \
- __diag_push(); \
- __diag_ignore(GCC, 8, "-Wattribute-alias", \
- "Type aliasing is used to sanitize syscall arguments");\
- asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
- __attribute__((alias(__stringify(__se_sys##name)))); \
- ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \
- long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
- static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
- __S390_SYS_STUBx(x, name, __VA_ARGS__) \
- asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
- { \
- long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
- __MAP(x,__SC_TEST,__VA_ARGS__); \
- return ret; \
- } \
- __diag_pop(); \
+#define __SYSCALL_DEFINEx(x, name, ...) \
+ __diag_push(); \
+ __diag_ignore(GCC, 8, "-Wattribute-alias", \
+ "Type aliasing is used to sanitize syscall arguments"); \
+ long __s390x_sys##name(struct pt_regs *regs) \
+ __attribute__((alias(__stringify(__se_sys##name)))); \
+ ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \
+ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
+ long __se_sys##name(struct pt_regs *regs); \
+ __S390_SYS_STUBx(x, name, __VA_ARGS__) \
+ long __se_sys##name(struct pt_regs *regs) \
+ { \
+ long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \
+ __SC_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \
+ __MAP(x,__SC_TEST,__VA_ARGS__); \
+ return ret; \
+ } \
+ __diag_pop(); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
-#endif /* _ASM_X86_SYSCALL_WRAPPER_H */
+#endif /* _ASM_S390_SYSCALL_WRAPPER_H */
diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index fe7b3f8f0791..ab1c6316055c 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -67,12 +67,12 @@ struct sysinfo_1_2_2 {
unsigned short cpus_configured;
unsigned short cpus_standby;
unsigned short cpus_reserved;
- unsigned short adjustment[0];
+ unsigned short adjustment[];
};
struct sysinfo_1_2_2_extension {
unsigned int alt_capability;
- unsigned short alt_adjustment[0];
+ unsigned short alt_adjustment[];
};
struct sysinfo_2_2_1 {
@@ -181,7 +181,7 @@ struct sysinfo_15_1_x {
unsigned char reserved1;
unsigned char mnest;
unsigned char reserved2[4];
- union topology_entry tle[0];
+ union topology_entry tle[];
};
int stsi(void *sysinfo, int fc, int sel1, int sel2);
diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h
deleted file mode 100644
index 46fa3020b41e..000000000000
--- a/arch/s390/include/asm/termios.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 version
- *
- * Derived from "include/asm-i386/termios.h"
- */
-#ifndef _S390_TERMIOS_H
-#define _S390_TERMIOS_H
-
-#include <uapi/asm/termios.h>
-
-
-/* intr=^C quit=^\ erase=del kill=^U
- eof=^D vtime=\0 vmin=\1 sxtc=\0
- start=^Q stop=^S susp=^Z eol=\0
- reprint=^R discard=^U werase=^W lnext=^V
- eol2=\0
-*/
-#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
-
-#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2))
-#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2))
-
-#include <asm-generic/termios-base.h>
-
-#endif /* _S390_TERMIOS_H */
diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h
new file mode 100644
index 000000000000..b219056a8817
--- /dev/null
+++ b/arch/s390/include/asm/text-patching.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_TEXT_PATCHING_H
+#define _ASM_S390_TEXT_PATCHING_H
+
+#include <asm/barrier.h>
+
+static __always_inline void sync_core(void)
+{
+ bcr_serialize();
+}
+
+void text_poke_sync(void);
+void text_poke_sync_lock(void);
+
+#endif /* _ASM_S390_TEXT_PATCHING_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index e582fbe59e20..b2ffcb4fe000 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -18,13 +18,12 @@
#else
#define THREAD_SIZE_ORDER 2
#endif
-#define BOOT_STACK_ORDER 2
+#define BOOT_STACK_SIZE (PAGE_SIZE << 2)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#ifndef __ASSEMBLY__
#include <asm/lowcore.h>
#include <asm/page.h>
-#include <asm/processor.h>
#define STACK_INIT_OFFSET \
(THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs))
@@ -37,6 +36,8 @@
*/
struct thread_info {
unsigned long flags; /* low level flags */
+ unsigned long syscall_work; /* SYSCALL_WORK_ flags */
+ unsigned int cpu; /* current CPU */
};
/*
@@ -47,6 +48,8 @@ struct thread_info {
.flags = 0, \
}
+struct task_struct;
+
void arch_release_task_struct(struct task_struct *tsk);
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
@@ -66,8 +69,10 @@ void arch_setup_new_exec(void);
#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */
#define TIF_PATCH_PENDING 5 /* pending live patching update */
#define TIF_PGSTE 6 /* New mm's will use 4K page tables */
+#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */
#define TIF_ISOLATE_BP 8 /* Run process with isolated BP */
#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */
+#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */
#define TIF_31BIT 16 /* 32bit process */
#define TIF_MEMDIE 17 /* is terminating due to OOM killer */
@@ -83,6 +88,7 @@ void arch_setup_new_exec(void);
#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */
#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME)
+#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL)
#define _TIF_SIGPENDING BIT(TIF_SIGPENDING)
#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED)
#define _TIF_UPROBE BIT(TIF_UPROBE)
@@ -90,6 +96,7 @@ void arch_setup_new_exec(void);
#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING)
#define _TIF_ISOLATE_BP BIT(TIF_ISOLATE_BP)
#define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST)
+#define _TIF_PER_TRAP BIT(TIF_PER_TRAP)
#define _TIF_31BIT BIT(TIF_31BIT)
#define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP)
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 6bf3a45ccfec..ce878e85b6e4 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -19,6 +19,25 @@
extern u64 clock_comparator_max;
+union tod_clock {
+ __uint128_t val;
+ struct {
+ __uint128_t ei : 8; /* epoch index */
+ __uint128_t tod : 64; /* bits 0-63 of tod clock */
+ __uint128_t : 40;
+ __uint128_t pf : 16; /* programmable field */
+ };
+ struct {
+ __uint128_t eitod : 72; /* epoch index + bits 0-63 tod clock */
+ __uint128_t : 56;
+ };
+ struct {
+ __uint128_t us : 60; /* micro-seconds */
+ __uint128_t sus : 12; /* sub-microseconds */
+ __uint128_t : 56;
+ };
+} __packed;
+
/* Inline functions for clock register access. */
static inline int set_tod_clock(__u64 time)
{
@@ -32,26 +51,36 @@ static inline int set_tod_clock(__u64 time)
return cc;
}
-static inline int store_tod_clock(__u64 *time)
+static inline int store_tod_clock_ext_cc(union tod_clock *clk)
{
int cc;
asm volatile(
- " stck %1\n"
+ " stcke %1\n"
" ipm %0\n"
" srl %0,28\n"
- : "=d" (cc), "=Q" (*time) : : "cc");
+ : "=d" (cc), "=Q" (*clk) : : "cc");
return cc;
}
+static inline void store_tod_clock_ext(union tod_clock *tod)
+{
+ asm volatile("stcke %0" : "=Q" (*tod) : : "cc");
+}
+
static inline void set_clock_comparator(__u64 time)
{
asm volatile("sckc %0" : : "Q" (time));
}
-static inline void store_clock_comparator(__u64 *time)
+static inline void set_tod_programmable_field(u16 val)
{
- asm volatile("stckc %0" : "=Q" (*time));
+ asm volatile(
+ " lgr 0,%[val]\n"
+ " sckpf\n"
+ :
+ : [val] "d" ((unsigned long)val)
+ : "0");
}
void clock_comparator_work(void);
@@ -72,10 +101,10 @@ extern unsigned char ptff_function_mask[16];
/* Query TOD offset result */
struct ptff_qto {
- unsigned long long physical_clock;
- unsigned long long tod_offset;
- unsigned long long logical_tod_offset;
- unsigned long long tod_epoch_difference;
+ unsigned long physical_clock;
+ unsigned long tod_offset;
+ unsigned long logical_tod_offset;
+ unsigned long tod_epoch_difference;
} __packed;
static inline int ptff_query(unsigned int nr)
@@ -112,22 +141,25 @@ struct ptff_qui {
#define ptff(ptff_block, len, func) \
({ \
struct addrtype { char _[len]; }; \
- register unsigned int reg0 asm("0") = func; \
- register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\
+ unsigned int reg0 = func; \
+ unsigned long reg1 = (unsigned long)(ptff_block); \
int rc; \
\
asm volatile( \
- " .word 0x0104\n" \
- " ipm %0\n" \
- " srl %0,28\n" \
- : "=d" (rc), "+m" (*(struct addrtype *) reg1) \
- : "d" (reg0), "d" (reg1) : "cc"); \
+ " lgr 0,%[reg0]\n" \
+ " lgr 1,%[reg1]\n" \
+ " ptff\n" \
+ " ipm %[rc]\n" \
+ " srl %[rc],28\n" \
+ : [rc] "=&d" (rc), "+m" (*(struct addrtype *)reg1) \
+ : [reg0] "d" (reg0), [reg1] "d" (reg1) \
+ : "cc", "0", "1"); \
rc; \
})
-static inline unsigned long long local_tick_disable(void)
+static inline unsigned long local_tick_disable(void)
{
- unsigned long long old;
+ unsigned long old;
old = S390_lowcore.clock_comparator;
S390_lowcore.clock_comparator = clock_comparator_max;
@@ -135,53 +167,42 @@ static inline unsigned long long local_tick_disable(void)
return old;
}
-static inline void local_tick_enable(unsigned long long comp)
+static inline void local_tick_enable(unsigned long comp)
{
S390_lowcore.clock_comparator = comp;
set_clock_comparator(S390_lowcore.clock_comparator);
}
#define CLOCK_TICK_RATE 1193180 /* Underlying HZ */
-#define STORE_CLOCK_EXT_SIZE 16 /* stcke writes 16 bytes */
-typedef unsigned long long cycles_t;
+typedef unsigned long cycles_t;
-static inline void get_tod_clock_ext(char *clk)
+static inline unsigned long get_tod_clock(void)
{
- typedef struct { char _[STORE_CLOCK_EXT_SIZE]; } addrtype;
+ union tod_clock clk;
- asm volatile("stcke %0" : "=Q" (*(addrtype *) clk) : : "cc");
+ store_tod_clock_ext(&clk);
+ return clk.tod;
}
-static inline unsigned long long get_tod_clock(void)
+static inline unsigned long get_tod_clock_fast(void)
{
- char clk[STORE_CLOCK_EXT_SIZE];
-
- get_tod_clock_ext(clk);
- return *((unsigned long long *)&clk[1]);
-}
-
-static inline unsigned long long get_tod_clock_fast(void)
-{
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
- unsigned long long clk;
+ unsigned long clk;
asm volatile("stckf %0" : "=Q" (clk) : : "cc");
return clk;
-#else
- return get_tod_clock();
-#endif
}
static inline cycles_t get_cycles(void)
{
return (cycles_t) get_tod_clock() >> 2;
}
+#define get_cycles get_cycles
int get_phys_clock(unsigned long *clock);
void init_cpu_timer(void);
-extern unsigned char tod_clock_base[16] __aligned(8);
+extern union tod_clock tod_clock_base;
/**
* get_clock_monotonic - returns current time in clock rate units
@@ -190,12 +211,12 @@ extern unsigned char tod_clock_base[16] __aligned(8);
* Therefore preemption must be disabled, otherwise the returned
* value is not guaranteed to be monotonic.
*/
-static inline unsigned long long get_tod_clock_monotonic(void)
+static inline unsigned long get_tod_clock_monotonic(void)
{
- unsigned long long tod;
+ unsigned long tod;
preempt_disable_notrace();
- tod = get_tod_clock() - *(unsigned long long *) &tod_clock_base[1];
+ tod = get_tod_clock() - tod_clock_base.tod;
preempt_enable_notrace();
return tod;
}
@@ -219,7 +240,7 @@ static inline unsigned long long get_tod_clock_monotonic(void)
* -> ns = (th * 125) + ((tl * 125) >> 9);
*
*/
-static inline unsigned long long tod_to_ns(unsigned long long todval)
+static inline unsigned long tod_to_ns(unsigned long todval)
{
return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9);
}
@@ -231,10 +252,10 @@ static inline unsigned long long tod_to_ns(unsigned long long todval)
*
* Returns: true if a is later than b
*/
-static inline int tod_after(unsigned long long a, unsigned long long b)
+static inline int tod_after(unsigned long a, unsigned long b)
{
if (MACHINE_HAS_SCC)
- return (long long) a > (long long) b;
+ return (long) a > (long) b;
return a > b;
}
@@ -245,10 +266,10 @@ static inline int tod_after(unsigned long long a, unsigned long long b)
*
* Returns: true if a is later than b
*/
-static inline int tod_after_eq(unsigned long long a, unsigned long long b)
+static inline int tod_after_eq(unsigned long a, unsigned long b)
{
if (MACHINE_HAS_SCC)
- return (long long) a >= (long long) b;
+ return (long) a >= (long) b;
return a >= b;
}
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index aa406c05a350..3a5c8fb590e5 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -27,16 +27,12 @@ static inline void tlb_flush(struct mmu_gather *tlb);
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size);
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-
#define tlb_flush tlb_flush
#define pte_free_tlb pte_free_tlb
#define pmd_free_tlb pmd_free_tlb
#define p4d_free_tlb p4d_free_tlb
#define pud_free_tlb pud_free_tlb
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm-generic/tlb.h>
@@ -67,7 +63,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
- tlb->cleared_ptes = 1;
+ tlb->cleared_pmds = 1;
/*
* page_table_free_rcu takes care of the allocation bit masks
* of the 2K table fragments in the 4K page table page,
@@ -111,7 +107,6 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
- tlb->cleared_p4ds = 1;
tlb_remove_table(tlb, p4d);
}
@@ -129,7 +124,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
return;
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
- tlb->cleared_puds = 1;
+ tlb->cleared_p4ds = 1;
tlb_remove_table(tlb, pud);
}
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 82703e03f35d..a6e2cd89b609 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -5,8 +5,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/processor.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
/*
* Flush all TLB entries on the local CPU.
@@ -27,13 +25,9 @@ static inline void __tlb_flush_idte(unsigned long asce)
if (MACHINE_HAS_TLB_GUEST)
opt |= IDTE_GUEST_ASCE;
/* Global TLB flush for the mm */
- asm volatile(
- " .insn rrf,0xb98e0000,0,%0,%1,0"
- : : "a" (opt), "a" (asce) : "cc");
+ asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc");
}
-void smp_ptlb_all(void);
-
/*
* Flush all TLB entries on all CPUs.
*/
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index cca406fdbe51..3a0ac0c7a9a3 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -16,8 +16,8 @@ struct cpu_topology_s390 {
unsigned short socket_id;
unsigned short book_id;
unsigned short drawer_id;
- unsigned short node_id;
unsigned short dedicated : 1;
+ int booted_cores;
cpumask_t thread_mask;
cpumask_t core_mask;
cpumask_t book_mask;
@@ -25,7 +25,6 @@ struct cpu_topology_s390 {
};
extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
-extern cpumask_t cpus_with_topology;
#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id)
#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id)
@@ -37,6 +36,7 @@ extern cpumask_t cpus_with_topology;
#define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id)
#define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask)
#define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated)
+#define topology_booted_cores(cpu) (cpu_topology[cpu].booted_cores)
#define mc_capable() 1
@@ -45,6 +45,7 @@ int topology_cpu_init(struct cpu *);
int topology_set_cpu_management(int fc);
void topology_schedule_update(void);
void store_topology(struct sysinfo_15_1_x *info);
+void update_cpu_masks(void);
void topology_expect_change(void);
const struct cpumask *cpu_coregroup_mask(int cpu);
@@ -54,6 +55,8 @@ static inline void topology_init_early(void) { }
static inline void topology_schedule_update(void) { }
static inline int topology_cpu_init(struct cpu *cpu) { return 0; }
static inline int topology_cpu_dedicated(int cpu_nr) { return 0; }
+static inline int topology_booted_cores(int cpu_nr) { return 1; }
+static inline void update_cpu_masks(void) { }
static inline void topology_expect_change(void) { }
#endif /* CONFIG_SCHED_TOPOLOGY */
@@ -71,20 +74,18 @@ static inline void topology_expect_change(void) { }
#define cpu_to_node cpu_to_node
static inline int cpu_to_node(int cpu)
{
- return cpu_topology[cpu].node_id;
+ return 0;
}
/* Returns a pointer to the cpumask of CPUs on node 'node'. */
#define cpumask_of_node cpumask_of_node
static inline const struct cpumask *cpumask_of_node(int node)
{
- return &node_to_cpumask_map[node];
+ return cpu_possible_mask;
}
#define pcibus_to_node(bus) __pcibus_to_node(bus)
-#define node_distance(a, b) __node_distance(a, b)
-
#else /* !CONFIG_NUMA */
#define numa_node_id numa_node_id
diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h
new file mode 100644
index 000000000000..f76e5fdff23a
--- /dev/null
+++ b/arch/s390/include/asm/tpi.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_S390_TPI_H
+#define _ASM_S390_TPI_H
+
+#include <linux/types.h>
+#include <uapi/asm/schid.h>
+
+#ifndef __ASSEMBLY__
+
+/* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */
+struct tpi_info {
+ struct subchannel_id schid;
+ u32 intparm;
+ u32 adapter_IO:1;
+ u32 directed_irq:1;
+ u32 isc:3;
+ u32 :12;
+ u32 type:3;
+ u32 :12;
+} __packed __aligned(4);
+
+/* I/O-Interruption Code as stored by TPI for an Adapter I/O */
+struct tpi_adapter_info {
+ u32 aism:8;
+ u32 :22;
+ u32 error:1;
+ u32 forward:1;
+ u32 reserved;
+ u32 adapter_IO:1;
+ u32 directed_irq:1;
+ u32 isc:3;
+ u32 :27;
+} __packed __aligned(4);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_TPI_H */
diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h
new file mode 100644
index 000000000000..0b5d550a0478
--- /dev/null
+++ b/arch/s390/include/asm/types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ASM_S390_TYPES_H
+#define _ASM_S390_TYPES_H
+
+#include <uapi/asm/types.h>
+
+#ifndef __ASSEMBLY__
+
+union register_pair {
+ unsigned __int128 pair;
+ struct {
+ unsigned long even;
+ unsigned long odd;
+ };
+};
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_S390_TYPES_H */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index a470f1fa9f2a..f7038b800cc3 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -3,7 +3,7 @@
* S390 version
* Copyright IBM Corp. 1999, 2000
* Author(s): Hartmut Penner (hp@de.ibm.com),
- * Martin Schwidefsky (schwidefsky@de.ibm.com)
+ * Martin Schwidefsky (schwidefsky@de.ibm.com)
*
* Derived from "include/asm-i386/uaccess.h"
*/
@@ -13,41 +13,14 @@
/*
* User space memory access functions
*/
+#include <asm/asm-extable.h>
#include <asm/processor.h>
#include <asm/ctl_reg.h>
#include <asm/extable.h>
#include <asm/facility.h>
+#include <asm-generic/access_ok.h>
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not. If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- */
-
-#define KERNEL_DS (0)
-#define KERNEL_DS_SACF (1)
-#define USER_DS (2)
-#define USER_DS_SACF (3)
-
-#define get_fs() (current->thread.mm_segment)
-#define segment_eq(a,b) (((a) & 2) == ((b) & 2))
-
-void set_fs(mm_segment_t fs);
-
-static inline int __range_ok(unsigned long addr, unsigned long size)
-{
- return 1;
-}
-
-#define __access_ok(addr, size) \
-({ \
- __chk_user_ptr(addr); \
- __range_ok((unsigned long)(addr), (size)); \
-})
-
-#define access_ok(addr, size) __access_ok(addr, size)
+void debug_user_asce(int exit);
unsigned long __must_check
raw_copy_from_user(void *to, const void __user *from, unsigned long n);
@@ -60,209 +33,246 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n);
#define INLINE_COPY_TO_USER
#endif
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-
-#define __put_get_user_asm(to, from, size, spec) \
-({ \
- register unsigned long __reg0 asm("0") = spec; \
- int __rc; \
- \
- asm volatile( \
- "0: mvcos %1,%3,%2\n" \
- "1: xr %0,%0\n" \
- "2:\n" \
- ".pushsection .fixup, \"ax\"\n" \
- "3: lhi %0,%5\n" \
- " jg 2b\n" \
- ".popsection\n" \
- EX_TABLE(0b,3b) EX_TABLE(1b,3b) \
- : "=d" (__rc), "+Q" (*(to)) \
- : "d" (size), "Q" (*(from)), \
- "d" (__reg0), "K" (-EFAULT) \
- : "cc"); \
- __rc; \
+unsigned long __must_check
+_copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key);
+
+static __always_inline unsigned long __must_check
+copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key)
+{
+ if (check_copy_size(to, n, false))
+ n = _copy_from_user_key(to, from, n, key);
+ return n;
+}
+
+unsigned long __must_check
+_copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key);
+
+static __always_inline unsigned long __must_check
+copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key)
+{
+ if (check_copy_size(from, n, true))
+ n = _copy_to_user_key(to, from, n, key);
+ return n;
+}
+
+union oac {
+ unsigned int val;
+ struct {
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac1;
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac2;
+ };
+};
+
+int __noreturn __put_user_bad(void);
+
+#define __put_user_asm(to, from, size) \
+({ \
+ union oac __oac_spec = { \
+ .oac1.as = PSW_BITS_AS_SECONDARY, \
+ .oac1.a = 1, \
+ }; \
+ int __rc; \
+ \
+ asm volatile( \
+ " lr 0,%[spec]\n" \
+ "0: mvcos %[_to],%[_from],%[_size]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_STORE(0b, 2b, %[rc]) \
+ EX_TABLE_UA_STORE(1b, 2b, %[rc]) \
+ : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \
+ : [_size] "d" (size), [_from] "Q" (*(from)), \
+ [spec] "d" (__oac_spec.val) \
+ : "cc", "0"); \
+ __rc; \
})
static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
{
- unsigned long spec = 0x010000UL;
int rc;
switch (size) {
case 1:
- rc = __put_get_user_asm((unsigned char __user *)ptr,
- (unsigned char *)x,
- size, spec);
+ rc = __put_user_asm((unsigned char __user *)ptr,
+ (unsigned char *)x,
+ size);
break;
case 2:
- rc = __put_get_user_asm((unsigned short __user *)ptr,
- (unsigned short *)x,
- size, spec);
+ rc = __put_user_asm((unsigned short __user *)ptr,
+ (unsigned short *)x,
+ size);
break;
case 4:
- rc = __put_get_user_asm((unsigned int __user *)ptr,
- (unsigned int *)x,
- size, spec);
+ rc = __put_user_asm((unsigned int __user *)ptr,
+ (unsigned int *)x,
+ size);
break;
case 8:
- rc = __put_get_user_asm((unsigned long __user *)ptr,
- (unsigned long *)x,
- size, spec);
+ rc = __put_user_asm((unsigned long __user *)ptr,
+ (unsigned long *)x,
+ size);
+ break;
+ default:
+ __put_user_bad();
break;
}
return rc;
}
+int __noreturn __get_user_bad(void);
+
+#define __get_user_asm(to, from, size) \
+({ \
+ union oac __oac_spec = { \
+ .oac2.as = PSW_BITS_AS_SECONDARY, \
+ .oac2.a = 1, \
+ }; \
+ int __rc; \
+ \
+ asm volatile( \
+ " lr 0,%[spec]\n" \
+ "0: mvcos 0(%[_to]),%[_from],%[_size]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \
+ EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \
+ : [rc] "=&d" (__rc), "=Q" (*(to)) \
+ : [_size] "d" (size), [_from] "Q" (*(from)), \
+ [spec] "d" (__oac_spec.val), [_to] "a" (to), \
+ [_ksize] "K" (size) \
+ : "cc", "0"); \
+ __rc; \
+})
+
static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
{
- unsigned long spec = 0x01UL;
int rc;
switch (size) {
case 1:
- rc = __put_get_user_asm((unsigned char *)x,
- (unsigned char __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned char *)x,
+ (unsigned char __user *)ptr,
+ size);
break;
case 2:
- rc = __put_get_user_asm((unsigned short *)x,
- (unsigned short __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned short *)x,
+ (unsigned short __user *)ptr,
+ size);
break;
case 4:
- rc = __put_get_user_asm((unsigned int *)x,
- (unsigned int __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned int *)x,
+ (unsigned int __user *)ptr,
+ size);
break;
case 8:
- rc = __put_get_user_asm((unsigned long *)x,
- (unsigned long __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned long *)x,
+ (unsigned long __user *)ptr,
+ size);
+ break;
+ default:
+ __get_user_bad();
break;
}
return rc;
}
-#else /* CONFIG_HAVE_MARCH_Z10_FEATURES */
-
-static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
-{
- size = raw_copy_to_user(ptr, x, size);
- return size ? -EFAULT : 0;
-}
-
-static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
-{
- size = raw_copy_from_user(x, ptr, size);
- return size ? -EFAULT : 0;
-}
-
-#endif /* CONFIG_HAVE_MARCH_Z10_FEATURES */
-
/*
* These are the main single-value transfer routines. They automatically
* use the right size if we just have the right pointer type.
*/
-#define __put_user(x, ptr) \
-({ \
- __typeof__(*(ptr)) __x = (x); \
- int __pu_err = -EFAULT; \
- __chk_user_ptr(ptr); \
- switch (sizeof (*(ptr))) { \
- case 1: \
- case 2: \
- case 4: \
- case 8: \
- __pu_err = __put_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- break; \
- default: \
- __put_user_bad(); \
- break; \
- } \
- __builtin_expect(__pu_err, 0); \
+#define __put_user(x, ptr) \
+({ \
+ __typeof__(*(ptr)) __x = (x); \
+ int __pu_err = -EFAULT; \
+ \
+ __chk_user_ptr(ptr); \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ case 2: \
+ case 4: \
+ case 8: \
+ __pu_err = __put_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ break; \
+ } \
+ __builtin_expect(__pu_err, 0); \
})
-#define put_user(x, ptr) \
-({ \
- might_fault(); \
- __put_user(x, ptr); \
+#define put_user(x, ptr) \
+({ \
+ might_fault(); \
+ __put_user(x, ptr); \
})
-
-int __put_user_bad(void) __attribute__((noreturn));
-
-#define __get_user(x, ptr) \
-({ \
- int __gu_err = -EFAULT; \
- __chk_user_ptr(ptr); \
- switch (sizeof(*(ptr))) { \
- case 1: { \
- unsigned char __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- case 2: { \
- unsigned short __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- case 4: { \
- unsigned int __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- case 8: { \
- unsigned long long __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- default: \
- __get_user_bad(); \
- break; \
- } \
- __builtin_expect(__gu_err, 0); \
+#define __get_user(x, ptr) \
+({ \
+ int __gu_err = -EFAULT; \
+ \
+ __chk_user_ptr(ptr); \
+ switch (sizeof(*(ptr))) { \
+ case 1: { \
+ unsigned char __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ case 2: { \
+ unsigned short __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ case 4: { \
+ unsigned int __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ case 8: { \
+ unsigned long __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ default: \
+ __get_user_bad(); \
+ break; \
+ } \
+ __builtin_expect(__gu_err, 0); \
})
-#define get_user(x, ptr) \
-({ \
- might_fault(); \
- __get_user(x, ptr); \
+#define get_user(x, ptr) \
+({ \
+ might_fault(); \
+ __get_user(x, ptr); \
})
-int __get_user_bad(void) __attribute__((noreturn));
-
-unsigned long __must_check
-raw_copy_in_user(void __user *to, const void __user *from, unsigned long n);
-
/*
* Copy a null terminated string from userspace.
*/
+long __must_check strncpy_from_user(char *dst, const char __user *src, long count);
-long __strncpy_from_user(char *dst, const char __user *src, long count);
-
-static inline long __must_check
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
- might_fault();
- return __strncpy_from_user(dst, src, count);
-}
-
-unsigned long __must_check __strnlen_user(const char __user *src, unsigned long count);
-
-static inline unsigned long strnlen_user(const char __user *src, unsigned long n)
-{
- might_fault();
- return __strnlen_user(src, n);
-}
+long __must_check strnlen_user(const char __user *src, long count);
/*
* Zero Userspace
@@ -275,7 +285,109 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo
return __clear_user(to, n);
}
-int copy_to_user_real(void __user *dest, void *src, unsigned long count);
-void s390_kernel_write(void *dst, const void *src, size_t size);
+void *s390_kernel_write(void *dst, const void *src, size_t size);
+
+int __noreturn __put_kernel_bad(void);
+
+#define __put_kernel_asm(val, to, insn) \
+({ \
+ int __rc; \
+ \
+ asm volatile( \
+ "0: " insn " %[_val],%[_to]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_STORE(0b, 2b, %[rc]) \
+ EX_TABLE_UA_STORE(1b, 2b, %[rc]) \
+ : [rc] "=d" (__rc), [_to] "+Q" (*(to)) \
+ : [_val] "d" (val) \
+ : "cc"); \
+ __rc; \
+})
+
+#define __put_kernel_nofault(dst, src, type, err_label) \
+do { \
+ unsigned long __x = (unsigned long)(*((type *)(src))); \
+ int __pk_err; \
+ \
+ switch (sizeof(type)) { \
+ case 1: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \
+ break; \
+ case 2: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \
+ break; \
+ case 4: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \
+ break; \
+ case 8: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \
+ break; \
+ default: \
+ __pk_err = __put_kernel_bad(); \
+ break; \
+ } \
+ if (unlikely(__pk_err)) \
+ goto err_label; \
+} while (0)
+
+int __noreturn __get_kernel_bad(void);
+
+#define __get_kernel_asm(val, from, insn) \
+({ \
+ int __rc; \
+ \
+ asm volatile( \
+ "0: " insn " %[_val],%[_from]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_LOAD_REG(0b, 2b, %[rc], %[_val]) \
+ EX_TABLE_UA_LOAD_REG(1b, 2b, %[rc], %[_val]) \
+ : [rc] "=d" (__rc), [_val] "=d" (val) \
+ : [_from] "Q" (*(from)) \
+ : "cc"); \
+ __rc; \
+})
+
+#define __get_kernel_nofault(dst, src, type, err_label) \
+do { \
+ int __gk_err; \
+ \
+ switch (sizeof(type)) { \
+ case 1: { \
+ unsigned char __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ case 2: { \
+ unsigned short __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ case 4: { \
+ unsigned int __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ case 8: { \
+ unsigned long __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ default: \
+ __gk_err = __get_kernel_bad(); \
+ break; \
+ } \
+ if (unlikely(__gk_err)) \
+ goto err_label; \
+} while (0)
#endif /* __S390_UACCESS_H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 9e9f75ef046a..4260bc5ce7f8 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -28,6 +28,7 @@
#define __ARCH_WANT_SYS_SIGPENDING
#define __ARCH_WANT_SYS_SIGPROCMASK
# ifdef CONFIG_COMPAT
+# define __ARCH_WANT_COMPAT_STAT
# define __ARCH_WANT_SYS_TIME32
# define __ARCH_WANT_SYS_UTIME32
# endif
diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h
index de9006b0cfeb..02462e7100c1 100644
--- a/arch/s390/include/asm/unwind.h
+++ b/arch/s390/include/asm/unwind.h
@@ -4,6 +4,8 @@
#include <linux/sched.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
+#include <linux/llist.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
@@ -36,10 +38,21 @@ struct unwind_state {
struct pt_regs *regs;
unsigned long sp, ip;
int graph_idx;
+ struct llist_node *kr_cur;
bool reliable;
bool error;
};
+/* Recover the return address modified by kretprobe and ftrace_graph. */
+static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+ unsigned long ip)
+{
+ ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp);
+ if (is_kretprobe_trampoline(ip))
+ ip = kretprobe_find_ret_addr(state->task, (void *)state->sp, &state->kr_cur);
+ return ip;
+}
+
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long first_frame);
bool unwind_next_frame(struct unwind_state *state);
@@ -55,10 +68,10 @@ static inline bool unwind_error(struct unwind_state *state)
return state->error;
}
-static inline void unwind_start(struct unwind_state *state,
- struct task_struct *task,
- struct pt_regs *regs,
- unsigned long first_frame)
+static __always_inline void unwind_start(struct unwind_state *state,
+ struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long first_frame)
{
task = task ?: current;
first_frame = first_frame ?: get_stack_pointer(task, regs);
diff --git a/arch/s390/include/asm/user.h b/arch/s390/include/asm/user.h
index 0ca572ced21b..8e8aaf48582e 100644
--- a/arch/s390/include/asm/user.h
+++ b/arch/s390/include/asm/user.h
@@ -67,9 +67,5 @@ struct user {
unsigned long magic; /* To uniquely identify a core file */
char u_comm[32]; /* User command that was responsible */
};
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
#endif /* _S390_USER_H */
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 4093a2856929..be3ef9dd6972 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -2,7 +2,7 @@
/*
* Ultravisor Interfaces
*
- * Copyright IBM Corp. 2019
+ * Copyright IBM Corp. 2019, 2022
*
* Author(s):
* Vasily Gorbik <gor@linux.ibm.com>
@@ -14,23 +14,83 @@
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/bug.h>
+#include <linux/sched.h>
#include <asm/page.h>
+#include <asm/gmap.h>
+
+#define UVC_CC_OK 0
+#define UVC_CC_ERROR 1
+#define UVC_CC_BUSY 2
+#define UVC_CC_PARTIAL 3
#define UVC_RC_EXECUTED 0x0001
#define UVC_RC_INV_CMD 0x0002
#define UVC_RC_INV_STATE 0x0003
#define UVC_RC_INV_LEN 0x0005
#define UVC_RC_NO_RESUME 0x0007
+#define UVC_RC_NEED_DESTROY 0x8000
#define UVC_CMD_QUI 0x0001
+#define UVC_CMD_INIT_UV 0x000f
+#define UVC_CMD_CREATE_SEC_CONF 0x0100
+#define UVC_CMD_DESTROY_SEC_CONF 0x0101
+#define UVC_CMD_CREATE_SEC_CPU 0x0120
+#define UVC_CMD_DESTROY_SEC_CPU 0x0121
+#define UVC_CMD_CONV_TO_SEC_STOR 0x0200
+#define UVC_CMD_CONV_FROM_SEC_STOR 0x0201
+#define UVC_CMD_DESTR_SEC_STOR 0x0202
+#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300
+#define UVC_CMD_UNPACK_IMG 0x0301
+#define UVC_CMD_VERIFY_IMG 0x0302
+#define UVC_CMD_CPU_RESET 0x0310
+#define UVC_CMD_CPU_RESET_INITIAL 0x0311
+#define UVC_CMD_PREPARE_RESET 0x0320
+#define UVC_CMD_CPU_RESET_CLEAR 0x0321
+#define UVC_CMD_CPU_SET_STATE 0x0330
+#define UVC_CMD_SET_UNSHARE_ALL 0x0340
+#define UVC_CMD_PIN_PAGE_SHARED 0x0341
+#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342
+#define UVC_CMD_DUMP_INIT 0x0400
+#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401
+#define UVC_CMD_DUMP_CPU 0x0402
+#define UVC_CMD_DUMP_COMPLETE 0x0403
#define UVC_CMD_SET_SHARED_ACCESS 0x1000
#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001
+#define UVC_CMD_RETR_ATTEST 0x1020
/* Bits in installed uv calls */
enum uv_cmds_inst {
BIT_UVC_CMD_QUI = 0,
+ BIT_UVC_CMD_INIT_UV = 1,
+ BIT_UVC_CMD_CREATE_SEC_CONF = 2,
+ BIT_UVC_CMD_DESTROY_SEC_CONF = 3,
+ BIT_UVC_CMD_CREATE_SEC_CPU = 4,
+ BIT_UVC_CMD_DESTROY_SEC_CPU = 5,
+ BIT_UVC_CMD_CONV_TO_SEC_STOR = 6,
+ BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7,
BIT_UVC_CMD_SET_SHARED_ACCESS = 8,
BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9,
+ BIT_UVC_CMD_SET_SEC_PARMS = 11,
+ BIT_UVC_CMD_UNPACK_IMG = 13,
+ BIT_UVC_CMD_VERIFY_IMG = 14,
+ BIT_UVC_CMD_CPU_RESET = 15,
+ BIT_UVC_CMD_CPU_RESET_INITIAL = 16,
+ BIT_UVC_CMD_CPU_SET_STATE = 17,
+ BIT_UVC_CMD_PREPARE_RESET = 18,
+ BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19,
+ BIT_UVC_CMD_UNSHARE_ALL = 20,
+ BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
+ BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
+ BIT_UVC_CMD_DUMP_INIT = 24,
+ BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
+ BIT_UVC_CMD_DUMP_CPU = 26,
+ BIT_UVC_CMD_DUMP_COMPLETE = 27,
+ BIT_UVC_CMD_RETR_ATTEST = 28,
+};
+
+enum uv_feat_ind {
+ BIT_UV_FEAT_MISC = 0,
+ BIT_UV_FEAT_AIV = 1,
};
struct uv_cb_header {
@@ -40,13 +100,137 @@ struct uv_cb_header {
u16 rrc; /* Return Reason Code */
} __packed __aligned(8);
+/* Query Ultravisor Information */
struct uv_cb_qui {
+ struct uv_cb_header header; /* 0x0000 */
+ u64 reserved08; /* 0x0008 */
+ u64 inst_calls_list[4]; /* 0x0010 */
+ u64 reserved30[2]; /* 0x0030 */
+ u64 uv_base_stor_len; /* 0x0040 */
+ u64 reserved48; /* 0x0048 */
+ u64 conf_base_phys_stor_len; /* 0x0050 */
+ u64 conf_base_virt_stor_len; /* 0x0058 */
+ u64 conf_virt_var_stor_len; /* 0x0060 */
+ u64 cpu_stor_len; /* 0x0068 */
+ u32 reserved70[3]; /* 0x0070 */
+ u32 max_num_sec_conf; /* 0x007c */
+ u64 max_guest_stor_addr; /* 0x0080 */
+ u8 reserved88[158 - 136]; /* 0x0088 */
+ u16 max_guest_cpu_id; /* 0x009e */
+ u64 uv_feature_indications; /* 0x00a0 */
+ u64 reserveda8; /* 0x00a8 */
+ u64 supp_se_hdr_versions; /* 0x00b0 */
+ u64 supp_se_hdr_pcf; /* 0x00b8 */
+ u64 reservedc0; /* 0x00c0 */
+ u64 conf_dump_storage_state_len; /* 0x00c8 */
+ u64 conf_dump_finalize_len; /* 0x00d0 */
+ u64 reservedd8; /* 0x00d8 */
+ u64 supp_att_req_hdr_ver; /* 0x00e0 */
+ u64 supp_att_pflags; /* 0x00e8 */
+ u8 reservedf0[256 - 240]; /* 0x00f0 */
+} __packed __aligned(8);
+
+/* Initialize Ultravisor */
+struct uv_cb_init {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 stor_origin;
+ u64 stor_len;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+/* Create Guest Configuration */
+struct uv_cb_cgc {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 conf_base_stor_origin;
+ u64 conf_virt_stor_origin;
+ u64 reserved30;
+ u64 guest_stor_origin;
+ u64 guest_stor_len;
+ u64 guest_sca;
+ u64 guest_asce;
+ u64 reserved58[5];
+} __packed __aligned(8);
+
+/* Create Secure CPU */
+struct uv_cb_csc {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u64 guest_handle;
+ u64 stor_origin;
+ u8 reserved30[6];
+ u16 num;
+ u64 state_origin;
+ u64 reserved40[4];
+} __packed __aligned(8);
+
+/* Convert to Secure */
+struct uv_cb_cts {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 gaddr;
+} __packed __aligned(8);
+
+/* Convert from Secure / Pin Page Shared */
+struct uv_cb_cfs {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 paddr;
+} __packed __aligned(8);
+
+/* Set Secure Config Parameter */
+struct uv_cb_ssc {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 sec_header_origin;
+ u32 sec_header_len;
+ u32 reserved2c;
+ u64 reserved30[4];
+} __packed __aligned(8);
+
+/* Unpack */
+struct uv_cb_unp {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 gaddr;
+ u64 tweak[2];
+ u64 reserved38[3];
+} __packed __aligned(8);
+
+#define PV_CPU_STATE_OPR 1
+#define PV_CPU_STATE_STP 2
+#define PV_CPU_STATE_CHKSTP 3
+#define PV_CPU_STATE_OPR_LOAD 5
+
+struct uv_cb_cpu_set_state {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u8 reserved20[7];
+ u8 state;
+ u64 reserved28[5];
+};
+
+/*
+ * A common UV call struct for calls that take no payload
+ * Examples:
+ * Destroy cpu/config
+ * Verify
+ */
+struct uv_cb_nodata {
struct uv_cb_header header;
- u64 reserved08;
- u64 inst_calls_list[4];
- u64 reserved30[15];
+ u64 reserved08[2];
+ u64 handle;
+ u64 reserved20[4];
} __packed __aligned(8);
+/* Set Shared Access */
struct uv_cb_share {
struct uv_cb_header header;
u64 reserved08[3];
@@ -54,21 +238,127 @@ struct uv_cb_share {
u64 reserved28;
} __packed __aligned(8);
-static inline int uv_call(unsigned long r1, unsigned long r2)
+/* Retrieve Attestation Measurement */
+struct uv_cb_attest {
+ struct uv_cb_header header; /* 0x0000 */
+ u64 reserved08[2]; /* 0x0008 */
+ u64 arcb_addr; /* 0x0018 */
+ u64 cont_token; /* 0x0020 */
+ u8 reserved28[6]; /* 0x0028 */
+ u16 user_data_len; /* 0x002e */
+ u8 user_data[256]; /* 0x0030 */
+ u32 reserved130[3]; /* 0x0130 */
+ u32 meas_len; /* 0x013c */
+ u64 meas_addr; /* 0x0140 */
+ u8 config_uid[16]; /* 0x0148 */
+ u32 reserved158; /* 0x0158 */
+ u32 add_data_len; /* 0x015c */
+ u64 add_data_addr; /* 0x0160 */
+ u64 reserved168[4]; /* 0x0168 */
+} __packed __aligned(8);
+
+struct uv_cb_dump_cpu {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u64 dump_area_origin;
+ u64 reserved28[5];
+} __packed __aligned(8);
+
+struct uv_cb_dump_stor_state {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 config_handle;
+ u64 dump_area_origin;
+ u64 gaddr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+struct uv_cb_dump_complete {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 config_handle;
+ u64 dump_area_origin;
+ u64 reserved30[5];
+} __packed __aligned(8);
+
+static inline int __uv_call(unsigned long r1, unsigned long r2)
{
int cc;
asm volatile(
- "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
- " brc 3,0b\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
+ " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
: [cc] "=d" (cc)
: [r1] "a" (r1), [r2] "a" (r2)
: "memory", "cc");
return cc;
}
+static inline int uv_call(unsigned long r1, unsigned long r2)
+{
+ int cc;
+
+ do {
+ cc = __uv_call(r1, r2);
+ } while (cc > 1);
+ return cc;
+}
+
+/* Low level uv_call that avoids stalls for long running busy conditions */
+static inline int uv_call_sched(unsigned long r1, unsigned long r2)
+{
+ int cc;
+
+ do {
+ cc = __uv_call(r1, r2);
+ cond_resched();
+ } while (cc > 1);
+ return cc;
+}
+
+/*
+ * special variant of uv_call that only transports the cpu or guest
+ * handle and the command, like destroy or verify.
+ */
+static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_nodata uvcb = {
+ .header.cmd = cmd,
+ .header.len = sizeof(uvcb),
+ .handle = handle,
+ };
+ int cc;
+
+ WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd);
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ return cc ? -EINVAL : 0;
+}
+
+struct uv_info {
+ unsigned long inst_calls_list[4];
+ unsigned long uv_base_stor_len;
+ unsigned long guest_base_stor_len;
+ unsigned long guest_virt_base_stor_len;
+ unsigned long guest_virt_var_stor_len;
+ unsigned long guest_cpu_stor_len;
+ unsigned long max_sec_stor_addr;
+ unsigned int max_num_sec_conf;
+ unsigned short max_guest_cpu_id;
+ unsigned long uv_feature_indications;
+ unsigned long supp_se_hdr_ver;
+ unsigned long supp_se_hdr_pcf;
+ unsigned long conf_dump_storage_state_len;
+ unsigned long conf_dump_finalize_len;
+ unsigned long supp_att_req_hdr_ver;
+ unsigned long supp_att_pflags;
+};
+
+extern struct uv_info uv_info;
+
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
extern int prot_virt_guest;
@@ -121,12 +411,46 @@ static inline int uv_remove_shared(unsigned long addr)
return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
}
-void uv_query_info(void);
#else
#define is_prot_virt_guest() 0
static inline int uv_set_shared(unsigned long addr) { return 0; }
static inline int uv_remove_shared(unsigned long addr) { return 0; }
-static inline void uv_query_info(void) {}
+#endif
+
+#if IS_ENABLED(CONFIG_KVM)
+extern int prot_virt_host;
+
+static inline int is_prot_virt_host(void)
+{
+ return prot_virt_host;
+}
+
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
+int uv_destroy_owned_page(unsigned long paddr);
+int uv_convert_from_secure(unsigned long paddr);
+int uv_convert_owned_from_secure(unsigned long paddr);
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
+
+void setup_uv(void);
+#else
+#define is_prot_virt_host() 0
+static inline void setup_uv(void) {}
+
+static inline int uv_destroy_owned_page(unsigned long paddr)
+{
+ return 0;
+}
+
+static inline int uv_convert_from_secure(unsigned long paddr)
+{
+ return 0;
+}
+
+static inline int uv_convert_owned_from_secure(unsigned long paddr)
+{
+ return 0;
+}
#endif
#endif /* _ASM_S390_UV_H */
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index 3bcfdeb01395..53165aa7813a 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -2,64 +2,33 @@
#ifndef __S390_VDSO_H__
#define __S390_VDSO_H__
-/* Default link addresses for the vDSOs */
-#define VDSO32_LBASE 0
-#define VDSO64_LBASE 0
-
-#define VDSO_VERSION_STRING LINUX_2.6.29
+#include <vdso/datapage.h>
#ifndef __ASSEMBLY__
-/*
- * Note about the vdso_data and vdso_per_cpu_data structures:
- *
- * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
- * structure is supposed to be known only to the function in the vdso
- * itself and may change without notice.
- */
+#include <generated/vdso64-offsets.h>
+#ifdef CONFIG_COMPAT
+#include <generated/vdso32-offsets.h>
+#endif
-struct vdso_data {
- __u64 tb_update_count; /* Timebase atomicity ctr 0x00 */
- __u64 xtime_tod_stamp; /* TOD clock for xtime 0x08 */
- __u64 xtime_clock_sec; /* Kernel time 0x10 */
- __u64 xtime_clock_nsec; /* 0x18 */
- __u64 xtime_coarse_sec; /* Coarse kernel time 0x20 */
- __u64 xtime_coarse_nsec; /* 0x28 */
- __u64 wtom_clock_sec; /* Wall to monotonic clock 0x30 */
- __u64 wtom_clock_nsec; /* 0x38 */
- __u64 wtom_coarse_sec; /* Coarse wall to monotonic 0x40 */
- __u64 wtom_coarse_nsec; /* 0x48 */
- __u32 tz_minuteswest; /* Minutes west of Greenwich 0x50 */
- __u32 tz_dsttime; /* Type of dst correction 0x54 */
- __u32 ectg_available; /* ECTG instruction present 0x58 */
- __u32 tk_mult; /* Mult. used for xtime_nsec 0x5c */
- __u32 tk_shift; /* Shift used for xtime_nsec 0x60 */
- __u32 ts_dir; /* TOD steering direction 0x64 */
- __u64 ts_end; /* TOD steering end 0x68 */
-};
-
-struct vdso_per_cpu_data {
- __u64 ectg_timer_base;
- __u64 ectg_user_time;
- /*
- * Note: node_id and cpu_nr must be at adjacent memory locations.
- * VDSO userspace must read both values with a single instruction.
- */
- union {
- __u64 getcpu_val;
- struct {
- __u32 node_id;
- __u32 cpu_nr;
- };
- };
-};
+#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
+#ifdef CONFIG_COMPAT
+#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
+#else
+#define VDSO32_SYMBOL(tsk, name) (-1UL)
+#endif
extern struct vdso_data *vdso_data;
-extern struct vdso_data boot_vdso_data;
-void vdso_alloc_boot_cpu(struct lowcore *lowcore);
-int vdso_alloc_per_cpu(struct lowcore *lowcore);
-void vdso_free_per_cpu(struct lowcore *lowcore);
+int vdso_getcpu_init(void);
#endif /* __ASSEMBLY__ */
+
+/* Default link address for the vDSO */
+#define VDSO_LBASE 0
+
+#define __VVAR_PAGES 2
+
+#define VDSO_VERSION_STRING LINUX_2.6.29
+
#endif /* __S390_VDSO_H__ */
diff --git a/arch/s390/include/asm/vdso/clocksource.h b/arch/s390/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..a93eda0ce7bb
--- /dev/null
+++ b/arch/s390/include/asm/vdso/clocksource.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_CLOCKSOURCE_H
+#define __ASM_VDSO_CLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES \
+ VDSO_CLOCKMODE_TOD
+
+#endif /* __ASM_VDSO_CLOCKSOURCE_H */
diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h
new file mode 100644
index 000000000000..73ee89142666
--- /dev/null
+++ b/arch/s390/include/asm/vdso/data.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __S390_ASM_VDSO_DATA_H
+#define __S390_ASM_VDSO_DATA_H
+
+#include <linux/types.h>
+#include <vdso/datapage.h>
+
+struct arch_vdso_data {
+ __s64 tod_steering_delta;
+ __u64 tod_steering_end;
+};
+
+#endif /* __S390_ASM_VDSO_DATA_H */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..db84942eb78f
--- /dev/null
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_VDSO_GETTIMEOFDAY_H
+#define ASM_VDSO_GETTIMEOFDAY_H
+
+#define VDSO_HAS_TIME 1
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+#include <asm/syscall.h>
+#include <asm/timex.h>
+#include <asm/unistd.h>
+#include <linux/compiler.h>
+
+#define vdso_calc_delta __arch_vdso_calc_delta
+static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+ return (cycles - last) * mult;
+}
+
+static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
+{
+ return _vdso_data;
+}
+
+static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd)
+{
+ u64 adj, now;
+
+ now = get_tod_clock();
+ adj = vd->arch_data.tod_steering_end - now;
+ if (unlikely((s64) adj > 0))
+ now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15);
+ return now;
+}
+
+static __always_inline
+long clock_gettime_fallback(clockid_t clkid, struct __kernel_timespec *ts)
+{
+ return syscall2(__NR_clock_gettime, (long)clkid, (long)ts);
+}
+
+static __always_inline
+long gettimeofday_fallback(register struct __kernel_old_timeval *tv,
+ register struct timezone *tz)
+{
+ return syscall2(__NR_gettimeofday, (long)tv, (long)tz);
+}
+
+static __always_inline
+long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts)
+{
+ return syscall2(__NR_clock_getres, (long)clkid, (long)ts);
+}
+
+#ifdef CONFIG_TIME_NS
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
+{
+ return _timens_data;
+}
+#endif
+
+#endif
diff --git a/arch/s390/include/asm/vdso/processor.h b/arch/s390/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..cfcc3e117c4c
--- /dev/null
+++ b/arch/s390/include/asm/vdso/processor.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_VDSO_PROCESSOR_H
+#define __ASM_VDSO_PROCESSOR_H
+
+#define cpu_relax() barrier()
+
+#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..6c67c08cefdd
--- /dev/null
+++ b/arch/s390/include/asm/vdso/vsyscall.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_VSYSCALL_H
+#define __ASM_VDSO_VSYSCALL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/hrtimer.h>
+#include <linux/timekeeper_internal.h>
+#include <vdso/datapage.h>
+#include <asm/vdso.h>
+/*
+ * Update the vDSO data page to keep in sync with kernel timekeeping.
+ */
+
+static __always_inline struct vdso_data *__s390_get_k_vdso_data(void)
+{
+ return vdso_data;
+}
+#define __arch_get_k_vdso_data __s390_get_k_vdso_data
+
+/* The asm-generic header needs to be included after the definitions above */
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h
index 3622d4ebc73a..fe17e448c0c5 100644
--- a/arch/s390/include/asm/vtime.h
+++ b/arch/s390/include/asm/vtime.h
@@ -2,7 +2,20 @@
#ifndef _S390_VTIME_H
#define _S390_VTIME_H
-#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
+static inline void update_timer_sys(void)
+{
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer;
+ S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.sys_enter_timer;
+ S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer;
+}
+
+static inline void update_timer_mcck(void)
+{
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer;
+ S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.mcck_enter_timer;
+ S390_lowcore.last_update_timer = S390_lowcore.mcck_enter_timer;
+}
+
#endif /* _S390_VTIME_H */
diff --git a/arch/s390/include/asm/vtimer.h b/arch/s390/include/asm/vtimer.h
index 42f707d1c1e8..e601adaa6320 100644
--- a/arch/s390/include/asm/vtimer.h
+++ b/arch/s390/include/asm/vtimer.h
@@ -25,8 +25,6 @@ extern void add_virt_timer_periodic(struct vtimer_list *timer);
extern int mod_virt_timer(struct vtimer_list *timer, u64 expires);
extern int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires);
extern int del_virt_timer(struct vtimer_list *timer);
-
-extern void init_cpu_vtimer(void);
extern void vtime_init(void);
#endif /* _ASM_S390_TIMER_H */
diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h
index 0c05a673811c..95480ed9149e 100644
--- a/arch/s390/include/asm/vx-insn.h
+++ b/arch/s390/include/asm/vx-insn.h
@@ -366,17 +366,27 @@
.macro VLM vfrom, vto, disp, base, hint=3
VX_NUM v1, \vfrom
VX_NUM v3, \vto
- GR_NUM b2, \base /* Base register */
+ GR_NUM b2, \base
.word 0xE700 | ((v1&15) << 4) | (v3&15)
.word (b2 << 12) | (\disp)
MRXBOPC \hint, 0x36, v1, v3
.endm
+/* VECTOR STORE */
+.macro VST vr1, disp, index="%r0", base
+ VX_NUM v1, \vr1
+ GR_NUM x2, \index
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | (x2&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC 0, 0x0E, v1
+.endm
+
/* VECTOR STORE MULTIPLE */
.macro VSTM vfrom, vto, disp, base, hint=3
VX_NUM v1, \vfrom
VX_NUM v3, \vto
- GR_NUM b2, \base /* Base register */
+ GR_NUM b2, \base
.word 0xE700 | ((v1&15) << 4) | (v3&15)
.word (b2 << 12) | (\disp)
MRXBOPC \hint, 0x3E, v1, v3
@@ -411,6 +421,81 @@
VUPLL \vr1, \vr2, 2
.endm
+/* VECTOR PERMUTE DOUBLEWORD IMMEDIATE */
+.macro VPDI vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x84, v1, v2, v3
+.endm
+
+/* VECTOR REPLICATE */
+.macro VREP vr1, vr3, imm2, m4
+ VX_NUM v1, \vr1
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v3&15)
+ .word \imm2
+ MRXBOPC \m4, 0x4D, v1, v3
+.endm
+.macro VREPB vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 0
+.endm
+.macro VREPH vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 1
+.endm
+.macro VREPF vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 2
+.endm
+.macro VREPG vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 3
+.endm
+
+/* VECTOR MERGE HIGH */
+.macro VMRH vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x61, v1, v2, v3
+.endm
+.macro VMRHB vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 0
+.endm
+.macro VMRHH vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 1
+.endm
+.macro VMRHF vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 2
+.endm
+.macro VMRHG vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR MERGE LOW */
+.macro VMRL vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x60, v1, v2, v3
+.endm
+.macro VMRLB vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 0
+.endm
+.macro VMRLH vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 1
+.endm
+.macro VMRLF vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 2
+.endm
+.macro VMRLG vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 3
+.endm
+
/* Vector integer instructions */
@@ -557,5 +642,37 @@
VESRAV \vr1, \vr2, \vr3, 3
.endm
+/* VECTOR ELEMENT ROTATE LEFT LOGICAL */
+.macro VERLL vr1, vr3, disp, base="%r0", m4
+ VX_NUM v1, \vr1
+ VX_NUM v3, \vr3
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | (v3&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \m4, 0x33, v1, v3
+.endm
+.macro VERLLB vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 0
+.endm
+.macro VERLLH vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 1
+.endm
+.macro VERLLF vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 2
+.endm
+.macro VERLLG vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 3
+.endm
+
+/* VECTOR SHIFT LEFT DOUBLE BY BYTE */
+.macro VSLDB vr1, vr2, vr3, imm4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12) | (\imm4)
+ MRXBOPC 0, 0x77, v1, v2, v3
+.endm
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_S390_VX_INSN_H */
diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h
index 9ec86fae9980..93d1ccd3304c 100644
--- a/arch/s390/include/uapi/asm/dasd.h
+++ b/arch/s390/include/uapi/asm/dasd.h
@@ -183,6 +183,18 @@ typedef struct format_data_t {
} format_data_t;
/*
+ * struct dasd_copypair_swap_data_t
+ * represents all data necessary to issue a swap of the copy pair relation
+ */
+struct dasd_copypair_swap_data_t {
+ char primary[20]; /* BUSID of primary */
+ char secondary[20]; /* BUSID of secondary */
+
+ /* Reserved for future updates. */
+ __u8 reserved[64];
+};
+
+/*
* values to be used for format_data_t.intensity
* 0/8: normal format
* 1/9: also write record zero
@@ -326,6 +338,8 @@ struct dasd_snid_ioctl_data {
#define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t)
/* Release Allocated Space */
#define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t)
+/* Swap copy pair relation */
+#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t)
/* Get Sense Path Group ID (SNID) data */
#define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data)
diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h
deleted file mode 100644
index c7c564d9aea4..000000000000
--- a/arch/s390/include/uapi/asm/debug.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * S/390 debug facility
- *
- * Copyright IBM Corp. 1999, 2000
- */
-
-#ifndef _UAPIDEBUG_H
-#define _UAPIDEBUG_H
-
-#include <linux/fs.h>
-
-/* Note:
- * struct __debug_entry must be defined outside of #ifdef __KERNEL__
- * in order to allow a user program to analyze the 'raw'-view.
- */
-
-struct __debug_entry{
- union {
- struct {
- unsigned long long clock:52;
- unsigned long long exception:1;
- unsigned long long level:3;
- unsigned long long cpuid:8;
- } fields;
-
- unsigned long long stck;
- } id;
- void* caller;
-} __attribute__((packed));
-
-
-#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */
-
-#endif /* _UAPIDEBUG_H */
diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h
new file mode 100644
index 000000000000..e56b9dd23a4b
--- /dev/null
+++ b/arch/s390/include/uapi/asm/hwctrset.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2021
+ * Interface implementation for communication with the CPU Measurement
+ * counter facility device driver.
+ *
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ *
+ * Define for ioctl() commands to communicate with the CPU Measurement
+ * counter facility device driver.
+ */
+
+#ifndef _PERF_CPUM_CF_DIAG_H
+#define _PERF_CPUM_CF_DIAG_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define S390_HWCTR_DEVICE "hwctr"
+#define S390_HWCTR_START_VERSION 1
+
+struct s390_ctrset_start { /* Set CPUs to operate on */
+ __u64 version; /* Version of interface */
+ __u64 data_bytes; /* # of bytes required */
+ __u64 cpumask_len; /* Length of CPU mask in bytes */
+ __u64 *cpumask; /* Pointer to CPU mask */
+ __u64 counter_sets; /* Bit mask of counter sets to get */
+};
+
+struct s390_ctrset_setdata { /* Counter set data */
+ __u32 set; /* Counter set number */
+ __u32 no_cnts; /* # of counters stored in cv[] */
+ __u64 cv[]; /* Counter values (variable length) */
+};
+
+struct s390_ctrset_cpudata { /* Counter set data per CPU */
+ __u32 cpu_nr; /* CPU number */
+ __u32 no_sets; /* # of counters sets in data[] */
+ struct s390_ctrset_setdata data[];
+};
+
+struct s390_ctrset_read { /* Structure to get all ctr sets */
+ __u64 no_cpus; /* Total # of CPUs data taken from */
+ struct s390_ctrset_cpudata data[];
+};
+
+#define S390_HWCTR_MAGIC 'C' /* Random magic # for ioctls */
+#define S390_HWCTR_START _IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start)
+#define S390_HWCTR_STOP _IO(S390_HWCTR_MAGIC, 2)
+#define S390_HWCTR_READ _IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read)
+#endif
diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h
index 451ba7d08905..d1ecd5d722a0 100644
--- a/arch/s390/include/uapi/asm/ipl.h
+++ b/arch/s390/include/uapi/asm/ipl.h
@@ -27,6 +27,7 @@ enum ipl_pbt {
IPL_PBT_FCP = 0,
IPL_PBT_SCP_DATA = 1,
IPL_PBT_CCW = 2,
+ IPL_PBT_NVME = 4,
};
/* IPL Parameter Block 0 with common fields */
@@ -67,6 +68,30 @@ struct ipl_pb0_fcp {
#define IPL_PB0_FCP_OPT_IPL 0x10
#define IPL_PB0_FCP_OPT_DUMP 0x20
+/* IPL Parameter Block 0 for NVMe */
+struct ipl_pb0_nvme {
+ __u32 len;
+ __u8 pbt;
+ __u8 reserved1[3];
+ __u8 loadparm[8];
+ __u8 reserved2[304];
+ __u8 opt;
+ __u8 reserved3[3];
+ __u32 fid;
+ __u8 reserved4[12];
+ __u32 nsid;
+ __u8 reserved5[4];
+ __u32 bootprog;
+ __u8 reserved6[12];
+ __u64 br_lba;
+ __u32 scp_data_len;
+ __u8 reserved7[260];
+ __u8 scp_data[];
+} __packed;
+
+#define IPL_PB0_NVME_OPT_IPL 0x10
+#define IPL_PB0_NVME_OPT_DUMP 0x20
+
/* IPL Parameter Block 0 for CCW */
struct ipl_pb0_ccw {
__u32 len;
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 436ec7636927..a73cf01a1606 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
#define KVM_S390_VM_CRYPTO 2
#define KVM_S390_VM_CPU_MODEL 3
#define KVM_S390_VM_MIGRATION 4
+#define KVM_S390_VM_CPU_TOPOLOGY 5
/* kvm attributes for mem_ctrl */
#define KVM_S390_VM_MEM_ENABLE_CMMA 0
@@ -231,11 +232,13 @@ struct kvm_guest_debug_arch {
#define KVM_SYNC_GSCB (1UL << 9)
#define KVM_SYNC_BPBC (1UL << 10)
#define KVM_SYNC_ETOKEN (1UL << 11)
+#define KVM_SYNC_DIAG318 (1UL << 12)
#define KVM_SYNC_S390_VALID_FIELDS \
(KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \
KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \
- KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN)
+ KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \
+ KVM_SYNC_DIAG318)
/* length and alignment of the sdnx as a power of two */
#define SDNXC 8
@@ -264,7 +267,8 @@ struct kvm_sync_regs {
__u8 reserved2 : 7;
__u8 padding1[51]; /* riccb needs to be 64byte aligned */
__u8 riccb[64]; /* runtime instrumentation controls block */
- __u8 padding2[192]; /* sdnx needs to be 256byte aligned */
+ __u64 diag318; /* diagnose 0x318 info */
+ __u8 padding2[184]; /* sdnx needs to be 256byte aligned */
union {
__u8 sdnx[SDNXL]; /* state description annex */
struct {
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
index d27d7d329263..924b876f992c 100644
--- a/arch/s390/include/uapi/asm/pkey.h
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -35,12 +35,16 @@
#define PKEY_KEYTYPE_AES_128 1
#define PKEY_KEYTYPE_AES_192 2
#define PKEY_KEYTYPE_AES_256 3
+#define PKEY_KEYTYPE_ECC 4
/* the newer ioctls use a pkey_key_type enum for type information */
enum pkey_key_type {
PKEY_TYPE_CCA_DATA = (__u32) 1,
PKEY_TYPE_CCA_CIPHER = (__u32) 2,
PKEY_TYPE_EP11 = (__u32) 3,
+ PKEY_TYPE_CCA_ECC = (__u32) 0x1f,
+ PKEY_TYPE_EP11_AES = (__u32) 6,
+ PKEY_TYPE_EP11_ECC = (__u32) 7,
};
/* the newer ioctls use a pkey_key_size enum for key size information */
@@ -89,6 +93,20 @@ struct pkey_clrkey {
};
/*
+ * EP11 key blobs of type PKEY_TYPE_EP11_AES and PKEY_TYPE_EP11_ECC
+ * are ep11 blobs prepended by this header:
+ */
+struct ep11kblob_header {
+ __u8 type; /* always 0x00 */
+ __u8 hver; /* header version, currently needs to be 0x00 */
+ __u16 len; /* total length in bytes (including this header) */
+ __u8 version; /* PKEY_TYPE_EP11_AES or PKEY_TYPE_EP11_ECC */
+ __u8 res0; /* unused */
+ __u16 bitlen; /* clear key bit len, 0 for unknown */
+ __u8 res1[8]; /* unused */
+} __packed;
+
+/*
* Generate CCA AES secure key.
*/
struct pkey_genseck {
@@ -153,7 +171,7 @@ struct pkey_skey2pkey {
#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
/*
- * Verify the given CCA AES secure key for being able to be useable with
+ * Verify the given CCA AES secure key for being able to be usable with
* the pkey module. Check for correct key type and check for having at
* least one crypto card being able to handle this key (master key
* or old master key verification pattern matches).
@@ -304,7 +322,7 @@ struct pkey_verifykey2 {
#define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2)
/*
- * Transform a key blob (of any type) into a protected key, version 2.
+ * Transform a key blob into a protected key, version 2.
* There needs to be a list of apqns given with at least one entry in there.
* All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
* is not supported. The implementation walks through the list of apqns and
@@ -313,6 +331,8 @@ struct pkey_verifykey2 {
* list is tried until success (return 0) or the end of the list is reached
* (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to
* generate a list of apqns based on the key.
+ * Deriving ECC protected keys from ECC secure keys is not supported with
+ * this ioctl, use PKEY_KBLOB2PROTK3 for this purpose.
*/
struct pkey_kblob2pkey2 {
__u8 __user *key; /* in: pointer to key blob */
@@ -326,17 +346,17 @@ struct pkey_kblob2pkey2 {
/*
* Build a list of APQNs based on a key blob given.
* Is able to find out which type of secure key is given (CCA AES secure
- * key, CCA AES cipher key or EP11 AES key) and tries to find all matching
- * crypto cards based on the MKVP and maybe other criterias (like CCA AES
- * cipher keys need a CEX5C or higher, EP11 keys with BLOB_PKEY_EXTRACTABLE
- * need a CEX7 and EP11 api version 4). The list of APQNs is further filtered
- * by the key's mkvp which needs to match to either the current mkvp (CCA and
- * EP11) or the alternate mkvp (old mkvp, CCA adapters only) of the apqns. The
- * flags argument may be used to limit the matching apqns. If the
- * PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of each apqn is
- * compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both are given, it
- * is assumed to return apqns where either the current or the alternate mkvp
- * matches. At least one of the matching flags needs to be given.
+ * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private
+ * key) and tries to find all matching crypto cards based on the MKVP and maybe
+ * other criterias (like CCA AES cipher keys need a CEX5C or higher, EP11 keys
+ * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of
+ * APQNs is further filtered by the key's mkvp which needs to match to either
+ * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters
+ * only) of the apqns. The flags argument may be used to limit the matching
+ * apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of
+ * each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both
+ * are given, it is assumed to return apqns where either the current or the
+ * alternate mkvp matches. At least one of the matching flags needs to be given.
* The flags argument for EP11 keys has no further action and is currently
* ignored (but needs to be given as PKEY_FLAGS_MATCH_CUR_MKVP) as there is only
* the wkvp from the key to match against the apqn's wkvp.
@@ -365,9 +385,10 @@ struct pkey_apqns4key {
* restrict the list by given master key verification patterns.
* For different key types there may be different ways to match the
* master key verification patterns. For CCA keys (CCA data key and CCA
- * cipher key) the first 8 bytes of cur_mkvp refer to the current mkvp value
- * of the apqn and the first 8 bytes of the alt_mkvp refer to the old mkvp.
- * The flags argument controls if the apqns current and/or alternate mkvp
+ * cipher key) the first 8 bytes of cur_mkvp refer to the current AES mkvp value
+ * of the apqn and the first 8 bytes of the alt_mkvp refer to the old AES mkvp.
+ * For CCA ECC keys it is similar but the match is against the APKA current/old
+ * mkvp. The flags argument controls if the apqns current and/or alternate mkvp
* should match. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current
* mkvp of each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP.
* If both are given, it is assumed to return apqns where either the
@@ -397,4 +418,30 @@ struct pkey_apqns4keytype {
};
#define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype)
+/*
+ * Transform a key blob into a protected key, version 3.
+ * The difference to version 2 of this ioctl is that the protected key
+ * buffer is now explicitly and not within a struct pkey_protkey any more.
+ * So this ioctl is also able to handle EP11 and CCA ECC secure keys and
+ * provide ECC protected keys.
+ * There needs to be a list of apqns given with at least one entry in there.
+ * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
+ * is not supported. The implementation walks through the list of apqns and
+ * tries to send the request to each apqn without any further checking (like
+ * card type or online state). If the apqn fails, simple the next one in the
+ * list is tried until success (return 0) or the end of the list is reached
+ * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to
+ * generate a list of apqns based on the key.
+ */
+struct pkey_kblob2pkey3 {
+ __u8 __user *key; /* in: pointer to key blob */
+ __u32 keylen; /* in: key blob size */
+ struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */
+ __u32 apqn_entries; /* in: # of apqn target list entries */
+ __u32 pkeytype; /* out: prot key type (enum pkey_key_type) */
+ __u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */
+ __u8 __user *pkey; /* in: pkey blob buffer space ptr */
+};
+#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3)
+
#endif /* _UAPI_PKEY_H */
diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h
index 543dd70e12c8..ad64d673b5e6 100644
--- a/arch/s390/include/uapi/asm/ptrace.h
+++ b/arch/s390/include/uapi/asm/ptrace.h
@@ -179,8 +179,9 @@
#define ACR_SIZE 4
-#define PTRACE_OLDSETOPTIONS 21
-
+#define PTRACE_OLDSETOPTIONS 21
+#define PTRACE_SYSEMU 31
+#define PTRACE_SYSEMU_SINGLESTEP 32
#ifndef __ASSEMBLY__
#include <linux/stddef.h>
#include <linux/types.h>
diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h
index 58fca6f48410..a3e1cf168553 100644
--- a/arch/s390/include/uapi/asm/schid.h
+++ b/arch/s390/include/uapi/asm/schid.h
@@ -4,6 +4,8 @@
#include <linux/types.h>
+#ifndef __ASSEMBLY__
+
struct subchannel_id {
__u32 cssid : 8;
__u32 : 4;
@@ -13,5 +15,6 @@ struct subchannel_id {
__u32 sch_no : 16;
} __attribute__ ((packed, aligned(4)));
+#endif /* __ASSEMBLY__ */
#endif /* _UAPIASM_SCHID_H */
diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h
index 1f8803a31079..598d769e76df 100644
--- a/arch/s390/include/uapi/asm/setup.h
+++ b/arch/s390/include/uapi/asm/setup.h
@@ -1,14 +1 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * S390 version
- * Copyright IBM Corp. 1999, 2010
- */
-
-#ifndef _UAPI_ASM_S390_SETUP_H
-#define _UAPI_ASM_S390_SETUP_H
-
-#define COMMAND_LINE_SIZE 4096
-
-#define ARCH_COMMAND_LINE_SIZE 896
-
-#endif /* _UAPI_ASM_S390_SETUP_H */
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 6ca1e68d7103..ede318653c87 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -29,7 +29,7 @@
{ 0x13, "SIGP conditional emergency signal" }, \
{ 0x15, "SIGP sense running" }, \
{ 0x16, "SIGP set multithreading"}, \
- { 0x17, "SIGP store additional status ait address"}
+ { 0x17, "SIGP store additional status at address"}
#define icpt_prog_codes \
{ 0x0001, "Prog Operation" }, \
diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h
index 9a14a611ed82..e74d6ba1bd3b 100644
--- a/arch/s390/include/uapi/asm/signal.h
+++ b/arch/s390/include/uapi/asm/signal.h
@@ -65,30 +65,6 @@ typedef unsigned long sigset_t;
#define SIGRTMIN 32
#define SIGRTMAX _NSIG
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP 0x00000001
-#define SA_NOCLDWAIT 0x00000002
-#define SA_SIGINFO 0x00000004
-#define SA_ONSTACK 0x08000000
-#define SA_RESTART 0x10000000
-#define SA_NODEFER 0x40000000
-#define SA_RESETHAND 0x80000000
-
-#define SA_NOMASK SA_NODEFER
-#define SA_ONESHOT SA_RESETHAND
-
#define SA_RESTORER 0x04000000
#define MINSIGSTKSZ 2048
@@ -132,7 +108,7 @@ struct sigaction {
typedef struct sigaltstack {
void __user *ss_sp;
int ss_flags;
- size_t ss_size;
+ __kernel_size_t ss_size;
} stack_t;
diff --git a/arch/s390/include/uapi/asm/termios.h b/arch/s390/include/uapi/asm/termios.h
deleted file mode 100644
index 54223169c806..000000000000
--- a/arch/s390/include/uapi/asm/termios.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * S390 version
- *
- * Derived from "include/asm-i386/termios.h"
- */
-
-#ifndef _UAPI_S390_TERMIOS_H
-#define _UAPI_S390_TERMIOS_H
-
-#include <asm/termbits.h>
-#include <asm/ioctls.h>
-
-struct winsize {
- unsigned short ws_row;
- unsigned short ws_col;
- unsigned short ws_xpixel;
- unsigned short ws_ypixel;
-};
-
-#define NCC 8
-struct termio {
- unsigned short c_iflag; /* input mode flags */
- unsigned short c_oflag; /* output mode flags */
- unsigned short c_cflag; /* control mode flags */
- unsigned short c_lflag; /* local mode flags */
- unsigned char c_line; /* line discipline */
- unsigned char c_cc[NCC]; /* control characters */
-};
-
-/* modem lines */
-#define TIOCM_LE 0x001
-#define TIOCM_DTR 0x002
-#define TIOCM_RTS 0x004
-#define TIOCM_ST 0x008
-#define TIOCM_SR 0x010
-#define TIOCM_CTS 0x020
-#define TIOCM_CAR 0x040
-#define TIOCM_RNG 0x080
-#define TIOCM_DSR 0x100
-#define TIOCM_CD TIOCM_CAR
-#define TIOCM_RI TIOCM_RNG
-#define TIOCM_OUT1 0x2000
-#define TIOCM_OUT2 0x4000
-#define TIOCM_LOOP 0x8000
-
-/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
-
-
-#endif /* _UAPI_S390_TERMIOS_H */
diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h
new file mode 100644
index 000000000000..10a5ac918e02
--- /dev/null
+++ b/arch/s390/include/uapi/asm/uvdevice.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2022
+ * Author(s): Steffen Eiden <seiden@linux.ibm.com>
+ */
+#ifndef __S390_ASM_UVDEVICE_H
+#define __S390_ASM_UVDEVICE_H
+
+#include <linux/types.h>
+
+struct uvio_ioctl_cb {
+ __u32 flags;
+ __u16 uv_rc; /* UV header rc value */
+ __u16 uv_rrc; /* UV header rrc value */
+ __u64 argument_addr; /* Userspace address of uvio argument */
+ __u32 argument_len;
+ __u8 reserved14[0x40 - 0x14]; /* must be zero */
+};
+
+#define UVIO_ATT_USER_DATA_LEN 0x100
+#define UVIO_ATT_UID_LEN 0x10
+struct uvio_attest {
+ __u64 arcb_addr; /* 0x0000 */
+ __u64 meas_addr; /* 0x0008 */
+ __u64 add_data_addr; /* 0x0010 */
+ __u8 user_data[UVIO_ATT_USER_DATA_LEN]; /* 0x0018 */
+ __u8 config_uid[UVIO_ATT_UID_LEN]; /* 0x0118 */
+ __u32 arcb_len; /* 0x0128 */
+ __u32 meas_len; /* 0x012c */
+ __u32 add_data_len; /* 0x0130 */
+ __u16 user_data_len; /* 0x0134 */
+ __u16 reserved136; /* 0x0136 */
+};
+
+/*
+ * The following max values define an upper length for the IOCTL in/out buffers.
+ * However, they do not represent the maximum the Ultravisor allows which is
+ * often way smaller. By allowing larger buffer sizes we hopefully do not need
+ * to update the code with every machine update. It is therefore possible for
+ * userspace to request more memory than actually used by kernel/UV.
+ */
+#define UVIO_ATT_ARCB_MAX_LEN 0x100000
+#define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000
+#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000
+
+#define UVIO_DEVICE_NAME "uv"
+#define UVIO_TYPE_UVC 'u'
+
+#define UVIO_IOCTL_ATT _IOWR(UVIO_TYPE_UVC, 0x01, struct uvio_ioctl_cb)
+
+#endif /* __S390_ASM_UVDEVICE_H */
diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h
index 5a2177e96e88..d83713f67530 100644
--- a/arch/s390/include/uapi/asm/zcrypt.h
+++ b/arch/s390/include/uapi/asm/zcrypt.h
@@ -4,7 +4,7 @@
*
* zcrypt 2.2.1 (user-visible header)
*
- * Copyright IBM Corp. 2001, 2019
+ * Copyright IBM Corp. 2001, 2022
* Author(s): Robert Burroughs
* Eric Rossman (edrossma@us.ibm.com)
*
@@ -36,12 +36,12 @@
* - length(n_modulus) = inputdatalength
*/
struct ica_rsa_modexpo {
- char __user *inputdata;
- unsigned int inputdatalength;
- char __user *outputdata;
- unsigned int outputdatalength;
- char __user *b_key;
- char __user *n_modulus;
+ __u8 __user *inputdata;
+ __u32 inputdatalength;
+ __u8 __user *outputdata;
+ __u32 outputdatalength;
+ __u8 __user *b_key;
+ __u8 __user *n_modulus;
};
/**
@@ -59,15 +59,15 @@ struct ica_rsa_modexpo {
* - length(u_mult_inv) = inputdatalength/2 + 8
*/
struct ica_rsa_modexpo_crt {
- char __user *inputdata;
- unsigned int inputdatalength;
- char __user *outputdata;
- unsigned int outputdatalength;
- char __user *bp_key;
- char __user *bq_key;
- char __user *np_prime;
- char __user *nq_prime;
- char __user *u_mult_inv;
+ __u8 __user *inputdata;
+ __u32 inputdatalength;
+ __u8 __user *outputdata;
+ __u32 outputdatalength;
+ __u8 __user *bp_key;
+ __u8 __user *bq_key;
+ __u8 __user *np_prime;
+ __u8 __user *nq_prime;
+ __u8 __user *u_mult_inv;
};
/**
@@ -83,67 +83,65 @@ struct ica_rsa_modexpo_crt {
* key block
*/
struct CPRBX {
- unsigned short cprb_len; /* CPRB length 220 */
- unsigned char cprb_ver_id; /* CPRB version id. 0x02 */
- unsigned char pad_000[3]; /* Alignment pad bytes */
- unsigned char func_id[2]; /* function id 0x5432 */
- unsigned char cprb_flags[4]; /* Flags */
- unsigned int req_parml; /* request parameter buffer len */
- unsigned int req_datal; /* request data buffer */
- unsigned int rpl_msgbl; /* reply message block length */
- unsigned int rpld_parml; /* replied parameter block len */
- unsigned int rpl_datal; /* reply data block len */
- unsigned int rpld_datal; /* replied data block len */
- unsigned int req_extbl; /* request extension block len */
- unsigned char pad_001[4]; /* reserved */
- unsigned int rpld_extbl; /* replied extension block len */
- unsigned char padx000[16 - sizeof(char *)];
- unsigned char *req_parmb; /* request parm block 'address' */
- unsigned char padx001[16 - sizeof(char *)];
- unsigned char *req_datab; /* request data block 'address' */
- unsigned char padx002[16 - sizeof(char *)];
- unsigned char *rpl_parmb; /* reply parm block 'address' */
- unsigned char padx003[16 - sizeof(char *)];
- unsigned char *rpl_datab; /* reply data block 'address' */
- unsigned char padx004[16 - sizeof(char *)];
- unsigned char *req_extb; /* request extension block 'addr'*/
- unsigned char padx005[16 - sizeof(char *)];
- unsigned char *rpl_extb; /* reply extension block 'address'*/
- unsigned short ccp_rtcode; /* server return code */
- unsigned short ccp_rscode; /* server reason code */
- unsigned int mac_data_len; /* Mac Data Length */
- unsigned char logon_id[8]; /* Logon Identifier */
- unsigned char mac_value[8]; /* Mac Value */
- unsigned char mac_content_flgs;/* Mac content flag byte */
- unsigned char pad_002; /* Alignment */
- unsigned short domain; /* Domain */
- unsigned char usage_domain[4];/* Usage domain */
- unsigned char cntrl_domain[4];/* Control domain */
- unsigned char S390enf_mask[4];/* S/390 enforcement mask */
- unsigned char pad_004[36]; /* reserved */
+ __u16 cprb_len; /* CPRB length 220 */
+ __u8 cprb_ver_id; /* CPRB version id. 0x02 */
+ __u8 _pad_000[3]; /* Alignment pad bytes */
+ __u8 func_id[2]; /* function id 0x5432 */
+ __u8 cprb_flags[4]; /* Flags */
+ __u32 req_parml; /* request parameter buffer len */
+ __u32 req_datal; /* request data buffer */
+ __u32 rpl_msgbl; /* reply message block length */
+ __u32 rpld_parml; /* replied parameter block len */
+ __u32 rpl_datal; /* reply data block len */
+ __u32 rpld_datal; /* replied data block len */
+ __u32 req_extbl; /* request extension block len */
+ __u8 _pad_001[4]; /* reserved */
+ __u32 rpld_extbl; /* replied extension block len */
+ __u8 _pad_002[16 - sizeof(__u8 *)];
+ __u8 __user *req_parmb; /* request parm block 'address' */
+ __u8 _pad_003[16 - sizeof(__u8 *)];
+ __u8 __user *req_datab; /* request data block 'address' */
+ __u8 _pad_004[16 - sizeof(__u8 *)];
+ __u8 __user *rpl_parmb; /* reply parm block 'address' */
+ __u8 _pad_005[16 - sizeof(__u8 *)];
+ __u8 __user *rpl_datab; /* reply data block 'address' */
+ __u8 _pad_006[16 - sizeof(__u8 *)];
+ __u8 __user *req_extb; /* request extension block 'addr'*/
+ __u8 _pad_007[16 - sizeof(__u8 *)];
+ __u8 __user *rpl_extb; /* reply extension block 'address'*/
+ __u16 ccp_rtcode; /* server return code */
+ __u16 ccp_rscode; /* server reason code */
+ __u32 mac_data_len; /* Mac Data Length */
+ __u8 logon_id[8]; /* Logon Identifier */
+ __u8 mac_value[8]; /* Mac Value */
+ __u8 mac_content_flgs; /* Mac content flag byte */
+ __u8 _pad_008; /* Alignment */
+ __u16 domain; /* Domain */
+ __u8 _pad_009[12]; /* reserved, checked for zeros */
+ __u8 _pad_010[36]; /* reserved */
} __attribute__((packed));
/**
* xcRB
*/
struct ica_xcRB {
- unsigned short agent_ID;
- unsigned int user_defined;
- unsigned short request_ID;
- unsigned int request_control_blk_length;
- unsigned char padding1[16 - sizeof(char *)];
- char __user *request_control_blk_addr;
- unsigned int request_data_length;
- char padding2[16 - sizeof(char *)];
- char __user *request_data_address;
- unsigned int reply_control_blk_length;
- char padding3[16 - sizeof(char *)];
- char __user *reply_control_blk_addr;
- unsigned int reply_data_length;
- char padding4[16 - sizeof(char *)];
- char __user *reply_data_addr;
- unsigned short priority_window;
- unsigned int status;
+ __u16 agent_ID;
+ __u32 user_defined;
+ __u16 request_ID;
+ __u32 request_control_blk_length;
+ __u8 _padding1[16 - sizeof(__u8 *)];
+ __u8 __user *request_control_blk_addr;
+ __u32 request_data_length;
+ __u8 _padding2[16 - sizeof(__u8 *)];
+ __u8 __user *request_data_address;
+ __u32 reply_control_blk_length;
+ __u8 _padding3[16 - sizeof(__u8 *)];
+ __u8 __user *reply_control_blk_addr;
+ __u32 reply_data_length;
+ __u8 __padding4[16 - sizeof(__u8 *)];
+ __u8 __user *reply_data_addr;
+ __u16 priority_window;
+ __u32 status;
} __attribute__((packed));
/**
@@ -238,8 +236,8 @@ struct zcrypt_device_matrix_ext {
};
#define AUTOSELECT 0xFFFFFFFF
-#define AUTOSEL_AP ((__u16) 0xFFFF)
-#define AUTOSEL_DOM ((__u16) 0xFFFF)
+#define AUTOSEL_AP ((__u16)0xFFFF)
+#define AUTOSEL_DOM ((__u16)0xFFFF)
#define ZCRYPT_IOCTL_MAGIC 'z'
@@ -288,7 +286,7 @@ struct zcrypt_device_matrix_ext {
* 0x08: CEX3A
* 0x0a: CEX4
* 0x0b: CEX5
- * 0x0c: CEX6 and CEX7
+ * 0x0c: CEX6, CEX7 or CEX8
* 0x0d: device is disabled
*
* ZCRYPT_QDEPTH_MASK
@@ -305,12 +303,12 @@ struct zcrypt_device_matrix_ext {
/**
* Supported ioctl calls
*/
-#define ICARSAMODEXPO _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0)
-#define ICARSACRT _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0)
-#define ZSECSENDCPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0)
-#define ZSENDEP11CPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0)
+#define ICARSAMODEXPO _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0)
+#define ICARSACRT _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0)
+#define ZSECSENDCPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0)
+#define ZSENDEP11CPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0)
-#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0)
+#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0)
#define ZCRYPT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x58, char[MAX_ZDEV_CARDIDS_EXT])
#define ZCRYPT_QDEPTH_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x59, char[MAX_ZDEV_CARDIDS_EXT])
#define ZCRYPT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x5a, int[MAX_ZDEV_CARDIDS_EXT])
@@ -352,7 +350,7 @@ struct zcrypt_device_matrix {
};
/* Deprecated: use ZCRYPT_DEVICE_STATUS */
-#define ZDEVICESTATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0)
+#define ZDEVICESTATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0)
/* Deprecated: use ZCRYPT_STATUS_MASK */
#define Z90STAT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x48, char[64])
/* Deprecated: use ZCRYPT_QDEPTH_MASK */
diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore
index c5f676c3c224..bbb90f92d051 100644
--- a/arch/s390/kernel/.gitignore
+++ b/arch/s390/kernel/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
vmlinux.lds
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2b1203cf7be6..5e6a23299790 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -33,36 +33,33 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls
CFLAGS_dumpstack.o += -fno-optimize-sibling-calls
CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls
-#
-# Pass UTS_MACHINE for user_regset definition
-#
-CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"'
-
-obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
-obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
-obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o
-obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
+obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o
+obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
+obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o
+obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y += smp.o
+obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o
-extra-y += head64.o vmlinux.lds
+extra-y += vmlinux.lds
obj-$(CONFIG_SYSFS) += nospec-sysfs.o
CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE)
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o
-obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_AUDIT) += audit.o
compat-obj-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o
obj-$(CONFIG_COMPAT) += $(compat-obj-y)
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o
+obj-$(CONFIG_KPROBES) += kprobes_insn_page.o
+obj-$(CONFIG_KPROBES) += mcount.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER) += mcount.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
@@ -70,14 +67,16 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o
-obj-$(CONFIG_IMA) += ima_arch.o
+obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o
+obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o
obj-$(CONFIG_TRACEPOINTS) += trace.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
# vdso
obj-y += vdso64/
+obj-$(CONFIG_COMPAT) += vdso32/
diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c
new file mode 100644
index 000000000000..fb92e8ed0525
--- /dev/null
+++ b/arch/s390/kernel/abs_lowcore.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pgtable.h>
+#include <asm/abs_lowcore.h>
+
+#define ABS_LOWCORE_UNMAPPED 1
+#define ABS_LOWCORE_LAP_ON 2
+#define ABS_LOWCORE_IRQS_ON 4
+
+unsigned long __bootdata_preserved(__abs_lowcore);
+bool __ro_after_init abs_lowcore_mapped;
+
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc)
+{
+ unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+ unsigned long phys = __pa(lc);
+ int rc, i;
+
+ for (i = 0; i < LC_PAGES; i++) {
+ rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc);
+ if (rc) {
+ /*
+ * Do not unmap allocated page tables in case the
+ * allocation was not requested. In such a case the
+ * request is expected coming from an atomic context,
+ * while the unmap attempt might sleep.
+ */
+ if (alloc) {
+ for (--i; i >= 0; i--) {
+ addr -= PAGE_SIZE;
+ vmem_unmap_4k_page(addr);
+ }
+ }
+ return rc;
+ }
+ addr += PAGE_SIZE;
+ phys += PAGE_SIZE;
+ }
+ return 0;
+}
+
+void abs_lowcore_unmap(int cpu)
+{
+ unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+ int i;
+
+ for (i = 0; i < LC_PAGES; i++) {
+ vmem_unmap_4k_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
+
+struct lowcore *get_abs_lowcore(unsigned long *flags)
+{
+ unsigned long irq_flags;
+ union ctlreg0 cr0;
+ int cpu;
+
+ *flags = 0;
+ cpu = get_cpu();
+ if (abs_lowcore_mapped) {
+ return ((struct lowcore *)__abs_lowcore) + cpu;
+ } else {
+ if (cpu != 0)
+ panic("Invalid unmapped absolute lowcore access\n");
+ local_irq_save(irq_flags);
+ if (!irqs_disabled_flags(irq_flags))
+ *flags |= ABS_LOWCORE_IRQS_ON;
+ __ctl_store(cr0.val, 0, 0);
+ if (cr0.lap) {
+ *flags |= ABS_LOWCORE_LAP_ON;
+ __ctl_clear_bit(0, 28);
+ }
+ *flags |= ABS_LOWCORE_UNMAPPED;
+ return lowcore_ptr[0];
+ }
+}
+
+void put_abs_lowcore(struct lowcore *lc, unsigned long flags)
+{
+ if (abs_lowcore_mapped) {
+ if (flags)
+ panic("Invalid mapped absolute lowcore release\n");
+ } else {
+ if (smp_processor_id() != 0)
+ panic("Invalid mapped absolute lowcore access\n");
+ if (!(flags & ABS_LOWCORE_UNMAPPED))
+ panic("Invalid unmapped absolute lowcore release\n");
+ if (flags & ABS_LOWCORE_LAP_ON)
+ __ctl_set_bit(0, 28);
+ if (flags & ABS_LOWCORE_IRQS_ON)
+ local_irq_enable();
+ }
+ put_cpu();
+}
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 8e1f2aee85ef..e7bca29f9c34 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/facility.h>
#include <asm/nospec-branch.h>
-#define MAX_PATCH_LEN (255 - 1)
-
static int __initdata_or_module alt_instr_disabled;
static int __init disable_alternative_instructions(char *str)
@@ -16,86 +17,30 @@ static int __init disable_alternative_instructions(char *str)
early_param("noaltinstr", disable_alternative_instructions);
-struct brcl_insn {
- u16 opc;
- s32 disp;
-} __packed;
-
-static u16 __initdata_or_module nop16 = 0x0700;
-static u32 __initdata_or_module nop32 = 0x47000000;
-static struct brcl_insn __initdata_or_module nop48 = {
- 0xc004, 0
-};
-
-static const void *nops[] __initdata_or_module = {
- &nop16,
- &nop32,
- &nop48
-};
-
-static void __init_or_module add_jump_padding(void *insns, unsigned int len)
-{
- struct brcl_insn brcl = {
- 0xc0f4,
- len / 2
- };
-
- memcpy(insns, &brcl, sizeof(brcl));
- insns += sizeof(brcl);
- len -= sizeof(brcl);
-
- while (len > 0) {
- memcpy(insns, &nop16, 2);
- insns += 2;
- len -= 2;
- }
-}
-
-static void __init_or_module add_padding(void *insns, unsigned int len)
-{
- if (len > 6)
- add_jump_padding(insns, len);
- else if (len >= 2)
- memcpy(insns, nops[len / 2 - 1], len);
-}
-
static void __init_or_module __apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
u8 *instr, *replacement;
- u8 insnbuf[MAX_PATCH_LEN];
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
*/
for (a = start; a < end; a++) {
- int insnbuf_sz = 0;
-
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- if (!__test_facility(a->facility,
- S390_lowcore.alt_stfle_fac_list))
+ if (!__test_facility(a->facility, alt_stfle_fac_list))
continue;
- if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
+ if (unlikely(a->instrlen % 2)) {
WARN_ONCE(1, "cpu alternatives instructions length is "
"odd, skipping patching\n");
continue;
}
- memcpy(insnbuf, replacement, a->replacementlen);
- insnbuf_sz = a->replacementlen;
-
- if (a->instrlen > a->replacementlen) {
- add_padding(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
- insnbuf_sz += a->instrlen - a->replacementlen;
- }
-
- s390_kernel_write(instr, insnbuf, insnbuf_sz);
+ s390_kernel_write(instr, replacement, a->instrlen);
}
}
@@ -111,3 +56,20 @@ void __init apply_alternative_instructions(void)
{
apply_alternatives(__alt_instructions, __alt_instructions_end);
}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+void text_poke_sync_lock(void)
+{
+ cpus_read_lock();
+ text_poke_sync();
+ cpus_read_unlock();
+}
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ce33406cfe83..d8ce965c0a97 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -11,11 +11,9 @@
#include <linux/kvm_host.h>
#include <linux/sched.h>
#include <linux/purgatory.h>
+#include <linux/pgtable.h>
#include <asm/idle.h>
-#include <asm/vdso.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
-#include <asm/nmi.h>
#include <asm/stacktrace.h>
int main(void)
@@ -27,80 +25,54 @@ int main(void)
BLANK();
/* thread struct offsets */
OFFSET(__THREAD_ksp, thread_struct, ksp);
- OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table);
- OFFSET(__THREAD_last_break, thread_struct, last_break);
- OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc);
- OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs);
- OFFSET(__THREAD_per_cause, thread_struct, per_event.cause);
- OFFSET(__THREAD_per_address, thread_struct, per_event.address);
- OFFSET(__THREAD_per_paid, thread_struct, per_event.paid);
- OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb);
BLANK();
/* thread info offsets */
OFFSET(__TI_flags, task_struct, thread_info.flags);
BLANK();
/* pt_regs offsets */
- OFFSET(__PT_ARGS, pt_regs, args);
OFFSET(__PT_PSW, pt_regs, psw);
OFFSET(__PT_GPRS, pt_regs, gprs);
+ OFFSET(__PT_R0, pt_regs, gprs[0]);
+ OFFSET(__PT_R1, pt_regs, gprs[1]);
+ OFFSET(__PT_R2, pt_regs, gprs[2]);
+ OFFSET(__PT_R3, pt_regs, gprs[3]);
+ OFFSET(__PT_R4, pt_regs, gprs[4]);
+ OFFSET(__PT_R5, pt_regs, gprs[5]);
+ OFFSET(__PT_R6, pt_regs, gprs[6]);
+ OFFSET(__PT_R7, pt_regs, gprs[7]);
+ OFFSET(__PT_R8, pt_regs, gprs[8]);
+ OFFSET(__PT_R9, pt_regs, gprs[9]);
+ OFFSET(__PT_R10, pt_regs, gprs[10]);
+ OFFSET(__PT_R11, pt_regs, gprs[11]);
+ OFFSET(__PT_R12, pt_regs, gprs[12]);
+ OFFSET(__PT_R13, pt_regs, gprs[13]);
+ OFFSET(__PT_R14, pt_regs, gprs[14]);
+ OFFSET(__PT_R15, pt_regs, gprs[15]);
OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
- OFFSET(__PT_INT_CODE, pt_regs, int_code);
- OFFSET(__PT_INT_PARM, pt_regs, int_parm);
- OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long);
OFFSET(__PT_FLAGS, pt_regs, flags);
+ OFFSET(__PT_CR1, pt_regs, cr1);
+ OFFSET(__PT_LAST_BREAK, pt_regs, last_break);
DEFINE(__PT_SIZE, sizeof(struct pt_regs));
BLANK();
/* stack_frame offsets */
OFFSET(__SF_BACKCHAIN, stack_frame, back_chain);
OFFSET(__SF_GPRS, stack_frame, gprs);
- OFFSET(__SF_EMPTY, stack_frame, empty1);
- OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]);
- OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]);
- OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]);
- OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]);
- BLANK();
- /* timeval/timezone offsets for use by vdso */
- OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count);
- OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp);
- OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec);
- OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec);
- OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec);
- OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec);
- OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec);
- OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec);
- OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec);
- OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec);
- OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest);
- OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available);
- OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult);
- OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
- OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir);
- OFFSET(__VDSO_TS_END, vdso_data, ts_end);
- OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
- OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
- OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val);
- BLANK();
- /* constants used by the vdso */
- DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME);
- DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC);
- DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
- DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
- DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID);
- DEFINE(__CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
- DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC);
+ OFFSET(__SF_EMPTY, stack_frame, empty[0]);
+ OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block);
+ OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea);
+ OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
+ OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
+ DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
BLANK();
/* idle data offsets */
OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter);
- OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit);
OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter);
- OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit);
+ OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter);
BLANK();
/* hardware defined lowcore locations 0x000 - 0x1ff */
OFFSET(__LC_EXT_PARAMS, lowcore, ext_params);
OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr);
OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code);
- OFFSET(__LC_SVC_ILC, lowcore, svc_ilc);
- OFFSET(__LC_SVC_INT_CODE, lowcore, svc_code);
OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc);
OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code);
OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code);
@@ -118,12 +90,12 @@ int main(void)
OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr);
OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm);
OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word);
- OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
- OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
- OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
+ OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break);
+ OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe);
+ OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe);
OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw);
OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw);
@@ -143,39 +115,34 @@ int main(void)
OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags);
OFFSET(__LC_RETURN_PSW, lowcore, return_psw);
OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw);
- OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer);
- OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer);
+ OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer);
OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer);
OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer);
- OFFSET(__LC_USER_TIMER, lowcore, user_timer);
- OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer);
- OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer);
OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer);
OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
- OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
OFFSET(__LC_CURRENT, lowcore, current_task);
OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
OFFSET(__LC_ASYNC_STACK, lowcore, async_stack);
OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack);
OFFSET(__LC_RESTART_STACK, lowcore, restart_stack);
+ OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack);
OFFSET(__LC_RESTART_FN, lowcore, restart_fn);
OFFSET(__LC_RESTART_DATA, lowcore, restart_data);
OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source);
+ OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags);
+ OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce);
OFFSET(__LC_USER_ASCE, lowcore, user_asce);
- OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce);
OFFSET(__LC_LPP, lowcore, lpp);
OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
- OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset);
- OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data);
- OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
- OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
OFFSET(__LC_GMAP, lowcore, gmap);
- OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline);
+ OFFSET(__LC_LAST_BREAK, lowcore, last_break);
/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
+ OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info);
+ OFFSET(__LC_OS_INFO, lowcore, os_info);
/* hardware defined lowcore locations 0x1000 - 0x18ff */
OFFSET(__LC_MCESAD, lowcore, mcesad);
OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2);
@@ -187,13 +154,11 @@ int main(void)
OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area);
OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area);
OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area);
+ OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area);
OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area);
OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
BLANK();
- /* extended machine check save area */
- OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
- BLANK();
/* gmap/sie offsets */
OFFSET(__GMAP_ASCE, gmap, asce);
OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
@@ -202,5 +167,15 @@ int main(void)
OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start);
OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len);
DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region));
+ /* sizeof kernel parameter area */
+ DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea));
+ /* kernel parameter area offsets */
+ DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device));
+ DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start));
+ DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size));
+ DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base));
+ DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size));
+ DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line));
+ DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size));
return 0;
}
diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
index d395c6c9944c..02051a596b87 100644
--- a/arch/s390/kernel/audit.c
+++ b/arch/s390/kernel/audit.c
@@ -47,15 +47,17 @@ int audit_classify_syscall(int abi, unsigned syscall)
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S
deleted file mode 100644
index b79e0fd571f8..000000000000
--- a/arch/s390/kernel/base.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/s390/kernel/base.S
- *
- * Copyright IBM Corp. 2006, 2007
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
- * Michael Holzheu <holzheu@de.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/ptrace.h>
-#include <asm/sigp.h>
-
- GEN_BR_THUNK %r9
- GEN_BR_THUNK %r14
-
-ENTRY(s390_base_ext_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_ASYNC
- basr %r13,0
-0: aghi %r15,-STACK_FRAME_OVERHEAD
- larl %r1,s390_base_ext_handler_fn
- lg %r9,0(%r1)
- ltgr %r9,%r9
- jz 1f
- BASR_EX %r14,%r9
-1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC
- ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit
- lpswe __LC_EXT_OLD_PSW
-ENDPROC(s390_base_ext_handler)
-
- .section .bss
- .align 8
- .globl s390_base_ext_handler_fn
-s390_base_ext_handler_fn:
- .quad 0
- .previous
-
-ENTRY(s390_base_pgm_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_SYNC
- basr %r13,0
-0: aghi %r15,-STACK_FRAME_OVERHEAD
- larl %r1,s390_base_pgm_handler_fn
- lg %r9,0(%r1)
- ltgr %r9,%r9
- jz 1f
- BASR_EX %r14,%r9
- lmg %r0,%r15,__LC_SAVE_AREA_SYNC
- lpswe __LC_PGM_OLD_PSW
-1: lpswe disabled_wait_psw-0b(%r13)
-ENDPROC(s390_base_pgm_handler)
-
- .align 8
-disabled_wait_psw:
- .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler
-
- .section .bss
- .align 8
- .globl s390_base_pgm_handler_fn
-s390_base_pgm_handler_fn:
- .quad 0
- .previous
diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c
index d66825e53fce..7ee3651d00ab 100644
--- a/arch/s390/kernel/cache.c
+++ b/arch/s390/kernel/cache.c
@@ -3,7 +3,6 @@
* Extract CPU cache information and expose them via sysfs.
*
* Copyright IBM Corp. 2012
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/seq_file.h>
@@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m)
struct cacheinfo *cache;
int idx;
- if (!test_facility(34))
- return;
this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask));
for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
cache = this_cpu_ci->info_list + idx;
@@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu)
union cache_topology ct;
enum cache_type ctype;
- if (!test_facility(34))
- return -EOPNOTSUPP;
if (!this_cpu_ci)
return -EINVAL;
ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
@@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu)
union cache_topology ct;
enum cache_type ctype;
- if (!test_facility(34))
- return -EOPNOTSUPP;
ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
for (idx = 0, level = 0; level < this_cpu_ci->num_levels &&
idx < this_cpu_ci->num_leaves; idx++, level++) {
diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
index 444fb1f66944..a7c46e8310f0 100644
--- a/arch/s390/kernel/compat_audit.c
+++ b/arch/s390/kernel/compat_audit.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#undef __s390x__
+#include <linux/audit_arch.h>
#include <asm/unistd.h>
#include "audit.h"
@@ -32,14 +33,16 @@ int s390_classify_syscall(unsigned syscall)
{
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 1;
+ return AUDITSC_COMPAT;
}
}
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index 64509e7dbd3b..ef23739b277c 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -5,69 +5,59 @@
#include <linux/compat.h>
#include <linux/socket.h>
#include <linux/syscalls.h>
+#include <asm/ptrace.h>
-/* Macro that masks the high order bit of an 32 bit pointer and converts it*/
-/* to a 64 bit pointer */
-#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
-#define AA(__x) \
- ((unsigned long)(__x))
+/*
+ * Macro that masks the high order bit of a 32 bit pointer and
+ * converts it to a 64 bit pointer.
+ */
+#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
+#define AA(__x) ((unsigned long)(__x))
/* Now 32bit compatibility types */
struct ipc_kludge_32 {
- __u32 msgp; /* pointer */
- __s32 msgtyp;
+ __u32 msgp; /* pointer */
+ __s32 msgtyp;
};
/* asm/sigcontext.h */
-typedef union
-{
- __u64 d;
- __u32 f;
+typedef union {
+ __u64 d;
+ __u32 f;
} freg_t32;
-typedef struct
-{
+typedef struct {
unsigned int fpc;
unsigned int pad;
- freg_t32 fprs[__NUM_FPRS];
+ freg_t32 fprs[__NUM_FPRS];
} _s390_fp_regs32;
-typedef struct
-{
- __u32 mask;
- __u32 addr;
-} _psw_t32 __attribute__ ((aligned(8)));
-
-typedef struct
-{
- _psw_t32 psw;
+typedef struct {
+ psw_t32 psw;
__u32 gprs[__NUM_GPRS];
__u32 acrs[__NUM_ACRS];
} _s390_regs_common32;
-typedef struct
-{
+typedef struct {
_s390_regs_common32 regs;
- _s390_fp_regs32 fpregs;
+ _s390_fp_regs32 fpregs;
} _sigregs32;
-typedef struct
-{
- __u32 gprs_high[__NUM_GPRS];
- __u64 vxrs_low[__NUM_VXRS_LOW];
- __vector128 vxrs_high[__NUM_VXRS_HIGH];
- __u8 __reserved[128];
+typedef struct {
+ __u32 gprs_high[__NUM_GPRS];
+ __u64 vxrs_low[__NUM_VXRS_LOW];
+ __vector128 vxrs_high[__NUM_VXRS_HIGH];
+ __u8 __reserved[128];
} _sigregs_ext32;
#define _SIGCONTEXT_NSIG32 64
#define _SIGCONTEXT_NSIG_BPW32 32
#define __SIGNAL_FRAMESIZE32 96
-#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2)
+#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2)
-struct sigcontext32
-{
+struct sigcontext32 {
__u32 oldmask[_COMPAT_NSIG_WORDS];
- __u32 sregs; /* pointer */
+ __u32 sregs; /* pointer */
};
/* asm/signal.h */
@@ -75,11 +65,11 @@ struct sigcontext32
/* asm/ucontext.h */
struct ucontext32 {
__u32 uc_flags;
- __u32 uc_link; /* pointer */
+ __u32 uc_link; /* pointer */
compat_stack_t uc_stack;
_sigregs32 uc_mcontext;
compat_sigset_t uc_sigmask;
- /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
+ /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
unsigned char __unused[128 - sizeof(compat_sigset_t)];
_sigregs_ext32 uc_mcontext_ext;
};
@@ -88,25 +78,6 @@ struct stat64_emu31;
struct mmap_arg_struct_emu31;
struct fadvise64_64_args;
-long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group);
-long compat_sys_s390_setregid16(u16 rgid, u16 egid);
-long compat_sys_s390_setgid16(u16 gid);
-long compat_sys_s390_setreuid16(u16 ruid, u16 euid);
-long compat_sys_s390_setuid16(u16 uid);
-long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid);
-long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid);
-long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid);
-long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid);
-long compat_sys_s390_setfsuid16(u16 uid);
-long compat_sys_s390_setfsgid16(u16 gid);
-long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_getuid16(void);
-long compat_sys_s390_geteuid16(void);
-long compat_sys_s390_getgid16(void);
-long compat_sys_s390_getegid16(void);
long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low);
long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low);
long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low);
@@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu
long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag);
long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg);
long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg);
-long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count);
-long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count);
+long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count);
+long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count);
long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise);
long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args);
long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags);
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 38d4bdbc34b9..eee1ad3e1b29 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -28,6 +28,7 @@
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
+#include <asm/vdso.h>
#include "compat_linux.h"
#include "compat_ptrace.h"
#include "entry.h"
@@ -88,7 +89,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
_sigregs32 user_sregs;
int i;
- /* Alwys make any pending restarted system call return -EINTR */
+ /* Always make any pending restarted system call return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
@@ -303,11 +304,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
- /* Signal frames without vectors registers are short ! */
- __u16 __user *svc = (void __user *) frame + frame_size - 2;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+ restorer = VDSO32_SYMBOL(current, sigreturn);
}
/* Set up registers for signal handler */
@@ -370,10 +367,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
- __u16 __user *svc = &frame->svc_insn;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+ restorer = VDSO32_SYMBOL(current, rt_sigreturn);
}
/* Create siginfo on the signal stack */
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index af013b4244d3..72e106cfd8c7 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -26,31 +26,35 @@ static char cpcmd_buf[241];
static int diag8_noresponse(int cmdlen)
{
- register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
- register unsigned long reg3 asm ("3") = cmdlen;
-
asm volatile(
- " diag %1,%0,0x8\n"
- : "+d" (reg3) : "d" (reg2) : "cc");
- return reg3;
+ " diag %[rx],%[ry],0x8\n"
+ : [ry] "+&d" (cmdlen)
+ : [rx] "d" (__pa(cpcmd_buf))
+ : "cc");
+ return cmdlen;
}
static int diag8_response(int cmdlen, char *response, int *rlen)
{
- register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
- register unsigned long reg3 asm ("3") = (addr_t) response;
- register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L;
- register unsigned long reg5 asm ("5") = *rlen;
+ union register_pair rx, ry;
+ int cc;
+ rx.even = __pa(cpcmd_buf);
+ rx.odd = __pa(response);
+ ry.even = cmdlen | 0x40000000L;
+ ry.odd = *rlen;
asm volatile(
- " diag %2,%0,0x8\n"
- " brc 8,1f\n"
- " agr %1,%4\n"
- "1:\n"
- : "+d" (reg4), "+d" (reg5)
- : "d" (reg2), "d" (reg3), "d" (*rlen) : "cc");
- *rlen = reg5;
- return reg4;
+ " diag %[rx],%[ry],0x8\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [ry] "+&d" (ry.pair)
+ : [rx] "d" (rx.pair)
+ : "cc");
+ if (cc)
+ *rlen += ry.odd;
+ else
+ *rlen = ry.odd;
+ return ry.even;
}
/*
diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c
new file mode 100644
index 000000000000..1b2ae42a0c15
--- /dev/null
+++ b/arch/s390/kernel/cpufeature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2022
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/bug.h>
+#include <asm/elf.h>
+
+enum {
+ TYPE_HWCAP,
+ TYPE_FACILITY,
+};
+
+struct s390_cpu_feature {
+ unsigned int type : 4;
+ unsigned int num : 28;
+};
+
+static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = {
+ [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA},
+ [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS},
+ [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158},
+};
+
+/*
+ * cpu_have_feature - Test CPU features on module initialization
+ */
+int cpu_have_feature(unsigned int num)
+{
+ struct s390_cpu_feature *feature;
+
+ if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES))
+ return 0;
+ feature = &s390_cpu_features[num];
+ switch (feature->type) {
+ case TYPE_HWCAP:
+ return !!(elf_hwcap & BIT(feature->num));
+ case TYPE_FACILITY:
+ return test_facility(feature->num);
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(cpu_have_feature);
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index f96a5857bbfd..dd74fe664ed1 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -15,11 +15,13 @@
#include <linux/slab.h>
#include <linux/memblock.h>
#include <linux/elf.h>
+#include <linux/uio.h>
#include <asm/asm-offsets.h>
#include <asm/os_info.h>
#include <asm/elf.h>
#include <asm/ipl.h>
#include <asm/sclp.h>
+#include <asm/maccess.h>
#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
@@ -60,9 +62,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)
{
struct save_area *sa;
- sa = (void *) memblock_phys_alloc(sizeof(*sa), 8);
+ sa = memblock_alloc(sizeof(*sa), 8);
if (!sa)
- panic("Failed to allocate save area\n");
+ return NULL;
if (is_boot_cpu)
list_add(&sa->list, &dump_save_areas);
@@ -113,121 +115,60 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs)
memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128));
}
-/*
- * Return physical address for virtual address
- */
-static inline void *load_real_addr(void *addr)
-{
- unsigned long real_addr;
-
- asm volatile(
- " lra %0,0(%1)\n"
- " jz 0f\n"
- " la %0,0\n"
- "0:"
- : "=a" (real_addr) : "a" (addr) : "cc");
- return (void *)real_addr;
-}
-
-/*
- * Copy memory of the old, dumped system to a kernel space virtual address
- */
-int copy_oldmem_kernel(void *dst, void *src, size_t count)
+static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- unsigned long from, len;
- void *ra;
- int rc;
+ size_t len, copied, res = 0;
while (count) {
- from = __pa(src);
- if (!OLDMEM_BASE && from < sclp.hsa_size) {
- /* Copy from zfcpdump HSA area */
- len = min(count, sclp.hsa_size - from);
- rc = memcpy_hsa_kernel(dst, from, len);
- if (rc)
- return rc;
+ if (!oldmem_data.start && src < sclp.hsa_size) {
+ /* Copy from zfcp/nvme dump HSA area */
+ len = min(count, sclp.hsa_size - src);
+ copied = memcpy_hsa_iter(iter, src, len);
} else {
/* Check for swapped kdump oldmem areas */
- if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
- from -= OLDMEM_BASE;
- len = min(count, OLDMEM_SIZE - from);
- } else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
- len = min(count, OLDMEM_SIZE - from);
- from += OLDMEM_BASE;
+ if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) {
+ src -= oldmem_data.start;
+ len = min(count, oldmem_data.size - src);
+ } else if (oldmem_data.start && src < oldmem_data.size) {
+ len = min(count, oldmem_data.size - src);
+ src += oldmem_data.start;
} else {
len = count;
}
- if (is_vmalloc_or_module_addr(dst)) {
- ra = load_real_addr(dst);
- len = min(PAGE_SIZE - offset_in_page(ra), len);
- } else {
- ra = dst;
- }
- if (memcpy_real(ra, (void *) from, len))
- return -EFAULT;
+ copied = memcpy_real_iter(iter, src, len);
}
- dst += len;
- src += len;
- count -= len;
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- return 0;
+ return res;
}
-/*
- * Copy memory of the old, dumped system to a user space virtual address
- */
-static int copy_oldmem_user(void __user *dst, void *src, size_t count)
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
{
- unsigned long from, len;
- int rc;
+ struct iov_iter iter;
+ struct kvec kvec;
- while (count) {
- from = __pa(src);
- if (!OLDMEM_BASE && from < sclp.hsa_size) {
- /* Copy from zfcpdump HSA area */
- len = min(count, sclp.hsa_size - from);
- rc = memcpy_hsa_user(dst, from, len);
- if (rc)
- return rc;
- } else {
- /* Check for swapped kdump oldmem areas */
- if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
- from -= OLDMEM_BASE;
- len = min(count, OLDMEM_SIZE - from);
- } else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
- len = min(count, OLDMEM_SIZE - from);
- from += OLDMEM_BASE;
- } else {
- len = count;
- }
- rc = copy_to_user_real(dst, (void *) from, count);
- if (rc)
- return rc;
- }
- dst += len;
- src += len;
- count -= len;
- }
+ kvec.iov_base = dst;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+ if (copy_oldmem_iter(&iter, src, count) < count)
+ return -EFAULT;
return 0;
}
/*
* Copy one page from "oldmem"
*/
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
- void *src;
- int rc;
+ unsigned long src;
- if (!csize)
- return 0;
- src = (void *) (pfn << PAGE_SHIFT) + offset;
- if (userbuf)
- rc = copy_oldmem_user((void __force __user *) buf, src, csize);
- else
- rc = copy_oldmem_kernel((void *) buf, src, csize);
- return rc;
+ src = pfn_to_phys(pfn) + offset;
+ return copy_oldmem_iter(iter, src, csize);
}
/*
@@ -243,10 +184,10 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
unsigned long size_old;
int rc;
- if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) {
- size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT));
+ if (pfn < oldmem_data.size >> PAGE_SHIFT) {
+ size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT));
rc = remap_pfn_range(vma, from,
- pfn + (OLDMEM_BASE >> PAGE_SHIFT),
+ pfn + (oldmem_data.start >> PAGE_SHIFT),
size_old, prot);
if (rc || size == size_old)
return rc;
@@ -258,7 +199,7 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
}
/*
- * Remap "oldmem" for zfcpdump
+ * Remap "oldmem" for zfcp/nvme dump
*
* We only map available memory above HSA size. Memory below HSA size
* is read on demand using the copy_oldmem_page() function.
@@ -283,12 +224,12 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
}
/*
- * Remap "oldmem" for kdump or zfcpdump
+ * Remap "oldmem" for kdump or zfcp/nvme dump
*/
int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
- if (OLDMEM_BASE)
+ if (oldmem_data.start)
return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot);
else
return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size,
@@ -365,7 +306,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs));
memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw));
memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs));
- nt_prstatus.pr_pid = cpu;
+ nt_prstatus.common.pr_pid = cpu;
/* Prepare fpregset (floating point) note */
memset(&nt_fpregset, 0, sizeof(nt_fpregset));
memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
@@ -429,10 +370,10 @@ static void *nt_prpsinfo(void *ptr)
static void *get_vmcoreinfo_old(unsigned long *size)
{
char nt_name[11], *vmcoreinfo;
+ unsigned long addr;
Elf64_Nhdr note;
- void *addr;
- if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr)))
+ if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr)))
return NULL;
memset(nt_name, 0, sizeof(nt_name));
if (copy_oldmem_kernel(&note, addr, sizeof(note)))
@@ -549,8 +490,7 @@ static int get_mem_chunk_cnt(void)
int cnt = 0;
u64 idx;
- for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
- MEMBLOCK_NONE, NULL, NULL, NULL)
+ for_each_physmem_range(idx, &oldmem_type, NULL, NULL)
cnt++;
return cnt;
}
@@ -563,8 +503,7 @@ static void loads_init(Elf64_Phdr *phdr, u64 loads_offset)
phys_addr_t start, end;
u64 idx;
- for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_physmem_range(idx, &oldmem_type, &start, &end) {
phdr->p_filesz = end - start;
phdr->p_type = PT_LOAD;
phdr->p_offset = start;
@@ -634,18 +573,18 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
u32 alloc_size;
u64 hdr_off;
- /* If we are not in kdump or zfcpdump mode return */
- if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP)
+ /* If we are not in kdump or zfcp/nvme dump mode return */
+ if (!oldmem_data.start && !is_ipl_type_dump())
return 0;
- /* If we cannot get HSA size for zfcpdump return error */
- if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size)
+ /* If we cannot get HSA size for zfcp/nvme dump return error */
+ if (is_ipl_type_dump() && !sclp.hsa_size)
return -ENODEV;
/* For kdump, exclude previous crashkernel memory */
- if (OLDMEM_BASE) {
- oldmem_region.base = OLDMEM_BASE;
- oldmem_region.size = OLDMEM_SIZE;
- oldmem_type.total_size = OLDMEM_SIZE;
+ if (oldmem_data.start) {
+ oldmem_region.base = oldmem_data.start;
+ oldmem_region.size = oldmem_data.size;
+ oldmem_type.total_size = oldmem_data.size;
}
mem_chunk_cnt = get_mem_chunk_cnt();
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6d321f5f101d..d7a82066a638 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -2,7 +2,7 @@
/*
* S/390 debug facility
*
- * Copyright IBM Corp. 1999, 2012
+ * Copyright IBM Corp. 1999, 2020
*
* Author(s): Michael Holzheu (holzheu@de.ibm.com),
* Holger Smolinski (Holger.Smolinski@de.ibm.com)
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/minmax.h>
#include <linux/debugfs.h>
#include <asm/debug.h>
@@ -90,27 +91,13 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view,
size_t user_buf_size, loff_t *offset);
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, const char *in_buf);
-static int debug_raw_format_fn(debug_info_t *id,
- struct debug_view *view, char *out_buf,
- const char *in_buf);
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
- int area, debug_entry_t *entry, char *out_buf);
-
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, debug_sprintf_entry_t *curr_event);
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b);
+static void debug_events_append(debug_info_t *dest, debug_info_t *src);
/* globals */
-struct debug_view debug_raw_view = {
- "raw",
- NULL,
- &debug_raw_header_fn,
- &debug_raw_format_fn,
- NULL,
- NULL
-};
-EXPORT_SYMBOL(debug_raw_view);
-
struct debug_view debug_hex_ascii_view = {
"hex_ascii",
NULL,
@@ -198,9 +185,10 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas)
if (!areas)
goto fail_malloc_areas;
for (i = 0; i < nr_areas; i++) {
+ /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */
areas[i] = kmalloc_array(pages_per_area,
sizeof(debug_entry_t *),
- GFP_KERNEL);
+ GFP_KERNEL | __GFP_NOWARN);
if (!areas[i])
goto fail_malloc_areas2;
for (j = 0; j < pages_per_area; j++) {
@@ -262,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area,
rc->level = level;
rc->buf_size = buf_size;
rc->entry_size = sizeof(debug_entry_t) + buf_size;
- strlcpy(rc->name, name, sizeof(rc->name));
+ strscpy(rc->name, name, sizeof(rc->name));
memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *));
memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *));
refcount_set(&(rc->ref_count), 0);
@@ -326,24 +314,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area,
goto out;
rc->mode = mode & ~S_IFMT;
-
- /* create root directory */
- rc->debugfs_root_entry = debugfs_create_dir(rc->name,
- debug_debugfs_root_entry);
-
- /* append new element to linked list */
- if (!debug_area_first) {
- /* first element in list */
- debug_area_first = rc;
- rc->prev = NULL;
- } else {
- /* append element to end of list */
- debug_area_last->next = rc;
- rc->prev = debug_area_last;
- }
- debug_area_last = rc;
- rc->next = NULL;
-
refcount_set(&rc->ref_count, 1);
out:
return rc;
@@ -403,27 +373,10 @@ static void debug_info_get(debug_info_t *db_info)
*/
static void debug_info_put(debug_info_t *db_info)
{
- int i;
-
if (!db_info)
return;
- if (refcount_dec_and_test(&db_info->ref_count)) {
- for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
- if (!db_info->views[i])
- continue;
- debugfs_remove(db_info->debugfs_entries[i]);
- }
- debugfs_remove(db_info->debugfs_root_entry);
- if (db_info == debug_area_first)
- debug_area_first = db_info->next;
- if (db_info == debug_area_last)
- debug_area_last = db_info->prev;
- if (db_info->prev)
- db_info->prev->next = db_info->next;
- if (db_info->next)
- db_info->next->prev = db_info->prev;
+ if (refcount_dec_and_test(&db_info->ref_count))
debug_info_free(db_info);
- }
}
/*
@@ -448,7 +401,7 @@ static int debug_format_entry(file_private_info_t *p_info)
act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area]
[p_info->act_page] + p_info->act_entry);
- if (act_entry->id.stck == 0LL)
+ if (act_entry->clock == 0LL)
goto out; /* empty entry */
if (view->header_proc)
len += view->header_proc(id_snap, view, p_info->act_area,
@@ -647,6 +600,31 @@ static int debug_close(struct inode *inode, struct file *file)
return 0; /* success */
}
+/* Create debugfs entries and add to internal list. */
+static void _debug_register(debug_info_t *id)
+{
+ /* create root directory */
+ id->debugfs_root_entry = debugfs_create_dir(id->name,
+ debug_debugfs_root_entry);
+
+ /* append new element to linked list */
+ if (!debug_area_first) {
+ /* first element in list */
+ debug_area_first = id;
+ id->prev = NULL;
+ } else {
+ /* append element to end of list */
+ debug_area_last->next = id;
+ id->prev = debug_area_last;
+ }
+ debug_area_last = id;
+ id->next = NULL;
+
+ debug_register_view(id, &debug_level_view);
+ debug_register_view(id, &debug_flush_view);
+ debug_register_view(id, &debug_pages_view);
+}
+
/**
* debug_register_mode() - creates and initializes debug area.
*
@@ -676,19 +654,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area,
if ((uid != 0) || (gid != 0))
pr_warn("Root becomes the owner of all s390dbf files in sysfs\n");
BUG_ON(!initialized);
- mutex_lock(&debug_mutex);
/* create new debug_info */
rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode);
- if (!rc)
- goto out;
- debug_register_view(rc, &debug_level_view);
- debug_register_view(rc, &debug_flush_view);
- debug_register_view(rc, &debug_pages_view);
-out:
- if (!rc)
+ if (rc) {
+ mutex_lock(&debug_mutex);
+ _debug_register(rc);
+ mutex_unlock(&debug_mutex);
+ } else {
pr_err("Registering debug feature %s failed\n", name);
- mutex_unlock(&debug_mutex);
+ }
return rc;
}
EXPORT_SYMBOL(debug_register_mode);
@@ -718,6 +693,82 @@ debug_info_t *debug_register(const char *name, int pages_per_area,
EXPORT_SYMBOL(debug_register);
/**
+ * debug_register_static() - registers a static debug area
+ *
+ * @id: Handle for static debug area
+ * @pages_per_area: Number of pages per area
+ * @nr_areas: Number of debug areas
+ *
+ * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO.
+ *
+ * Note: This function is called automatically via an initcall generated by
+ * DEFINE_STATIC_DEBUG_INFO.
+ */
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas)
+{
+ unsigned long flags;
+ debug_info_t *copy;
+
+ if (!initialized) {
+ pr_err("Tried to register debug feature %s too early\n",
+ id->name);
+ return;
+ }
+
+ copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+ id->level, ALL_AREAS);
+ if (!copy) {
+ pr_err("Registering debug feature %s failed\n", id->name);
+
+ /* Clear pointers to prevent tracing into released initdata. */
+ spin_lock_irqsave(&id->lock, flags);
+ id->areas = NULL;
+ id->active_pages = NULL;
+ id->active_entries = NULL;
+ spin_unlock_irqrestore(&id->lock, flags);
+
+ return;
+ }
+
+ /* Replace static trace area with dynamic copy. */
+ spin_lock_irqsave(&id->lock, flags);
+ debug_events_append(copy, id);
+ debug_areas_swap(id, copy);
+ spin_unlock_irqrestore(&id->lock, flags);
+
+ /* Clear pointers to initdata and discard copy. */
+ copy->areas = NULL;
+ copy->active_pages = NULL;
+ copy->active_entries = NULL;
+ debug_info_free(copy);
+
+ mutex_lock(&debug_mutex);
+ _debug_register(id);
+ mutex_unlock(&debug_mutex);
+}
+
+/* Remove debugfs entries and remove from internal list. */
+static void _debug_unregister(debug_info_t *id)
+{
+ int i;
+
+ for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
+ if (!id->views[i])
+ continue;
+ debugfs_remove(id->debugfs_entries[i]);
+ }
+ debugfs_remove(id->debugfs_root_entry);
+ if (id == debug_area_first)
+ debug_area_first = id->next;
+ if (id == debug_area_last)
+ debug_area_last = id->prev;
+ if (id->prev)
+ id->prev->next = id->next;
+ if (id->next)
+ id->next->prev = id->prev;
+}
+
+/**
* debug_unregister() - give back debug area.
*
* @id: handle for debug log
@@ -730,8 +781,10 @@ void debug_unregister(debug_info_t *id)
if (!id)
return;
mutex_lock(&debug_mutex);
- debug_info_put(id);
+ _debug_unregister(id);
mutex_unlock(&debug_mutex);
+
+ debug_info_put(id);
}
EXPORT_SYMBOL(debug_unregister);
@@ -741,35 +794,28 @@ EXPORT_SYMBOL(debug_unregister);
*/
static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area)
{
- debug_entry_t ***new_areas;
+ debug_info_t *new_id;
unsigned long flags;
- int rc = 0;
if (!id || (nr_areas <= 0) || (pages_per_area < 0))
return -EINVAL;
- if (pages_per_area > 0) {
- new_areas = debug_areas_alloc(pages_per_area, nr_areas);
- if (!new_areas) {
- pr_info("Allocating memory for %i pages failed\n",
- pages_per_area);
- rc = -ENOMEM;
- goto out;
- }
- } else {
- new_areas = NULL;
+
+ new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+ id->level, ALL_AREAS);
+ if (!new_id) {
+ pr_info("Allocating memory for %i pages failed\n",
+ pages_per_area);
+ return -ENOMEM;
}
+
spin_lock_irqsave(&id->lock, flags);
- debug_areas_free(id);
- id->areas = new_areas;
- id->nr_areas = nr_areas;
- id->pages_per_area = pages_per_area;
- id->active_area = 0;
- memset(id->active_entries, 0, sizeof(int)*id->nr_areas);
- memset(id->active_pages, 0, sizeof(int)*id->nr_areas);
+ debug_events_append(new_id, id);
+ debug_areas_swap(new_id, id);
+ debug_info_free(new_id);
spin_unlock_irqrestore(&id->lock, flags);
pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area);
-out:
- return rc;
+
+ return 0;
}
/**
@@ -787,16 +833,17 @@ void debug_set_level(debug_info_t *id, int new_level)
if (!id)
return;
- spin_lock_irqsave(&id->lock, flags);
+
if (new_level == DEBUG_OFF_LEVEL) {
- id->level = DEBUG_OFF_LEVEL;
pr_info("%s: switched off\n", id->name);
} else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) {
pr_info("%s: level %i is out of range (%i - %i)\n",
id->name, new_level, 0, DEBUG_MAX_LEVEL);
- } else {
- id->level = new_level;
+ return;
}
+
+ spin_lock_irqsave(&id->lock, flags);
+ id->level = new_level;
spin_unlock_irqrestore(&id->lock, flags);
}
EXPORT_SYMBOL(debug_set_level);
@@ -836,6 +883,42 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
id->active_entries[id->active_area]);
}
+/* Swap debug areas of a and b. */
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b)
+{
+ swap(a->nr_areas, b->nr_areas);
+ swap(a->pages_per_area, b->pages_per_area);
+ swap(a->areas, b->areas);
+ swap(a->active_area, b->active_area);
+ swap(a->active_pages, b->active_pages);
+ swap(a->active_entries, b->active_entries);
+}
+
+/* Append all debug events in active area from source to destination log. */
+static void debug_events_append(debug_info_t *dest, debug_info_t *src)
+{
+ debug_entry_t *from, *to, *last;
+
+ if (!src->areas || !dest->areas)
+ return;
+
+ /* Loop over all entries in src, starting with oldest. */
+ from = get_active_entry(src);
+ last = from;
+ do {
+ if (from->clock != 0LL) {
+ to = get_active_entry(dest);
+ memset(to, 0, dest->entry_size);
+ memcpy(to, from, min(src->entry_size,
+ dest->entry_size));
+ proceed_active_entry(dest);
+ }
+
+ proceed_active_entry(src);
+ from = get_active_entry(src);
+ } while (from != last);
+}
+
/*
* debug_finish_entry:
* - set timestamp, caller address, cpu number etc.
@@ -844,12 +927,17 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active,
int level, int exception)
{
- active->id.stck = get_tod_clock_fast() -
- *(unsigned long long *) &tod_clock_base[1];
- active->id.fields.cpuid = smp_processor_id();
+ unsigned long timestamp;
+ union tod_clock clk;
+
+ store_tod_clock_ext(&clk);
+ timestamp = clk.us;
+ timestamp -= TOD_UNIX_EPOCH >> 12;
+ active->clock = timestamp;
+ active->cpu = smp_processor_id();
active->caller = __builtin_return_address(0);
- active->id.fields.exception = exception;
- active->id.fields.level = level;
+ active->exception = exception;
+ active->level = level;
proceed_active_entry(id);
if (exception)
proceed_active_area(id);
@@ -867,7 +955,7 @@ static int debug_active = 1;
* if debug_active is already off
*/
static int s390dbf_procactive(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!write || debug_stoppable || !debug_active)
return proc_dointvec(table, write, buffer, lenp, ppos);
@@ -1121,16 +1209,17 @@ int debug_register_view(debug_info_t *id, struct debug_view *view)
break;
}
if (i == DEBUG_MAX_VIEWS) {
- pr_err("Registering view %s/%s would exceed the maximum "
- "number of views %i\n", id->name, view->name, i);
rc = -1;
} else {
id->views[i] = view;
id->debugfs_entries[i] = pde;
}
spin_unlock_irqrestore(&id->lock, flags);
- if (rc)
+ if (rc) {
+ pr_err("Registering view %s/%s would exceed the maximum "
+ "number of views %i\n", id->name, view->name, i);
debugfs_remove(pde);
+ }
out:
return rc;
}
@@ -1385,32 +1474,6 @@ out:
}
/*
- * prints debug header in raw format
- */
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
- int area, debug_entry_t *entry, char *out_buf)
-{
- int rc;
-
- rc = sizeof(debug_entry_t);
- memcpy(out_buf, entry, sizeof(debug_entry_t));
- return rc;
-}
-
-/*
- * prints debug data in raw format
- */
-static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, const char *in_buf)
-{
- int rc;
-
- rc = id->buf_size;
- memcpy(out_buf, in_buf, id->buf_size);
- return rc;
-}
-
-/*
* prints debug data in hex/ascii format
*/
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
@@ -1439,25 +1502,24 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
int area, debug_entry_t *entry, char *out_buf)
{
- unsigned long base, sec, usec;
+ unsigned long sec, usec;
unsigned long caller;
unsigned int level;
char *except_str;
int rc = 0;
- level = entry->id.fields.level;
- base = (*(unsigned long *) &tod_clock_base[0]) >> 4;
- sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12);
+ level = entry->level;
+ sec = entry->clock;
usec = do_div(sec, USEC_PER_SEC);
- if (entry->id.fields.exception)
+ if (entry->exception)
except_str = "*";
else
except_str = "-";
caller = (unsigned long) entry->caller;
- rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK ",
+ rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ",
area, sec, usec, level, except_str,
- entry->id.fields.cpuid, (void *)caller);
+ entry->cpu, (void *)caller);
return rc;
}
EXPORT_SYMBOL(debug_dflt_header_fn);
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index e9dac9a24d3f..a778714e4d8b 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -11,9 +11,11 @@
#include <linux/cpu.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/trace/diag.h>
#include <asm/sections.h>
+#include "entry.h"
struct diag_stat {
unsigned int counter[NR_DIAG_STAT];
@@ -50,8 +52,16 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
};
-struct diag_ops __bootdata_preserved(diag_dma_ops);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma);
+struct diag_ops __amode31_ref diag_amode31_ops = {
+ .diag210 = _diag210_amode31,
+ .diag26c = _diag26c_amode31,
+ .diag14 = _diag14_amode31,
+ .diag0c = _diag0c_amode31,
+ .diag308_reset = _diag308_reset_amode31
+};
+
+static struct diag210 _diag210_tmp_amode31 __section(".amode31.data");
+struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31;
static int show_diag_stat(struct seq_file *m, void *v)
{
@@ -59,7 +69,7 @@ static int show_diag_stat(struct seq_file *m, void *v)
unsigned long n = (unsigned long) v - 1;
int cpu, prec, tmp;
- get_online_cpus();
+ cpus_read_lock();
if (n == 0) {
seq_puts(m, " ");
@@ -78,13 +88,13 @@ static int show_diag_stat(struct seq_file *m, void *v)
}
seq_printf(m, " %s\n", diag_map[n-1].name);
}
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
static void *show_diag_stat_start(struct seq_file *m, loff_t *pos)
{
- return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL;
+ return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL;
}
static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos)
@@ -104,18 +114,7 @@ static const struct seq_operations show_diag_stat_sops = {
.show = show_diag_stat,
};
-static int show_diag_stat_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &show_diag_stat_sops);
-}
-
-static const struct file_operations show_diag_stat_fops = {
- .open = show_diag_stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
+DEFINE_SEQ_ATTRIBUTE(show_diag_stat);
static int __init show_diag_stat_init(void)
{
@@ -133,7 +132,7 @@ void diag_stat_inc(enum diag_stat_enum nr)
}
EXPORT_SYMBOL(diag_stat_inc);
-void diag_stat_inc_norecursion(enum diag_stat_enum nr)
+void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr)
{
this_cpu_inc(diag_stat.counter[nr]);
trace_s390_diagnose_norecursion(diag_map[nr].code);
@@ -146,22 +145,21 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion);
int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
{
diag_stat_inc(DIAG_STAT_X014);
- return diag_dma_ops.diag14(rx, ry1, subcode);
+ return diag_amode31_ops.diag14(rx, ry1, subcode);
}
EXPORT_SYMBOL(diag14);
static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
{
- register unsigned long _subcode asm("0") = *subcode;
- register unsigned long _size asm("1") = size;
+ union register_pair rp = { .even = *subcode, .odd = size };
asm volatile(
- " diag %2,%0,0x204\n"
+ " diag %[addr],%[rp],0x204\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- *subcode = _subcode;
- return _size;
+ : [rp] "+&d" (rp.pair) : [addr] "d" (addr) : "memory");
+ *subcode = rp.even;
+ return rp.odd;
}
int diag204(unsigned long subcode, unsigned long size, void *addr)
@@ -184,12 +182,12 @@ int diag210(struct diag210 *addr)
int ccode;
spin_lock_irqsave(&diag210_lock, flags);
- *__diag210_tmp_dma = *addr;
+ *__diag210_tmp_amode31 = *addr;
diag_stat_inc(DIAG_STAT_X210);
- ccode = diag_dma_ops.diag210(__diag210_tmp_dma);
+ ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31);
- *addr = *__diag210_tmp_dma;
+ *addr = *__diag210_tmp_amode31;
spin_unlock_irqrestore(&diag210_lock, flags);
return ccode;
@@ -217,6 +215,6 @@ EXPORT_SYMBOL(diag224);
int diag26c(void *req, void *resp, enum diag26c_sc subcode)
{
diag_stat_inc(DIAG_STAT_X26C);
- return diag_dma_ops.diag26c(req, resp, subcode);
+ return diag_amode31_ops.diag26c(req, resp, subcode);
}
EXPORT_SYMBOL(diag26c);
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index f304802ecf7b..90bbb4ea1d08 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -278,6 +278,7 @@ static const unsigned char formats[][6] = {
[INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 },
[INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 },
[INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 },
+ [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 },
[INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 },
[INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 },
[INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 },
@@ -312,10 +313,12 @@ static const unsigned char formats[][6] = {
[INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 },
[INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 },
[INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 },
+ [INSTR_VRR_VV0U2] = { V_8, V_12, U4_24, 0, 0, 0 },
[INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 },
[INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 },
[INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 },
[INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 },
+ [INSTR_VRR_VVV0U0] = { V_8, V_12, V_16, U4_24, 0, 0 },
[INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 },
[INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 },
[INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 },
@@ -482,31 +485,37 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
return (int) (ptr - buffer);
}
+static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len)
+{
+ if (user_mode(regs)) {
+ if (copy_from_user(dst, (char __user *)src, len))
+ return -EFAULT;
+ } else {
+ if (copy_from_kernel_nofault(dst, src, len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
void show_code(struct pt_regs *regs)
{
char *mode = user_mode(regs) ? "User" : "Krnl";
unsigned char code[64];
char buffer[128], *ptr;
- mm_segment_t old_fs;
unsigned long addr;
int start, end, opsize, hops, i;
/* Get a snapshot of the 64 bytes surrounding the fault address. */
- old_fs = get_fs();
- set_fs(user_mode(regs) ? USER_DS : KERNEL_DS);
for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) {
addr = regs->psw.addr - 34 + start;
- if (__copy_from_user(code + start - 2,
- (char __user *) addr, 2))
+ if (copy_from_regs(regs, code + start - 2, (void *)addr, 2))
break;
}
for (end = 32; end < 64; end += 2) {
addr = regs->psw.addr + end - 32;
- if (__copy_from_user(code + end,
- (char __user *) addr, 2))
+ if (copy_from_regs(regs, code + end, (void *)addr, 2))
break;
}
- set_fs(old_fs);
/* Code snapshot useable ? */
if ((regs->psw.addr & 1) || start >= end) {
printk("%s Code: Bad PSW.\n", mode);
@@ -557,7 +566,7 @@ void show_code(struct pt_regs *regs)
void print_fn_code(unsigned char *code, unsigned long len)
{
- char buffer[64], *ptr;
+ char buffer[128], *ptr;
int opsize, i;
while (len) {
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 2c122d8bab93..1e3233eb510a 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -79,6 +79,15 @@ static bool in_nodat_stack(unsigned long sp, struct stack_info *info)
return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top);
}
+static bool in_mcck_stack(unsigned long sp, struct stack_info *info)
+{
+ unsigned long frame_size, top;
+
+ frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
+ top = S390_lowcore.mcck_stack + frame_size;
+ return in_stack(sp, info, STACK_TYPE_MCCK, top - THREAD_SIZE, top);
+}
+
static bool in_restart_stack(unsigned long sp, struct stack_info *info)
{
unsigned long frame_size, top;
@@ -108,7 +117,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task,
/* Check per-cpu stacks */
if (!in_irq_stack(sp, info) &&
!in_nodat_stack(sp, info) &&
- !in_restart_stack(sp, info))
+ !in_restart_stack(sp, info) &&
+ !in_mcck_stack(sp, info))
goto unknown;
recursion_check:
@@ -126,22 +136,23 @@ unknown:
return -EINVAL;
}
-void show_stack(struct task_struct *task, unsigned long *stack)
+void show_stack(struct task_struct *task, unsigned long *stack,
+ const char *loglvl)
{
struct unwind_state state;
- printk("Call Trace:\n");
+ printk("%sCall Trace:\n", loglvl);
unwind_for_each_frame(&state, task, NULL, (unsigned long) stack)
- printk(state.reliable ? " [<%016lx>] %pSR \n" :
- "([<%016lx>] %pSR)\n",
- state.ip, (void *) state.ip);
+ printk(state.reliable ? "%s [<%016lx>] %pSR \n" :
+ "%s([<%016lx>] %pSR)\n",
+ loglvl, state.ip, (void *) state.ip);
debug_show_held_locks(task ? : current);
}
static void show_last_breaking_event(struct pt_regs *regs)
{
printk("Last Breaking-Event-Address:\n");
- printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]);
+ printk(" [<%016lx>] %pSR\n", regs->last_break, (void *)regs->last_break);
}
void show_registers(struct pt_regs *regs)
@@ -175,13 +186,13 @@ void show_regs(struct pt_regs *regs)
show_registers(regs);
/* Show stack backtrace if pt_regs is from kernel mode */
if (!user_mode(regs))
- show_stack(NULL, (unsigned long *) regs->gprs[15]);
+ show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT);
show_last_breaking_event(regs);
}
static DEFINE_SPINLOCK(die_lock);
-void die(struct pt_regs *regs, const char *str)
+void __noreturn die(struct pt_regs *regs, const char *str)
{
static int die_counter;
@@ -213,5 +224,5 @@ void die(struct pt_regs *regs, const char *str)
if (panic_on_oops)
panic("Fatal exception: panic_on_oops");
oops_exit();
- do_exit(SIGSEGV);
+ make_task_dead(SIGSEGV);
}
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index cd241ee66eff..6030fdd6997b 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -2,7 +2,6 @@
/*
* Copyright IBM Corp. 2007, 2009
* Author(s): Hongjie Yang <hongjie@us.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#define KMSG_COMPONENT "setup"
@@ -18,6 +17,7 @@
#include <linux/pfn.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/ipl.h>
@@ -33,18 +33,20 @@
#include <asm/switch_to.h>
#include "entry.h"
+int __bootdata(is_full_image);
+
static void __init reset_tod_clock(void)
{
- u64 time;
+ union tod_clock clk;
- if (store_tod_clock(&time) == 0)
+ if (store_tod_clock_ext_cc(&clk) == 0)
return;
/* TOD clock not running. Set the clock to Unix Epoch. */
- if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0)
+ if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk))
disabled_wait();
- memset(tod_clock_base, 0, 16);
- *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH;
+ memset(&tod_clock_base, 0, sizeof(tod_clock_base));
+ tod_clock_base.tod = TOD_UNIX_EPOCH;
S390_lowcore.last_update_clock = TOD_UNIX_EPOCH;
}
@@ -147,44 +149,29 @@ static __init void setup_topology(void)
topology_max_mnest = max_mnest;
}
-static void early_pgm_check_handler(void)
+void __do_early_pgm_check(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
- unsigned long cr0, cr0_new;
- unsigned long addr;
-
- addr = S390_lowcore.program_old_psw.addr;
- fixup = s390_search_extables(addr);
- if (!fixup)
+ if (!fixup_exception(regs))
disabled_wait();
- /* Disable low address protection before storing into lowcore. */
- __ctl_store(cr0, 0, 0);
- cr0_new = cr0 & ~(1UL << 28);
- __ctl_load(cr0_new, 0, 0);
- S390_lowcore.program_old_psw.addr = extable_fixup(fixup);
- __ctl_load(cr0, 0, 0);
}
static noinline __init void setup_lowcore_early(void)
{
psw_t psw;
+ psw.addr = (unsigned long)early_pgm_check_handler;
psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA;
- psw.addr = (unsigned long) s390_base_ext_handler;
- S390_lowcore.external_new_psw = psw;
- psw.addr = (unsigned long) s390_base_pgm_handler;
+ if (IS_ENABLED(CONFIG_KASAN))
+ psw.mask |= PSW_MASK_DAT;
S390_lowcore.program_new_psw = psw;
- s390_base_pgm_handler_fn = early_pgm_check_handler;
S390_lowcore.preempt_count = INIT_PREEMPT_COUNT;
}
static noinline __init void setup_facility_list(void)
{
- memcpy(S390_lowcore.alt_stfle_fac_list,
- S390_lowcore.stfle_fac_list,
- sizeof(S390_lowcore.alt_stfle_fac_list));
+ memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list));
if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
static __init void detect_diag9c(void)
@@ -230,12 +217,16 @@ static __init void detect_machine_facilities(void)
}
if (test_facility(133))
S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
- if (test_facility(139) && (tod_clock_base[1] & 0x80)) {
+ if (test_facility(139) && (tod_clock_base.tod >> 63)) {
/* Enabled signed clock comparator comparisons */
S390_lowcore.machine_flags |= MACHINE_FLAG_SCC;
clock_comparator_max = -1ULL >> 1;
__ctl_set_bit(0, 53);
}
+ if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) {
+ S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO;
+ /* the control bit is set during PCI initialization */
+ }
}
static inline void save_vector_registers(void)
@@ -272,29 +263,16 @@ static int __init disable_vector_extension(char *str)
}
early_param("novx", disable_vector_extension);
-static int __init cad_setup(char *str)
-{
- bool enabled;
- int rc;
-
- rc = kstrtobool(str, &enabled);
- if (!rc && enabled && test_facility(128))
- /* Enable problem state CAD. */
- __ctl_set_bit(2, 3);
- return rc;
-}
-early_param("cad", cad_setup);
-
char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
static void __init setup_boot_command_line(void)
{
/* copy arch command line */
- strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE);
}
static void __init check_image_bootable(void)
{
- if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
+ if (is_full_image)
return;
sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
@@ -303,13 +281,20 @@ static void __init check_image_bootable(void)
disabled_wait();
}
+static void __init sort_amode31_extable(void)
+{
+ sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table);
+}
+
void __init startup_init(void)
{
+ sclp_early_adjust_va();
reset_tod_clock();
check_image_bootable();
time_early_init();
init_kernel_storage_key();
lockdep_off();
+ sort_amode31_extable();
setup_lowcore_early();
setup_facility_list();
detect_machine_type();
diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c
index 6f24d83bc5dc..d9d53f44008a 100644
--- a/arch/s390/kernel/early_printk.c
+++ b/arch/s390/kernel/early_printk.c
@@ -10,7 +10,7 @@
static void sclp_early_write(struct console *con, const char *s, unsigned int len)
{
- __sclp_early_printk(s, len, 0);
+ __sclp_early_printk(s, len);
}
static struct console sclp_early_console = {
diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S
new file mode 100644
index 000000000000..f521c6da37b8
--- /dev/null
+++ b/arch/s390/kernel/earlypgm.S
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2006, 2007
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+ENTRY(early_pgm_check_handler)
+ stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE)
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ stmg %r0,%r7,__PT_R0(%r11)
+ mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW
+ mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+ lgr %r2,%r11
+ brasl %r14,__do_early_pgm_check
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ lpswe __LC_RETURN_PSW
+ENDPROC(early_pgm_check_handler)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 9205add8481d..d2a1f2f4f5b8 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -6,15 +6,14 @@
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
* Hartmut Penner (hp@de.ibm.com),
* Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/init.h>
#include <linux/linkage.h>
+#include <asm/asm-extable.h>
#include <asm/alternative-asm.h>
#include <asm/processor.h>
#include <asm/cache.h>
-#include <asm/ctl_reg.h>
#include <asm/dwarf.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@@ -30,59 +29,26 @@
#include <asm/export.h>
#include <asm/nospec-insn.h>
-__PT_R0 = __PT_GPRS
-__PT_R1 = __PT_GPRS + 8
-__PT_R2 = __PT_GPRS + 16
-__PT_R3 = __PT_GPRS + 24
-__PT_R4 = __PT_GPRS + 32
-__PT_R5 = __PT_GPRS + 40
-__PT_R6 = __PT_GPRS + 48
-__PT_R7 = __PT_GPRS + 56
-__PT_R8 = __PT_GPRS + 64
-__PT_R9 = __PT_GPRS + 72
-__PT_R10 = __PT_GPRS + 80
-__PT_R11 = __PT_GPRS + 88
-__PT_R12 = __PT_GPRS + 96
-__PT_R13 = __PT_GPRS + 104
-__PT_R14 = __PT_GPRS + 112
-__PT_R15 = __PT_GPRS + 120
-
STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER
STACK_SIZE = 1 << STACK_SHIFT
STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
-_TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
- _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING)
-_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
- _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
- _CIF_ASCE_SECONDARY | _CIF_FPU)
-_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
-
_LPP_OFFSET = __LC_LPP
-#define BASED(name) name-cleanup_critical(%r13)
+ .macro STBEAR address
+ ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193
+ .endm
- .macro TRACE_IRQS_ON
-#ifdef CONFIG_TRACE_IRQFLAGS
- basr %r2,%r0
- brasl %r14,trace_hardirqs_on_caller
-#endif
+ .macro LBEAR address
+ ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193
.endm
- .macro TRACE_IRQS_OFF
-#ifdef CONFIG_TRACE_IRQFLAGS
- basr %r2,%r0
- brasl %r14,trace_hardirqs_off_caller
-#endif
+ .macro LPSWEY address,lpswe
+ ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193
.endm
- .macro LOCKDEP_SYS_EXIT
-#ifdef CONFIG_LOCKDEP
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jz .+10
- brasl %r14,lockdep_sys_exit
-#endif
+ .macro MBEAR reg
+ ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193
.endm
.macro CHECK_STACK savearea
@@ -102,6 +68,8 @@ _LPP_OFFSET = __LC_LPP
je \oklabel
clg %r14,__LC_ASYNC_STACK
je \oklabel
+ clg %r14,__LC_MCCK_STACK
+ je \oklabel
clg %r14,__LC_NODAT_STACK
je \oklabel
clg %r14,__LC_RESTART_STACK
@@ -113,56 +81,6 @@ _LPP_OFFSET = __LC_LPP
#endif
.endm
- .macro SWITCH_ASYNC savearea,timer
- tmhh %r8,0x0001 # interrupting from user ?
- jnz 1f
- lgr %r14,%r9
- slg %r14,BASED(.Lcritical_start)
- clg %r14,BASED(.Lcritical_length)
- jhe 0f
- lghi %r11,\savearea # inside critical section, do cleanup
- brasl %r14,cleanup_critical
- tmhh %r8,0x0001 # retest problem state after cleanup
- jnz 1f
-0: lg %r14,__LC_ASYNC_STACK # are we already on the target stack?
- slgr %r14,%r15
- srag %r14,%r14,STACK_SHIFT
- jnz 2f
- CHECK_STACK \savearea
- aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
- j 3f
-1: UPDATE_VTIME %r14,%r15,\timer
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-2: lg %r15,__LC_ASYNC_STACK # load async stack
-3: la %r11,STACK_FRAME_OVERHEAD(%r15)
- .endm
-
- .macro UPDATE_VTIME w1,w2,enter_timer
- lg \w1,__LC_EXIT_TIMER
- lg \w2,__LC_LAST_UPDATE_TIMER
- slg \w1,\enter_timer
- slg \w2,__LC_EXIT_TIMER
- alg \w1,__LC_USER_TIMER
- alg \w2,__LC_SYSTEM_TIMER
- stg \w1,__LC_USER_TIMER
- stg \w2,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer
- .endm
-
- .macro REENABLE_IRQS
- stg %r8,__LC_RETURN_PSW
- ni __LC_RETURN_PSW,0xbf
- ssm __LC_RETURN_PSW
- .endm
-
- .macro STCK savearea
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
- .insn s,0xb27c0000,\savearea # store clock fast
-#else
- .insn s,0xb2050000,\savearea # store clock
-#endif
- .endm
-
/*
* The TSTMSK macro generates a test-under-mask instruction by
* calculating the memory offset for the specified mask value.
@@ -186,36 +104,90 @@ _LPP_OFFSET = __LC_LPP
.endm
.macro BPOFF
- ALTERNATIVE "", ".long 0xb2e8c000", 82
+ ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82
.endm
.macro BPON
- ALTERNATIVE "", ".long 0xb2e8d000", 82
+ ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82
.endm
.macro BPENTER tif_ptr,tif_mask
- ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \
- "", 82
+ ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \
+ "j .+12; nop; nop", 82
.endm
.macro BPEXIT tif_ptr,tif_mask
TSTMSK \tif_ptr,\tif_mask
- ALTERNATIVE "jz .+8; .long 0xb2e8c000", \
- "jnz .+8; .long 0xb2e8d000", 82
+ ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \
+ "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82
+ .endm
+
+ /*
+ * The CHKSTG macro jumps to the provided label in case the
+ * machine check interruption code reports one of unrecoverable
+ * storage errors:
+ * - Storage error uncorrected
+ * - Storage key error uncorrected
+ * - Storage degradation with Failing-storage-address validity
+ */
+ .macro CHKSTG errlabel
+ TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR)
+ jnz \errlabel
+ TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD
+ jz .Loklabel\@
+ TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR
+ jnz \errlabel
+.Loklabel\@:
+ .endm
+
+#if IS_ENABLED(CONFIG_KVM)
+ /*
+ * The OUTSIDE macro jumps to the provided label in case the value
+ * in the provided register is outside of the provided range. The
+ * macro is useful for checking whether a PSW stored in a register
+ * pair points inside or outside of a block of instructions.
+ * @reg: register to check
+ * @start: start of the range
+ * @end: end of the range
+ * @outside_label: jump here if @reg is outside of [@start..@end)
+ */
+ .macro OUTSIDE reg,start,end,outside_label
+ lgr %r14,\reg
+ larl %r13,\start
+ slgr %r14,%r13
+#ifdef CONFIG_AS_IS_LLVM
+ clgfrl %r14,.Lrange_size\@
+#else
+ clgfi %r14,\end - \start
+#endif
+ jhe \outside_label
+#ifdef CONFIG_AS_IS_LLVM
+ .section .rodata, "a"
+ .align 4
+.Lrange_size\@:
+ .long \end - \start
+ .previous
+#endif
.endm
- GEN_BR_THUNK %r9
+ .macro SIEEXIT
+ lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
+ ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
+ lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
+ larl %r9,sie_exit # skip forward to sie_exit
+ .endm
+#endif
+
GEN_BR_THUNK %r14
- GEN_BR_THUNK %r14,%r11
.section .kprobes.text, "ax"
.Ldummy:
/*
- * This nop exists only in order to avoid that __switch_to starts at
+ * This nop exists only in order to avoid that __bpon starts at
* the beginning of the kprobes text section. In that case we would
* have several symbols at the same address. E.g. objdump would take
* an arbitrary symbol name when disassembling this code.
- * With the added nop in between the __switch_to symbol is unique
+ * With the added nop in between the __bpon symbol is unique
* again.
*/
nop 0
@@ -247,12 +219,10 @@ ENTRY(__switch_to)
aghi %r3,__TASK_pid
mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next
lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
- ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+ ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
BR_EX %r14
ENDPROC(__switch_to)
-.L__critical_start:
-
#if IS_ENABLED(CONFIG_KVM)
/*
* sie64a calling convention:
@@ -266,10 +236,6 @@ ENTRY(sie64a)
stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area
xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ?
- jno .Lsie_load_guest_gprs
- brasl %r14,load_fpu_regs # load guest fp/vx regs
-.Lsie_load_guest_gprs:
lmg %r0,%r13,0(%r3) # load guest gprs 0-13
lg %r14,__LC_GMAP # get gmap pointer
ltgr %r14,%r14
@@ -285,19 +251,21 @@ ENTRY(sie64a)
BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
.Lsie_entry:
sie 0(%r14)
-.Lsie_exit:
+# Let the next instruction be NOP to avoid triggering a machine check
+# and handling it in a guest as result of the instruction execution.
+ nopr 7
+.Lsie_leave:
BPOFF
BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
.Lsie_skip:
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
+ lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
.Lsie_done:
# some program checks are suppressing. C code (e.g. do_protection_exception)
# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
# Other instructions between sie64a and .Lsie_done should not cause program
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-# See also .Lcleanup_sie
.Lrewind_pad6:
nopr 7
.Lrewind_pad4:
@@ -310,7 +278,6 @@ sie_exit:
stmg %r0,%r13,0(%r14) # save guest gprs 0-13
xgr %r0,%r0 # clear guest registers to
xgr %r1,%r1 # prevent speculative use
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
@@ -333,780 +300,228 @@ EXPORT_SYMBOL(sie_exit)
/*
* SVC interrupt handler routine. System calls are synchronous events and
- * are executed with interrupts enabled.
+ * are entered with interrupts disabled.
*/
ENTRY(system_call)
- stpt __LC_SYNC_ENTER_TIMER
-.Lsysc_stmg:
+ stpt __LC_SYS_ENTER_TIMER
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
BPOFF
- lg %r12,__LC_CURRENT
- lghi %r13,__TASK_thread
- lghi %r14,_PIF_SYSCALL
+ lghi %r14,0
.Lsysc_per:
+ STBEAR __LC_LAST_BREAK
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ lg %r12,__LC_CURRENT
lg %r15,__LC_KERNEL_STACK
- la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
-.Lsysc_vtime:
- UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- stmg %r0,%r7,__PT_R0(%r11)
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
- mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
- mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC
- stg %r14,__PT_FLAGS(%r11)
-.Lsysc_do_svc:
# clear user controlled register to prevent speculative use
xgr %r0,%r0
- # load address of system call table
- lg %r10,__THREAD_sysc_table(%r13,%r12)
- llgh %r8,__PT_INT_CODE+2(%r11)
- slag %r8,%r8,3 # shift and test for svc 0
- jnz .Lsysc_nr_ok
- # svc 0: system call number in %r1
- llgfr %r1,%r1 # clear high word in r1
- cghi %r1,NR_syscalls
- jnl .Lsysc_nr_ok
- sth %r1,__PT_INT_CODE+2(%r11)
- slag %r8,%r1,3
-.Lsysc_nr_ok:
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- stg %r2,__PT_ORIG_GPR2(%r11)
- stg %r7,STACK_FRAME_OVERHEAD(%r15)
- lg %r9,0(%r8,%r10) # get system call add.
- TSTMSK __TI_flags(%r12),_TIF_TRACE
- jnz .Lsysc_tracesys
- BASR_EX %r14,%r9 # call sys_xxxx
- stg %r2,__PT_R2(%r11) # store return value
-
-.Lsysc_return:
-#ifdef CONFIG_DEBUG_RSEQ
- lgr %r2,%r11
- brasl %r14,rseq_syscall
-#endif
- LOCKDEP_SYS_EXIT
-.Lsysc_tif:
- TSTMSK __PT_FLAGS(%r11),_PIF_WORK
- jnz .Lsysc_work
- TSTMSK __TI_flags(%r12),_TIF_WORK
- jnz .Lsysc_work # check for work
- TSTMSK __LC_CPU_FLAGS,_CIF_WORK
- jnz .Lsysc_work
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
-.Lsysc_restore:
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r10,__PT_R0(%r11)
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
-.Lsysc_exit_timer:
+ xgr %r1,%r1
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
+ xgr %r8,%r8
+ xgr %r9,%r9
+ xgr %r10,%r10
+ xgr %r11,%r11
+ la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
+ mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC
+ MBEAR %r2
+ lgr %r3,%r14
+ brasl %r14,__do_syscall
+ lctlg %c1,%c1,__LC_USER_ASCE
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
- lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_PSW
-.Lsysc_done:
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lsysc_work:
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jo .Lsysc_mcck_pending
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jo .Lsysc_reschedule
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
- jo .Lsysc_syscall_restart
-#ifdef CONFIG_UPROBES
- TSTMSK __TI_flags(%r12),_TIF_UPROBE
- jo .Lsysc_uprobe_notify
-#endif
- TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE
- jo .Lsysc_guarded_storage
- TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP
- jo .Lsysc_singlestep
-#ifdef CONFIG_LIVEPATCH
- TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING
- jo .Lsysc_patch_pending # handle live patching just before
- # signals and possible syscall restart
-#endif
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
- jo .Lsysc_syscall_restart
- TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
- jo .Lsysc_sigpending
- TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
- jo .Lsysc_notify_resume
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsysc_vxrs
- TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
- jnz .Lsysc_asce
- j .Lsysc_return # beware of critical section cleanup
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lsysc_reschedule:
- larl %r14,.Lsysc_return
- jg schedule
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lsysc_mcck_pending:
- larl %r14,.Lsysc_return
- jg s390_handle_mcck # TIF bit will be cleared by handler
-
-#
-# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lsysc_asce:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
- lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
- jz .Lsysc_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
- tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ?
- jnz .Lsysc_set_fs_fixup
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lsysc_return
-.Lsysc_set_fs_fixup:
-#endif
- larl %r14,.Lsysc_return
- jg set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lsysc_vxrs:
- larl %r14,.Lsysc_return
- jg load_fpu_regs
-
-#
-# _TIF_SIGPENDING is set, call do_signal
-#
-.Lsysc_sigpending:
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_signal
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL
- jno .Lsysc_return
-.Lsysc_do_syscall:
- lghi %r13,__TASK_thread
- lmg %r2,%r7,__PT_R2(%r11) # load svc arguments
- lghi %r1,0 # svc 0 returns -ENOSYS
- j .Lsysc_do_svc
-
-#
-# _TIF_NOTIFY_RESUME is set, call do_notify_resume
-#
-.Lsysc_notify_resume:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_notify_resume
-
-#
-# _TIF_UPROBE is set, call uprobe_notify_resume
-#
-#ifdef CONFIG_UPROBES
-.Lsysc_uprobe_notify:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg uprobe_notify_resume
-#endif
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lsysc_guarded_storage:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg gs_load_bc_cb
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lsysc_patch_pending:
- lg %r2,__LC_CURRENT # pass pointer to task struct
- larl %r14,.Lsysc_return
- jg klp_update_patch_state
-#endif
-
-#
-# _PIF_PER_TRAP is set, call do_per_trap
-#
-.Lsysc_singlestep:
- ni __PT_FLAGS+7(%r11),255-_PIF_PER_TRAP
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_per_trap
-
-#
-# _PIF_SYSCALL_RESTART is set, repeat the current system call
-#
-.Lsysc_syscall_restart:
- ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART
- lmg %r1,%r7,__PT_R1(%r11) # load svc arguments
- lg %r2,__PT_ORIG_GPR2(%r11)
- j .Lsysc_do_svc
-
-#
-# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
-# and after the system call
-#
-.Lsysc_tracesys:
- lgr %r2,%r11 # pass pointer to pt_regs
- la %r3,0
- llgh %r0,__PT_INT_CODE+2(%r11)
- stg %r0,__PT_R2(%r11)
- brasl %r14,do_syscall_trace_enter
- lghi %r0,NR_syscalls
- clgr %r0,%r2
- jnh .Lsysc_tracenogo
- sllg %r8,%r2,3
- lg %r9,0(%r8,%r10)
-.Lsysc_tracego:
- lmg %r3,%r7,__PT_R3(%r11)
- stg %r7,STACK_FRAME_OVERHEAD(%r15)
- lg %r2,__PT_ORIG_GPR2(%r11)
- BASR_EX %r14,%r9 # call sys_xxx
- stg %r2,__PT_R2(%r11) # store return value
-.Lsysc_tracenogo:
- TSTMSK __TI_flags(%r12),_TIF_TRACE
- jz .Lsysc_return
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_syscall_trace_exit
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
ENDPROC(system_call)
#
# a new process exits the kernel with ret_from_fork
#
ENTRY(ret_from_fork)
- la %r11,STACK_FRAME_OVERHEAD(%r15)
- lg %r12,__LC_CURRENT
- brasl %r14,schedule_tail
- TRACE_IRQS_ON
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ?
- jne .Lsysc_tracenogo
- # it's a kernel thread
- lmg %r9,%r10,__PT_R9(%r11) # load gprs
- la %r2,0(%r10)
- BASR_EX %r14,%r9
- j .Lsysc_tracenogo
+ lgr %r3,%r11
+ brasl %r14,__ret_from_fork
+ lctlg %c1,%c1,__LC_USER_ASCE
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ stpt __LC_EXIT_TIMER
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
ENDPROC(ret_from_fork)
-ENTRY(kernel_thread_starter)
- la %r2,0(%r10)
- BASR_EX %r14,%r9
- j .Lsysc_tracenogo
-ENDPROC(kernel_thread_starter)
-
/*
* Program check handler routine
*/
ENTRY(pgm_check_handler)
- stpt __LC_SYNC_ENTER_TIMER
+ stpt __LC_SYS_ENTER_TIMER
BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
- lg %r10,__LC_LAST_BREAK
lg %r12,__LC_CURRENT
- lghi %r11,0
- larl %r13,cleanup_critical
+ lghi %r10,0
lmg %r8,%r9,__LC_PGM_OLD_PSW
- tmhh %r8,0x0001 # test problem state bit
- jnz 2f # -> fault in user space
+ tmhh %r8,0x0001 # coming from user space?
+ jno .Lpgm_skip_asce
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ j 3f # -> fault in user space
+.Lpgm_skip_asce:
#if IS_ENABLED(CONFIG_KVM)
# cleanup critical section for program checks in sie64a
- lgr %r14,%r9
- slg %r14,BASED(.Lsie_critical_start)
- clg %r14,BASED(.Lsie_critical_length)
- jhe 0f
- lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
- ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- larl %r9,sie_exit # skip forward to sie_exit
- lghi %r11,_PIF_GUEST_FAULT
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
+ SIEEXIT
+ lghi %r10,_PIF_GUEST_FAULT
#endif
-0: tmhh %r8,0x4000 # PER bit set in old PSW ?
- jnz 1f # -> enabled, can't be a double fault
+1: tmhh %r8,0x4000 # PER bit set in old PSW ?
+ jnz 2f # -> enabled, can't be a double fault
tm __LC_PGM_ILC+3,0x80 # check for per exception
jnz .Lpgm_svcper # -> single stepped svc
-1: CHECK_STACK __LC_SAVE_AREA_SYNC
+2: CHECK_STACK __LC_SAVE_AREA_SYNC
aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
# CHECK_VMAP_STACK branches to stack_overflow or 4f
CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f
-2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
+3: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
lg %r15,__LC_KERNEL_STACK
- lgr %r14,%r12
- aghi %r14,__TASK_thread # pointer to thread_struct
- lghi %r13,__LC_PGM_TDB
- tm __LC_PGM_ILC+2,0x02 # check for transaction abort
- jz 3f
- mvc __THREAD_trap_tdb(256,%r14),0(%r13)
-3: stg %r10,__THREAD_last_break(%r14)
-4: lgr %r13,%r11
- la %r11,STACK_FRAME_OVERHEAD(%r15)
+4: la %r11,STACK_FRAME_OVERHEAD(%r15)
+ stg %r10,__PT_FLAGS(%r11)
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
stmg %r0,%r7,__PT_R0(%r11)
+ mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+ mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK
+ stmg %r8,%r9,__PT_PSW(%r11)
+
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
- stmg %r8,%r9,__PT_PSW(%r11)
- mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC
- mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE
- stg %r13,__PT_FLAGS(%r11)
- stg %r10,__PT_ARGS(%r11)
- tm __LC_PGM_ILC+3,0x80 # check for per exception
- jz 5f
- tmhh %r8,0x0001 # kernel per event ?
- jz .Lpgm_kprobe
- oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP
- mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS
- mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE
- mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID
-5: REENABLE_IRQS
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- larl %r1,pgm_check_table
- llgh %r10,__PT_INT_CODE+2(%r11)
- nill %r10,0x007f
- sll %r10,3
- je .Lpgm_return
- lg %r9,0(%r10,%r1) # load address of handler routine
- lgr %r2,%r11 # pass pointer to pt_regs
- BASR_EX %r14,%r9 # branch to interrupt-handler
-.Lpgm_return:
- LOCKDEP_SYS_EXIT
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lsysc_restore
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL
- jo .Lsysc_do_syscall
- j .Lsysc_tif
-
-#
-# PER event in supervisor state, must be kprobes
-#
-.Lpgm_kprobe:
- REENABLE_IRQS
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_per_trap
- j .Lpgm_return
+ lgr %r2,%r11
+ brasl %r14,__do_pgm_check
+ tmhh %r8,0x0001 # returning to user space?
+ jno .Lpgm_exit_kernel
+ lctlg %c1,%c1,__LC_USER_ASCE
+ BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+ stpt __LC_EXIT_TIMER
+.Lpgm_exit_kernel:
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
#
# single stepped system call
#
.Lpgm_svcper:
mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW
- lghi %r13,__TASK_thread
larl %r14,.Lsysc_per
stg %r14,__LC_RETURN_PSW+8
- lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP
- lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs
+ lghi %r14,1
+ LBEAR __LC_PGM_LAST_BREAK
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per
ENDPROC(pgm_check_handler)
/*
- * IO interrupt handler routine
+ * Interrupt handler macro used for external and IO interrupts.
*/
-ENTRY(io_int_handler)
- STCK __LC_INT_CLOCK
- stpt __LC_ASYNC_ENTER_TIMER
+.macro INT_HANDLER name,lc_old_psw,handler
+ENTRY(\name)
+ stckf __LC_INT_CLOCK
+ stpt __LC_SYS_ENTER_TIMER
+ STBEAR __LC_LAST_BREAK
BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
- lmg %r8,%r9,__LC_IO_OLD_PSW
- SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
+ lmg %r8,%r9,\lc_old_psw
+ tmhh %r8,0x0001 # interrupting from user ?
+ jnz 1f
+#if IS_ENABLED(CONFIG_KVM)
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f
+ BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ SIEEXIT
+#endif
+0: CHECK_STACK __LC_SAVE_AREA_ASYNC
+ aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
+ j 2f
+1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ lg %r15,__LC_KERNEL_STACK
+2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r7,__PT_R0(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
xgr %r10,%r10
+ xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
+ MBEAR %r11
stmg %r8,%r9,__PT_PSW(%r11)
- mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
- TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
- jo .Lio_restore
- TRACE_IRQS_OFF
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-.Lio_loop:
lgr %r2,%r11 # pass pointer to pt_regs
- lghi %r3,IO_INTERRUPT
- tm __PT_INT_CODE+8(%r11),0x80 # adapter interrupt ?
- jz .Lio_call
- lghi %r3,THIN_INTERRUPT
-.Lio_call:
- brasl %r14,do_IRQ
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR
- jz .Lio_return
- tpi 0
- jz .Lio_return
- mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
- j .Lio_loop
-.Lio_return:
- LOCKDEP_SYS_EXIT
- TRACE_IRQS_ON
-.Lio_tif:
- TSTMSK __TI_flags(%r12),_TIF_WORK
- jnz .Lio_work # there is work to do (signals etc.)
- TSTMSK __LC_CPU_FLAGS,_CIF_WORK
- jnz .Lio_work
-.Lio_restore:
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r10,__PT_R0(%r11)
+ brasl %r14,\handler
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lio_exit_kernel
+ tmhh %r8,0x0001 # returning to user ?
+ jno 2f
+ lctlg %c1,%c1,__LC_USER_ASCE
BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
-.Lio_exit_timer:
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-.Lio_exit_kernel:
- lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_PSW
-.Lio_done:
-
-#
-# There is work todo, find out in which context we have been interrupted:
-# 1) if we return to user space we can do all _TIF_WORK work
-# 2) if we return to kernel code and kvm is enabled check if we need to
-# modify the psw to leave SIE
-# 3) if we return to kernel code and preemptive scheduling is enabled check
-# the preemption counter and if it is zero call preempt_schedule_irq
-# Before any work can be done, a switch to the kernel stack is required.
-#
-.Lio_work:
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jo .Lio_work_user # yes -> do resched & signal
-#ifdef CONFIG_PREEMPTION
- # check for preemptive scheduling
- icm %r0,15,__LC_PREEMPT_COUNT
- jnz .Lio_restore # preemption is disabled
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jno .Lio_restore
- # switch to kernel stack
- lg %r1,__PT_R15(%r11)
- aghi %r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
- # TRACE_IRQS_ON already done at .Lio_return, call
- # TRACE_IRQS_OFF to keep things symmetrical
- TRACE_IRQS_OFF
- brasl %r14,preempt_schedule_irq
- j .Lio_return
-#else
- j .Lio_restore
-#endif
-
-#
-# Need to do work before returning to userspace, switch to kernel stack
-#
-.Lio_work_user:
- lg %r1,__LC_KERNEL_STACK
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lio_work_tif:
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jo .Lio_mcck_pending
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jo .Lio_reschedule
-#ifdef CONFIG_LIVEPATCH
- TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING
- jo .Lio_patch_pending
-#endif
- TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
- jo .Lio_sigpending
- TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
- jo .Lio_notify_resume
- TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE
- jo .Lio_guarded_storage
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lio_vxrs
- TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
- jnz .Lio_asce
- j .Lio_return # beware of critical section cleanup
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lio_mcck_pending:
- # TRACE_IRQS_ON already done at .Lio_return
- brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler
- TRACE_IRQS_OFF
- j .Lio_return
+2: LBEAR __PT_LAST_BREAK(%r11)
+ lmg %r0,%r15,__PT_R0(%r11)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+ENDPROC(\name)
+.endm
-#
-# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lio_asce:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
- lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
- jz .Lio_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
- tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ?
- jnz .Lio_set_fs_fixup
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lio_return
-.Lio_set_fs_fixup:
-#endif
- larl %r14,.Lio_return
- jg set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lio_vxrs:
- larl %r14,.Lio_return
- jg load_fpu_regs
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lio_guarded_storage:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,gs_load_bc_cb
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lio_reschedule:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- brasl %r14,schedule # call scheduler
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lio_patch_pending:
- lg %r2,__LC_CURRENT # pass pointer to task struct
- larl %r14,.Lio_return
- jg klp_update_patch_state
-#endif
-
-#
-# _TIF_SIGPENDING or is set, call do_signal
-#
-.Lio_sigpending:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_signal
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_NOTIFY_RESUME or is set, call do_notify_resume
-#
-.Lio_notify_resume:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_notify_resume
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-ENDPROC(io_int_handler)
+INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq
+INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq
/*
- * External interrupt handler routine
- */
-ENTRY(ext_int_handler)
- STCK __LC_INT_CLOCK
- stpt __LC_ASYNC_ENTER_TIMER
- BPOFF
- stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
- lmg %r8,%r9,__LC_EXT_OLD_PSW
- SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
- stmg %r0,%r7,__PT_R0(%r11)
- # clear user controlled registers to prevent speculative use
- xgr %r0,%r0
- xgr %r1,%r1
- xgr %r2,%r2
- xgr %r3,%r3
- xgr %r4,%r4
- xgr %r5,%r5
- xgr %r6,%r6
- xgr %r7,%r7
- xgr %r10,%r10
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
- stmg %r8,%r9,__PT_PSW(%r11)
- lghi %r1,__LC_EXT_PARAMS2
- mvc __PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR
- mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS
- mvc __PT_INT_PARM_LONG(8,%r11),0(%r1)
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
- TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
- jo .Lio_restore
- TRACE_IRQS_OFF
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- lgr %r2,%r11 # pass pointer to pt_regs
- lghi %r3,EXT_INTERRUPT
- brasl %r14,do_IRQ
- j .Lio_return
-ENDPROC(ext_int_handler)
-
-/*
- * Load idle PSW. The second "half" of this function is in .Lcleanup_idle.
+ * Load idle PSW.
*/
ENTRY(psw_idle)
+ stg %r14,(__SF_GPRS+8*8)(%r15)
stg %r3,__SF_EMPTY(%r15)
- larl %r1,.Lpsw_idle_lpsw+4
+ larl %r1,psw_idle_exit
stg %r1,__SF_EMPTY+8(%r15)
larl %r1,smp_cpu_mtid
llgf %r1,0(%r1)
ltgr %r1,%r1
jz .Lpsw_idle_stcctm
- .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15)
+ .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2)
.Lpsw_idle_stcctm:
oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT
BPON
- STCK __CLOCK_IDLE_ENTER(%r2)
+ stckf __CLOCK_IDLE_ENTER(%r2)
stpt __TIMER_IDLE_ENTER(%r2)
-.Lpsw_idle_lpsw:
lpswe __SF_EMPTY(%r15)
+.globl psw_idle_exit
+psw_idle_exit:
BR_EX %r14
-.Lpsw_idle_end:
ENDPROC(psw_idle)
/*
- * Store floating-point controls and floating-point or vector register
- * depending whether the vector facility is available. A critical section
- * cleanup assures that the registers are stored even if interrupted for
- * some other work. The CIF_FPU flag is set to trigger a lazy restore
- * of the register contents at return from io or a system call.
- */
-ENTRY(save_fpu_regs)
- lg %r2,__LC_CURRENT
- aghi %r2,__TASK_thread
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsave_fpu_regs_exit
- stfpc __THREAD_FPU_fpc(%r2)
- lg %r3,__THREAD_FPU_regs(%r2)
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- jz .Lsave_fpu_regs_fp # no -> store FP regs
- VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3)
- VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3)
- j .Lsave_fpu_regs_done # -> set CIF_FPU flag
-.Lsave_fpu_regs_fp:
- std 0,0(%r3)
- std 1,8(%r3)
- std 2,16(%r3)
- std 3,24(%r3)
- std 4,32(%r3)
- std 5,40(%r3)
- std 6,48(%r3)
- std 7,56(%r3)
- std 8,64(%r3)
- std 9,72(%r3)
- std 10,80(%r3)
- std 11,88(%r3)
- std 12,96(%r3)
- std 13,104(%r3)
- std 14,112(%r3)
- std 15,120(%r3)
-.Lsave_fpu_regs_done:
- oi __LC_CPU_FLAGS+7,_CIF_FPU
-.Lsave_fpu_regs_exit:
- BR_EX %r14
-.Lsave_fpu_regs_end:
-ENDPROC(save_fpu_regs)
-EXPORT_SYMBOL(save_fpu_regs)
-
-/*
- * Load floating-point controls and floating-point or vector registers.
- * A critical section cleanup assures that the register contents are
- * loaded even if interrupted for some other work.
- *
- * There are special calling conventions to fit into sysc and io return work:
- * %r15: <kernel stack>
- * The function requires:
- * %r4
- */
-load_fpu_regs:
- lg %r4,__LC_CURRENT
- aghi %r4,__TASK_thread
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jno .Lload_fpu_regs_exit
- lfpc __THREAD_FPU_fpc(%r4)
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area
- jz .Lload_fpu_regs_fp # -> no VX, load FP regs
- VLM %v0,%v15,0,%r4
- VLM %v16,%v31,256,%r4
- j .Lload_fpu_regs_done
-.Lload_fpu_regs_fp:
- ld 0,0(%r4)
- ld 1,8(%r4)
- ld 2,16(%r4)
- ld 3,24(%r4)
- ld 4,32(%r4)
- ld 5,40(%r4)
- ld 6,48(%r4)
- ld 7,56(%r4)
- ld 8,64(%r4)
- ld 9,72(%r4)
- ld 10,80(%r4)
- ld 11,88(%r4)
- ld 12,96(%r4)
- ld 13,104(%r4)
- ld 14,112(%r4)
- ld 15,120(%r4)
-.Lload_fpu_regs_done:
- ni __LC_CPU_FLAGS+7,255-_CIF_FPU
-.Lload_fpu_regs_exit:
- BR_EX %r14
-.Lload_fpu_regs_end:
-ENDPROC(load_fpu_regs)
-
-.L__critical_end:
-
-/*
* Machine check handler routines
*/
ENTRY(mcck_int_handler)
- STCK __LC_MCCK_CLOCK
+ stckf __LC_MCCK_CLOCK
BPOFF
la %r1,4095 # validate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer
- sckc __LC_CLOCK_COMPARATOR # validate comparator
- lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
+ LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear
lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
lmg %r8,%r9,__LC_MCK_OLD_PSW
TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
jo .Lmcck_panic # yes -> rest of mcck code invalid
@@ -1115,49 +530,12 @@ ENTRY(mcck_int_handler)
la %r14,4095
lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
ptlb
- lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area
- nill %r11,0xfc00 # MCESA_ORIGIN_MASK
- TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
- jno 0f
- TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID
- jno 0f
- .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
-0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
- TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID
- jo 0f
- sr %r14,%r14
-0: sfpc %r14
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- jo 0f
- lghi %r14,__LC_FPREGS_SAVE_AREA
- ld %f0,0(%r14)
- ld %f1,8(%r14)
- ld %f2,16(%r14)
- ld %f3,24(%r14)
- ld %f4,32(%r14)
- ld %f5,40(%r14)
- ld %f6,48(%r14)
- ld %f7,56(%r14)
- ld %f8,64(%r14)
- ld %f9,72(%r14)
- ld %f10,80(%r14)
- ld %f11,88(%r14)
- ld %f12,96(%r14)
- ld %f13,104(%r14)
- ld %f14,112(%r14)
- ld %f15,120(%r14)
- j 1f
-0: VLM %v0,%v15,0,%r11
- VLM %v16,%v31,256,%r11
-1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA
+ lghi %r14,__LC_CPU_TIMER_SAVE_AREA
mvc __LC_MCCK_ENTER_TIMER(8),0(%r14)
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
jo 3f
- la %r14,__LC_SYNC_ENTER_TIMER
- clc 0(8,%r14),__LC_ASYNC_ENTER_TIMER
- jl 0f
- la %r14,__LC_ASYNC_ENTER_TIMER
-0: clc 0(8,%r14),__LC_EXIT_TIMER
+ la %r14,__LC_SYS_ENTER_TIMER
+ clc 0(8,%r14),__LC_EXIT_TIMER
jl 1f
la %r14,__LC_EXIT_TIMER
1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER
@@ -1168,18 +546,38 @@ ENTRY(mcck_int_handler)
3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
jno .Lmcck_panic
tmhh %r8,0x0001 # interrupting from user ?
- jnz 4f
+ jnz 6f
TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
jno .Lmcck_panic
-4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off
- SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER
-.Lmcck_skip:
+#if IS_ENABLED(CONFIG_KVM)
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f
+ OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f
+ oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
+ j 5f
+4: CHKSTG .Lmcck_panic
+5: larl %r14,.Lstosm_tmp
+ stosm 0(%r14),0x04 # turn dat on, keep irqs off
+ BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ SIEEXIT
+ j .Lmcck_stack
+#endif
+6: CHKSTG .Lmcck_panic
+ larl %r14,.Lstosm_tmp
+ stosm 0(%r14),0x04 # turn dat on, keep irqs off
+ tmhh %r8,0x0001 # interrupting from user ?
+ jz .Lmcck_stack
+ BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
+.Lmcck_stack:
+ lg %r15,__LC_MCCK_STACK
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
+ stctg %c1,%c1,__PT_CR1(%r11)
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lghi %r14,__LC_GPREGS_SAVE_AREA+64
stmg %r0,%r7,__PT_R0(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
@@ -1192,42 +590,67 @@ ENTRY(mcck_int_handler)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lgr %r2,%r11 # pass pointer to pt_regs
brasl %r14,s390_do_machine_check
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lmcck_return
+ cghi %r2,0
+ je .Lmcck_return
lg %r1,__LC_KERNEL_STACK # switch to kernel stack
mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
la %r11,STACK_FRAME_OVERHEAD(%r1)
+ lgr %r2,%r11
lgr %r15,%r1
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jno .Lmcck_return
- TRACE_IRQS_OFF
brasl %r14,s390_handle_mcck
- TRACE_IRQS_ON
.Lmcck_return:
- lg %r14,__LC_VDSO_PER_CPU
+ lctlg %c1,%c1,__PT_CR1(%r11)
lmg %r0,%r10,__PT_R0(%r11)
mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-0: lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_MCCK_PSW
+0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193
+ LBEAR 0(%r12)
+ lmg %r11,%r15,__PT_R11(%r11)
+ LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE
.Lmcck_panic:
- lg %r15,__LC_NODAT_STACK
- la %r11,STACK_FRAME_OVERHEAD(%r15)
- j .Lmcck_skip
+ /*
+ * Iterate over all possible CPU addresses in the range 0..0xffff
+ * and stop each CPU using signal processor. Use compare and swap
+ * to allow just one CPU-stopper and prevent concurrent CPUs from
+ * stopping each other while leaving the others running.
+ */
+ lhi %r5,0
+ lhi %r6,1
+ larl %r7,.Lstop_lock
+ cs %r5,%r6,0(%r7) # single CPU-stopper only
+ jnz 4f
+ larl %r7,.Lthis_cpu
+ stap 0(%r7) # this CPU address
+ lh %r4,0(%r7)
+ nilh %r4,0
+ lhi %r0,1
+ sll %r0,16 # CPU counter
+ lhi %r3,0 # next CPU address
+0: cr %r3,%r4
+ je 2f
+1: sigp %r1,%r3,SIGP_STOP # stop next CPU
+ brc SIGP_CC_BUSY,1b
+2: ahi %r3,1
+ brct %r0,0b
+3: sigp %r1,%r4,SIGP_STOP # stop this CPU
+ brc SIGP_CC_BUSY,3b
+4: j 4b
ENDPROC(mcck_int_handler)
-#
-# PSW restart interrupt handler
-#
ENTRY(restart_int_handler)
- ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+ ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
stg %r15,__LC_SAVE_AREA_RESTART
+ TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4
+ jz 0f
+ la %r15,4095
+ lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r15)
+0: larl %r15,.Lstosm_tmp
+ stosm 0(%r15),0x04 # turn dat on, keep irqs off
lg %r15,__LC_RESTART_STACK
xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
@@ -1236,7 +659,7 @@ ENTRY(restart_int_handler)
xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15)
lg %r1,__LC_RESTART_FN # load fn, parm & source cpu
lg %r2,__LC_RESTART_DATA
- lg %r3,__LC_RESTART_SOURCE
+ lgf %r3,__LC_RESTART_SOURCE
ltgr %r3,%r3 # test source cpu address
jm 1f # negative -> skip source stop
0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu
@@ -1270,263 +693,11 @@ ENTRY(stack_overflow)
ENDPROC(stack_overflow)
#endif
-ENTRY(cleanup_critical)
-#if IS_ENABLED(CONFIG_KVM)
- clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap
- jl 0f
- clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done
- jl .Lcleanup_sie
-#endif
- clg %r9,BASED(.Lcleanup_table) # system_call
- jl 0f
- clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc
- jl .Lcleanup_system_call
- clg %r9,BASED(.Lcleanup_table+16) # .Lsysc_tif
- jl 0f
- clg %r9,BASED(.Lcleanup_table+24) # .Lsysc_restore
- jl .Lcleanup_sysc_tif
- clg %r9,BASED(.Lcleanup_table+32) # .Lsysc_done
- jl .Lcleanup_sysc_restore
- clg %r9,BASED(.Lcleanup_table+40) # .Lio_tif
- jl 0f
- clg %r9,BASED(.Lcleanup_table+48) # .Lio_restore
- jl .Lcleanup_io_tif
- clg %r9,BASED(.Lcleanup_table+56) # .Lio_done
- jl .Lcleanup_io_restore
- clg %r9,BASED(.Lcleanup_table+64) # psw_idle
- jl 0f
- clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end
- jl .Lcleanup_idle
- clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs
- jl 0f
- clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end
- jl .Lcleanup_save_fpu_regs
- clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs
- jl 0f
- clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end
- jl .Lcleanup_load_fpu_regs
-0: BR_EX %r14,%r11
-ENDPROC(cleanup_critical)
-
- .align 8
-.Lcleanup_table:
- .quad system_call
- .quad .Lsysc_do_svc
- .quad .Lsysc_tif
- .quad .Lsysc_restore
- .quad .Lsysc_done
- .quad .Lio_tif
- .quad .Lio_restore
- .quad .Lio_done
- .quad psw_idle
- .quad .Lpsw_idle_end
- .quad save_fpu_regs
- .quad .Lsave_fpu_regs_end
- .quad load_fpu_regs
- .quad .Lload_fpu_regs_end
-
-#if IS_ENABLED(CONFIG_KVM)
-.Lcleanup_table_sie:
- .quad .Lsie_gmap
- .quad .Lsie_done
-
-.Lcleanup_sie:
- cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt?
- je 1f
- slg %r9,BASED(.Lsie_crit_mcck_start)
- clg %r9,BASED(.Lsie_crit_mcck_length)
- jh 1f
- oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
- lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
- ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- larl %r9,sie_exit # skip forward to sie_exit
- BR_EX %r14,%r11
-#endif
-
-.Lcleanup_system_call:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_system_call_insn)
- jh 0f
- mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: # check if stmg has been executed
- clg %r9,BASED(.Lcleanup_system_call_insn+8)
- jh 0f
- mvc __LC_SAVE_AREA_SYNC(64),0(%r11)
-0: # check if base register setup + TIF bit load has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+16)
- jhe 0f
- # set up saved register r12 task struct pointer
- stg %r12,32(%r11)
- # set up saved register r13 __TASK_thread offset
- mvc 40(8,%r11),BASED(.Lcleanup_system_call_const)
-0: # check if the user time update has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+24)
- jh 0f
- lg %r15,__LC_EXIT_TIMER
- slg %r15,__LC_SYNC_ENTER_TIMER
- alg %r15,__LC_USER_TIMER
- stg %r15,__LC_USER_TIMER
-0: # check if the system time update has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+32)
- jh 0f
- lg %r15,__LC_LAST_UPDATE_TIMER
- slg %r15,__LC_EXIT_TIMER
- alg %r15,__LC_SYSTEM_TIMER
- stg %r15,__LC_SYSTEM_TIMER
-0: # update accounting time stamp
- mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- # set up saved register r11
- lg %r15,__LC_KERNEL_STACK
- la %r9,STACK_FRAME_OVERHEAD(%r15)
- stg %r9,24(%r11) # r11 pt_regs pointer
- # fill pt_regs
- mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC
- stmg %r0,%r7,__PT_R0(%r9)
- mvc __PT_PSW(16,%r9),__LC_SVC_OLD_PSW
- mvc __PT_INT_CODE(4,%r9),__LC_SVC_ILC
- xc __PT_FLAGS(8,%r9),__PT_FLAGS(%r9)
- mvi __PT_FLAGS+7(%r9),_PIF_SYSCALL
- # setup saved register r15
- stg %r15,56(%r11) # r15 stack pointer
- # set new psw address and exit
- larl %r9,.Lsysc_do_svc
- BR_EX %r14,%r11
-.Lcleanup_system_call_insn:
- .quad system_call
- .quad .Lsysc_stmg
- .quad .Lsysc_per
- .quad .Lsysc_vtime+36
- .quad .Lsysc_vtime+42
-.Lcleanup_system_call_const:
- .quad __TASK_thread
-
-.Lcleanup_sysc_tif:
- larl %r9,.Lsysc_tif
- BR_EX %r14,%r11
-
-.Lcleanup_sysc_restore:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_sysc_restore_insn)
- jh 0f
- mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8)
- je 1f
- lg %r9,24(%r11) # get saved pointer to pt_regs
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r9)
- mvc 0(64,%r11),__PT_R8(%r9)
- lmg %r0,%r7,__PT_R0(%r9)
-1: lmg %r8,%r9,__LC_RETURN_PSW
- BR_EX %r14,%r11
-.Lcleanup_sysc_restore_insn:
- .quad .Lsysc_exit_timer
- .quad .Lsysc_done - 4
-
-.Lcleanup_io_tif:
- larl %r9,.Lio_tif
- BR_EX %r14,%r11
-
-.Lcleanup_io_restore:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_io_restore_insn)
- jh 0f
- mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: clg %r9,BASED(.Lcleanup_io_restore_insn+8)
- je 1f
- lg %r9,24(%r11) # get saved r11 pointer to pt_regs
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r9)
- mvc 0(64,%r11),__PT_R8(%r9)
- lmg %r0,%r7,__PT_R0(%r9)
-1: lmg %r8,%r9,__LC_RETURN_PSW
- BR_EX %r14,%r11
-.Lcleanup_io_restore_insn:
- .quad .Lio_exit_timer
- .quad .Lio_done - 4
-
-.Lcleanup_idle:
- ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT
- # copy interrupt clock & cpu timer
- mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK
- mvc __TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK
- mvc __TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER
-0: # check if stck & stpt have been executed
- clg %r9,BASED(.Lcleanup_idle_insn)
- jhe 1f
- mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2)
- mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2)
-1: # calculate idle cycles
- clg %r9,BASED(.Lcleanup_idle_insn)
- jl 3f
- larl %r1,smp_cpu_mtid
- llgf %r1,0(%r1)
- ltgr %r1,%r1
- jz 3f
- .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15)
- larl %r3,mt_cycles
- ag %r3,__LC_PERCPU_OFFSET
- la %r4,__SF_EMPTY+16(%r15)
-2: lg %r0,0(%r3)
- slg %r0,0(%r4)
- alg %r0,64(%r4)
- stg %r0,0(%r3)
- la %r3,8(%r3)
- la %r4,8(%r4)
- brct %r1,2b
-3: # account system time going idle
- lg %r9,__LC_STEAL_TIMER
- alg %r9,__CLOCK_IDLE_ENTER(%r2)
- slg %r9,__LC_LAST_UPDATE_CLOCK
- stg %r9,__LC_STEAL_TIMER
- mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2)
- lg %r9,__LC_SYSTEM_TIMER
- alg %r9,__LC_LAST_UPDATE_TIMER
- slg %r9,__TIMER_IDLE_ENTER(%r2)
- stg %r9,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2)
- # prepare return psw
- nihh %r8,0xfcfd # clear irq & wait state bits
- lg %r9,48(%r11) # return from psw_idle
- BR_EX %r14,%r11
-.Lcleanup_idle_insn:
- .quad .Lpsw_idle_lpsw
-
-.Lcleanup_save_fpu_regs:
- larl %r9,save_fpu_regs
- BR_EX %r14,%r11
-
-.Lcleanup_load_fpu_regs:
- larl %r9,load_fpu_regs
- BR_EX %r14,%r11
-
-/*
- * Integer constants
- */
- .align 8
-.Lcritical_start:
- .quad .L__critical_start
-.Lcritical_length:
- .quad .L__critical_end - .L__critical_start
-#if IS_ENABLED(CONFIG_KVM)
-.Lsie_critical_start:
- .quad .Lsie_gmap
-.Lsie_critical_length:
- .quad .Lsie_done - .Lsie_gmap
-.Lsie_crit_mcck_start:
- .quad .Lsie_entry
-.Lsie_crit_mcck_length:
- .quad .Lsie_skip - .Lsie_entry
-#endif
+ .section .data, "aw"
+ .align 4
+.Lstop_lock: .long 0
+.Lthis_cpu: .short 0
+.Lstosm_tmp: .byte 0
.section .rodata, "a"
#define SYSCALL(esame,emu) .quad __s390x_ ## esame
.globl sys_call_table
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 1d3927e01a5f..995ec7449feb 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -5,11 +5,11 @@
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/signal.h>
+#include <asm/extable.h>
#include <asm/ptrace.h>
#include <asm/idle.h>
extern void *restart_stack;
-extern unsigned long suspend_zero_pages;
void system_call(void);
void pgm_check_handler(void);
@@ -17,54 +17,31 @@ void ext_int_handler(void);
void io_int_handler(void);
void mcck_int_handler(void);
void restart_int_handler(void);
-void restart_call_handler(void);
+void early_pgm_check_handler(void);
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs);
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs);
+void __do_pgm_check(struct pt_regs *regs);
+void __do_syscall(struct pt_regs *regs, int per_trap);
+void __do_early_pgm_check(struct pt_regs *regs);
void do_protection_exception(struct pt_regs *regs);
void do_dat_exception(struct pt_regs *regs);
-
-void addressing_exception(struct pt_regs *regs);
-void data_exception(struct pt_regs *regs);
-void default_trap_handler(struct pt_regs *regs);
-void divide_exception(struct pt_regs *regs);
-void execute_exception(struct pt_regs *regs);
-void hfp_divide_exception(struct pt_regs *regs);
-void hfp_overflow_exception(struct pt_regs *regs);
-void hfp_significance_exception(struct pt_regs *regs);
-void hfp_sqrt_exception(struct pt_regs *regs);
-void hfp_underflow_exception(struct pt_regs *regs);
-void illegal_op(struct pt_regs *regs);
-void operand_exception(struct pt_regs *regs);
-void overflow_exception(struct pt_regs *regs);
-void privileged_op(struct pt_regs *regs);
-void space_switch_exception(struct pt_regs *regs);
-void special_op_exception(struct pt_regs *regs);
-void specification_exception(struct pt_regs *regs);
-void transaction_exception(struct pt_regs *regs);
-void translation_exception(struct pt_regs *regs);
-void vector_exception(struct pt_regs *regs);
-void monitor_event_exception(struct pt_regs *regs);
-
-void do_per_trap(struct pt_regs *regs);
+void do_secure_storage_access(struct pt_regs *regs);
+void do_non_secure_storage_access(struct pt_regs *regs);
+void do_secure_storage_violation(struct pt_regs *regs);
void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str);
-void syscall_trace(struct pt_regs *regs, int entryexit);
void kernel_stack_overflow(struct pt_regs * regs);
-void do_signal(struct pt_regs *regs);
void handle_signal32(struct ksignal *ksig, sigset_t *oldset,
struct pt_regs *regs);
-void do_notify_resume(struct pt_regs *regs);
void __init init_IRQ(void);
-void do_IRQ(struct pt_regs *regs, int irq);
-void do_restart(void);
-void __init startup_init_nobss(void);
+void do_io_irq(struct pt_regs *regs);
+void do_ext_irq(struct pt_regs *regs);
+void do_restart(void *arg);
void __init startup_init(void);
void die(struct pt_regs *regs, const char *str);
int setup_profiling_timer(unsigned int multiplier);
void __init time_init(void);
-void s390_early_resume(void);
unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip);
struct s390_mmap_arg_struct;
@@ -83,10 +60,19 @@ long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user
DECLARE_PER_CPU(u64, mt_cycles[8]);
-void gs_load_bc_cb(struct pt_regs *regs);
-void set_fs_fixup(void);
-
unsigned long stack_alloc(void);
void stack_free(unsigned long stack);
+extern char kprobes_insn_page[];
+
+extern char _samode31[], _eamode31[];
+extern char _stext_amode31[], _etext_amode31[];
+extern struct exception_table_entry _start_amode31_ex_table[];
+extern struct exception_table_entry _stop_amode31_ex_table[];
+
+#define __amode31_data __section(".amode31.data")
+#define __amode31_ref __section(".amode31.refs")
+extern long _start_amode31_refs[], _end_amode31_refs[];
+extern unsigned long __amode31_base;
+
#endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index 0da378e2eb25..d864c9a325e2 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -175,3 +175,91 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
: "1", "cc");
}
EXPORT_SYMBOL(__kernel_fpu_end);
+
+void __load_fpu_regs(void)
+{
+ struct fpu *state = &current->thread.fpu;
+ unsigned long *regs = current->thread.fpu.regs;
+
+ asm volatile("lfpc %0" : : "Q" (state->fpc));
+ if (likely(MACHINE_HAS_VX)) {
+ asm volatile("lgr 1,%0\n"
+ "VLM 0,15,0,1\n"
+ "VLM 16,31,256,1\n"
+ :
+ : "d" (regs)
+ : "1", "cc", "memory");
+ } else {
+ asm volatile("ld 0,%0" : : "Q" (regs[0]));
+ asm volatile("ld 1,%0" : : "Q" (regs[1]));
+ asm volatile("ld 2,%0" : : "Q" (regs[2]));
+ asm volatile("ld 3,%0" : : "Q" (regs[3]));
+ asm volatile("ld 4,%0" : : "Q" (regs[4]));
+ asm volatile("ld 5,%0" : : "Q" (regs[5]));
+ asm volatile("ld 6,%0" : : "Q" (regs[6]));
+ asm volatile("ld 7,%0" : : "Q" (regs[7]));
+ asm volatile("ld 8,%0" : : "Q" (regs[8]));
+ asm volatile("ld 9,%0" : : "Q" (regs[9]));
+ asm volatile("ld 10,%0" : : "Q" (regs[10]));
+ asm volatile("ld 11,%0" : : "Q" (regs[11]));
+ asm volatile("ld 12,%0" : : "Q" (regs[12]));
+ asm volatile("ld 13,%0" : : "Q" (regs[13]));
+ asm volatile("ld 14,%0" : : "Q" (regs[14]));
+ asm volatile("ld 15,%0" : : "Q" (regs[15]));
+ }
+ clear_cpu_flag(CIF_FPU);
+}
+EXPORT_SYMBOL(__load_fpu_regs);
+
+void load_fpu_regs(void)
+{
+ raw_local_irq_disable();
+ __load_fpu_regs();
+ raw_local_irq_enable();
+}
+EXPORT_SYMBOL(load_fpu_regs);
+
+void save_fpu_regs(void)
+{
+ unsigned long flags, *regs;
+ struct fpu *state;
+
+ local_irq_save(flags);
+
+ if (test_cpu_flag(CIF_FPU))
+ goto out;
+
+ state = &current->thread.fpu;
+ regs = current->thread.fpu.regs;
+
+ asm volatile("stfpc %0" : "=Q" (state->fpc));
+ if (likely(MACHINE_HAS_VX)) {
+ asm volatile("lgr 1,%0\n"
+ "VSTM 0,15,0,1\n"
+ "VSTM 16,31,256,1\n"
+ :
+ : "d" (regs)
+ : "1", "cc", "memory");
+ } else {
+ asm volatile("std 0,%0" : "=Q" (regs[0]));
+ asm volatile("std 1,%0" : "=Q" (regs[1]));
+ asm volatile("std 2,%0" : "=Q" (regs[2]));
+ asm volatile("std 3,%0" : "=Q" (regs[3]));
+ asm volatile("std 4,%0" : "=Q" (regs[4]));
+ asm volatile("std 5,%0" : "=Q" (regs[5]));
+ asm volatile("std 6,%0" : "=Q" (regs[6]));
+ asm volatile("std 7,%0" : "=Q" (regs[7]));
+ asm volatile("std 8,%0" : "=Q" (regs[8]));
+ asm volatile("std 9,%0" : "=Q" (regs[9]));
+ asm volatile("std 10,%0" : "=Q" (regs[10]));
+ asm volatile("std 11,%0" : "=Q" (regs[11]));
+ asm volatile("std 12,%0" : "=Q" (regs[12]));
+ asm volatile("std 13,%0" : "=Q" (regs[13]));
+ asm volatile("std 14,%0" : "=Q" (regs[14]));
+ asm volatile("std 15,%0" : "=Q" (regs[15]));
+ }
+ set_cpu_flag(CIF_FPU);
+out:
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(save_fpu_regs);
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 4cd9b1ada834..416b5a94353d 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -4,8 +4,7 @@
*
* Copyright IBM Corp. 2009,2014
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
#include <linux/moduleloader.h>
@@ -17,149 +16,237 @@
#include <linux/kprobes.h>
#include <trace/syscall.h>
#include <asm/asm-offsets.h>
+#include <asm/text-patching.h>
#include <asm/cacheflush.h>
+#include <asm/ftrace.lds.h>
+#include <asm/nospec-branch.h>
#include <asm/set_memory.h>
#include "entry.h"
+#include "ftrace.h"
/*
- * The mcount code looks like this:
- * stg %r14,8(%r15) # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
- * Total length is 24 bytes. Only the first instruction will be patched
- * by ftrace_make_call / ftrace_make_nop.
- * The enabled ftrace code block looks like this:
+ * To generate function prologue either gcc's hotpatch feature (since gcc 4.8)
+ * or a combination of -pg -mrecord-mcount -mnop-mcount -mfentry flags
+ * (since gcc 9 / clang 10) is used.
+ * In both cases the original and also the disabled function prologue contains
+ * only a single six byte instruction and looks like this:
+ * > brcl 0,0 # offset 0
+ * To enable ftrace the code gets patched like above and afterwards looks
+ * like this:
* > brasl %r0,ftrace_caller # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
+ *
+ * The instruction will be patched by ftrace_make_call / ftrace_make_nop.
* The ftrace function gets called with a non-standard C function call ABI
* where r0 contains the return address. It is also expected that the called
* function only clobbers r0 and r1, but restores r2-r15.
* For module code we can't directly jump to ftrace caller, but need a
* trampoline (ftrace_plt), which clobbers also r1.
- * The return point of the ftrace function has offset 24, so execution
- * continues behind the mcount block.
- * The disabled ftrace code block looks like this:
- * > jg .+24 # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
- * The jg instruction branches to offset 24 to skip as many instructions
- * as possible.
- * In case we use gcc's hotpatch feature the original and also the disabled
- * function prologue contains only a single six byte instruction and looks
- * like this:
- * > brcl 0,0 # offset 0
- * To enable ftrace the code gets patched like above and afterwards looks
- * like this:
- * > brasl %r0,ftrace_caller # offset 0
*/
-unsigned long ftrace_plt;
+void *ftrace_func __read_mostly = ftrace_stub;
+struct ftrace_insn {
+ u16 opc;
+ s32 disp;
+} __packed;
+
+asm(
+ " .align 16\n"
+ "ftrace_shared_hotpatch_trampoline_br:\n"
+ " lmg %r0,%r1,2(%r1)\n"
+ " br %r1\n"
+ "ftrace_shared_hotpatch_trampoline_br_end:\n"
+);
+
+#ifdef CONFIG_EXPOLINE
+asm(
+ " .align 16\n"
+ "ftrace_shared_hotpatch_trampoline_exrl:\n"
+ " lmg %r0,%r1,2(%r1)\n"
+ " exrl %r0,0f\n"
+ " j .\n"
+ "0: br %r1\n"
+ "ftrace_shared_hotpatch_trampoline_exrl_end:\n"
+);
+#endif /* CONFIG_EXPOLINE */
-static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn)
+#ifdef CONFIG_MODULES
+static char *ftrace_plt;
+#endif /* CONFIG_MODULES */
+
+static const char *ftrace_shared_hotpatch_trampoline(const char **end)
{
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- /* brcl 0,0 */
- insn->opc = 0xc004;
- insn->disp = 0;
-#else
- /* stg r14,8(r15) */
- insn->opc = 0xe3e0;
- insn->disp = 0xf0080024;
-#endif
+ const char *tstart, *tend;
+
+ tstart = ftrace_shared_hotpatch_trampoline_br;
+ tend = ftrace_shared_hotpatch_trampoline_br_end;
+#ifdef CONFIG_EXPOLINE
+ if (!nospec_disable) {
+ tstart = ftrace_shared_hotpatch_trampoline_exrl;
+ tend = ftrace_shared_hotpatch_trampoline_exrl_end;
+ }
+#endif /* CONFIG_EXPOLINE */
+ if (end)
+ *end = tend;
+ return tstart;
}
-static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn)
+bool ftrace_need_init_nop(void)
{
-#ifdef CONFIG_KPROBES
- insn->opc = BREAKPOINT_INSTRUCTION;
- insn->disp = KPROBE_ON_FTRACE_NOP;
-#endif
+ return true;
}
-static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn)
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
{
-#ifdef CONFIG_KPROBES
- insn->opc = BREAKPOINT_INSTRUCTION;
- insn->disp = KPROBE_ON_FTRACE_CALL;
+ static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline =
+ __ftrace_hotpatch_trampolines_start;
+ static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
+ static struct ftrace_hotpatch_trampoline *trampoline;
+ struct ftrace_hotpatch_trampoline **next_trampoline;
+ struct ftrace_hotpatch_trampoline *trampolines_end;
+ struct ftrace_hotpatch_trampoline tmp;
+ struct ftrace_insn *insn;
+ const char *shared;
+ s32 disp;
+
+ BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) !=
+ SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE);
+
+ next_trampoline = &next_vmlinux_trampoline;
+ trampolines_end = __ftrace_hotpatch_trampolines_end;
+ shared = ftrace_shared_hotpatch_trampoline(NULL);
+#ifdef CONFIG_MODULES
+ if (mod) {
+ next_trampoline = &mod->arch.next_trampoline;
+ trampolines_end = mod->arch.trampolines_end;
+ shared = ftrace_plt;
+ }
#endif
+
+ if (WARN_ON_ONCE(*next_trampoline >= trampolines_end))
+ return -ENOMEM;
+ trampoline = (*next_trampoline)++;
+
+ /* Check for the compiler-generated fentry nop (brcl 0, .). */
+ if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig))))
+ return -EINVAL;
+
+ /* Generate the trampoline. */
+ tmp.brasl_opc = 0xc015; /* brasl %r1, shared */
+ tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2;
+ tmp.interceptor = FTRACE_ADDR;
+ tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn);
+ s390_kernel_write(trampoline, &tmp, sizeof(tmp));
+
+ /* Generate a jump to the trampoline. */
+ disp = ((char *)trampoline - (char *)rec->ip) / 2;
+ insn = (struct ftrace_insn *)rec->ip;
+ s390_kernel_write(&insn->disp, &disp, sizeof(disp));
+
+ return 0;
}
-int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
- unsigned long addr)
+static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec)
{
- return 0;
+ struct ftrace_hotpatch_trampoline *trampoline;
+ struct ftrace_insn insn;
+ s64 disp;
+ u16 opc;
+
+ if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn)))
+ return ERR_PTR(-EFAULT);
+ disp = (s64)insn.disp * 2;
+ trampoline = (void *)(rec->ip + disp);
+ if (get_kernel_nofault(opc, &trampoline->brasl_opc))
+ return ERR_PTR(-EFAULT);
+ if (opc != 0xc015)
+ return ERR_PTR(-EINVAL);
+ return trampoline;
}
-int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
- unsigned long addr)
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
{
- struct ftrace_insn orig, new, old;
+ struct ftrace_hotpatch_trampoline *trampoline;
+ u64 old;
- if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
+ trampoline = ftrace_get_trampoline(rec);
+ if (IS_ERR(trampoline))
+ return PTR_ERR(trampoline);
+ if (get_kernel_nofault(old, &trampoline->interceptor))
return -EFAULT;
- if (addr == MCOUNT_ADDR) {
- /* Initial code replacement */
- ftrace_generate_orig_insn(&orig);
- ftrace_generate_nop_insn(&new);
- } else {
- /* Replace ftrace call with a nop. */
- ftrace_generate_call_insn(&orig, rec->ip);
- ftrace_generate_nop_insn(&new);
- }
- /* Verify that the to be replaced code matches what we expect. */
- if (memcmp(&orig, &old, sizeof(old)))
+ if (old != old_addr)
return -EINVAL;
- s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+ s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
return 0;
}
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable)
{
- struct ftrace_insn orig, new, old;
+ u16 old;
+ u8 op;
- if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
+ if (get_kernel_nofault(old, addr))
return -EFAULT;
- /* Replace nop with an ftrace call. */
- ftrace_generate_nop_insn(&orig);
- ftrace_generate_call_insn(&new, rec->ip);
-
- /* Verify that the to be replaced code matches what we expect. */
- if (memcmp(&orig, &old, sizeof(old)))
+ if (old != expected)
return -EINVAL;
- s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+ /* set mask field to all ones or zeroes */
+ op = enable ? 0xf4 : 0x04;
+ s390_kernel_write((char *)addr + 1, &op, sizeof(op));
return 0;
}
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+ unsigned long addr)
+{
+ /* Expect brcl 0xf,... */
+ return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ struct ftrace_hotpatch_trampoline *trampoline;
+
+ trampoline = ftrace_get_trampoline(rec);
+ if (IS_ERR(trampoline))
+ return PTR_ERR(trampoline);
+ s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
+ /* Expect brcl 0x0,... */
+ return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true);
+}
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
+ ftrace_func = func;
return 0;
}
-int __init ftrace_dyn_arch_init(void)
+void arch_ftrace_update_code(int command)
{
- return 0;
+ ftrace_modify_all_code(command);
+}
+
+void ftrace_arch_code_modify_post_process(void)
+{
+ /*
+ * Flush any pre-fetched instructions on all
+ * CPUs to make the new code visible.
+ */
+ text_poke_sync_lock();
}
#ifdef CONFIG_MODULES
static int __init ftrace_plt_init(void)
{
- unsigned int *ip;
+ const char *start, *end;
- ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
+ ftrace_plt = module_alloc(PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
- ip = (unsigned int *) ftrace_plt;
- ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
- ip[1] = 0x100a0004;
- ip[2] = 0x07f10000;
- ip[3] = FTRACE_ADDR >> 32;
- ip[4] = FTRACE_ADDR & 0xffffffff;
- set_memory_ro(ftrace_plt, 1);
+
+ start = ftrace_shared_hotpatch_trampoline(&end);
+ memcpy(ftrace_plt, start, end - start);
+ set_memory_ro((unsigned long)ftrace_plt, 1);
return 0;
}
device_initcall(ftrace_plt_init);
@@ -196,17 +283,25 @@ NOKPROBE_SYMBOL(prepare_ftrace_return);
*/
int ftrace_enable_ftrace_graph_caller(void)
{
- u8 op = 0x04; /* set mask field to zero */
+ int rc;
- s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+ /* Expect brc 0xf,... */
+ rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false);
+ if (rc)
+ return rc;
+ text_poke_sync_lock();
return 0;
}
int ftrace_disable_ftrace_graph_caller(void)
{
- u8 op = 0xf4; /* set mask field to all ones */
+ int rc;
- s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+ /* Expect brc 0x0,... */
+ rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true);
+ if (rc)
+ return rc;
+ text_poke_sync_lock();
return 0;
}
@@ -214,17 +309,25 @@ int ftrace_disable_ftrace_graph_caller(void)
#ifdef CONFIG_KPROBES_ON_FTRACE
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct kprobe_ctlblk *kcb;
- struct kprobe *p = get_kprobe((kprobe_opcode_t *)ip);
+ struct pt_regs *regs;
+ struct kprobe *p;
+ int bit;
- if (unlikely(!p) || kprobe_disabled(p))
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
return;
+ regs = ftrace_get_regs(fregs);
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (!regs || unlikely(!p) || kprobe_disabled(p))
+ goto out;
+
if (kprobe_running()) {
kprobes_inc_nmissed_count(p);
- return;
+ goto out;
}
__this_cpu_write(current_kprobe, p);
@@ -244,6 +347,8 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
}
}
__this_cpu_write(current_kprobe, NULL);
+out:
+ ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h
new file mode 100644
index 000000000000..7f75a9616406
--- /dev/null
+++ b/arch/s390/kernel/ftrace.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FTRACE_H
+#define _FTRACE_H
+
+#include <asm/types.h>
+
+struct ftrace_hotpatch_trampoline {
+ u16 brasl_opc;
+ s32 brasl_disp;
+ s16: 16;
+ u64 rest_of_intercepted_function;
+ u64 interceptor;
+} __packed;
+
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[];
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[];
+extern const char ftrace_shared_hotpatch_trampoline_br[];
+extern const char ftrace_shared_hotpatch_trampoline_br_end[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl_end[];
+extern const char ftrace_plt_template[];
+extern const char ftrace_plt_template_end[];
+
+#endif /* _FTRACE_H */
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 8b88dbbda7df..d7b8b6ad574d 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -5,7 +5,6 @@
* Author(s): Hartmut Penner <hp@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Rob van der Heij <rvdhei@iae.nl>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*
*/
@@ -18,14 +17,8 @@
__HEAD
ENTRY(startup_continue)
- tm __LC_STFLE_FAC_LIST+5,0x80 # LPP available ?
- jz 0f
- xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid
- mvi __LC_LPP,0x80 # and set LPP_MAGIC
- .insn s,0xb2800000,__LC_LPP # load program parameter
-0: larl %r1,tod_clock_base
+ larl %r1,tod_clock_base
mvc 0(16,%r1),__LC_BOOT_CLOCK
- larl %r13,.LPG1 # get base
#
# Setup stack
#
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 8f8456816d83..4bf1ee293f2b 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -9,51 +9,71 @@
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/kprobes.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/sched/cputime.h>
+#include <trace/events/power.h>
+#include <asm/cpu_mf.h>
#include <asm/nmi.h>
#include <asm/smp.h>
#include "entry.h"
static DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
-void enabled_wait(void)
+void account_idle_time_irq(void)
{
struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
- unsigned long long idle_time;
- unsigned long psw_mask;
+ u64 cycles_new[8];
+ int i;
+
+ clear_cpu_flag(CIF_ENABLED_WAIT);
+ if (smp_cpu_mtid) {
+ stcctm(MT_DIAG, smp_cpu_mtid, cycles_new);
+ for (i = 0; i < smp_cpu_mtid; i++)
+ this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
+ }
+
+ idle->clock_idle_exit = S390_lowcore.int_clock;
+ idle->timer_idle_exit = S390_lowcore.sys_enter_timer;
+
+ S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock;
+ S390_lowcore.last_update_clock = idle->clock_idle_exit;
- trace_hardirqs_on();
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter;
+ S390_lowcore.last_update_timer = idle->timer_idle_exit;
+}
+
+void arch_cpu_idle(void)
+{
+ struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
+ unsigned long idle_time;
+ unsigned long psw_mask;
/* Wait for external, I/O or machine check interrupt. */
psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT |
PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
clear_cpu_flag(CIF_NOHZ_DELAY);
- /* Call the assembler magic in entry.S */
+ /* psw_idle() returns with interrupts disabled. */
psw_idle(idle, psw_mask);
- trace_hardirqs_off();
-
/* Account time spent with enabled wait psw loaded as idle time. */
- write_seqcount_begin(&idle->seqcount);
+ raw_write_seqcount_begin(&idle->seqcount);
idle_time = idle->clock_idle_exit - idle->clock_idle_enter;
idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
idle->idle_time += idle_time;
idle->idle_count++;
account_idle_time(cputime_to_nsecs(idle_time));
- write_seqcount_end(&idle->seqcount);
+ raw_write_seqcount_end(&idle->seqcount);
+ raw_local_irq_enable();
}
-NOKPROBE_SYMBOL(enabled_wait);
static ssize_t show_idle_count(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
- unsigned long long idle_count;
+ unsigned long idle_count;
unsigned int seq;
do {
@@ -62,14 +82,14 @@ static ssize_t show_idle_count(struct device *dev,
if (READ_ONCE(idle->clock_idle_enter))
idle_count++;
} while (read_seqcount_retry(&idle->seqcount, seq));
- return sprintf(buf, "%llu\n", idle_count);
+ return sprintf(buf, "%lu\n", idle_count);
}
DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
static ssize_t show_idle_time(struct device *dev,
struct device_attribute *attr, char *buf)
{
- unsigned long long now, idle_time, idle_enter, idle_exit, in_idle;
+ unsigned long now, idle_time, idle_enter, idle_exit, in_idle;
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
unsigned int seq;
@@ -89,14 +109,14 @@ static ssize_t show_idle_time(struct device *dev,
}
}
idle_time += in_idle;
- return sprintf(buf, "%llu\n", idle_time >> 12);
+ return sprintf(buf, "%lu\n", idle_time >> 12);
}
DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
u64 arch_cpu_idle_time(int cpu)
{
struct s390_idle_data *idle = &per_cpu(s390_idle, cpu);
- unsigned long long now, idle_enter, idle_exit, in_idle;
+ unsigned long now, idle_enter, idle_exit, in_idle;
unsigned int seq;
do {
@@ -118,22 +138,10 @@ u64 arch_cpu_idle_time(int cpu)
void arch_cpu_idle_enter(void)
{
- local_mcck_disable();
-}
-
-void arch_cpu_idle(void)
-{
- if (!test_cpu_flag(CIF_MCCK_PENDING))
- /* Halt the cpu and keep track of cpu time accounting. */
- enabled_wait();
- local_irq_enable();
}
void arch_cpu_idle_exit(void)
{
- local_mcck_enable();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
}
void arch_cpu_idle_dead(void)
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 6837affc19e8..325cbf69ebbd 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -4,7 +4,6 @@
*
* Copyright IBM Corp. 2005, 2012
* Author(s): Michael Holzheu <holzheu@de.ibm.com>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Volker Sameske <sameske@de.ibm.com>
*/
@@ -13,12 +12,14 @@
#include <linux/init.h>
#include <linux/device.h>
#include <linux/delay.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/crash_dump.h>
#include <linux/debug_locks.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/ipl.h>
#include <asm/smp.h>
@@ -28,6 +29,7 @@
#include <asm/sclp.h>
#include <asm/checksum.h>
#include <asm/debug.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
@@ -39,10 +41,13 @@
#define IPL_CCW_STR "ccw"
#define IPL_FCP_STR "fcp"
#define IPL_FCP_DUMP_STR "fcp_dump"
+#define IPL_NVME_STR "nvme"
+#define IPL_NVME_DUMP_STR "nvme_dump"
#define IPL_NSS_STR "nss"
#define DUMP_CCW_STR "ccw"
#define DUMP_FCP_STR "fcp"
+#define DUMP_NVME_STR "nvme"
#define DUMP_NONE_STR "none"
/*
@@ -93,6 +98,10 @@ static char *ipl_type_str(enum ipl_type type)
return IPL_FCP_DUMP_STR;
case IPL_TYPE_NSS:
return IPL_NSS_STR;
+ case IPL_TYPE_NVME:
+ return IPL_NVME_STR;
+ case IPL_TYPE_NVME_DUMP:
+ return IPL_NVME_DUMP_STR;
case IPL_TYPE_UNKNOWN:
default:
return IPL_UNKNOWN_STR;
@@ -103,6 +112,7 @@ enum dump_type {
DUMP_TYPE_NONE = 1,
DUMP_TYPE_CCW = 2,
DUMP_TYPE_FCP = 4,
+ DUMP_TYPE_NVME = 8,
};
static char *dump_type_str(enum dump_type type)
@@ -114,6 +124,8 @@ static char *dump_type_str(enum dump_type type)
return DUMP_CCW_STR;
case DUMP_TYPE_FCP:
return DUMP_FCP_STR;
+ case DUMP_TYPE_NVME:
+ return DUMP_NVME_STR;
default:
return NULL;
}
@@ -133,6 +145,7 @@ static int reipl_capabilities = IPL_TYPE_UNKNOWN;
static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN;
static struct ipl_parameter_block *reipl_block_fcp;
+static struct ipl_parameter_block *reipl_block_nvme;
static struct ipl_parameter_block *reipl_block_ccw;
static struct ipl_parameter_block *reipl_block_nss;
static struct ipl_parameter_block *reipl_block_actual;
@@ -140,28 +153,33 @@ static struct ipl_parameter_block *reipl_block_actual;
static int dump_capabilities = DUMP_TYPE_NONE;
static enum dump_type dump_type = DUMP_TYPE_NONE;
static struct ipl_parameter_block *dump_block_fcp;
+static struct ipl_parameter_block *dump_block_nvme;
static struct ipl_parameter_block *dump_block_ccw;
static struct sclp_ipl_info sclp_ipl_info;
+static bool reipl_nvme_clear;
+static bool reipl_fcp_clear;
+static bool reipl_ccw_clear;
+
static inline int __diag308(unsigned long subcode, void *addr)
{
- register unsigned long _addr asm("0") = (unsigned long) addr;
- register unsigned long _rc asm("1") = 0;
+ union register_pair r1;
+ r1.even = (unsigned long) addr;
+ r1.odd = 0;
asm volatile(
- " diag %0,%2,0x308\n"
+ " diag %[r1],%[subcode],0x308\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : "+d" (_addr), "+d" (_rc)
- : "d" (subcode) : "cc", "memory");
- return _rc;
+ : [r1] "+&d" (r1.pair)
+ : [subcode] "d" (subcode)
+ : "cc", "memory");
+ return r1.odd;
}
int diag308(unsigned long subcode, void *addr)
{
- if (IS_ENABLED(CONFIG_KASAN))
- __arch_local_irq_stosm(0x04); /* enable DAT */
diag_stat_inc(DIAG_STAT_X308);
return __diag308(subcode, addr);
}
@@ -174,7 +192,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
- return snprintf(page, PAGE_SIZE, _format, ##args); \
+ return scnprintf(page, PAGE_SIZE, _format, ##args); \
}
#define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \
@@ -258,6 +276,11 @@ static __init enum ipl_type get_ipl_type(void)
return IPL_TYPE_FCP_DUMP;
else
return IPL_TYPE_FCP;
+ case IPL_PBT_NVME:
+ if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+ return IPL_TYPE_NVME_DUMP;
+ else
+ return IPL_TYPE_NVME;
}
return IPL_TYPE_UNKNOWN;
}
@@ -314,6 +337,9 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj,
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno);
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ return sprintf(page, "%08ux\n", ipl_block.nvme.fid);
default:
return 0;
}
@@ -342,15 +368,35 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
return memory_read_from_buffer(buf, count, &off, scp_data, size);
}
+
+static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ unsigned int size = ipl_block.nvme.scp_data_len;
+ void *scp_data = &ipl_block.nvme.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
static struct bin_attribute ipl_scp_data_attr =
__BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE);
+static struct bin_attribute ipl_nvme_scp_data_attr =
+ __BIN_ATTR(scp_data, S_IRUGO, ipl_nvme_scp_data_read, NULL, PAGE_SIZE);
+
static struct bin_attribute *ipl_fcp_bin_attrs[] = {
&ipl_parameter_attr,
&ipl_scp_data_attr,
NULL,
};
+static struct bin_attribute *ipl_nvme_bin_attrs[] = {
+ &ipl_parameter_attr,
+ &ipl_nvme_scp_data_attr,
+ NULL,
+};
+
/* FCP ipl device attributes */
DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n",
@@ -362,6 +408,16 @@ DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n",
DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n",
(unsigned long long)ipl_block.fcp.br_lba);
+/* NVMe ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n",
+ (unsigned long long)ipl_block.nvme.fid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n",
+ (unsigned long long)ipl_block.nvme.nsid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n",
+ (unsigned long long)ipl_block.nvme.bootprog);
+DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n",
+ (unsigned long long)ipl_block.nvme.br_lba);
+
static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
@@ -396,6 +452,24 @@ static struct attribute_group ipl_fcp_attr_group = {
.bin_attrs = ipl_fcp_bin_attrs,
};
+static struct attribute *ipl_nvme_attrs[] = {
+ &sys_ipl_type_attr.attr,
+ &sys_ipl_nvme_fid_attr.attr,
+ &sys_ipl_nvme_nsid_attr.attr,
+ &sys_ipl_nvme_bootprog_attr.attr,
+ &sys_ipl_nvme_br_lba_attr.attr,
+ &sys_ipl_ccw_loadparm_attr.attr,
+ &sys_ipl_secure_attr.attr,
+ &sys_ipl_has_secure_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ipl_nvme_attr_group = {
+ .attrs = ipl_nvme_attrs,
+ .bin_attrs = ipl_nvme_bin_attrs,
+};
+
+
/* CCW ipl device attributes */
static struct attribute *ipl_ccw_attrs_vm[] = {
@@ -471,6 +545,10 @@ static int __init ipl_init(void)
case IPL_TYPE_FCP_DUMP:
rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group);
break;
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group);
+ break;
default:
rc = sysfs_create_group(&ipl_kset->kobj,
&ipl_unknown_attr_group);
@@ -691,6 +769,21 @@ static struct kobj_attribute sys_reipl_fcp_loadparm_attr =
__ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show,
reipl_fcp_loadparm_store);
+static ssize_t reipl_fcp_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_fcp_clear);
+}
+
+static ssize_t reipl_fcp_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (strtobool(buf, &reipl_fcp_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
static struct attribute *reipl_fcp_attrs[] = {
&sys_reipl_fcp_device_attr.attr,
&sys_reipl_fcp_wwpn_attr.attr,
@@ -706,6 +799,114 @@ static struct attribute_group reipl_fcp_attr_group = {
.bin_attrs = reipl_fcp_bin_attrs,
};
+static struct kobj_attribute sys_reipl_fcp_clear_attr =
+ __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store);
+
+/* NVME reipl device attributes */
+
+static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ size_t size = reipl_block_nvme->nvme.scp_data_len;
+ void *scp_data = reipl_block_nvme->nvme.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
+static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ size_t scpdata_len = count;
+ size_t padding;
+
+ if (off)
+ return -EINVAL;
+
+ memcpy(reipl_block_nvme->nvme.scp_data, buf, count);
+ if (scpdata_len % 8) {
+ padding = 8 - (scpdata_len % 8);
+ memset(reipl_block_nvme->nvme.scp_data + scpdata_len,
+ 0, padding);
+ scpdata_len += padding;
+ }
+
+ reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len;
+ reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len;
+ reipl_block_nvme->nvme.scp_data_len = scpdata_len;
+
+ return count;
+}
+
+static struct bin_attribute sys_reipl_nvme_scp_data_attr =
+ __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_nvme_scpdata_read,
+ reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE);
+
+static struct bin_attribute *reipl_nvme_bin_attrs[] = {
+ &sys_reipl_nvme_scp_data_attr,
+ NULL,
+};
+
+DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n",
+ reipl_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n",
+ reipl_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n",
+ reipl_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n",
+ reipl_block_nvme->nvme.br_lba);
+
+/* nvme wrapper */
+static ssize_t reipl_nvme_loadparm_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return reipl_generic_loadparm_show(reipl_block_nvme, page);
+}
+
+static ssize_t reipl_nvme_loadparm_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ return reipl_generic_loadparm_store(reipl_block_nvme, buf, len);
+}
+
+static struct kobj_attribute sys_reipl_nvme_loadparm_attr =
+ __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nvme_loadparm_show,
+ reipl_nvme_loadparm_store);
+
+static struct attribute *reipl_nvme_attrs[] = {
+ &sys_reipl_nvme_fid_attr.attr,
+ &sys_reipl_nvme_nsid_attr.attr,
+ &sys_reipl_nvme_bootprog_attr.attr,
+ &sys_reipl_nvme_br_lba_attr.attr,
+ &sys_reipl_nvme_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group reipl_nvme_attr_group = {
+ .attrs = reipl_nvme_attrs,
+ .bin_attrs = reipl_nvme_bin_attrs
+};
+
+static ssize_t reipl_nvme_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_nvme_clear);
+}
+
+static ssize_t reipl_nvme_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (strtobool(buf, &reipl_nvme_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
+static struct kobj_attribute sys_reipl_nvme_clear_attr =
+ __ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store);
+
/* CCW reipl device attributes */
DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
@@ -741,16 +942,36 @@ static struct kobj_attribute sys_reipl_ccw_loadparm_attr =
__ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show,
reipl_ccw_loadparm_store);
+static ssize_t reipl_ccw_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_ccw_clear);
+}
+
+static ssize_t reipl_ccw_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (strtobool(buf, &reipl_ccw_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
+static struct kobj_attribute sys_reipl_ccw_clear_attr =
+ __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store);
+
static struct attribute *reipl_ccw_attrs_vm[] = {
&sys_reipl_ccw_device_attr.attr,
&sys_reipl_ccw_loadparm_attr.attr,
&sys_reipl_ccw_vmparm_attr.attr,
+ &sys_reipl_ccw_clear_attr.attr,
NULL,
};
static struct attribute *reipl_ccw_attrs_lpar[] = {
&sys_reipl_ccw_device_attr.attr,
&sys_reipl_ccw_loadparm_attr.attr,
+ &sys_reipl_ccw_clear_attr.attr,
NULL,
};
@@ -850,6 +1071,9 @@ static int reipl_set_type(enum ipl_type type)
case IPL_TYPE_FCP:
reipl_block_actual = reipl_block_fcp;
break;
+ case IPL_TYPE_NVME:
+ reipl_block_actual = reipl_block_nvme;
+ break;
case IPL_TYPE_NSS:
reipl_block_actual = reipl_block_nss;
break;
@@ -876,6 +1100,8 @@ static ssize_t reipl_type_store(struct kobject *kobj,
rc = reipl_set_type(IPL_TYPE_CCW);
else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_FCP);
+ else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0)
+ rc = reipl_set_type(IPL_TYPE_NVME);
else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_NSS);
return (rc != 0) ? rc : len;
@@ -886,17 +1112,31 @@ static struct kobj_attribute reipl_type_attr =
static struct kset *reipl_kset;
static struct kset *reipl_fcp_kset;
+static struct kset *reipl_nvme_kset;
static void __reipl_run(void *unused)
{
switch (reipl_type) {
case IPL_TYPE_CCW:
diag308(DIAG308_SET, reipl_block_ccw);
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ if (reipl_ccw_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
break;
case IPL_TYPE_FCP:
diag308(DIAG308_SET, reipl_block_fcp);
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ if (reipl_fcp_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
+ break;
+ case IPL_TYPE_NVME:
+ diag308(DIAG308_SET, reipl_block_nvme);
+ if (reipl_nvme_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
break;
case IPL_TYPE_NSS:
diag308(DIAG308_SET, reipl_block_nss);
@@ -906,6 +1146,7 @@ static void __reipl_run(void *unused)
diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case IPL_TYPE_FCP_DUMP:
+ case IPL_TYPE_NVME_DUMP:
break;
}
disabled_wait();
@@ -1008,10 +1249,16 @@ static int __init reipl_fcp_init(void)
}
rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
- if (rc) {
- kset_unregister(reipl_fcp_kset);
- free_page((unsigned long) reipl_block_fcp);
- return rc;
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_fcp_kset->kobj,
+ &sys_reipl_fcp_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_fcp_clear = true;
}
if (ipl_info.type == IPL_TYPE_FCP) {
@@ -1032,6 +1279,69 @@ static int __init reipl_fcp_init(void)
}
reipl_capabilities |= IPL_TYPE_FCP;
return 0;
+
+out2:
+ sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
+out1:
+ kset_unregister(reipl_fcp_kset);
+ free_page((unsigned long) reipl_block_fcp);
+ return rc;
+}
+
+static int __init reipl_nvme_init(void)
+{
+ int rc;
+
+ reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!reipl_block_nvme)
+ return -ENOMEM;
+
+ /* sysfs: create kset for mixing attr group and bin attrs */
+ reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL,
+ &reipl_kset->kobj);
+ if (!reipl_nvme_kset) {
+ free_page((unsigned long) reipl_block_nvme);
+ return -ENOMEM;
+ }
+
+ rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_nvme_kset->kobj,
+ &sys_reipl_nvme_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_nvme_clear = true;
+ }
+
+ if (ipl_info.type == IPL_TYPE_NVME) {
+ memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block));
+ /*
+ * Fix loadparm: There are systems where the (SCSI) LOADPARM
+ * is invalid in the IPL parameter block, so take it
+ * always from sclp_ipl_info.
+ */
+ memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm,
+ LOADPARM_LEN);
+ } else {
+ reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+ reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+ reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN;
+ reipl_block_nvme->nvme.pbt = IPL_PBT_NVME;
+ reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL;
+ }
+ reipl_capabilities |= IPL_TYPE_NVME;
+ return 0;
+
+out2:
+ sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+out1:
+ kset_unregister(reipl_nvme_kset);
+ free_page((unsigned long) reipl_block_nvme);
+ return rc;
}
static int __init reipl_type_init(void)
@@ -1049,6 +1359,9 @@ static int __init reipl_type_init(void)
if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) {
memcpy(reipl_block_fcp, reipl_block, size);
reipl_type = IPL_TYPE_FCP;
+ } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) {
+ memcpy(reipl_block_nvme, reipl_block, size);
+ reipl_type = IPL_TYPE_NVME;
} else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) {
memcpy(reipl_block_ccw, reipl_block, size);
reipl_type = IPL_TYPE_CCW;
@@ -1075,6 +1388,9 @@ static int __init reipl_init(void)
rc = reipl_fcp_init();
if (rc)
return rc;
+ rc = reipl_nvme_init();
+ if (rc)
+ return rc;
rc = reipl_nss_init();
if (rc)
return rc;
@@ -1118,6 +1434,29 @@ static struct attribute_group dump_fcp_attr_group = {
.attrs = dump_fcp_attrs,
};
+/* NVME dump device attributes */
+DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n",
+ dump_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n",
+ dump_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n",
+ dump_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n",
+ dump_block_nvme->nvme.br_lba);
+
+static struct attribute *dump_nvme_attrs[] = {
+ &sys_dump_nvme_fid_attr.attr,
+ &sys_dump_nvme_nsid_attr.attr,
+ &sys_dump_nvme_bootprog_attr.attr,
+ &sys_dump_nvme_br_lba_attr.attr,
+ NULL,
+};
+
+static struct attribute_group dump_nvme_attr_group = {
+ .name = IPL_NVME_STR,
+ .attrs = dump_nvme_attrs,
+};
+
/* CCW dump device attributes */
DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw);
@@ -1159,6 +1498,8 @@ static ssize_t dump_type_store(struct kobject *kobj,
rc = dump_set_type(DUMP_TYPE_CCW);
else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0)
rc = dump_set_type(DUMP_TYPE_FCP);
+ else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0)
+ rc = dump_set_type(DUMP_TYPE_NVME);
return (rc != 0) ? rc : len;
}
@@ -1173,7 +1514,7 @@ static void diag308_dump(void *dump_block)
while (1) {
if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302)
break;
- udelay_simple(USEC_PER_SEC);
+ udelay(USEC_PER_SEC);
}
}
@@ -1186,6 +1527,9 @@ static void __dump_run(void *unused)
case DUMP_TYPE_FCP:
diag308_dump(dump_block_fcp);
break;
+ case DUMP_TYPE_NVME:
+ diag308_dump(dump_block_nvme);
+ break;
default:
break;
}
@@ -1242,6 +1586,29 @@ static int __init dump_fcp_init(void)
return 0;
}
+static int __init dump_nvme_init(void)
+{
+ int rc;
+
+ if (!sclp_ipl_info.has_dump)
+ return 0; /* LDIPL DUMP is not installed */
+ dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!dump_block_nvme)
+ return -ENOMEM;
+ rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group);
+ if (rc) {
+ free_page((unsigned long)dump_block_nvme);
+ return rc;
+ }
+ dump_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+ dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+ dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN;
+ dump_block_nvme->fcp.pbt = IPL_PBT_NVME;
+ dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP;
+ dump_capabilities |= DUMP_TYPE_NVME;
+ return 0;
+}
+
static int __init dump_init(void)
{
int rc;
@@ -1260,6 +1627,9 @@ static int __init dump_init(void)
rc = dump_fcp_init();
if (rc)
return rc;
+ rc = dump_nvme_init();
+ if (rc)
+ return rc;
dump_set_type(DUMP_TYPE_NONE);
return 0;
}
@@ -1273,12 +1643,16 @@ static struct shutdown_action __refdata dump_action = {
static void dump_reipl_run(struct shutdown_trigger *trigger)
{
unsigned long ipib = (unsigned long) reipl_block_actual;
+ struct lowcore *abs_lc;
+ unsigned long flags;
unsigned int csum;
csum = (__force unsigned int)
csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0);
- mem_assign_absolute(S390_lowcore.ipib, ipib);
- mem_assign_absolute(S390_lowcore.ipib_checksum, csum);
+ abs_lc = get_abs_lowcore(&flags);
+ abs_lc->ipib = ipib;
+ abs_lc->ipib_checksum = csum;
+ put_abs_lowcore(abs_lc, flags);
dump_run(trigger);
}
@@ -1472,7 +1846,6 @@ static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart);
static void __do_restart(void *ignore)
{
- __arch_local_irq_stosm(0x04); /* enable DAT */
smp_send_stop();
#ifdef CONFIG_CRASH_DUMP
crash_kexec(NULL);
@@ -1481,12 +1854,12 @@ static void __do_restart(void *ignore)
stop_run(&on_restart_trigger);
}
-void do_restart(void)
+void do_restart(void *arg)
{
tracing_off();
debug_locks_off();
lgr_info_log();
- smp_call_online_cpu(__do_restart, NULL);
+ smp_call_online_cpu(__do_restart, arg);
}
/* on halt */
@@ -1691,6 +2064,11 @@ void __init setup_ipl(void)
ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn;
ipl_info.data.fcp.lun = ipl_block.fcp.lun;
break;
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ ipl_info.data.nvme.fid = ipl_block.nvme.fid;
+ ipl_info.data.nvme.nsid = ipl_block.nvme.nsid;
+ break;
case IPL_TYPE_NSS:
case IPL_TYPE_UNKNOWN:
/* We have no info to copy */
@@ -1706,7 +2084,7 @@ void s390_reset_system(void)
/* Disable lowcore protection */
__ctl_clear_bit(0, 28);
- diag_dma_ops.diag308_reset();
+ diag_amode31_ops.diag308_reset();
}
#ifdef CONFIG_KEXEC_FILE
@@ -1783,7 +2161,7 @@ void *ipl_report_finish(struct ipl_report *report)
buf = vzalloc(report->size);
if (!buf)
- return ERR_PTR(-ENOMEM);
+ goto out;
ptr = buf;
memcpy(ptr, report->ipib, report->ipib->hdr.len);
@@ -1822,6 +2200,7 @@ void *ipl_report_finish(struct ipl_report *report)
}
BUG_ON(ptr > buf + report->size);
+out:
return buf;
}
diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c
index af43535a976d..b5245fadcfb0 100644
--- a/arch/s390/kernel/ipl_vmparm.c
+++ b/arch/s390/kernel/ipl_vmparm.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/minmax.h>
+#include <linux/string.h>
#include <asm/ebcdic.h>
#include <asm/ipl.h>
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 8371855042dc..45393919fe61 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -21,12 +21,14 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/irq.h>
+#include <linux/entry-common.h>
#include <asm/irq_regs.h>
#include <asm/cputime.h>
#include <asm/lowcore.h>
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
#include "entry.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -95,27 +97,106 @@ static const struct irq_class irqclass_sub_desc[] = {
{.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"},
};
-void __init init_IRQ(void)
+static void do_IRQ(struct pt_regs *regs, int irq)
{
- BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
- init_cio_interrupts();
- init_airq_interrupts();
- init_ext_interrupts();
-}
-
-void do_IRQ(struct pt_regs *regs, int irq)
-{
- struct pt_regs *old_regs;
-
- old_regs = set_irq_regs(regs);
- irq_enter();
if (tod_after_eq(S390_lowcore.int_clock,
S390_lowcore.clock_comparator))
/* Serve timer interrupts first. */
clock_comparator_work();
generic_handle_irq(irq);
- irq_exit();
+}
+
+static int on_async_stack(void)
+{
+ unsigned long frame = current_frame_address();
+
+ return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
+}
+
+static void do_irq_async(struct pt_regs *regs, int irq)
+{
+ if (on_async_stack()) {
+ do_IRQ(regs, irq);
+ } else {
+ call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
+ struct pt_regs *, regs, int, irq);
+ }
+}
+
+static int irq_pending(struct pt_regs *regs)
+{
+ int cc;
+
+ asm volatile("tpi 0\n"
+ "ipm %0" : "=d" (cc) : : "cc");
+ return cc >> 28;
+}
+
+void noinstr do_io_irq(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ int from_idle;
+
+ irq_enter_rcu();
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+ }
+
+ from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit;
+ if (from_idle)
+ account_idle_time_irq();
+
+ do {
+ regs->tpi_info = S390_lowcore.tpi_info;
+ if (S390_lowcore.tpi_info.adapter_IO)
+ do_irq_async(regs, THIN_INTERRUPT);
+ else
+ do_irq_async(regs, IO_INTERRUPT);
+ } while (MACHINE_IS_LPAR && irq_pending(regs));
+
+ irq_exit_rcu();
+
set_irq_regs(old_regs);
+ irqentry_exit(regs, state);
+
+ if (from_idle)
+ regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+}
+
+void noinstr do_ext_irq(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ int from_idle;
+
+ irq_enter_rcu();
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+ }
+
+ regs->int_code = S390_lowcore.ext_int_code_addr;
+ regs->int_parm = S390_lowcore.ext_params;
+ regs->int_parm_long = S390_lowcore.ext_params2;
+
+ from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit;
+ if (from_idle)
+ account_idle_time_irq();
+
+ do_irq_async(regs, EXT_INTERRUPT);
+
+ irq_exit_rcu();
+ set_irq_regs(old_regs);
+ irqentry_exit(regs, state);
+
+ if (from_idle)
+ regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
}
static void show_msi_interrupt(struct seq_file *p, int irq)
@@ -124,7 +205,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
unsigned long flags;
int cpu;
- irq_lock_sparse();
+ rcu_read_lock();
desc = irq_to_desc(irq);
if (!desc)
goto out;
@@ -132,7 +213,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
raw_spin_lock_irqsave(&desc->lock, flags);
seq_printf(p, "%3d: ", irq);
for_each_online_cpu(cpu)
- seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+ seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu));
if (desc->irq_data.chip)
seq_printf(p, " %8s", desc->irq_data.chip->name);
@@ -143,7 +224,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
seq_putc(p, '\n');
raw_spin_unlock_irqrestore(&desc->lock, flags);
out:
- irq_unlock_sparse();
+ rcu_read_unlock();
}
/*
@@ -154,7 +235,7 @@ int show_interrupts(struct seq_file *p, void *v)
int index = *(loff_t *) v;
int cpu, irq;
- get_online_cpus();
+ cpus_read_lock();
if (index == 0) {
seq_puts(p, " ");
for_each_online_cpu(cpu)
@@ -184,7 +265,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
out:
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -194,24 +275,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
}
/*
- * Switch to the asynchronous interrupt stack for softirq execution.
- */
-void do_softirq_own_stack(void)
-{
- unsigned long old, new;
-
- old = current_stack_pointer();
- /* Check against async. stack address range. */
- new = S390_lowcore.async_stack;
- if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
- CALL_ON_STACK(__do_softirq, new, 0);
- } else {
- /* We are already on the async stack. */
- __do_softirq();
- }
-}
-
-/*
* ext_int_hash[index] is the list head for all external interrupts that hash
* to this index.
*/
@@ -279,7 +342,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
struct ext_int_info *p;
int index;
- ext_code = *(struct ext_code *) &regs->int_code;
+ ext_code.int_code = regs->int_code;
if (ext_code.code != EXT_IRQ_CLK_COMP)
set_cpu_flag(CIF_NOHZ_DELAY);
@@ -294,12 +357,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
return IRQ_HANDLED;
}
-static struct irqaction external_interrupt = {
- .name = "EXT",
- .handler = do_ext_interrupt,
-};
-
-void __init init_ext_interrupts(void)
+static void __init init_ext_interrupts(void)
{
int idx;
@@ -308,7 +366,16 @@ void __init init_ext_interrupts(void)
irq_set_chip_and_handler(EXT_INTERRUPT,
&dummy_irq_chip, handle_percpu_irq);
- setup_irq(EXT_INTERRUPT, &external_interrupt);
+ if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL))
+ panic("Failed to register EXT interrupt\n");
+}
+
+void __init init_IRQ(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
+ init_cio_interrupts();
+ init_airq_interrupts();
+ init_ext_interrupts();
}
static DEFINE_SPINLOCK(irq_subclass_lock);
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index ab584e8e3527..e808bb8bc0da 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -6,8 +6,9 @@
* Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
*/
#include <linux/uaccess.h>
-#include <linux/stop_machine.h>
#include <linux/jump_label.h>
+#include <linux/module.h>
+#include <asm/text-patching.h>
#include <asm/ipl.h>
struct insn {
@@ -36,21 +37,15 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected,
unsigned char *ipe = (unsigned char *)expected;
unsigned char *ipn = (unsigned char *)new;
- pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc);
+ pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc);
pr_emerg("Found: %6ph\n", ipc);
pr_emerg("Expected: %6ph\n", ipe);
pr_emerg("New: %6ph\n", ipn);
panic("Corrupted kernel text");
}
-static struct insn orignop = {
- .opcode = 0xc004,
- .offset = JUMP_LABEL_NOP_OFFSET >> 1,
-};
-
-static void __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static void jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
void *code = (void *)jump_entry_code(entry);
struct insn old, new;
@@ -62,29 +57,26 @@ static void __jump_label_transform(struct jump_entry *entry,
jump_label_make_branch(entry, &old);
jump_label_make_nop(entry, &new);
}
- if (init) {
- if (memcmp(code, &orignop, sizeof(orignop)))
- jump_label_bug(entry, &orignop, &new);
- } else {
- if (memcmp(code, &old, sizeof(old)))
- jump_label_bug(entry, &old, &new);
- }
+ if (memcmp(code, &old, sizeof(old)))
+ jump_label_bug(entry, &old, &new);
s390_kernel_write(code, &new, sizeof(new));
}
-static void __jump_label_sync(void *dummy)
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
+ jump_label_transform(entry, type);
+ text_poke_sync();
}
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+ enum jump_label_type type)
{
- __jump_label_transform(entry, type, 0);
- smp_call_function(__jump_label_sync, NULL, 1);
+ jump_label_transform(entry, type);
+ return true;
}
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+void arch_jump_label_transform_apply(void)
{
- __jump_label_transform(entry, type, 1);
+ text_poke_sync();
}
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 548d0ea9808d..0032bdbe8e3f 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -7,6 +7,9 @@
* s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com>
*/
+#define pr_fmt(fmt) "kprobes: " fmt
+
+#include <linux/moduleloader.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
@@ -21,6 +24,7 @@
#include <asm/set_memory.h>
#include <asm/sections.h>
#include <asm/dis.h>
+#include "entry.h"
DEFINE_PER_CPU(struct kprobe *, current_kprobe);
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -30,19 +34,27 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { };
DEFINE_INSN_CACHE_OPS(s390_insn);
static int insn_page_in_use;
-static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+void *alloc_insn_page(void)
+{
+ void *page;
+
+ page = module_alloc(PAGE_SIZE);
+ if (!page)
+ return NULL;
+ __set_memory((unsigned long) page, 1, SET_MEMORY_RO | SET_MEMORY_X);
+ return page;
+}
static void *alloc_s390_insn_page(void)
{
if (xchg(&insn_page_in_use, 1) == 1)
return NULL;
- set_memory_x((unsigned long) &insn_page, 1);
- return &insn_page;
+ return &kprobes_insn_page;
}
static void free_s390_insn_page(void *page)
{
- set_memory_nx((unsigned long) page, 1);
xchg(&insn_page_in_use, 0);
}
@@ -56,33 +68,32 @@ struct kprobe_insn_cache kprobe_s390_insn_slots = {
static void copy_instruction(struct kprobe *p)
{
+ kprobe_opcode_t insn[MAX_INSN_SIZE];
s64 disp, new_disp;
u64 addr, new_addr;
+ unsigned int len;
- memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8));
- p->opcode = p->ainsn.insn[0];
- if (!probe_is_insn_relative_long(p->ainsn.insn))
- return;
- /*
- * For pc-relative instructions in RIL-b or RIL-c format patch the
- * RI2 displacement field. We have already made sure that the insn
- * slot for the patched instruction is within the same 2GB area
- * as the original instruction (either kernel image or module area).
- * Therefore the new displacement will always fit.
- */
- disp = *(s32 *)&p->ainsn.insn[1];
- addr = (u64)(unsigned long)p->addr;
- new_addr = (u64)(unsigned long)p->ainsn.insn;
- new_disp = ((addr + (disp * 2)) - new_addr) / 2;
- *(s32 *)&p->ainsn.insn[1] = new_disp;
+ len = insn_length(*p->addr >> 8);
+ memcpy(&insn, p->addr, len);
+ p->opcode = insn[0];
+ if (probe_is_insn_relative_long(&insn[0])) {
+ /*
+ * For pc-relative instructions in RIL-b or RIL-c format patch
+ * the RI2 displacement field. We have already made sure that
+ * the insn slot for the patched instruction is within the same
+ * 2GB area as the original instruction (either kernel image or
+ * module area). Therefore the new displacement will always fit.
+ */
+ disp = *(s32 *)&insn[1];
+ addr = (u64)(unsigned long)p->addr;
+ new_addr = (u64)(unsigned long)p->ainsn.insn;
+ new_disp = ((addr + (disp * 2)) - new_addr) / 2;
+ *(s32 *)&insn[1] = new_disp;
+ }
+ s390_kernel_write(p->ainsn.insn, &insn, len);
}
NOKPROBE_SYMBOL(copy_instruction);
-static inline int is_kernel_addr(void *addr)
-{
- return addr < (void *)_end;
-}
-
static int s390_get_insn_slot(struct kprobe *p)
{
/*
@@ -91,7 +102,7 @@ static int s390_get_insn_slot(struct kprobe *p)
* field can be patched and executed within the insn slot.
*/
p->ainsn.insn = NULL;
- if (is_kernel_addr(p->addr))
+ if (is_kernel((unsigned long)p->addr))
p->ainsn.insn = get_s390_insn_slot();
else if (is_module_addr(p->addr))
p->ainsn.insn = get_insn_slot();
@@ -103,7 +114,7 @@ static void s390_free_insn_slot(struct kprobe *p)
{
if (!p->ainsn.insn)
return;
- if (is_kernel_addr(p->addr))
+ if (is_kernel((unsigned long)p->addr))
free_s390_insn_slot(p->ainsn.insn, 0);
else
free_insn_slot(p->ainsn.insn, 0);
@@ -111,9 +122,55 @@ static void s390_free_insn_slot(struct kprobe *p)
}
NOKPROBE_SYMBOL(s390_free_insn_slot);
+/* Check if paddr is at an instruction boundary */
+static bool can_probe(unsigned long paddr)
+{
+ unsigned long addr, offset = 0;
+ kprobe_opcode_t insn;
+ struct kprobe *kp;
+
+ if (paddr & 0x01)
+ return false;
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return false;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn)))
+ return false;
+
+ if (insn >> 8 == 0) {
+ if (insn != BREAKPOINT_INSTRUCTION) {
+ /*
+ * Note that QEMU inserts opcode 0x0000 to implement
+ * software breakpoints for guests. Since the size of
+ * the original instruction is unknown, stop following
+ * instructions and prevent setting a kprobe.
+ */
+ return false;
+ }
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case the original instruction is
+ * decoded.
+ */
+ kp = get_kprobe((void *)addr);
+ if (!kp) {
+ /* not a kprobe */
+ return false;
+ }
+ insn = kp->opcode;
+ }
+ addr += insn_length(insn >> 8);
+ }
+ return addr == paddr;
+}
+
int arch_prepare_kprobe(struct kprobe *p)
{
- if ((unsigned long) p->addr & 0x01)
+ if (!can_probe((unsigned long)p->addr))
return -EINVAL;
/* Make sure the probe isn't going on a difficult instruction */
if (probe_is_prohibited_opcode(p->addr))
@@ -227,10 +284,11 @@ NOKPROBE_SYMBOL(pop_kprobe);
void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
- ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14];
+ ri->ret_addr = (kprobe_opcode_t *)regs->gprs[14];
+ ri->fp = (void *)regs->gprs[15];
/* Replace the return addr with trampoline addr */
- regs->gprs[14] = (unsigned long) &kretprobe_trampoline;
+ regs->gprs[14] = (unsigned long)&__kretprobe_trampoline;
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
@@ -249,7 +307,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
* is a BUG. The code path resides in the .kprobes.text
* section and is executed with interrupts disabled.
*/
- pr_err("Invalid kprobe detected.\n");
+ pr_err("Failed to recover from reentered kprobes.\n");
dump_kprobe(p);
BUG();
}
@@ -314,109 +372,26 @@ static int kprobe_handler(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kprobe_handler);
-/*
- * Function return probe trampoline:
- * - init_kprobes() establishes a probepoint here
- * - When the probed function returns, this probe
- * causes the handlers to fire
- */
-static void __used kretprobe_trampoline_holder(void)
+void arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
{
- asm volatile(".global kretprobe_trampoline\n"
- "kretprobe_trampoline: bcr 0,0\n");
+ /* Replace fake return address with real one. */
+ regs->gprs[14] = (unsigned long)correct_ret_addr;
}
+NOKPROBE_SYMBOL(arch_kretprobe_fixup_return);
/*
- * Called when the probe at kretprobe trampoline is hit
+ * Called from __kretprobe_trampoline
*/
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+void trampoline_probe_handler(struct pt_regs *regs)
{
- struct kretprobe_instance *ri;
- struct hlist_head *head, empty_rp;
- struct hlist_node *tmp;
- unsigned long flags, orig_ret_address;
- unsigned long trampoline_address;
- kprobe_opcode_t *correct_ret_addr;
-
- INIT_HLIST_HEAD(&empty_rp);
- kretprobe_hash_lock(current, &head, &flags);
-
- /*
- * It is possible to have multiple instances associated with a given
- * task either because an multiple functions in the call path
- * have a return probe installed on them, and/or more than one return
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always inserted at the head of the list
- * - when multiple return probes are registered for the same
- * function, the first instance's ret_addr will point to the
- * real return address, and all the rest will point to
- * kretprobe_trampoline
- */
- ri = NULL;
- orig_ret_address = 0;
- correct_ret_addr = NULL;
- trampoline_address = (unsigned long) &kretprobe_trampoline;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long) ri->ret_addr;
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
- correct_ret_addr = ri->ret_addr;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long) ri->ret_addr;
-
- if (ri->rp && ri->rp->handler) {
- ri->ret_addr = correct_ret_addr;
- ri->rp->handler(ri, regs);
- }
-
- recycle_rp_inst(ri, &empty_rp);
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- regs->psw.addr = orig_ret_address;
-
- kretprobe_hash_unlock(current, &flags);
-
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
- /*
- * By returning a non-zero value, we are telling
- * kprobe_handler() that we don't want the post_handler
- * to run (and have re-enabled preemption)
- */
- return 1;
+ kretprobe_trampoline_handler(regs, (void *)regs->gprs[15]);
}
NOKPROBE_SYMBOL(trampoline_probe_handler);
+/* assembler function that handles the kretprobes must not be probed itself */
+NOKPROBE_SYMBOL(__kretprobe_trampoline);
+
/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "breakpoint"
@@ -483,7 +458,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
struct kprobe *p = kprobe_running();
- const struct exception_table_entry *entry;
switch(kcb->kprobe_status) {
case KPROBE_HIT_SS:
@@ -502,32 +476,11 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
case KPROBE_HIT_ACTIVE:
case KPROBE_HIT_SSDONE:
/*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(p);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (p->fault_handler && p->fault_handler(p, regs, trapnr))
- return 1;
-
- /*
* In case the user-specified fault handler returned
* zero, try to fix up.
*/
- entry = s390_search_extables(regs->psw.addr);
- if (entry) {
- regs->psw.addr = extable_fixup(entry);
+ if (fixup_exception(regs))
return 1;
- }
-
/*
* fixup_exception() could not handle it,
* Let do_page_fault() fix it.
@@ -591,18 +544,13 @@ int kprobe_exceptions_notify(struct notifier_block *self,
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-static struct kprobe trampoline = {
- .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
- .pre_handler = trampoline_probe_handler
-};
-
int __init arch_init_kprobes(void)
{
- return register_kprobe(&trampoline);
+ return 0;
}
int arch_trampoline_kprobe(struct kprobe *p)
{
- return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline;
+ return 0;
}
NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S
new file mode 100644
index 000000000000..f6cb022ef8c8
--- /dev/null
+++ b/arch/s390/kernel/kprobes_insn_page.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+
+/*
+ * insn_page is a special 4k aligned dummy function for kprobes.
+ * It will contain all kprobed instructions that are out-of-line executed.
+ * The page must be within the kernel image to guarantee that the
+ * out-of-line instructions are within 2GB distance of their original
+ * location. Using a dummy function ensures that the insn_page is within
+ * the text section of the kernel and mapped read-only/executable from
+ * the beginning on, thus avoiding to split large mappings if the page
+ * would be in the data section instead.
+ */
+ .section .kprobes.text, "ax"
+ .align 4096
+ENTRY(kprobes_insn_page)
+ .rept 2048
+ .word 0x07fe
+ .endr
+ENDPROC(kprobes_insn_page)
+ .previous
diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index 452502f9a0d9..6652e54cf3db 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info)
if (stsi(si, 2, 2, 2))
return;
cpascii(lgr_info->name, si->name, sizeof(si->name));
- memcpy(&lgr_info->lpar_number, &si->lpar_number,
- sizeof(lgr_info->lpar_number));
+ lgr_info->lpar_number = si->lpar_number;
}
/*
@@ -167,7 +166,7 @@ static struct timer_list lgr_timer;
*/
static void lgr_timer_set(void)
{
- mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ);
+ mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC));
}
/*
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index cb8b1cc285c9..4579b42286d5 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -3,7 +3,6 @@
* Copyright IBM Corp. 2005, 2011
*
* Author(s): Rolf Adelsberger,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Michael Holzheu <holzheu@linux.vnet.ibm.com>
*/
@@ -14,24 +13,24 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/debug_locks.h>
-#include <linux/suspend.h>
#include <asm/cio.h>
#include <asm/setup.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/ipl.h>
#include <asm/diag.h>
#include <asm/elf.h>
#include <asm/asm-offsets.h>
#include <asm/cacheflush.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/set_memory.h>
#include <asm/stacktrace.h>
#include <asm/switch_to.h>
#include <asm/nmi.h>
+#include <asm/sclp.h>
-typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long);
+typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long,
+ unsigned long);
extern const unsigned char relocate_kernel[];
extern const unsigned long long relocate_kernel_len;
@@ -39,36 +38,6 @@ extern const unsigned long long relocate_kernel_len;
#ifdef CONFIG_CRASH_DUMP
/*
- * PM notifier callback for kdump
- */
-static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- switch (action) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- if (kexec_crash_image)
- arch_kexec_unprotect_crashkres();
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- if (kexec_crash_image)
- arch_kexec_protect_crashkres();
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int __init machine_kdump_pm_init(void)
-{
- pm_notifier(machine_kdump_pm_cb, 0);
- return 0;
-}
-arch_initcall(machine_kdump_pm_init);
-
-/*
* Reset the system, copy boot CPU registers to absolute zero,
* and jump to the kdump image
*/
@@ -88,7 +57,7 @@ static void __do_machine_kdump(void *image)
* This need to be done *after* s390_reset_system set the
* prefix register of this CPU to zero
*/
- memcpy((void *) __LC_FPREGS_SAVE_AREA,
+ memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA),
(void *)(prefix + __LC_FPREGS_SAVE_AREA), 512);
__load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA);
@@ -119,7 +88,7 @@ static noinline void __machine_kdump(void *image)
continue;
}
/* Store status of the boot CPU */
- mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
if (MACHINE_HAS_VX)
save_vx_regs((__vector128 *) mcesa->vector_save_area);
if (MACHINE_HAS_GS) {
@@ -165,7 +134,8 @@ static bool kdump_csum_valid(struct kimage *image)
int rc;
preempt_disable();
- rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
+ rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump,
+ unsigned long, (unsigned long)image);
preempt_enable();
return rc == 0;
#else
@@ -253,13 +223,18 @@ void machine_kexec_cleanup(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
+ struct lowcore *abs_lc;
+ unsigned long flags;
+
VMCOREINFO_SYMBOL(lowcore_ptr);
VMCOREINFO_SYMBOL(high_memory);
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
- vmcoreinfo_append_str("SDMA=%lx\n", __sdma);
- vmcoreinfo_append_str("EDMA=%lx\n", __edma);
+ vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31);
+ vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
- mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
+ abs_lc = get_abs_lowcore(&flags);
+ abs_lc->vmcore_info = paddr_vmcoreinfo_note();
+ put_abs_lowcore(abs_lc, flags);
}
void machine_shutdown(void)
@@ -276,6 +251,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
*/
static void __do_machine_kexec(void *data)
{
+ unsigned long diag308_subcode;
relocate_kernel_t data_mover;
struct kimage *image = data;
@@ -284,7 +260,10 @@ static void __do_machine_kexec(void *data)
__arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */
/* Call the moving routine */
- (*data_mover)(&image->head, image->start);
+ diag308_subcode = DIAG308_CLEAR_RESET;
+ if (sclp.has_iplcc)
+ diag308_subcode |= DIAG308_FLAG_EI;
+ (*data_mover)(&image->head, image->start, diag308_subcode);
/* Die if kexec returns */
disabled_wait();
@@ -295,7 +274,6 @@ static void __do_machine_kexec(void *data)
*/
static void __machine_kexec(void *data)
{
- __arch_local_irq_stosm(0x04); /* enable DAT */
pfault_fini();
tracing_off();
debug_locks_off();
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 8415ae7d2a23..fc6d5f58debe 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -7,11 +7,14 @@
* Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/elf.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/module_signature.h>
#include <linux/verification.h>
+#include <linux/vmalloc.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
#include <asm/setup.h>
@@ -28,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
struct module_signature *ms;
unsigned long sig_len;
+ int ret;
/* Skip signature verification when not secure IPLed. */
if (!ipl_secure_flag)
@@ -62,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
return -EBADMSG;
}
- return verify_pkcs7_signature(kernel, kernel_len,
- kernel + kernel_len, sig_len,
- VERIFY_USE_PLATFORM_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
+ ret = verify_pkcs7_signature(kernel, kernel_len,
+ kernel + kernel_len, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING))
+ ret = verify_pkcs7_signature(kernel, kernel_len,
+ kernel + kernel_len, sig_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+ return ret;
}
#endif /* CONFIG_KEXEC_SIG */
@@ -151,7 +162,7 @@ static int kexec_file_add_initrd(struct kimage *image,
buf.mem += crashk_res.start;
buf.memsz = buf.bufsz;
- data->parm->initrd_start = buf.mem;
+ data->parm->initrd_start = data->memsz;
data->parm->initrd_size = buf.memsz;
data->memsz += buf.memsz;
@@ -170,6 +181,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
struct kexec_buf buf;
unsigned long addr;
void *ptr, *end;
+ int ret;
buf.image = image;
@@ -199,9 +211,13 @@ static int kexec_file_add_ipl_report(struct kimage *image,
ptr += len;
}
+ ret = -ENOMEM;
buf.buffer = ipl_report_finish(data->report);
+ if (!buf.buffer)
+ goto out;
buf.bufsz = data->report->size;
buf.memsz = buf.bufsz;
+ image->arch.ipl_buf = buf.buffer;
data->memsz += buf.memsz;
@@ -209,14 +225,18 @@ static int kexec_file_add_ipl_report(struct kimage *image,
data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr);
*lc_ipl_parmblock_ptr = (__u32)buf.mem;
- return kexec_add_buffer(&buf);
+ ret = kexec_add_buffer(&buf);
+out:
+ return ret;
}
void *kexec_file_add_components(struct kimage *image,
int (*add_kernel)(struct kimage *image,
struct s390_load_data *data))
{
+ unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE;
struct s390_load_data data = {0};
+ unsigned long minsize;
int ret;
data.report = ipl_report_init(&ipl_block);
@@ -227,10 +247,23 @@ void *kexec_file_add_components(struct kimage *image,
if (ret)
goto out;
- if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) {
- ret = -EINVAL;
+ ret = -EINVAL;
+ minsize = PARMAREA + offsetof(struct parmarea, command_line);
+ if (image->kernel_buf_len < minsize)
goto out;
- }
+
+ if (data.parm->max_command_line_size)
+ max_command_line_size = data.parm->max_command_line_size;
+
+ if (minsize + max_command_line_size < minsize)
+ goto out;
+
+ if (image->kernel_buf_len < minsize + max_command_line_size)
+ goto out;
+
+ if (image->cmdline_buf_len >= max_command_line_size)
+ goto out;
+
memcpy(data.parm->command_line, image->cmdline_buf,
image->cmdline_buf_len);
@@ -267,8 +300,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
const Elf_Shdr *relsec,
const Elf_Shdr *symtab)
{
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
Elf_Rela *relas;
int i, r_type;
+ int ret;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
relas = (void *)pi->ehdr + relsec->sh_offset;
@@ -281,15 +322,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
sym = (void *)pi->ehdr + symtab->sh_offset;
sym += ELF64_R_SYM(relas[i].r_info);
- if (sym->st_shndx == SHN_UNDEF)
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
return -ENOEXEC;
+ }
- if (sym->st_shndx == SHN_COMMON)
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
return -ENOEXEC;
+ }
if (sym->st_shndx >= pi->ehdr->e_shnum &&
- sym->st_shndx != SHN_ABS)
+ sym->st_shndx != SHN_ABS) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
return -ENOEXEC;
+ }
loc = pi->purgatory_buf;
loc += section->sh_offset;
@@ -303,21 +356,23 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
addr = section->sh_addr + relas[i].r_offset;
r_type = ELF64_R_TYPE(relas[i].r_info);
- arch_kexec_do_relocs(r_type, loc, val, addr);
+
+ if (r_type == R_390_PLT32DBL)
+ r_type = R_390_PC32DBL;
+
+ ret = arch_kexec_do_relocs(r_type, loc, val, addr);
+ if (ret) {
+ pr_err("Unknown rela relocation: %d\n", r_type);
+ return -ENOEXEC;
+ }
}
return 0;
}
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- /* A kernel must be at least large enough to contain head.S. During
- * load memory in head.S will be accessed, e.g. to register the next
- * command line. If the next kernel were smaller the current kernel
- * will panic at load.
- */
- if (buf_len < HEAD_END)
- return -ENOEXEC;
-
- return kexec_image_probe_default(image, buf, buf_len);
+ vfree(image->arch.ipl_buf);
+ image->arch.ipl_buf = NULL;
+
+ return kexec_image_post_load_cleanup_default(image);
}
diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c
index d5035de9020e..b7182cec48dc 100644
--- a/arch/s390/kernel/machine_kexec_reloc.c
+++ b/arch/s390/kernel/machine_kexec_reloc.c
@@ -28,6 +28,7 @@ int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
break;
case R_390_64: /* Direct 64 bit. */
case R_390_GLOB_DAT:
+ case R_390_JMP_SLOT:
*(u64 *)loc = val;
break;
case R_390_PC16: /* PC relative 16 bit. */
diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index 7458dcfd6464..4786bfe02144 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -2,8 +2,6 @@
/*
* Copyright IBM Corp. 2008, 2009
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/linkage.h>
@@ -13,6 +11,18 @@
#include <asm/ptrace.h>
#include <asm/export.h>
+
+#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE)
+#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
+#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
+#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
+#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2)
+#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS)
+/* packed stack: allocate just enough for r14, r15 and backchain */
+#define TRACED_FUNC_FRAME_SIZE 24
+
+#ifdef CONFIG_FUNCTION_TRACER
+
GEN_BR_THUNK %r1
GEN_BR_THUNK %r14
@@ -22,33 +32,17 @@ ENTRY(ftrace_stub)
BR_EX %r14
ENDPROC(ftrace_stub)
-#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE)
-#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
-#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
-#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
-#ifdef __PACK_STACK
-/* allocate just enough for r14, r15 and backchain */
-#define TRACED_FUNC_FRAME_SIZE 24
-#else
-#define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD
-#endif
+ .macro ftrace_regs_entry, allregs=0
+ stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller
-ENTRY(_mcount)
- BR_EX %r14
-ENDPROC(_mcount)
-EXPORT_SYMBOL(_mcount)
+ .if \allregs == 1
+ # save psw mask
+ # don't put any instructions clobbering CC before this point
+ epsw %r1,%r14
+ risbg %r14,%r1,0,31,32
+ .endif
-ENTRY(ftrace_caller)
- .globl ftrace_regs_caller
- .set ftrace_regs_caller,ftrace_caller
- stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller
- lghi %r14,0 # save condition code
- ipm %r14 # don't put any instructions
- sllg %r14,%r14,16 # clobbering CC before this point
lgr %r1,%r15
-#if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT))
- aghi %r0,MCOUNT_RETURN_FIXUP
-#endif
# allocate stack frame for ftrace_caller to contain traced function
aghi %r15,-TRACED_FUNC_FRAME_SIZE
stg %r1,__SF_BACKCHAIN(%r15)
@@ -57,23 +51,43 @@ ENTRY(ftrace_caller)
# allocate pt_regs and stack frame for ftrace_trace_function
aghi %r15,-STACK_FRAME_SIZE
stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
+ xc STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15)
+
+ .if \allregs == 1
stg %r14,(STACK_PTREGS_PSW)(%r15)
+ mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS
+ .else
+ xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15)
+ .endif
+
lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address
- stosm (STACK_PTREGS_PSW)(%r15),0
aghi %r1,-TRACED_FUNC_FRAME_SIZE
stg %r1,__SF_BACKCHAIN(%r15)
stg %r0,(STACK_PTREGS_PSW+8)(%r15)
stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
+ .endm
+
+SYM_CODE_START(ftrace_regs_caller)
+ ftrace_regs_entry 1
+ j ftrace_common
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_CODE_START(ftrace_caller)
+ ftrace_regs_entry 0
+ j ftrace_common
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_common)
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
aghik %r2,%r0,-MCOUNT_INSN_SIZE
lgrl %r4,function_trace_op
- lgrl %r1,ftrace_trace_function
+ lgrl %r1,ftrace_func
#else
lgr %r2,%r0
aghi %r2,-MCOUNT_INSN_SIZE
larl %r4,function_trace_op
lg %r4,0(%r4)
- larl %r1,ftrace_trace_function
+ larl %r1,ftrace_func
lg %r1,0(%r1)
#endif
lgr %r3,%r14
@@ -82,24 +96,31 @@ ENTRY(ftrace_caller)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
# The j instruction gets runtime patched to a nop instruction.
# See ftrace_enable_ftrace_graph_caller.
- .globl ftrace_graph_caller
-ftrace_graph_caller:
- j ftrace_graph_caller_end
+SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL)
+ j .Lftrace_graph_caller_end
lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15)
lg %r4,(STACK_PTREGS_PSW+8)(%r15)
brasl %r14,prepare_ftrace_return
stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15)
-ftrace_graph_caller_end:
- .globl ftrace_graph_caller_end
+.Lftrace_graph_caller_end:
#endif
- lg %r1,(STACK_PTREGS_PSW+8)(%r15)
- lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
+ lg %r0,(STACK_PTREGS_PSW+8)(%r15)
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+ ltg %r1,STACK_PTREGS_ORIG_GPR2(%r15)
+ locgrz %r1,%r0
+#else
+ lg %r1,STACK_PTREGS_ORIG_GPR2(%r15)
+ ltgr %r1,%r1
+ jnz 0f
+ lgr %r1,%r0
+#endif
+0: lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
BR_EX %r1
-ENDPROC(ftrace_caller)
+SYM_CODE_END(ftrace_common)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(return_to_handler)
+SYM_FUNC_START(return_to_handler)
stmg %r2,%r5,32(%r15)
lgr %r1,%r15
aghi %r15,-STACK_FRAME_OVERHEAD
@@ -109,6 +130,38 @@ ENTRY(return_to_handler)
lgr %r14,%r2
lmg %r2,%r5,32(%r15)
BR_EX %r14
-ENDPROC(return_to_handler)
+SYM_FUNC_END(return_to_handler)
#endif
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_KPROBES
+
+SYM_FUNC_START(__kretprobe_trampoline)
+
+ stg %r14,(__SF_GPRS+8*8)(%r15)
+ lay %r15,-STACK_FRAME_SIZE(%r15)
+ stmg %r0,%r14,STACK_PTREGS_GPRS(%r15)
+
+ # store original stack pointer in backchain and pt_regs
+ lay %r7,STACK_FRAME_SIZE(%r15)
+ stg %r7,__SF_BACKCHAIN(%r15)
+ stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15)
+
+ # store full psw
+ epsw %r2,%r3
+ risbg %r3,%r2,0,31,32
+ stg %r3,STACK_PTREGS_PSW(%r15)
+ larl %r1,__kretprobe_trampoline
+ stg %r1,STACK_PTREGS_PSW+8(%r15)
+
+ lay %r2,STACK_PTREGS(%r15)
+ brasl %r14,trampoline_probe_handler
+
+ mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15)
+ lmg %r0,%r15,STACK_PTREGS_GPRS(%r15)
+ lpswe __SF_EMPTY(%r15)
+
+SYM_FUNC_END(__kretprobe_trampoline)
+
+#endif /* CONFIG_KPROBES */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index ba8f19bb438b..2d159b32885b 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -14,14 +14,18 @@
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
+#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/bug.h>
+#include <linux/memory.h>
#include <asm/alternative.h>
#include <asm/nospec-branch.h>
#include <asm/facility.h>
+#include <asm/ftrace.lds.h>
+#include <asm/set_memory.h>
#if 0
#define DEBUGP printk
@@ -29,24 +33,32 @@
#define DEBUGP(fmt , ...)
#endif
-#define PLT_ENTRY_SIZE 20
+#define PLT_ENTRY_SIZE 22
void *module_alloc(unsigned long size)
{
+ gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size) < 0)) {
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
return p;
}
+#ifdef CONFIG_FUNCTION_TRACER
+void module_arch_cleanup(struct module *mod)
+{
+ module_memfree(mod->arch.trampolines_start);
+}
+#endif
+
void module_arch_freeing_init(struct module *mod)
{
if (is_livepatch_module(mod) &&
@@ -174,10 +186,12 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
}
static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
- int sign, int bits, int shift)
+ int sign, int bits, int shift,
+ void *(*write)(void *dest, const void *src, size_t len))
{
unsigned long umax;
long min, max;
+ void *dest = (void *)loc;
if (val & ((1UL << shift) - 1))
return -ENOEXEC;
@@ -194,26 +208,33 @@ static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
return -ENOEXEC;
}
- if (bits == 8)
- *(unsigned char *) loc = val;
- else if (bits == 12)
- *(unsigned short *) loc = (val & 0xfff) |
+ if (bits == 8) {
+ unsigned char tmp = val;
+ write(dest, &tmp, 1);
+ } else if (bits == 12) {
+ unsigned short tmp = (val & 0xfff) |
(*(unsigned short *) loc & 0xf000);
- else if (bits == 16)
- *(unsigned short *) loc = val;
- else if (bits == 20)
- *(unsigned int *) loc = (val & 0xfff) << 16 |
- (val & 0xff000) >> 4 |
- (*(unsigned int *) loc & 0xf00000ff);
- else if (bits == 32)
- *(unsigned int *) loc = val;
- else if (bits == 64)
- *(unsigned long *) loc = val;
+ write(dest, &tmp, 2);
+ } else if (bits == 16) {
+ unsigned short tmp = val;
+ write(dest, &tmp, 2);
+ } else if (bits == 20) {
+ unsigned int tmp = (val & 0xfff) << 16 |
+ (val & 0xff000) >> 4 | (*(unsigned int *) loc & 0xf00000ff);
+ write(dest, &tmp, 4);
+ } else if (bits == 32) {
+ unsigned int tmp = val;
+ write(dest, &tmp, 4);
+ } else if (bits == 64) {
+ unsigned long tmp = val;
+ write(dest, &tmp, 8);
+ }
return 0;
}
static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
- const char *strtab, struct module *me)
+ const char *strtab, struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len))
{
struct mod_arch_syminfo *info;
Elf_Addr loc, val;
@@ -241,17 +262,17 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_64: /* Direct 64 bit. */
val += rela->r_addend;
if (r_type == R_390_8)
- rc = apply_rela_bits(loc, val, 0, 8, 0);
+ rc = apply_rela_bits(loc, val, 0, 8, 0, write);
else if (r_type == R_390_12)
- rc = apply_rela_bits(loc, val, 0, 12, 0);
+ rc = apply_rela_bits(loc, val, 0, 12, 0, write);
else if (r_type == R_390_16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_20)
- rc = apply_rela_bits(loc, val, 1, 20, 0);
+ rc = apply_rela_bits(loc, val, 1, 20, 0, write);
else if (r_type == R_390_32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_PC16: /* PC relative 16 bit. */
case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */
@@ -260,15 +281,15 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PC64: /* PC relative 64 bit. */
val += rela->r_addend - loc;
if (r_type == R_390_PC16)
- rc = apply_rela_bits(loc, val, 1, 16, 0);
+ rc = apply_rela_bits(loc, val, 1, 16, 0, write);
else if (r_type == R_390_PC16DBL)
- rc = apply_rela_bits(loc, val, 1, 16, 1);
+ rc = apply_rela_bits(loc, val, 1, 16, 1, write);
else if (r_type == R_390_PC32DBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
else if (r_type == R_390_PC32)
- rc = apply_rela_bits(loc, val, 1, 32, 0);
+ rc = apply_rela_bits(loc, val, 1, 32, 0, write);
else if (r_type == R_390_PC64)
- rc = apply_rela_bits(loc, val, 1, 64, 0);
+ rc = apply_rela_bits(loc, val, 1, 64, 0, write);
break;
case R_390_GOT12: /* 12 bit GOT offset. */
case R_390_GOT16: /* 16 bit GOT offset. */
@@ -283,33 +304,33 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_GOTPLT64: /* 64 bit offset to jump slot. */
case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */
if (info->got_initialized == 0) {
- Elf_Addr *gotent;
+ Elf_Addr *gotent = me->core_layout.base +
+ me->arch.got_offset +
+ info->got_offset;
- gotent = me->core_layout.base + me->arch.got_offset +
- info->got_offset;
- *gotent = val;
+ write(gotent, &val, sizeof(*gotent));
info->got_initialized = 1;
}
val = info->got_offset + rela->r_addend;
if (r_type == R_390_GOT12 ||
r_type == R_390_GOTPLT12)
- rc = apply_rela_bits(loc, val, 0, 12, 0);
+ rc = apply_rela_bits(loc, val, 0, 12, 0, write);
else if (r_type == R_390_GOT16 ||
r_type == R_390_GOTPLT16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_GOT20 ||
r_type == R_390_GOTPLT20)
- rc = apply_rela_bits(loc, val, 1, 20, 0);
+ rc = apply_rela_bits(loc, val, 1, 20, 0, write);
else if (r_type == R_390_GOT32 ||
r_type == R_390_GOTPLT32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_GOT64 ||
r_type == R_390_GOTPLT64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
else if (r_type == R_390_GOTENT ||
r_type == R_390_GOTPLTENT) {
val += (Elf_Addr) me->core_layout.base - loc;
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
}
break;
case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */
@@ -320,25 +341,28 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */
case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */
if (info->plt_initialized == 0) {
- unsigned int *ip;
- ip = me->core_layout.base + me->arch.plt_offset +
- info->plt_offset;
- ip[0] = 0x0d10e310; /* basr 1,0 */
- ip[1] = 0x100a0004; /* lg 1,10(1) */
+ unsigned char insn[PLT_ENTRY_SIZE];
+ char *plt_base;
+ char *ip;
+
+ plt_base = me->core_layout.base + me->arch.plt_offset;
+ ip = plt_base + info->plt_offset;
+ *(int *)insn = 0x0d10e310; /* basr 1,0 */
+ *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */
if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) {
- unsigned int *ij;
- ij = me->core_layout.base +
- me->arch.plt_offset +
- me->arch.plt_size - PLT_ENTRY_SIZE;
- ip[2] = 0xa7f40000 + /* j __jump_r1 */
- (unsigned int)(u16)
- (((unsigned long) ij - 8 -
- (unsigned long) ip) / 2);
+ char *jump_r1;
+
+ jump_r1 = plt_base + me->arch.plt_size -
+ PLT_ENTRY_SIZE;
+ /* brcl 0xf,__jump_r1 */
+ *(short *)&insn[8] = 0xc0f4;
+ *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2;
} else {
- ip[2] = 0x07f10000; /* br %r1 */
+ *(int *)&insn[8] = 0x07f10000; /* br %r1 */
}
- ip[3] = (unsigned int) (val >> 32);
- ip[4] = (unsigned int) val;
+ *(long *)&insn[14] = val;
+
+ write(ip, insn, sizeof(insn));
info->plt_initialized = 1;
}
if (r_type == R_390_PLTOFF16 ||
@@ -357,17 +381,17 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
val += rela->r_addend - loc;
}
if (r_type == R_390_PLT16DBL)
- rc = apply_rela_bits(loc, val, 1, 16, 1);
+ rc = apply_rela_bits(loc, val, 1, 16, 1, write);
else if (r_type == R_390_PLTOFF16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_PLT32DBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
else if (r_type == R_390_PLT32 ||
r_type == R_390_PLTOFF32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_PLT64 ||
r_type == R_390_PLTOFF64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_GOTOFF16: /* 16 bit offset to GOT. */
case R_390_GOTOFF32: /* 32 bit offset to GOT. */
@@ -375,20 +399,20 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
val = val + rela->r_addend -
((Elf_Addr) me->core_layout.base + me->arch.got_offset);
if (r_type == R_390_GOTOFF16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_GOTOFF32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_GOTOFF64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */
case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */
val = (Elf_Addr) me->core_layout.base + me->arch.got_offset +
rela->r_addend - loc;
if (r_type == R_390_GOTPC)
- rc = apply_rela_bits(loc, val, 1, 32, 0);
+ rc = apply_rela_bits(loc, val, 1, 32, 0, write);
else if (r_type == R_390_GOTPCDBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
break;
case R_390_COPY:
case R_390_GLOB_DAT: /* Create GOT entry. */
@@ -412,9 +436,10 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
return 0;
}
-int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+static int __apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symindex, unsigned int relsec,
- struct module *me)
+ struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len))
{
Elf_Addr base;
Elf_Sym *symtab;
@@ -430,13 +455,51 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
n = sechdrs[relsec].sh_size / sizeof(Elf_Rela);
for (i = 0; i < n; i++, rela++) {
- rc = apply_rela(rela, base, symtab, strtab, me);
+ rc = apply_rela(rela, base, symtab, strtab, me, write);
if (rc)
return rc;
}
return 0;
}
+int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+ unsigned int symindex, unsigned int relsec,
+ struct module *me)
+{
+ bool early = me->state == MODULE_STATE_UNFORMED;
+ void *(*write)(void *, const void *, size_t) = memcpy;
+
+ if (!early)
+ write = s390_kernel_write;
+
+ return __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
+ const Elf_Shdr *s)
+{
+ char *start, *end;
+ int numpages;
+ size_t size;
+
+ size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
+ numpages = DIV_ROUND_UP(size, PAGE_SIZE);
+ start = module_alloc(numpages * PAGE_SIZE);
+ if (!start)
+ return -ENOMEM;
+ set_memory_ro((unsigned long)start, numpages);
+ end = start + size;
+
+ me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start;
+ me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end;
+ me->arch.next_trampoline = me->arch.trampolines_start;
+
+ return 0;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
@@ -444,6 +507,9 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *s;
char *secstrings, *secname;
void *aseg;
+#ifdef CONFIG_FUNCTION_TRACER
+ int ret;
+#endif
if (IS_ENABLED(CONFIG_EXPOLINE) &&
!nospec_disable && me->arch.plt_size) {
@@ -451,15 +517,9 @@ int module_finalize(const Elf_Ehdr *hdr,
ij = me->core_layout.base + me->arch.plt_offset +
me->arch.plt_size - PLT_ENTRY_SIZE;
- if (test_facility(35)) {
- ij[0] = 0xc6000000; /* exrl %r0,.+10 */
- ij[1] = 0x0005a7f4; /* j . */
- ij[2] = 0x000007f1; /* br %r1 */
- } else {
- ij[0] = 0x44000000 | (unsigned int)
- offsetof(struct lowcore, br_r1_trampoline);
- ij[1] = 0xa7f40000; /* j . */
- }
+ ij[0] = 0xc6000000; /* exrl %r0,.+10 */
+ ij[1] = 0x0005a7f4; /* j . */
+ ij[2] = 0x000007f1; /* br %r1 */
}
secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -478,8 +538,15 @@ int module_finalize(const Elf_Ehdr *hdr,
if (IS_ENABLED(CONFIG_EXPOLINE) &&
(str_has_prefix(secname, ".s390_return")))
nospec_revert(aseg, aseg + s->sh_size);
+
+#ifdef CONFIG_FUNCTION_TRACER
+ if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) {
+ ret = module_alloc_ftrace_hotpatch_trampolines(me, s);
+ if (ret < 0)
+ return ret;
+ }
+#endif /* CONFIG_FUNCTION_TRACER */
}
- jump_label_apply_nops(me);
return 0;
}
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 0a487fae763e..31cb9b00a36b 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -6,12 +6,12 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#include <linux/kernel_stat.h>
#include <linux/init.h>
#include <linux/errno.h>
+#include <linux/entry-common.h>
#include <linux/hardirq.h>
#include <linux/log2.h>
#include <linux/kprobes.h>
@@ -30,6 +30,8 @@
#include <asm/switch_to.h>
#include <asm/ctl_reg.h>
#include <asm/asm-offsets.h>
+#include <asm/pai.h>
+
#include <linux/kvm_host.h>
struct mcck_struct {
@@ -58,27 +60,27 @@ static inline unsigned long nmi_get_mcesa_size(void)
/*
* The initial machine check extended save area for the boot CPU.
- * It will be replaced by nmi_init() with an allocated structure.
- * The structure is required for machine check happening early in
- * the boot process.
+ * It will be replaced on the boot CPU reinit with an allocated
+ * structure. The structure is required for machine check happening
+ * early in the boot process.
*/
-static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE);
+static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE);
-void __init nmi_alloc_boot_cpu(struct lowcore *lc)
+void __init nmi_alloc_mcesa_early(u64 *mcesad)
{
if (!nmi_needs_mcesa())
return;
- lc->mcesad = (unsigned long) &boot_mcesa;
+ *mcesad = __pa(&boot_mcesa);
if (MACHINE_HAS_GS)
- lc->mcesad |= ilog2(MCESA_MAX_SIZE);
+ *mcesad |= ilog2(MCESA_MAX_SIZE);
}
-static int __init nmi_init(void)
+static void __init nmi_alloc_cache(void)
{
- unsigned long origin, cr0, size;
+ unsigned long size;
if (!nmi_needs_mcesa())
- return 0;
+ return;
size = nmi_get_mcesa_size();
if (size > MCESA_MIN_SIZE)
mcesa_origin_lc = ilog2(size);
@@ -86,40 +88,31 @@ static int __init nmi_init(void)
mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL);
if (!mcesa_cache)
panic("Couldn't create nmi save area cache");
- origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
- if (!origin)
- panic("Couldn't allocate nmi save area");
- /* The pointer is stored with mcesa_bits ORed in */
- kmemleak_not_leak((void *) origin);
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- /* Replace boot_mcesa on the boot CPU */
- S390_lowcore.mcesad = origin | mcesa_origin_lc;
- __ctl_load(cr0, 0, 0);
- return 0;
}
-early_initcall(nmi_init);
-int nmi_alloc_per_cpu(struct lowcore *lc)
+int __ref nmi_alloc_mcesa(u64 *mcesad)
{
unsigned long origin;
+ *mcesad = 0;
if (!nmi_needs_mcesa())
return 0;
+ if (!mcesa_cache)
+ nmi_alloc_cache();
origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
if (!origin)
return -ENOMEM;
/* The pointer is stored with mcesa_bits ORed in */
kmemleak_not_leak((void *) origin);
- lc->mcesad = origin | mcesa_origin_lc;
+ *mcesad = __pa(origin) | mcesa_origin_lc;
return 0;
}
-void nmi_free_per_cpu(struct lowcore *lc)
+void nmi_free_mcesa(u64 *mcesad)
{
if (!nmi_needs_mcesa())
return;
- kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK));
+ kmem_cache_free(mcesa_cache, __va(*mcesad & MCESA_ORIGIN_MASK));
}
static notrace void s390_handle_damage(void)
@@ -131,12 +124,11 @@ static notrace void s390_handle_damage(void)
NOKPROBE_SYMBOL(s390_handle_damage);
/*
- * Main machine check handler function. Will be called with interrupts enabled
- * or disabled and machine checks enabled or disabled.
+ * Main machine check handler function. Will be called with interrupts disabled
+ * and machine checks enabled.
*/
-void s390_handle_mcck(void)
+void __s390_handle_mcck(void)
{
- unsigned long flags;
struct mcck_struct mcck;
/*
@@ -144,13 +136,10 @@ void s390_handle_mcck(void)
* machine checks. Afterwards delete the old state and enable machine
* checks again.
*/
- local_irq_save(flags);
local_mcck_disable();
mcck = *this_cpu_ptr(&cpu_mcck);
memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
- clear_cpu_flag(CIF_MCCK_PENDING);
local_mcck_enable();
- local_irq_restore(flags);
if (mcck.channel_report)
crw_handle_channel_report();
@@ -179,21 +168,32 @@ void s390_handle_mcck(void)
"malfunction (code 0x%016lx).\n", mcck.mcck_code);
printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
current->comm, current->pid);
- do_exit(SIGSEGV);
+ make_task_dead(SIGSEGV);
}
}
-EXPORT_SYMBOL_GPL(s390_handle_mcck);
+void noinstr s390_handle_mcck(struct pt_regs *regs)
+{
+ trace_hardirqs_off();
+ pai_kernel_enter(regs);
+ __s390_handle_mcck();
+ pai_kernel_exit(regs);
+ trace_hardirqs_on();
+}
/*
* returns 0 if all required registers are available
* returns 1 otherwise
*/
-static int notrace s390_check_registers(union mci mci, int umode)
+static int notrace s390_validate_registers(union mci mci, int umode)
{
+ struct mcesa *mcesa;
+ void *fpt_save_area;
union ctlreg2 cr2;
int kill_task;
+ u64 zero;
kill_task = 0;
+ zero = 0;
if (!mci.gr) {
/*
@@ -204,14 +204,6 @@ static int notrace s390_check_registers(union mci mci, int umode)
s390_handle_damage();
kill_task = 1;
}
- /* Check control registers */
- if (!mci.cr) {
- /*
- * Control registers have unknown contents.
- * Can't recover and therefore stopping machine.
- */
- s390_handle_damage();
- }
if (!mci.fp) {
/*
* Floating point registers can't be restored. If the
@@ -224,35 +216,96 @@ static int notrace s390_check_registers(union mci mci, int umode)
if (!test_cpu_flag(CIF_FPU))
kill_task = 1;
}
+ fpt_save_area = &S390_lowcore.floating_pt_save_area;
if (!mci.fc) {
/*
* Floating point control register can't be restored.
* If the kernel currently uses the floating pointer
* registers and needs the FPC register the system is
* stopped. If the process has its floating pointer
- * registers loaded it is terminated.
+ * registers loaded it is terminated. Otherwise the
+ * FPC is just validated.
*/
if (S390_lowcore.fpu_flags & KERNEL_FPC)
s390_handle_damage();
+ asm volatile(
+ " lfpc %0\n"
+ :
+ : "Q" (zero));
if (!test_cpu_flag(CIF_FPU))
kill_task = 1;
+ } else {
+ asm volatile(
+ " lfpc %0\n"
+ :
+ : "Q" (S390_lowcore.fpt_creg_save_area));
}
- if (MACHINE_HAS_VX) {
- if (!mci.vr) {
+ mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ if (!MACHINE_HAS_VX) {
+ /* Validate floating point registers */
+ asm volatile(
+ " ld 0,0(%0)\n"
+ " ld 1,8(%0)\n"
+ " ld 2,16(%0)\n"
+ " ld 3,24(%0)\n"
+ " ld 4,32(%0)\n"
+ " ld 5,40(%0)\n"
+ " ld 6,48(%0)\n"
+ " ld 7,56(%0)\n"
+ " ld 8,64(%0)\n"
+ " ld 9,72(%0)\n"
+ " ld 10,80(%0)\n"
+ " ld 11,88(%0)\n"
+ " ld 12,96(%0)\n"
+ " ld 13,104(%0)\n"
+ " ld 14,112(%0)\n"
+ " ld 15,120(%0)\n"
+ :
+ : "a" (fpt_save_area)
+ : "memory");
+ } else {
+ /* Validate vector registers */
+ union ctlreg0 cr0;
+
+ /*
+ * The vector validity must only be checked if not running a
+ * KVM guest. For KVM guests the machine check is forwarded by
+ * KVM and it is the responsibility of the guest to take
+ * appropriate actions. The host vector or FPU values have been
+ * saved by KVM and will be restored by KVM.
+ */
+ if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) {
/*
* Vector registers can't be restored. If the kernel
* currently uses vector registers the system is
* stopped. If the process has its vector registers
- * loaded it is terminated.
+ * loaded it is terminated. Otherwise just validate
+ * the registers.
*/
if (S390_lowcore.fpu_flags & KERNEL_VXR)
s390_handle_damage();
if (!test_cpu_flag(CIF_FPU))
kill_task = 1;
}
+ cr0.val = S390_lowcore.cregs_save_area[0];
+ cr0.afp = cr0.vx = 1;
+ __ctl_load(cr0.val, 0, 0);
+ asm volatile(
+ " la 1,%0\n"
+ " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+ " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+ :
+ : "Q" (*(struct vx_array *)mcesa->vector_save_area)
+ : "1");
+ __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
}
- /* Check if access registers are valid */
+ /* Validate access registers */
+ asm volatile(
+ " lam 0,15,0(%0)\n"
+ :
+ : "a" (&S390_lowcore.access_regs_save_area)
+ : "memory");
if (!mci.ar) {
/*
* Access registers have unknown contents.
@@ -260,41 +313,46 @@ static int notrace s390_check_registers(union mci mci, int umode)
*/
kill_task = 1;
}
- /* Check guarded storage registers */
+ /* Validate guarded storage registers */
cr2.val = S390_lowcore.cregs_save_area[2];
if (cr2.gse) {
if (!mci.gs) {
/*
- * Guarded storage register can't be restored and
- * the current processes uses guarded storage.
- * It has to be terminated.
+ * 2 cases:
+ * - machine check in kernel or userspace
+ * - machine check while running SIE (KVM guest)
+ * For kernel or userspace the userspace values of
+ * guarded storage control can not be recreated, the
+ * process must be terminated.
+ * For SIE the guest values of guarded storage can not
+ * be recreated. This is either due to a bug or due to
+ * GS being disabled in the guest. The guest will be
+ * notified by KVM code and the guests machine check
+ * handling must take care of this. The host values
+ * are saved by KVM and are not affected.
*/
- kill_task = 1;
+ if (!test_cpu_flag(CIF_MCCK_GUEST))
+ kill_task = 1;
+ } else {
+ load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
}
}
- /* Check if old PSW is valid */
- if (!mci.wp) {
- /*
- * Can't tell if we come from user or kernel mode
- * -> stopping machine.
- */
- s390_handle_damage();
- }
- /* Check for invalid kernel instruction address */
- if (!mci.ia && !umode) {
- /*
- * The instruction address got lost while running
- * in the kernel -> stopping machine.
- */
- s390_handle_damage();
- }
+ /*
+ * The getcpu vdso syscall reads CPU number from the programmable
+ * field of the TOD clock. Disregard the TOD programmable register
+ * validity bit and load the CPU number into the TOD programmable
+ * field unconditionally.
+ */
+ set_tod_programmable_field(raw_smp_processor_id());
+ /* Validate clock comparator register */
+ set_clock_comparator(S390_lowcore.clock_comparator);
if (!mci.ms || !mci.pm || !mci.ia)
kill_task = 1;
return kill_task;
}
-NOKPROBE_SYMBOL(s390_check_registers);
+NOKPROBE_SYMBOL(s390_validate_registers);
/*
* Backup the guest's machine check info to its description block
@@ -333,26 +391,26 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info);
/*
* machine check handler.
*/
-void notrace s390_do_machine_check(struct pt_regs *regs)
+int notrace s390_do_machine_check(struct pt_regs *regs)
{
static int ipd_count;
static DEFINE_SPINLOCK(ipd_lock);
static unsigned long long last_ipd;
struct mcck_struct *mcck;
unsigned long long tmp;
+ irqentry_state_t irq_state;
union mci mci;
unsigned long mcck_dam_code;
+ int mcck_pending = 0;
+
+ irq_state = irqentry_nmi_enter(regs);
- nmi_enter();
+ if (user_mode(regs))
+ update_timer_mcck();
inc_irq_stat(NMI_NMI);
mci.val = S390_lowcore.mcck_interruption_code;
mcck = this_cpu_ptr(&cpu_mcck);
- if (mci.sd) {
- /* System damage -> stopping machine */
- s390_handle_damage();
- }
-
/*
* Reinject the instruction processing damages' machine checks
* including Delayed Access Exception into the guest
@@ -393,14 +451,14 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
s390_handle_damage();
}
}
- if (s390_check_registers(mci, user_mode(regs))) {
+ if (s390_validate_registers(mci, user_mode(regs))) {
/*
* Couldn't restore all register contents for the
* user space process -> mark task for termination.
*/
mcck->kill_task = 1;
mcck->mcck_code = mci.val;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
/*
@@ -420,34 +478,18 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
mcck->stp_queue |= stp_sync_check();
if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
mcck->stp_queue |= stp_island_check();
- if (mcck->stp_queue)
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
- /*
- * Reinject storage related machine checks into the guest if they
- * happen when the guest is running.
- */
- if (!test_cpu_flag(CIF_MCCK_GUEST)) {
- if (mci.se)
- /* Storage error uncorrected */
- s390_handle_damage();
- if (mci.ke)
- /* Storage key-error uncorrected */
- s390_handle_damage();
- if (mci.ds && mci.fa)
- /* Storage degradation */
- s390_handle_damage();
- }
if (mci.cp) {
/* Channel report word pending */
mcck->channel_report = 1;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
if (mci.w) {
/* Warning pending */
mcck->warning = 1;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
/*
@@ -462,7 +504,17 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
*((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
}
clear_cpu_flag(CIF_MCCK_GUEST);
- nmi_exit();
+
+ if (user_mode(regs) && mcck_pending) {
+ irqentry_nmi_exit(regs, irq_state);
+ return 1;
+ }
+
+ if (mcck_pending)
+ schedule_mcck_handler();
+
+ irqentry_nmi_exit(regs, irq_state);
+ return 0;
}
NOKPROBE_SYMBOL(s390_do_machine_check);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 29e511f5bf06..717bbcc056e5 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -17,11 +17,11 @@ static int __init nobp_setup_early(char *str)
* The user explicitely requested nobp=1, enable it and
* disable the expoline support.
*/
- __set_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __set_facility(82, alt_stfle_fac_list);
if (IS_ENABLED(CONFIG_EXPOLINE))
nospec_disable = 1;
} else {
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
return 0;
}
@@ -29,7 +29,7 @@ early_param("nobp", nobp_setup_early);
static int __init nospec_setup_early(char *str)
{
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
return 0;
}
early_param("nospec", nospec_setup_early);
@@ -38,9 +38,9 @@ static int __init nospec_report(void)
{
if (test_facility(156))
pr_info("Spectre V2 mitigation: etokens\n");
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+ if (nospec_uses_trampoline())
pr_info("Spectre V2 mitigation: execute trampolines\n");
- if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+ if (__test_facility(82, alt_stfle_fac_list))
pr_info("Spectre V2 mitigation: limited branch prediction\n");
return 0;
}
@@ -66,14 +66,14 @@ void __init nospec_auto_detect(void)
*/
if (__is_defined(CC_USING_EXPOLINE))
nospec_disable = 1;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
} else if (__is_defined(CC_USING_EXPOLINE)) {
/*
* The kernel has been compiled with expolines.
* Keep expolines enabled and disable nobp.
*/
nospec_disable = 0;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
/*
* If the kernel has not been compiled with expolines the
@@ -86,7 +86,7 @@ static int __init spectre_v2_setup_early(char *str)
{
if (str && !strncmp(str, "on", 2)) {
nospec_disable = 0;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
if (str && !strncmp(str, "off", 3))
nospec_disable = 1;
@@ -99,11 +99,13 @@ early_param("spectre_v2", spectre_v2_setup_early);
static void __init_or_module __nospec_revert(s32 *start, s32 *end)
{
enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type;
+ static const u8 branch[] = { 0x47, 0x00, 0x07, 0x00 };
u8 *instr, *thunk, *br;
u8 insnbuf[6];
s32 *epo;
/* Second part of the instruction replace is always a nop */
+ memcpy(insnbuf + 2, branch, sizeof(branch));
for (epo = start; epo < end; epo++) {
instr = (u8 *) epo + *epo;
if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04)
@@ -116,42 +118,20 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
if (thunk[0] == 0xc6 && thunk[1] == 0x00)
/* exrl %r0,<target-br> */
br = thunk + (*(int *)(thunk + 2)) * 2;
- else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 &&
- thunk[6] == 0x44 && thunk[7] == 0x00 &&
- (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 &&
- (thunk[1] & 0xf0) == (thunk[8] & 0xf0))
- /* larl %rx,<target br> + ex %r0,0(%rx) */
- br = thunk + (*(int *)(thunk + 2)) * 2;
else
continue;
- /* Check for unconditional branch 0x07f? or 0x47f???? */
- if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0)
+ if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
continue;
-
- memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4);
switch (type) {
case BRCL_EXPOLINE:
+ /* brcl to thunk, replace with br + nop */
insnbuf[0] = br[0];
insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
- if (br[0] == 0x47) {
- /* brcl to b, replace with bc + nopr */
- insnbuf[2] = br[2];
- insnbuf[3] = br[3];
- } else {
- /* brcl to br, replace with bcr + nop */
- }
break;
case BRASL_EXPOLINE:
+ /* brasl to thunk, replace with basr + nop */
+ insnbuf[0] = 0x0d;
insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
- if (br[0] == 0x47) {
- /* brasl to b, replace with bas + nopr */
- insnbuf[0] = 0x4d;
- insnbuf[2] = br[2];
- insnbuf[3] = br[3];
- } else {
- /* brasl to br, replace with basr + nop */
- insnbuf[0] = 0x0d;
- }
break;
}
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
index 48f472bf9290..52d4353188ad 100644
--- a/arch/s390/kernel/nospec-sysfs.c
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -15,9 +15,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
{
if (test_facility(156))
return sprintf(buf, "Mitigation: etokens\n");
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+ if (nospec_uses_trampoline())
return sprintf(buf, "Mitigation: execute trampolines\n");
- if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+ if (__test_facility(82, alt_stfle_fac_list))
return sprintf(buf, "Mitigation: limited branch prediction\n");
return sprintf(buf, "Vulnerable\n");
}
diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c
new file mode 100644
index 000000000000..23ab9f02f278
--- /dev/null
+++ b/arch/s390/kernel/numa.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NUMA support for s390
+ *
+ * Implement NUMA core code.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/node.h>
+#include <asm/numa.h>
+
+struct pglist_data *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+void __init numa_setup(void)
+{
+ int nid;
+
+ nodes_clear(node_possible_map);
+ node_set(0, node_possible_map);
+ node_set_online(0);
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
+ if (!NODE_DATA(nid))
+ panic("%s: Failed to allocate %zu bytes align=0x%x\n",
+ __func__, sizeof(pg_data_t), 8);
+ }
+ NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ NODE_DATA(0)->node_id = 0;
+}
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index 0a5e4bafb6ad..ec0bd9457e90 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -13,8 +13,10 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <asm/checksum.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
+#include <asm/maccess.h>
+#include <asm/asm-offsets.h>
/*
* OS info structure has to be page aligned
@@ -45,24 +47,27 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
*/
void os_info_entry_add(int nr, void *ptr, u64 size)
{
- os_info.entry[nr].addr = (u64)(unsigned long)ptr;
+ os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0);
os_info.csum = os_info_csum(&os_info);
}
/*
- * Initialize OS info struture and set lowcore pointer
+ * Initialize OS info structure and set lowcore pointer
*/
void __init os_info_init(void)
{
- void *ptr = &os_info;
+ struct lowcore *abs_lc;
+ unsigned long flags;
os_info.version_major = OS_INFO_VERSION_MAJOR;
os_info.version_minor = OS_INFO_VERSION_MINOR;
os_info.magic = OS_INFO_MAGIC;
os_info.csum = os_info_csum(&os_info);
- mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr);
+ abs_lc = get_abs_lowcore(&flags);
+ abs_lc->os_info = __pa(&os_info);
+ put_abs_lowcore(abs_lc, flags);
}
#ifdef CONFIG_CRASH_DUMP
@@ -90,7 +95,7 @@ static void os_info_old_alloc(int nr, int align)
goto fail;
}
buf_align = PTR_ALIGN(buf, align);
- if (copy_oldmem_kernel(buf_align, (void *) addr, size)) {
+ if (copy_oldmem_kernel(buf_align, addr, size)) {
msg = "copy failed";
goto fail_free;
}
@@ -121,17 +126,16 @@ static void os_info_old_init(void)
if (os_info_init)
return;
- if (!OLDMEM_BASE)
+ if (!oldmem_data.start)
goto fail;
- if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr)))
+ if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr)))
goto fail;
if (addr == 0 || addr % PAGE_SIZE)
goto fail;
os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL);
if (!os_info_old)
goto fail;
- if (copy_oldmem_kernel(os_info_old, (void *) addr,
- sizeof(*os_info_old)))
+ if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old)))
goto fail_free;
if (os_info_old->magic != OS_INFO_MAGIC)
goto fail_free;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 0eb1d1cc53a8..f043a7ff220b 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
/*
* Performance event support for s390x - CPU-measurement Counter Facility
*
- * Copyright IBM Corp. 2012, 2019
+ * Copyright IBM Corp. 2012, 2021
* Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ * Thomas Richter <tmricht@linux.ibm.com>
*/
#define KMSG_COMPONENT "cpum_cf"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
@@ -14,7 +15,231 @@
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/miscdevice.h>
+
#include <asm/cpu_mcf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
+ /* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ * size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
+ unsigned int def:16; /* 0-15 Data Entry Format */
+ unsigned int set:16; /* 16-31 Counter set identifier */
+ unsigned int ctr:16; /* 32-47 Number of stored counters */
+ unsigned int res1:16; /* 48-63 Reserved */
+};
+
+struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
+ /* 0 - 7 */
+ union {
+ struct {
+ unsigned int clock_base:1; /* TOD clock base set */
+ unsigned int speed:1; /* CPU speed set */
+ /* Measurement alerts */
+ unsigned int mtda:1; /* Loss of MT ctr. data alert */
+ unsigned int caca:1; /* Counter auth. change alert */
+ unsigned int lcda:1; /* Loss of counter data alert */
+ };
+ unsigned long flags; /* 0-63 All indicators */
+ };
+ /* 8 - 15 */
+ unsigned int cfvn:16; /* 64-79 Ctr First Version */
+ unsigned int csvn:16; /* 80-95 Ctr Second Version */
+ unsigned int cpu_speed:32; /* 96-127 CPU speed */
+ /* 16 - 23 */
+ unsigned long timestamp; /* 128-191 Timestamp (TOD) */
+ /* 24 - 55 */
+ union {
+ struct {
+ unsigned long progusage1;
+ unsigned long progusage2;
+ unsigned long progusage3;
+ unsigned long tod_base;
+ };
+ unsigned long progusage[4];
+ };
+ /* 56 - 63 */
+ unsigned int mach_type:16; /* Machine type */
+ unsigned int res1:16; /* Reserved */
+ unsigned int res2:32; /* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpuid cpuid;
+
+ te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */
+ te->csvn = cpuhw->info.csvn;
+
+ get_cpu_id(&cpuid); /* Machine type */
+ te->mach_type = cpuid.machine;
+ te->cpu_speed = cfdiag_cpu_speed;
+ if (te->cpu_speed)
+ te->speed = 1;
+ te->clock_base = 1; /* Save clock base */
+ te->tod_base = tod_clock_base.tod;
+ te->timestamp = get_tod_clock_fast();
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+ size_t room, bool error_ok)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ size_t ctrset_size, need = 0;
+ int rc = 3; /* Assume write failure */
+
+ ctrdata->def = CF_DIAG_CTRSET_DEF;
+ ctrdata->set = ctrset;
+ ctrdata->res1 = 0;
+ ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
+
+ if (ctrset_size) { /* Save data */
+ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+ if (need <= room) {
+ rc = ctr_stcctm(ctrset, ctrset_size,
+ (u64 *)(ctrdata + 1));
+ }
+ if (rc != 3 || error_ok)
+ ctrdata->ctr = ctrset_size;
+ else
+ need = 0;
+ }
+
+ debug_sprintf_event(cf_dbg, 3,
+ "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
+ " need %zd rc %d\n", __func__, ctrset, ctrset_size,
+ cpuhw->info.cfvn, cpuhw->info.csvn, need, rc);
+ return need;
+}
+
+static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
+ [CPUMF_CTR_SET_BASIC] = 0x02,
+ [CPUMF_CTR_SET_USER] = 0x04,
+ [CPUMF_CTR_SET_CRYPTO] = 0x08,
+ [CPUMF_CTR_SET_EXT] = 0x01,
+ [CPUMF_CTR_SET_MT_DIAG] = 0x20,
+};
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+ bool error_ok)
+{
+ struct cf_trailer_entry *trailer;
+ size_t offset = 0, done;
+ int i;
+
+ memset(data, 0, sz);
+ sz -= sizeof(*trailer); /* Always room for trailer */
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ struct cf_ctrset_entry *ctrdata = data + offset;
+
+ if (!(auth & cpumf_ctr_ctl[i]))
+ continue; /* Counter set not authorized */
+
+ done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+ offset += done;
+ }
+ trailer = data + offset;
+ cfdiag_trailer(trailer);
+ return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+ for (; --counters >= 0; ++pstart, ++pstop)
+ if (*pstop >= *pstart)
+ *pstop -= *pstart;
+ else
+ *pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+ struct cf_trailer_entry *trailer_start, *trailer_stop;
+ struct cf_ctrset_entry *ctrstart, *ctrstop;
+ size_t offset = 0;
+
+ auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
+ do {
+ ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+ ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+ if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+ pr_err_once("cpum_cf_diag counter set compare error "
+ "in set %i\n", ctrstart->set);
+ return 0;
+ }
+ auth &= ~cpumf_ctr_ctl[ctrstart->set];
+ if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+ cfdiag_diffctrset((u64 *)(ctrstart + 1),
+ (u64 *)(ctrstop + 1), ctrstart->ctr);
+ offset += ctrstart->ctr * sizeof(u64) +
+ sizeof(*ctrstart);
+ }
+ } while (ctrstart->def && auth);
+
+ /* Save time_stamp from start of event in stop's trailer */
+ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+ trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+ trailer_stop->progusage[0] = trailer_start->timestamp;
+
+ return 1;
+}
static enum cpumf_ctr_set get_counter_set(u64 event)
{
@@ -34,7 +259,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
return set;
}
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const struct hw_perf_event *hwc,
+ enum cpumf_ctr_set set)
{
struct cpu_cf_events *cpuhw;
int err = 0;
@@ -43,7 +269,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
cpuhw = &get_cpu_var(cpu_cf_events);
/* check required version for counter sets */
- switch (hwc->config_base) {
+ switch (set) {
case CPUMF_CTR_SET_BASIC:
case CPUMF_CTR_SET_USER:
if (cpuhw->info.cfvn < 1)
@@ -86,6 +312,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
(cpuhw->info.act_ctl & mtdiag_ctl)))
err = -EOPNOTSUPP;
break;
+ case CPUMF_CTR_SET_MAX:
+ err = -EOPNOTSUPP;
}
put_cpu_var(cpu_cf_events);
@@ -95,7 +323,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
static int validate_ctr_auth(const struct hw_perf_event *hwc)
{
struct cpu_cf_events *cpuhw;
- u64 ctrs_state;
int err = 0;
cpuhw = &get_cpu_var(cpu_cf_events);
@@ -105,8 +332,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
* return with -ENOENT in order to fall back to other
* PMUs that might suffice the event request.
*/
- ctrs_state = cpumf_ctr_ctl[hwc->config_base];
- if (!(ctrs_state & cpuhw->info.auth_ctl))
+ if (!(hwc->config_base & cpuhw->info.auth_ctl))
err = -ENOENT;
put_cpu_var(cpu_cf_events);
@@ -126,7 +352,7 @@ static void cpumf_pmu_enable(struct pmu *pmu)
if (cpuhw->flags & PMU_F_ENABLED)
return;
- err = lcctl(cpuhw->state);
+ err = lcctl(cpuhw->state | cpuhw->dev_state);
if (err) {
pr_err("Enabling the performance measuring unit "
"failed with rc=%x\n", err);
@@ -151,6 +377,7 @@ static void cpumf_pmu_disable(struct pmu *pmu)
return;
inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+ inactive |= cpuhw->dev_state;
err = lcctl(inactive);
if (err) {
pr_err("Disabling the performance measuring unit "
@@ -199,6 +426,14 @@ static const int cpumf_generic_events_user[] = {
[PERF_COUNT_HW_BUS_CYCLES] = -1,
};
+static void cpumf_hw_inuse(void)
+{
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_inc_return(&num_events) == 1)
+ __kernel_cpumcf_begin();
+ mutex_unlock(&pmc_reserve_mutex);
+}
+
static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
{
struct perf_event_attr *attr = &event->attr;
@@ -230,9 +465,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
/* No support for kernel space counters only */
} else if (!attr->exclude_kernel && attr->exclude_user) {
return -EOPNOTSUPP;
-
- /* Count user and kernel space */
- } else {
+ } else { /* Count user and kernel space */
if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
return -EOPNOTSUPP;
ev = cpumf_generic_events_basic[ev];
@@ -260,38 +493,49 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
/*
* Use the hardware perf event structure to store the
* counter number in the 'config' member and the counter
- * set number in the 'config_base'. The counter set number
- * is then later used to enable/disable the counter(s).
+ * set number in the 'config_base' as bit mask.
+ * It is later used to enable/disable the counter(s).
*/
hwc->config = ev;
- hwc->config_base = set;
+ hwc->config_base = cpumf_ctr_ctl[set];
break;
case CPUMF_CTR_SET_MAX:
/* The counter could not be associated to a counter set */
return -EINVAL;
- };
+ }
/* Initialize for using the CPU-measurement counter facility */
- if (!atomic_inc_not_zero(&num_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
- err = -EBUSY;
- else
- atomic_inc(&num_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
+ cpumf_hw_inuse();
event->destroy = hw_perf_event_destroy;
/* Finally, validate version and authorization of the counter set */
err = validate_ctr_auth(hwc);
if (!err)
- err = validate_ctr_version(hwc);
+ err = validate_ctr_version(hwc, set);
return err;
}
+/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different
+ * attribute::type values:
+ * - PERF_TYPE_HARDWARE:
+ * - pmu->type:
+ * Handle both type of invocations identical. They address the same hardware.
+ * The result is different when event modifiers exclude_kernel and/or
+ * exclude_user are also set.
+ */
+static int cpumf_pmu_event_type(struct perf_event *event)
+{
+ u64 ev = event->attr.config;
+
+ if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev)
+ return PERF_TYPE_HARDWARE;
+ return PERF_TYPE_RAW;
+}
+
static int cpumf_pmu_event_init(struct perf_event *event)
{
unsigned int type = event->attr.type;
@@ -301,7 +545,7 @@ static int cpumf_pmu_event_init(struct perf_event *event)
err = __hw_perf_event_init(event, type);
else if (event->pmu->type == type)
/* Registered as unknown PMU */
- err = __hw_perf_event_init(event, PERF_TYPE_RAW);
+ err = __hw_perf_event_init(event, cpumf_pmu_event_type(event));
else
return -ENOENT;
@@ -363,16 +607,11 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct hw_perf_event *hwc = &event->hw;
+ int i;
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ if (!(hwc->state & PERF_HES_STOPPED))
return;
- if (WARN_ON_ONCE(hwc->config == -1))
- return;
-
- if (flags & PERF_EF_RELOAD)
- WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-
hwc->state = 0;
/* (Re-)enable and activate the counter set */
@@ -384,30 +623,96 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
* needs to be synchronized. At this point, the counter set can be in
* the inactive or disabled state.
*/
- hw_perf_event_reset(event);
+ if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+ cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+ sizeof(cpuhw->start),
+ hwc->config_base, true);
+ } else {
+ hw_perf_event_reset(event);
+ }
+
+ /* Increment refcount for counter sets */
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+ if ((hwc->config_base & cpumf_ctr_ctl[i]))
+ atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data. The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+ struct cpu_cf_events *cpuhw)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+
+ if (event->attr.sample_type & PERF_SAMPLE_CPU)
+ data.cpu_entry.cpu = event->cpu;
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = cpuhw->usedss;
+ raw.frag.data = cpuhw->stop;
+ raw.size = raw.frag.size;
+ data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ debug_sprintf_event(cf_dbg, 3,
+ "%s event %#llx sample_type %#llx raw %d ov %d\n",
+ __func__, event->hw.config,
+ event->attr.sample_type, raw.size, overflow);
+ if (overflow)
+ event->pmu->stop(event, 0);
- /* increment refcount for this counter set */
- atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+ perf_event_update_userpage(event);
+ return overflow;
}
static void cpumf_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct hw_perf_event *hwc = &event->hw;
+ int i;
if (!(hwc->state & PERF_HES_STOPPED)) {
/* Decrement reference count for this counter set and if this
* is the last used counter in the set, clear activation
* control and set the counter set state to inactive.
*/
- if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
- ctr_set_stop(&cpuhw->state, hwc->config_base);
- event->hw.state |= PERF_HES_STOPPED;
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+ continue;
+ if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+ ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+ }
+ hwc->state |= PERF_HES_STOPPED;
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- hw_perf_event_update(event);
- event->hw.state |= PERF_HES_UPTODATE;
+ if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+ local64_inc(&event->count);
+ cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+ sizeof(cpuhw->stop),
+ event->hw.config_base,
+ false);
+ if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+ cfdiag_push_sample(event, cpuhw);
+ } else if (cpuhw->flags & PMU_F_RESERVED) {
+ /* Only update when PMU not hotplugged off */
+ hw_perf_event_update(event);
+ }
+ hwc->state |= PERF_HES_UPTODATE;
}
}
@@ -415,29 +720,19 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- /* Check authorization for the counter set to which this
- * counter belongs.
- * For group events transaction, the authorization check is
- * done in cpumf_pmu_commit_txn().
- */
- if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD))
- if (validate_ctr_auth(&event->hw))
- return -ENOENT;
-
ctr_set_enable(&cpuhw->state, event->hw.config_base);
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (flags & PERF_EF_START)
cpumf_pmu_start(event, PERF_EF_RELOAD);
- perf_event_update_userpage(event);
-
return 0;
}
static void cpumf_pmu_del(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ int i;
cpumf_pmu_stop(event, PERF_EF_UPDATE);
@@ -449,112 +744,787 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
* clear enable control and resets all counters in a set. Therefore,
* cpumf_pmu_start() always has to reenable a counter set.
*/
- if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
- ctr_set_disable(&cpuhw->state, event->hw.config_base);
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+ if (!atomic_read(&cpuhw->ctr_set[i]))
+ ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
+}
- perf_event_update_userpage(event);
+/* Performance monitoring unit for s390x */
+static struct pmu cpumf_pmu = {
+ .task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .pmu_enable = cpumf_pmu_enable,
+ .pmu_disable = cpumf_pmu_disable,
+ .event_init = cpumf_pmu_event_init,
+ .add = cpumf_pmu_add,
+ .del = cpumf_pmu_del,
+ .start = cpumf_pmu_start,
+ .stop = cpumf_pmu_stop,
+ .read = cpumf_pmu_read,
+};
+
+static int cfset_init(void);
+static int __init cpumf_pmu_init(void)
+{
+ int rc;
+
+ if (!kernel_cpumcf_avail())
+ return -ENODEV;
+
+ /* Setup s390dbf facility */
+ cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+ if (!cf_dbg) {
+ pr_err("Registration of s390dbf(cpum_cf) failed\n");
+ return -ENOMEM;
+ }
+ debug_register_view(cf_dbg, &debug_sprintf_view);
+
+ cpumf_pmu.attr_groups = cpumf_cf_event_group();
+ rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
+ if (rc) {
+ debug_unregister_view(cf_dbg, &debug_sprintf_view);
+ debug_unregister(cf_dbg);
+ pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+ } else if (stccm_avail()) { /* Setup counter set device */
+ cfset_init();
+ }
+ return rc;
}
-/*
- * Start group events scheduling transaction.
- * Set flags to perform a single test at commit time.
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
+ */
+
+static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */
+static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
+struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */
+ unsigned int sets; /* Counter set bit mask */
+ atomic_t cpus_ack; /* # CPUs successfully executed func */
+};
+
+static struct cfset_session { /* CPUs and counter set bit mask */
+ struct list_head head; /* Head of list of active processes */
+} cfset_session = {
+ .head = LIST_HEAD_INIT(cfset_session.head)
+};
+
+struct cfset_request { /* CPUs and counter set bit mask */
+ unsigned long ctrset; /* Bit mask of counter set to read */
+ cpumask_t mask; /* CPU mask to read from */
+ struct list_head node; /* Chain to cfset_session.head */
+};
+
+static void cfset_session_init(void)
+{
+ INIT_LIST_HEAD(&cfset_session.head);
+}
+
+/* Remove current request from global bookkeeping. Maintain a counter set bit
+ * mask on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_del(struct cfset_request *p)
+{
+ list_del(&p->node);
+}
+
+/* Add current request to global bookkeeping. Maintain a counter set bit mask
+ * on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_add(struct cfset_request *p)
+{
+ list_add(&p->node, &cfset_session.head);
+}
+
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
*
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access. The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
*/
-static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cfset_call_on_cpu_parm *p = parm;
+ int rc;
- WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */
+ /* Check if any counter set used by /dev/hwc */
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ if ((p->sets & cpumf_ctr_ctl[rc])) {
+ if (!atomic_dec_return(&cpuhw->ctr_set[rc])) {
+ ctr_set_disable(&cpuhw->dev_state,
+ cpumf_ctr_ctl[rc]);
+ ctr_set_stop(&cpuhw->dev_state,
+ cpumf_ctr_ctl[rc]);
+ }
+ }
+ /* Keep perf_event_open counter sets */
+ rc = lcctl(cpuhw->dev_state | cpuhw->state);
+ if (rc)
+ pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->state, S390_HWCTR_DEVICE, rc);
+ if (!cpuhw->dev_state)
+ cpuhw->flags &= ~PMU_F_IN_USE;
+ debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+ __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
- cpuhw->txn_flags = txn_flags;
- if (txn_flags & ~PERF_PMU_TXN_ADD)
- return;
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cfset_call_on_cpu_parm *p = parm;
+ int rc;
- perf_pmu_disable(pmu);
- cpuhw->tx_state = cpuhw->state;
+ cpuhw->flags |= PMU_F_IN_USE;
+ ctr_set_enable(&cpuhw->dev_state, p->sets);
+ ctr_set_start(&cpuhw->dev_state, p->sets);
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ if ((p->sets & cpumf_ctr_ctl[rc]))
+ atomic_inc(&cpuhw->ctr_set[rc]);
+ rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */
+ if (!rc)
+ atomic_inc(&p->cpus_ack);
+ else
+ pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+ debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+ __func__, rc, cpuhw->state, cpuhw->dev_state);
}
-/*
- * Stop and cancel a group events scheduling tranctions.
- * Assumes cpumf_pmu_del() is called for each successful added
- * cpumf_pmu_add() during the transaction.
- */
-static void cpumf_pmu_cancel_txn(struct pmu *pmu)
+static void cfset_release_cpu(void *p)
{
- unsigned int txn_flags;
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ int rc;
+
+ debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
+ __func__, cpuhw->state, cpuhw->dev_state);
+ cpuhw->dev_state = 0;
+ rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */
+ if (rc)
+ pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
- WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */
+/* This modifies the process CPU mask to adopt it to the currently online
+ * CPUs. Offline CPUs can not be addresses. This call terminates the access
+ * and is usually followed by close() or a new iotcl(..., START, ...) which
+ * creates a new request structure.
+ */
+static void cfset_all_stop(struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p = {
+ .sets = req->ctrset,
+ };
- txn_flags = cpuhw->txn_flags;
- cpuhw->txn_flags = 0;
- if (txn_flags & ~PERF_PMU_TXN_ADD)
- return;
+ cpumask_and(&req->mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1);
+}
- WARN_ON(cpuhw->tx_state != cpuhw->state);
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&cfset_ctrset_mutex);
+ /* Open followed by close/exit has no private_data */
+ if (file->private_data) {
+ cfset_all_stop(file->private_data);
+ cfset_session_del(file->private_data);
+ kfree(file->private_data);
+ file->private_data = NULL;
+ }
+ if (!atomic_dec_return(&cfset_opencnt))
+ on_each_cpu(cfset_release_cpu, NULL, 1);
+ mutex_unlock(&cfset_ctrset_mutex);
- perf_pmu_enable(pmu);
+ hw_perf_event_destroy(NULL);
+ return 0;
}
-/*
- * Commit the group events scheduling transaction. On success, the
- * transaction is closed. On error, the transaction is kept open
- * until cpumf_pmu_cancel_txn() is called.
+static int cfset_open(struct inode *inode, struct file *file)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ mutex_lock(&cfset_ctrset_mutex);
+ if (atomic_inc_return(&cfset_opencnt) == 1)
+ cfset_session_init();
+ mutex_unlock(&cfset_ctrset_mutex);
+
+ cpumf_hw_inuse();
+ file->private_data = NULL;
+ /* nonseekable_open() never fails */
+ return nonseekable_open(inode, file);
+}
+
+static int cfset_all_start(struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p = {
+ .sets = req->ctrset,
+ .cpus_ack = ATOMIC_INIT(0),
+ };
+ cpumask_var_t mask;
+ int rc = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_and(mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+ if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+ on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+ rc = -EIO;
+ debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__);
+ }
+ free_cpumask_var(mask);
+ return rc;
+}
+
+
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
*/
-static int cpumf_pmu_commit_txn(struct pmu *pmu)
+static size_t cfset_needspace(unsigned int sets)
+{
+ struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
+ size_t bytes = 0;
+ int i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (!(sets & cpumf_ctr_ctl[i]))
+ continue;
+ bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+ sizeof(((struct s390_ctrset_setdata *)0)->set) +
+ sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+ }
+ bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+ (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+ sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+ put_cpu_ptr(&cpu_cf_events);
+ return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+ struct s390_ctrset_read __user *ctrset_read;
+ unsigned int cpu, cpus, rc;
+ void __user *uptr;
+
+ ctrset_read = (struct s390_ctrset_read __user *)arg;
+ uptr = ctrset_read->data;
+ for_each_cpu(cpu, mask) {
+ struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu);
+ struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+ ctrset_cpudata = uptr;
+ rc = put_user(cpu, &ctrset_cpudata->cpu_nr);
+ rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+ rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+ cpuhw->used);
+ if (rc)
+ return -EFAULT;
+ uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+ cond_resched();
+ }
+ cpus = cpumask_weight(mask);
+ if (put_user(cpus, &ctrset_read->no_cpus))
+ return -EFAULT;
+ debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__,
+ uptr - (void __user *)ctrset_read->data);
+ return 0;
+}
+
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+ int ctrset_size, size_t room)
+{
+ size_t need = 0;
+ int rc = -1;
+
+ need = sizeof(*p) + sizeof(u64) * ctrset_size;
+ if (need <= room) {
+ p->set = cpumf_ctr_ctl[ctrset];
+ p->no_cnts = ctrset_size;
+ rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+ if (rc == 3) /* Nothing stored */
+ need = 0;
+ }
+ return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- u64 state;
+ struct cfset_call_on_cpu_parm *p = parm;
+ int set, set_size;
+ size_t space;
+
+ /* No data saved yet */
+ cpuhw->used = 0;
+ cpuhw->sets = 0;
+ memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+ /* Scan the counter sets */
+ for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+ struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+ cpuhw->used;
+
+ if (!(p->sets & cpumf_ctr_ctl[set]))
+ continue; /* Counter set not in list */
+ set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
+ space = sizeof(cpuhw->data) - cpuhw->used;
+ space = cfset_cpuset_read(sp, set, set_size, space);
+ if (space) {
+ cpuhw->used += space;
+ cpuhw->sets += 1;
+ }
+ }
+ debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__,
+ cpuhw->sets, cpuhw->used);
+}
+
+static int cfset_all_read(unsigned long arg, struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p;
+ cpumask_var_t mask;
+ int rc;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ p.sets = req->ctrset;
+ cpumask_and(mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+ rc = cfset_all_copy(arg, mask);
+ free_cpumask_var(mask);
+ return rc;
+}
- WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */
+static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req)
+{
+ struct s390_ctrset_read read;
+ int ret = -ENODATA;
- if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) {
- cpuhw->txn_flags = 0;
- return 0;
+ if (req && req->ctrset) {
+ if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+ return -EFAULT;
+ ret = cfset_all_read(arg, req);
}
+ return ret;
+}
- /* check if the updated state can be scheduled */
- state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
- state >>= CPUMF_LCCTL_ENABLE_SHIFT;
- if ((state & cpuhw->info.auth_ctl) != state)
- return -ENOENT;
+static long cfset_ioctl_stop(struct file *file)
+{
+ struct cfset_request *req = file->private_data;
+ int ret = -ENXIO;
+
+ if (req) {
+ cfset_all_stop(req);
+ cfset_session_del(req);
+ kfree(req);
+ file->private_data = NULL;
+ ret = 0;
+ }
+ return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg, struct file *file)
+{
+ struct s390_ctrset_start __user *ustart;
+ struct s390_ctrset_start start;
+ struct cfset_request *preq;
+ void __user *umask;
+ unsigned int len;
+ int ret = 0;
+ size_t need;
+
+ if (file->private_data)
+ return -EBUSY;
+ ustart = (struct s390_ctrset_start __user *)arg;
+ if (copy_from_user(&start, ustart, sizeof(start)))
+ return -EFAULT;
+ if (start.version != S390_HWCTR_START_VERSION)
+ return -EINVAL;
+ if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+ return -EINVAL; /* Invalid counter set */
+ if (!start.counter_sets)
+ return -EINVAL; /* No counter set at all? */
+
+ preq = kzalloc(sizeof(*preq), GFP_KERNEL);
+ if (!preq)
+ return -ENOMEM;
+ cpumask_clear(&preq->mask);
+ len = min_t(u64, start.cpumask_len, cpumask_size());
+ umask = (void __user *)start.cpumask;
+ if (copy_from_user(&preq->mask, umask, len)) {
+ kfree(preq);
+ return -EFAULT;
+ }
+ if (cpumask_empty(&preq->mask)) {
+ kfree(preq);
+ return -EINVAL;
+ }
+ need = cfset_needspace(start.counter_sets);
+ if (put_user(need, &ustart->data_bytes)) {
+ kfree(preq);
+ return -EFAULT;
+ }
+ preq->ctrset = start.counter_sets;
+ ret = cfset_all_start(preq);
+ if (!ret) {
+ cfset_session_add(preq);
+ file->private_data = preq;
+ debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n",
+ __func__, preq->ctrset, need, ret);
+ } else {
+ kfree(preq);
+ }
+ return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ * counter set keeps running until explicitly stopped. Returns the number
+ * of bytes needed to store the counter values. If another S390_HWCTR_START
+ * ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ * command on the same file descriptor, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ * with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ * previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ cpus_read_lock();
+ mutex_lock(&cfset_ctrset_mutex);
+ switch (cmd) {
+ case S390_HWCTR_START:
+ ret = cfset_ioctl_start(arg, file);
+ break;
+ case S390_HWCTR_STOP:
+ ret = cfset_ioctl_stop(file);
+ break;
+ case S390_HWCTR_READ:
+ ret = cfset_ioctl_read(arg, file->private_data);
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static const struct file_operations cfset_fops = {
+ .owner = THIS_MODULE,
+ .open = cfset_open,
+ .release = cfset_release,
+ .unlocked_ioctl = cfset_ioctl,
+ .compat_ioctl = cfset_ioctl,
+ .llseek = no_llseek
+};
+
+static struct miscdevice cfset_dev = {
+ .name = S390_HWCTR_DEVICE,
+ .minor = MISC_DYNAMIC_MINOR,
+ .fops = &cfset_fops,
+};
- cpuhw->txn_flags = 0;
- perf_pmu_enable(pmu);
+/* Hotplug add of a CPU. Scan through all active processes and add
+ * that CPU to the list of CPUs supplied with ioctl(..., START, ...).
+ */
+int cfset_online_cpu(unsigned int cpu)
+{
+ struct cfset_call_on_cpu_parm p;
+ struct cfset_request *rp;
+
+ mutex_lock(&cfset_ctrset_mutex);
+ if (!list_empty(&cfset_session.head)) {
+ list_for_each_entry(rp, &cfset_session.head, node) {
+ p.sets = rp->ctrset;
+ cfset_ioctl_on(&p);
+ cpumask_set_cpu(cpu, &rp->mask);
+ }
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
return 0;
}
-/* Performance monitoring unit for s390x */
-static struct pmu cpumf_pmu = {
+/* Hotplug remove of a CPU. Scan through all active processes and clear
+ * that CPU from the list of CPUs supplied with ioctl(..., START, ...).
+ */
+int cfset_offline_cpu(unsigned int cpu)
+{
+ struct cfset_call_on_cpu_parm p;
+ struct cfset_request *rp;
+
+ mutex_lock(&cfset_ctrset_mutex);
+ if (!list_empty(&cfset_session.head)) {
+ list_for_each_entry(rp, &cfset_session.head, node) {
+ p.sets = rp->ctrset;
+ cfset_ioctl_off(&p);
+ cpumask_clear_cpu(cpu, &rp->mask);
+ }
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+ debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__,
+ event->attr.config, local64_read(&event->count));
+}
+
+static int get_authctrsets(void)
+{
+ struct cpu_cf_events *cpuhw;
+ unsigned long auth = 0;
+ enum cpumf_ctr_set i;
+
+ cpuhw = &get_cpu_var(cpu_cf_events);
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
+ auth |= cpumf_ctr_ctl[i];
+ }
+ put_cpu_var(cpu_cf_events);
+ return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int err = 0;
+
+ /* Set sample_period to indicate sampling */
+ event->hw.config = attr->config;
+ event->hw.sample_period = attr->sample_period;
+ local64_set(&event->hw.period_left, event->hw.sample_period);
+ local64_set(&event->count, 0);
+ event->hw.last_period = event->hw.sample_period;
+
+ /* Add all authorized counter sets to config_base. The
+ * the hardware init function is either called per-cpu or just once
+ * for all CPUS (event->cpu == -1). This depends on the whether
+ * counting is started for all CPUs or on a per workload base where
+ * the perf event moves from one CPU to another CPU.
+ * Checking the authorization on any CPU is fine as the hardware
+ * applies the same authorization settings to all CPUs.
+ */
+ event->hw.config_base = get_authctrsets();
+
+ /* No authorized counter sets, nothing to count/sample */
+ if (!event->hw.config_base)
+ err = -EINVAL;
+
+ debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n",
+ __func__, err, event->hw.config_base);
+ return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int err = -ENOENT;
+
+ if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+ event->attr.type != event->pmu->type)
+ goto out;
+
+ /* Raw events are used to access counters directly,
+ * hence do not permit excludes.
+ * This event is useless without PERF_SAMPLE_RAW to return counter set
+ * values as raw data.
+ */
+ if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+ !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* Initialize for using the CPU-measurement counter facility */
+ cpumf_hw_inuse();
+ event->destroy = hw_perf_event_destroy;
+
+ err = cfdiag_event_init2(event);
+ if (unlikely(err))
+ event->destroy(event);
+out:
+ return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+ CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+ NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+ .name = "events",
+ .attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+ .name = "format",
+ .attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+ &cfdiag_events_group,
+ &cfdiag_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event strcture as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .event_init = cfdiag_event_init,
.pmu_enable = cpumf_pmu_enable,
.pmu_disable = cpumf_pmu_disable,
- .event_init = cpumf_pmu_event_init,
.add = cpumf_pmu_add,
.del = cpumf_pmu_del,
.start = cpumf_pmu_start,
.stop = cpumf_pmu_stop,
- .read = cpumf_pmu_read,
- .start_txn = cpumf_pmu_start_txn,
- .commit_txn = cpumf_pmu_commit_txn,
- .cancel_txn = cpumf_pmu_cancel_txn,
+ .read = cfdiag_read,
+
+ .attr_groups = cfdiag_attr_groups
};
-static int __init cpumf_pmu_init(void)
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+ size_t max_size = sizeof(struct cf_trailer_entry);
+ enum cpumf_ctr_set i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ size_t size = cpum_cf_ctrset_size(i, info);
+
+ if (size)
+ max_size += size * sizeof(u64) +
+ sizeof(struct cf_ctrset_entry);
+ }
+ return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+ unsigned long mhz;
+
+ if (cpum_sf_avail()) { /* Sampling facility first */
+ struct hws_qsi_info_block si;
+
+ memset(&si, 0, sizeof(si));
+ if (!qsi(&si)) {
+ cfdiag_cpu_speed = si.cpu_speed;
+ return;
+ }
+ }
+
+ /* Fallback: CPU speed extract static part. Used in case
+ * CPU Measurement Sampling Facility is turned off.
+ */
+ mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+ if (mhz != -1UL)
+ cfdiag_cpu_speed = mhz & 0xffffffff;
+}
+
+static int cfset_init(void)
{
+ struct cpumf_ctr_info info;
+ size_t need;
int rc;
- if (!kernel_cpumcf_avail())
+ if (qctri(&info))
return -ENODEV;
- cpumf_pmu.attr_groups = cpumf_cf_event_group();
- rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
- if (rc)
- pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+ cfdiag_get_cpu_speed();
+ /* Make sure the counter set data fits into predefined buffer. */
+ need = cfdiag_maxsize(&info);
+ if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+ pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+ need);
+ return -ENOMEM;
+ }
+
+ rc = misc_register(&cfset_dev);
+ if (rc) {
+ pr_err("Registration of /dev/%s failed rc=%i\n",
+ cfset_dev.name, rc);
+ goto out;
+ }
+
+ rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+ if (rc) {
+ misc_deregister(&cfset_dev);
+ pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+ rc);
+ }
+out:
return rc;
}
-subsys_initcall(cpumf_pmu_init);
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
index 3bced89caffb..8ee48672233f 100644
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ b/arch/s390/kernel/perf_cpum_cf_common.c
@@ -29,8 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
},
.alert = ATOMIC64_INIT(0),
.state = 0,
+ .dev_state = 0,
.flags = 0,
- .txn_flags = 0,
+ .used = 0,
+ .usedss = 0,
+ .sets = 0
};
/* Indicator whether the CPU-Measurement Counter Facility Support is ready */
static bool cpum_cf_initalized;
@@ -97,25 +100,10 @@ bool kernel_cpumcf_avail(void)
}
EXPORT_SYMBOL(kernel_cpumcf_avail);
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
/* Initialize the CPU-measurement counter facility */
int __kernel_cpumcf_begin(void)
{
int flags = PMC_INIT;
- int err = 0;
-
- spin_lock(&cpumcf_owner_lock);
- if (cpumcf_owner)
- err = -EBUSY;
- else
- cpumcf_owner = __builtin_return_address(0);
- spin_unlock(&cpumcf_owner_lock);
- if (err)
- return err;
on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
@@ -145,10 +133,6 @@ void __kernel_cpumcf_end(void)
on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
- spin_lock(&cpumcf_owner_lock);
- cpumcf_owner = NULL;
- spin_unlock(&cpumcf_owner_lock);
}
EXPORT_SYMBOL(__kernel_cpumcf_end);
@@ -162,14 +146,62 @@ static int cpum_cf_setup(unsigned int cpu, int flags)
static int cpum_cf_online_cpu(unsigned int cpu)
{
- return cpum_cf_setup(cpu, PMC_INIT);
+ cpum_cf_setup(cpu, PMC_INIT);
+ return cfset_online_cpu(cpu);
}
static int cpum_cf_offline_cpu(unsigned int cpu)
{
+ cfset_offline_cpu(cpu);
return cpum_cf_setup(cpu, PMC_RELEASE);
}
+/* Return the maximum possible counter set size (in number of 8 byte counters)
+ * depending on type and model number.
+ */
+size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
+ struct cpumf_ctr_info *info)
+{
+ size_t ctrset_size = 0;
+
+ switch (ctrset) {
+ case CPUMF_CTR_SET_BASIC:
+ if (info->cfvn >= 1)
+ ctrset_size = 6;
+ break;
+ case CPUMF_CTR_SET_USER:
+ if (info->cfvn == 1)
+ ctrset_size = 6;
+ else if (info->cfvn >= 3)
+ ctrset_size = 2;
+ break;
+ case CPUMF_CTR_SET_CRYPTO:
+ if (info->csvn >= 1 && info->csvn <= 5)
+ ctrset_size = 16;
+ else if (info->csvn == 6 || info->csvn == 7)
+ ctrset_size = 20;
+ break;
+ case CPUMF_CTR_SET_EXT:
+ if (info->csvn == 1)
+ ctrset_size = 32;
+ else if (info->csvn == 2)
+ ctrset_size = 48;
+ else if (info->csvn >= 3 && info->csvn <= 5)
+ ctrset_size = 128;
+ else if (info->csvn == 6 || info->csvn == 7)
+ ctrset_size = 160;
+ break;
+ case CPUMF_CTR_SET_MT_DIAG:
+ if (info->csvn > 3)
+ ctrset_size = 48;
+ break;
+ case CPUMF_CTR_SET_MAX:
+ break;
+ }
+
+ return ctrset_size;
+}
+
static int __init cpum_cf_init(void)
{
int rc;
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index e949ab832ed7..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- * Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT "cpum_cf_diag"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
-
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd { /* Counter set data per CPU */
- size_t used; /* Bytes used in data/start */
- unsigned char start[PAGE_SIZE]; /* Counter set at event start */
- unsigned char data[PAGE_SIZE]; /* Counter set at event delete */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- * size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
- unsigned int def:16; /* 0-15 Data Entry Format */
- unsigned int set:16; /* 16-31 Counter set identifier */
- unsigned int ctr:16; /* 32-47 Number of stored counters */
- unsigned int res1:16; /* 48-63 Reserved */
-};
-
-struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
- /* 0 - 7 */
- union {
- struct {
- unsigned int clock_base:1; /* TOD clock base set */
- unsigned int speed:1; /* CPU speed set */
- /* Measurement alerts */
- unsigned int mtda:1; /* Loss of MT ctr. data alert */
- unsigned int caca:1; /* Counter auth. change alert */
- unsigned int lcda:1; /* Loss of counter data alert */
- };
- unsigned long flags; /* 0-63 All indicators */
- };
- /* 8 - 15 */
- unsigned int cfvn:16; /* 64-79 Ctr First Version */
- unsigned int csvn:16; /* 80-95 Ctr Second Version */
- unsigned int cpu_speed:32; /* 96-127 CPU speed */
- /* 16 - 23 */
- unsigned long timestamp; /* 128-191 Timestamp (TOD) */
- /* 24 - 55 */
- union {
- struct {
- unsigned long progusage1;
- unsigned long progusage2;
- unsigned long progusage3;
- unsigned long tod_base;
- };
- unsigned long progusage[4];
- };
- /* 56 - 63 */
- unsigned int mach_type:16; /* Machine type */
- unsigned int res1:16; /* Reserved */
- unsigned int res2:32; /* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cpuid cpuid;
-
- te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */
- te->csvn = cpuhw->info.csvn;
-
- get_cpu_id(&cpuid); /* Machine type */
- te->mach_type = cpuid.machine;
- te->cpu_speed = cf_diag_cpu_speed;
- if (te->cpu_speed)
- te->speed = 1;
- te->clock_base = 1; /* Save clock base */
- memcpy(&te->tod_base, &tod_clock_base[1], 8);
- store_tod_clock((__u64 *)&te->timestamp);
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s pmu %p cpu %d flags %#x state %#llx\n",
- __func__, pmu, smp_processor_id(), cpuhw->flags,
- cpuhw->state);
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- err = lcctl(cpuhw->state);
- if (err) {
- pr_err("Enabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
- cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- u64 inactive;
- int err;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s pmu %p cpu %d flags %#x state %#llx\n",
- __func__, pmu, smp_processor_id(), cpuhw->flags,
- cpuhw->state);
- if (!(cpuhw->flags & PMU_F_ENABLED))
- return;
-
- inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
- err = lcctl(inactive);
- if (err) {
- pr_err("Disabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
- cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d cf_diag_events %d\n",
- __func__, event, event->cpu,
- atomic_read(&cf_diag_events));
- if (atomic_dec_return(&cf_diag_events) == 0)
- __kernel_cpumcf_end();
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- struct cpu_cf_events *cpuhw;
- enum cpumf_ctr_set i;
- int err = 0;
-
- debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
- event, event->cpu);
-
- event->hw.config = attr->config;
- event->hw.config_base = 0;
-
- /* Add all authorized counter sets to config_base. The
- * the hardware init function is either called per-cpu or just once
- * for all CPUS (event->cpu == -1). This depends on the whether
- * counting is started for all CPUs or on a per workload base where
- * the perf event moves from one CPU to another CPU.
- * Checking the authorization on any CPU is fine as the hardware
- * applies the same authorization settings to all CPUs.
- */
- cpuhw = &get_cpu_var(cpu_cf_events);
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
- if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
- event->hw.config_base |= cpumf_ctr_ctl[i];
- put_cpu_var(cpu_cf_events);
-
- /* No authorized counter sets, nothing to count/sample */
- if (!event->hw.config_base) {
- err = -EINVAL;
- goto out;
- }
-
- /* Set sample_period to indicate sampling */
- event->hw.sample_period = attr->sample_period;
- local64_set(&event->hw.period_left, event->hw.sample_period);
- event->hw.last_period = event->hw.sample_period;
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
- __func__, err, event->hw.config_base);
- return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- int err = -ENOENT;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d config %#llx type:%u "
- "sample_type %#llx cf_diag_events %d\n", __func__,
- event, event->cpu, attr->config, event->pmu->type,
- attr->sample_type, atomic_read(&cf_diag_events));
-
- if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
- event->attr.type != event->pmu->type)
- goto out;
-
- /* Raw events are used to access counters directly,
- * hence do not permit excludes.
- * This event is usesless without PERF_SAMPLE_RAW to return counter set
- * values as raw data.
- */
- if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
- !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- /* Initialize for using the CPU-measurement counter facility */
- if (atomic_inc_return(&cf_diag_events) == 1) {
- if (__kernel_cpumcf_begin()) {
- atomic_dec(&cf_diag_events);
- err = -EBUSY;
- goto out;
- }
- }
- event->destroy = cf_diag_perf_event_destroy;
-
- err = __hw_perf_event_init(event);
- if (unlikely(err))
- event->destroy(event);
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
- return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
- debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Return the maximum possible counter set size (in number of 8 byte counters)
- * depending on type and model number.
- */
-static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset,
- struct cpumf_ctr_info *info)
-{
- size_t ctrset_size = 0;
-
- switch (ctrset) {
- case CPUMF_CTR_SET_BASIC:
- if (info->cfvn >= 1)
- ctrset_size = 6;
- break;
- case CPUMF_CTR_SET_USER:
- if (info->cfvn == 1)
- ctrset_size = 6;
- else if (info->cfvn >= 3)
- ctrset_size = 2;
- break;
- case CPUMF_CTR_SET_CRYPTO:
- if (info->csvn >= 1 && info->csvn <= 5)
- ctrset_size = 16;
- else if (info->csvn == 6)
- ctrset_size = 20;
- break;
- case CPUMF_CTR_SET_EXT:
- if (info->csvn == 1)
- ctrset_size = 32;
- else if (info->csvn == 2)
- ctrset_size = 48;
- else if (info->csvn >= 3 && info->csvn <= 5)
- ctrset_size = 128;
- else if (info->csvn == 6)
- ctrset_size = 160;
- break;
- case CPUMF_CTR_SET_MT_DIAG:
- if (info->csvn > 3)
- ctrset_size = 48;
- break;
- case CPUMF_CTR_SET_MAX:
- break;
- }
-
- return ctrset_size;
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
- size_t max_size = sizeof(struct cf_trailer_entry);
- enum cpumf_ctr_set i;
-
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
- size_t size = cf_diag_ctrset_size(i, info);
-
- if (size)
- max_size += size * sizeof(u64) +
- sizeof(struct cf_ctrset_entry);
- }
- debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
- max_size);
-
- return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
- size_t room)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- size_t ctrset_size, need = 0;
- int rc = 3; /* Assume write failure */
-
- ctrdata->def = CF_DIAG_CTRSET_DEF;
- ctrdata->set = ctrset;
- ctrdata->res1 = 0;
- ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info);
-
- if (ctrset_size) { /* Save data */
- need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
- if (need <= room)
- rc = ctr_stcctm(ctrset, ctrset_size,
- (u64 *)(ctrdata + 1));
- if (rc != 3)
- ctrdata->ctr = ctrset_size;
- else
- need = 0;
- }
-
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
- " need %zd rc %d\n",
- __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
- cpuhw->info.csvn, need, rc);
- return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
- struct cf_trailer_entry *trailer;
- size_t offset = 0, done;
- int i;
-
- memset(data, 0, sz);
- sz -= sizeof(*trailer); /* Always room for trailer */
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
- struct cf_ctrset_entry *ctrdata = data + offset;
-
- if (!(auth & cpumf_ctr_ctl[i]))
- continue; /* Counter set not authorized */
-
- done = cf_diag_getctrset(ctrdata, i, sz - offset);
- offset += done;
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s ctrset %d offset %zu done %zu\n",
- __func__, i, offset, done);
- }
- trailer = data + offset;
- cf_diag_trailer(trailer);
- return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
- for (; --counters >= 0; ++pstart, ++pstop)
- if (*pstop >= *pstart)
- *pstop -= *pstart;
- else
- *pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
- struct cf_trailer_entry *trailer_start, *trailer_stop;
- struct cf_ctrset_entry *ctrstart, *ctrstop;
- size_t offset = 0;
-
- auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
- do {
- ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
- ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
- if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
- pr_err("cpum_cf_diag counter set compare error "
- "in set %i\n", ctrstart->set);
- return 0;
- }
- auth &= ~cpumf_ctr_ctl[ctrstart->set];
- if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
- cf_diag_diffctrset((u64 *)(ctrstart + 1),
- (u64 *)(ctrstop + 1), ctrstart->ctr);
- offset += ctrstart->ctr * sizeof(u64) +
- sizeof(*ctrstart);
- }
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s set %d ctr %d offset %zu auth %lx\n",
- __func__, ctrstart->set, ctrstart->ctr,
- offset, auth);
- } while (ctrstart->def && auth);
-
- /* Save time_stamp from start of event in stop's trailer */
- trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
- trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
- trailer_stop->progusage[0] = trailer_start->timestamp;
-
- return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data. The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
- struct cf_diag_csd *csd)
-{
- struct perf_sample_data data;
- struct perf_raw_record raw;
- struct pt_regs regs;
- int overflow;
-
- /* Setup perf sample */
- perf_sample_data_init(&data, 0, event->hw.last_period);
- memset(&regs, 0, sizeof(regs));
- memset(&raw, 0, sizeof(raw));
-
- if (event->attr.sample_type & PERF_SAMPLE_CPU)
- data.cpu_entry.cpu = event->cpu;
- if (event->attr.sample_type & PERF_SAMPLE_RAW) {
- raw.frag.size = csd->used;
- raw.frag.data = csd->data;
- raw.size = csd->used;
- data.raw = &raw;
- }
-
- overflow = perf_event_overflow(event, &data, &regs);
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s event %p cpu %d sample_type %#llx raw %d "
- "ov %d\n", __func__, event, event->cpu,
- event->attr.sample_type, raw.size, overflow);
- if (overflow)
- event->pmu->stop(event, 0);
-
- perf_event_update_userpage(event);
- return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
- struct hw_perf_event *hwc = &event->hw;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x hwc-state %#x\n",
- __func__, event, event->cpu, flags, hwc->state);
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- /* (Re-)enable and activate all counter sets */
- lcctl(0); /* Reset counter sets */
- hwc->state = 0;
- ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
- lcctl(cpuhw->state); /* Enable counter sets */
- csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
- event->hw.config_base);
- ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
- /* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
- struct hw_perf_event *hwc = &event->hw;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x hwc-state %#x\n",
- __func__, event, event->cpu, flags, hwc->state);
-
- /* Deactivate all counter sets */
- ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
- local64_inc(&event->count);
- csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
- event->hw.config_base);
- if (cf_diag_diffctr(csd, event->hw.config_base))
- cf_diag_push_sample(event, csd);
- hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err = 0;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x cpuhw %p\n",
- __func__, event, event->cpu, flags, cpuhw);
-
- if (cpuhw->flags & PMU_F_IN_USE) {
- err = -EAGAIN;
- goto out;
- }
-
- event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
- cpuhw->flags |= PMU_F_IN_USE;
- if (flags & PERF_EF_START)
- cf_diag_start(event, PERF_EF_RELOAD);
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
- return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x\n",
- __func__, event, event->cpu, flags);
-
- cf_diag_stop(event, PERF_EF_UPDATE);
- ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
- ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
- cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
- CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
- NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
- &format_attr_event.attr,
- NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
- .name = "events",
- .attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
- .name = "format",
- .attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
- &cf_diag_events_group,
- &cf_diag_format_group,
- NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
- .task_ctx_nr = perf_sw_context,
- .pmu_enable = cf_diag_enable,
- .pmu_disable = cf_diag_disable,
- .event_init = cf_diag_event_init,
- .add = cf_diag_add,
- .del = cf_diag_del,
- .start = cf_diag_start,
- .stop = cf_diag_stop,
- .read = cf_diag_read,
-
- .attr_groups = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
- if (cpum_sf_avail()) { /* Sampling facility first */
- struct hws_qsi_info_block si;
-
- memset(&si, 0, sizeof(si));
- if (!qsi(&si)) {
- cf_diag_cpu_speed = si.cpu_speed;
- return;
- }
- }
-
- if (test_facility(34)) { /* CPU speed extract static part */
- unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
- if (mhz != -1UL)
- cf_diag_cpu_speed = mhz & 0xffffffff;
- }
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
- struct cpumf_ctr_info info;
- size_t need;
- int rc;
-
- if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
- return -ENODEV;
- cf_diag_get_cpu_speed();
-
- /* Make sure the counter set data fits into predefined buffer. */
- need = cf_diag_ctrset_maxsize(&info);
- if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
- pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
- need);
- return -ENOMEM;
- }
-
- /* Setup s390dbf facility */
- cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
- if (!cf_diag_dbg) {
- pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
- return -ENOMEM;
- }
- debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
- rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
- if (rc) {
- debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
- debug_unregister(cf_diag_dbg);
- pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
- rc);
- }
- return rc;
-}
-arch_initcall(cf_diag_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 8b33e03e47b8..0d64aafd158f 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -238,6 +238,134 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5);
CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af);
+CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7);
+CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd);
+CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+
static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES),
CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS),
@@ -286,7 +414,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = {
NULL,
};
-static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = {
+static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS),
@@ -516,6 +644,141 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = {
NULL,
};
+static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = {
+ CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TEND),
+ CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND),
+ CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS),
+ CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS),
+ CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CC),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CCFINISH),
+ CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+ CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+ NULL,
+};
+
+static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = {
+ CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TEND),
+ CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND),
+ CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS),
+ CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS),
+ CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES),
+ CPUMF_EVENT_PTR(cf_z16, SORTL),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CC),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK),
+ CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+ CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+ NULL,
+};
+
/* END: CPUM_CF COUNTER DEFINITIONS ===================================== */
static struct attribute_group cpumcf_pmu_events_group = {
@@ -596,8 +859,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
case 1 ... 5:
csvn = cpumcf_svn_12345_pmu_event_attr;
break;
- case 6:
- csvn = cpumcf_svn_6_pmu_event_attr;
+ case 6 ... 7:
+ csvn = cpumcf_svn_67_pmu_event_attr;
break;
default:
csvn = none;
@@ -624,9 +887,15 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
break;
case 0x3906:
case 0x3907:
+ model = cpumcf_z14_pmu_event_attr;
+ break;
case 0x8561:
case 0x8562:
- model = cpumcf_z14_pmu_event_attr;
+ model = cpumcf_z15_pmu_event_attr;
+ break;
+ case 0x3931:
+ case 0x3932:
+ model = cpumcf_z16_pmu_event_attr;
break;
default:
model = none;
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index b095b1c78987..332a49965130 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -372,28 +372,33 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
{
- unsigned long n_sdb, freq, factor;
+ unsigned long n_sdb, freq;
size_t sample_size;
/* Calculate sampling buffers using 4K pages
*
- * 1. Determine the sample data size which depends on the used
- * sampling functions, for example, basic-sampling or
- * basic-sampling with diagnostic-sampling.
+ * 1. The sampling size is 32 bytes for basic sampling. This size
+ * is the same for all machine types. Diagnostic
+ * sampling uses auxlilary data buffer setup which provides the
+ * memory for SDBs using linux common code auxiliary trace
+ * setup.
*
- * 2. Use the sampling frequency as input. The sampling buffer is
- * designed for almost one second. This can be adjusted through
- * the "factor" variable.
- * In any case, alloc_sampling_buffer() sets the Alert Request
+ * 2. Function alloc_sampling_buffer() sets the Alert Request
* Control indicator to trigger a measurement-alert to harvest
- * sample-data-blocks (sdb).
+ * sample-data-blocks (SDB). This is done per SDB. This
+ * measurement alert interrupt fires quick enough to handle
+ * one SDB, on very high frequency and work loads there might
+ * be 2 to 3 SBDs available for sample processing.
+ * Currently there is no need for setup alert request on every
+ * n-th page. This is counterproductive as one IRQ triggers
+ * a very high number of samples to be processed at one IRQ.
*
- * 3. Compute the number of sample-data-blocks and ensure a minimum
- * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not
- * exceed a "calculated" maximum. The symbolic maximum is
- * designed for basic-sampling only and needs to be increased if
- * diagnostic-sampling is active.
- * See also the remarks for these symbolic constants.
+ * 3. Use the sampling frequency as input.
+ * Compute the number of SDBs and ensure a minimum
+ * of CPUM_SF_MIN_SDB. Depending on frequency add some more
+ * SDBs to handle a higher sampling rate.
+ * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples
+ * (one SDB) for every 10000 HZ frequency increment.
*
* 4. Compute the number of sample-data-block-tables (SDBT) and
* ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
@@ -401,10 +406,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
*/
sample_size = sizeof(struct hws_basic_entry);
freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
- factor = 1;
- n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
- if (n_sdb < CPUM_SF_MIN_SDB)
- n_sdb = CPUM_SF_MIN_SDB;
+ n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000);
/* If there is already a sampling buffer allocated, it is very likely
* that the sampling facility is enabled too. If the event to be
@@ -670,7 +672,7 @@ static void cpumsf_output_event_pid(struct perf_event *event,
rcu_read_lock();
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ if (perf_output_begin(&handle, data, event, header.size))
goto out;
/* Update the process ID (see also kernel/events/core.c) */
@@ -879,12 +881,21 @@ out:
return err;
}
+static bool is_callchain_event(struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER |
+ PERF_SAMPLE_STACK_USER);
+}
+
static int cpumsf_pmu_event_init(struct perf_event *event)
{
int err;
/* No support for taken branch sampling */
- if (has_branch_stack(event))
+ /* No support for callchain, stacks and registers */
+ if (has_branch_stack(event) || is_callchain_event(event))
return -EOPNOTSUPP;
switch (event->attr.type) {
@@ -1168,7 +1179,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
sample = (struct hws_basic_entry *) *sdbt;
while ((unsigned long *) sample < (unsigned long *) te) {
/* Check for an empty sample */
- if (!sample->def)
+ if (!sample->def || sample->LS)
break;
/* Update perf event period */
@@ -1576,6 +1587,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
unsigned long range = 0, size;
unsigned long long overflow = 0;
struct perf_output_handle *handle = &cpuhw->handle;
+ unsigned long num_sdb;
aux = perf_get_aux(handle);
if (WARN_ON_ONCE(!aux))
@@ -1587,13 +1599,14 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
size >> PAGE_SHIFT);
perf_aux_output_end(handle, size);
+ num_sdb = aux->sfb.num_sdb;
while (!done) {
/* Get an output handle */
aux = perf_aux_output_begin(handle, cpuhw->event);
if (handle->size == 0) {
pr_err("The AUX buffer with %lu pages for the "
"diagnostic-sampling mode is full\n",
- aux->sfb.num_sdb);
+ num_sdb);
debug_sprintf_event(sfdbg, 1,
"%s: AUX buffer used up\n",
__func__);
@@ -1669,7 +1682,7 @@ static void aux_sdb_init(unsigned long sdb)
/* Save clock base */
te->clock_base = 1;
- memcpy(&te->progusage2, &tod_clock_base[1], 8);
+ te->progusage2 = tod_clock_base.tod;
}
/*
@@ -2215,4 +2228,4 @@ out:
}
arch_initcall(init_cpum_sampling_pmu);
-core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640);
+core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 1e75cc983546..c27321cb0969 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -23,27 +23,6 @@
#include <asm/sysinfo.h>
#include <asm/unwind.h>
-const char *perf_pmu_name(void)
-{
- if (cpum_cf_avail() || cpum_sf_avail())
- return "CPU-Measurement Facilities (CPU-MF)";
- return "pmu";
-}
-EXPORT_SYMBOL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
- int num = 0;
-
- if (cpum_cf_avail())
- num += PERF_CPUM_CF_MAX_CTR;
- if (cpum_sf_avail())
- num += PERF_CPUM_SF_MAX_CTR;
-
- return num;
-}
-EXPORT_SYMBOL(perf_num_counters);
-
static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
{
struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
@@ -51,7 +30,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
if (!stack)
return NULL;
- return (struct kvm_s390_sie_block *) stack->empty1[0];
+ return (struct kvm_s390_sie_block *)stack->sie_control_block;
}
static bool is_in_guest(struct pt_regs *regs)
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
new file mode 100644
index 000000000000..6826e2a69a21
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -0,0 +1,699 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Facility
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT "pai_crypto"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+
+#include <asm/ctl_reg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+static debug_info_t *cfm_dbg;
+static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */
+ /* extracted with QPACI instruction */
+
+DEFINE_STATIC_KEY_FALSE(pai_key);
+
+struct pai_userdata {
+ u16 num;
+ u64 value;
+} __packed;
+
+struct paicrypt_map {
+ unsigned long *page; /* Page for CPU to store counters */
+ struct pai_userdata *save; /* Page to store no-zero counters */
+ unsigned int users; /* # of PAI crypto users */
+ unsigned int sampler; /* # of PAI crypto samplers */
+ unsigned int counter; /* # of PAI crypto counters */
+ struct perf_event *event; /* Perf event for sampling */
+};
+
+static DEFINE_PER_CPU(struct paicrypt_map, paicrypt_map);
+
+/* Release the PMU if event is the last perf event */
+static DEFINE_MUTEX(pai_reserve_mutex);
+
+/* Adjust usage counters and remove allocated memory when all users are
+ * gone.
+ */
+static void paicrypt_event_destroy(struct perf_event *event)
+{
+ struct paicrypt_map *cpump = per_cpu_ptr(&paicrypt_map, event->cpu);
+
+ cpump->event = NULL;
+ static_branch_dec(&pai_key);
+ mutex_lock(&pai_reserve_mutex);
+ if (event->attr.sample_period)
+ cpump->sampler -= 1;
+ else
+ cpump->counter -= 1;
+ debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d"
+ " sampler %d counter %d\n", __func__,
+ event->attr.config, event->cpu, cpump->sampler,
+ cpump->counter);
+ if (!cpump->counter && !cpump->sampler) {
+ debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
+ __func__, (unsigned long)cpump->page,
+ cpump->save);
+ free_page((unsigned long)cpump->page);
+ cpump->page = NULL;
+ kvfree(cpump->save);
+ cpump->save = NULL;
+ }
+ mutex_unlock(&pai_reserve_mutex);
+}
+
+static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel)
+{
+ if (kernel)
+ nr += PAI_CRYPTO_MAXCTR;
+ return cpump->page[nr];
+}
+
+/* Read the counter values. Return value from location in CMP. For event
+ * CRYPTO_ALL sum up all events.
+ */
+static u64 paicrypt_getdata(struct perf_event *event, bool kernel)
+{
+ struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map);
+ u64 sum = 0;
+ int i;
+
+ if (event->attr.config != PAI_CRYPTO_BASE) {
+ return paicrypt_getctr(cpump,
+ event->attr.config - PAI_CRYPTO_BASE,
+ kernel);
+ }
+
+ for (i = 1; i <= paicrypt_cnt; i++) {
+ u64 val = paicrypt_getctr(cpump, i, kernel);
+
+ if (!val)
+ continue;
+ sum += val;
+ }
+ return sum;
+}
+
+static u64 paicrypt_getall(struct perf_event *event)
+{
+ u64 sum = 0;
+
+ if (!event->attr.exclude_kernel)
+ sum += paicrypt_getdata(event, true);
+ if (!event->attr.exclude_user)
+ sum += paicrypt_getdata(event, false);
+
+ return sum;
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for crypto events
+ *
+ * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is save to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump)
+{
+ unsigned int *use_ptr;
+ int rc = 0;
+
+ mutex_lock(&pai_reserve_mutex);
+ if (a->sample_period) { /* Sampling requested */
+ use_ptr = &cpump->sampler;
+ if (cpump->counter || cpump->sampler)
+ rc = -EBUSY; /* ... sampling/counting active */
+ } else { /* Counting requested */
+ use_ptr = &cpump->counter;
+ if (cpump->sampler)
+ rc = -EBUSY; /* ... and sampling active */
+ }
+ if (rc)
+ goto unlock;
+
+ /* Allocate memory for counter page and counter extraction.
+ * Only the first counting event has to allocate a page.
+ */
+ if (cpump->page)
+ goto unlock;
+
+ rc = -ENOMEM;
+ cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (!cpump->page)
+ goto unlock;
+ cpump->save = kvmalloc_array(paicrypt_cnt + 1,
+ sizeof(struct pai_userdata), GFP_KERNEL);
+ if (!cpump->save) {
+ free_page((unsigned long)cpump->page);
+ cpump->page = NULL;
+ goto unlock;
+ }
+ rc = 0;
+
+unlock:
+ /* If rc is non-zero, do not increment counter/sampler. */
+ if (!rc)
+ *use_ptr += 1;
+ debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx sampler %d"
+ " counter %d page %#lx save %p rc %d\n", __func__,
+ a->sample_period, cpump->sampler, cpump->counter,
+ (unsigned long)cpump->page, cpump->save, rc);
+ mutex_unlock(&pai_reserve_mutex);
+ return rc;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paicrypt_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ struct paicrypt_map *cpump;
+ int rc;
+
+ /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
+ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+ return -ENOENT;
+ /* PAI crypto event must be in valid range */
+ if (a->config < PAI_CRYPTO_BASE ||
+ a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
+ return -EINVAL;
+ /* Allow only CPU wide operation, no process context for now. */
+ if (event->hw.target || event->cpu == -1)
+ return -ENOENT;
+ /* Allow only CRYPTO_ALL for sampling. */
+ if (a->sample_period && a->config != PAI_CRYPTO_BASE)
+ return -EINVAL;
+
+ cpump = per_cpu_ptr(&paicrypt_map, event->cpu);
+ rc = paicrypt_busy(a, cpump);
+ if (rc)
+ return rc;
+
+ /* Event initialization sets last_tag to 0. When later on the events
+ * are deleted and re-added, do not reset the event count value to zero.
+ * Events are added, deleted and re-added when 2 or more events
+ * are active at the same time.
+ */
+ event->hw.last_tag = 0;
+ cpump->event = event;
+ event->destroy = paicrypt_event_destroy;
+
+ if (a->sample_period) {
+ a->sample_period = 1;
+ a->freq = 0;
+ /* Register for paicrypt_sched_task() to be called */
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ /* Add raw data which contain the memory mapped counters */
+ a->sample_type |= PERF_SAMPLE_RAW;
+ /* Turn off inheritance */
+ a->inherit = 0;
+ }
+
+ static_branch_inc(&pai_key);
+ return 0;
+}
+
+static void paicrypt_read(struct perf_event *event)
+{
+ u64 prev, new, delta;
+
+ prev = local64_read(&event->hw.prev_count);
+ new = paicrypt_getall(event);
+ local64_set(&event->hw.prev_count, new);
+ delta = (prev <= new) ? new - prev
+ : (-1ULL - prev) + new + 1; /* overflow */
+ local64_add(delta, &event->count);
+}
+
+static void paicrypt_start(struct perf_event *event, int flags)
+{
+ u64 sum;
+
+ if (!event->hw.last_tag) {
+ event->hw.last_tag = 1;
+ sum = paicrypt_getall(event); /* Get current value */
+ local64_set(&event->count, 0);
+ local64_set(&event->hw.prev_count, sum);
+ }
+}
+
+static int paicrypt_add(struct perf_event *event, int flags)
+{
+ struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map);
+ unsigned long ccd;
+
+ if (cpump->users++ == 0) {
+ ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
+ WRITE_ONCE(S390_lowcore.ccd, ccd);
+ __ctl_set_bit(0, 50);
+ }
+ cpump->event = event;
+ if (flags & PERF_EF_START && !event->attr.sample_period) {
+ /* Only counting needs initial counter value */
+ paicrypt_start(event, PERF_EF_RELOAD);
+ }
+ event->hw.state = 0;
+ if (event->attr.sample_period)
+ perf_sched_cb_inc(event->pmu);
+ return 0;
+}
+
+static void paicrypt_stop(struct perf_event *event, int flags)
+{
+ paicrypt_read(event);
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paicrypt_del(struct perf_event *event, int flags)
+{
+ struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map);
+
+ if (event->attr.sample_period)
+ perf_sched_cb_dec(event->pmu);
+ if (!event->attr.sample_period)
+ /* Only counting needs to read counter */
+ paicrypt_stop(event, PERF_EF_UPDATE);
+ if (cpump->users-- == 1) {
+ __ctl_clear_bit(0, 50);
+ WRITE_ONCE(S390_lowcore.ccd, 0);
+ }
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paicrypt_copy(struct pai_userdata *userdata,
+ struct paicrypt_map *cpump,
+ bool exclude_user, bool exclude_kernel)
+{
+ int i, outidx = 0;
+
+ for (i = 1; i <= paicrypt_cnt; i++) {
+ u64 val = 0;
+
+ if (!exclude_kernel)
+ val += paicrypt_getctr(cpump, i, true);
+ if (!exclude_user)
+ val += paicrypt_getctr(cpump, i, false);
+ if (val) {
+ userdata[outidx].num = i;
+ userdata[outidx].value = val;
+ outidx++;
+ }
+ }
+ return outidx * sizeof(struct pai_userdata);
+}
+
+static int paicrypt_push_sample(void)
+{
+ struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map);
+ struct perf_event *event = cpump->event;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ size_t rawsize;
+ int overflow;
+
+ if (!cpump->event) /* No event active */
+ return 0;
+ rawsize = paicrypt_copy(cpump->save, cpump,
+ cpump->event->attr.exclude_user,
+ cpump->event->attr.exclude_kernel);
+ if (!rawsize) /* No incremented counters */
+ return 0;
+
+ /* Setup perf sample */
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+ memset(&data, 0, sizeof(data));
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ if (event->attr.sample_type & PERF_SAMPLE_TID) {
+ data.tid_entry.pid = task_tgid_nr(current);
+ data.tid_entry.tid = task_pid_nr(current);
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_TIME)
+ data.time = event->clock();
+ if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data.id = event->id;
+ if (event->attr.sample_type & PERF_SAMPLE_CPU) {
+ data.cpu_entry.cpu = smp_processor_id();
+ data.cpu_entry.reserved = 0;
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = rawsize;
+ raw.frag.data = cpump->save;
+ raw.size = raw.frag.size;
+ data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ perf_event_update_userpage(event);
+ /* Clear lowcore page after read */
+ memset(cpump->page, 0, PAGE_SIZE);
+ return overflow;
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event CRYPTO_ALL is allowed.
+ */
+static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ /* We started with a clean page on event installation. So read out
+ * results on schedule_out and if page was dirty, clear values.
+ */
+ if (!sched_in)
+ paicrypt_push_sample();
+}
+
+/* Attribute definitions for paicrypt interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1000 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paicrypt_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group paicrypt_events_group = {
+ .name = "events",
+ .attrs = NULL /* Filled in attr_event_init() */
+};
+
+static struct attribute_group paicrypt_format_group = {
+ .name = "format",
+ .attrs = paicrypt_format_attr,
+};
+
+static const struct attribute_group *paicrypt_attr_groups[] = {
+ &paicrypt_events_group,
+ &paicrypt_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paicrypt = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = paicrypt_event_init,
+ .add = paicrypt_add,
+ .del = paicrypt_del,
+ .start = paicrypt_start,
+ .stop = paicrypt_stop,
+ .read = paicrypt_read,
+ .sched_task = paicrypt_sched_task,
+ .attr_groups = paicrypt_attr_groups
+};
+
+/* List of symbolic PAI counter names. */
+static const char * const paicrypt_ctrnames[] = {
+ [0] = "CRYPTO_ALL",
+ [1] = "KM_DEA",
+ [2] = "KM_TDEA_128",
+ [3] = "KM_TDEA_192",
+ [4] = "KM_ENCRYPTED_DEA",
+ [5] = "KM_ENCRYPTED_TDEA_128",
+ [6] = "KM_ENCRYPTED_TDEA_192",
+ [7] = "KM_AES_128",
+ [8] = "KM_AES_192",
+ [9] = "KM_AES_256",
+ [10] = "KM_ENCRYPTED_AES_128",
+ [11] = "KM_ENCRYPTED_AES_192",
+ [12] = "KM_ENCRYPTED_AES_256",
+ [13] = "KM_XTS_AES_128",
+ [14] = "KM_XTS_AES_256",
+ [15] = "KM_XTS_ENCRYPTED_AES_128",
+ [16] = "KM_XTS_ENCRYPTED_AES_256",
+ [17] = "KMC_DEA",
+ [18] = "KMC_TDEA_128",
+ [19] = "KMC_TDEA_192",
+ [20] = "KMC_ENCRYPTED_DEA",
+ [21] = "KMC_ENCRYPTED_TDEA_128",
+ [22] = "KMC_ENCRYPTED_TDEA_192",
+ [23] = "KMC_AES_128",
+ [24] = "KMC_AES_192",
+ [25] = "KMC_AES_256",
+ [26] = "KMC_ENCRYPTED_AES_128",
+ [27] = "KMC_ENCRYPTED_AES_192",
+ [28] = "KMC_ENCRYPTED_AES_256",
+ [29] = "KMC_PRNG",
+ [30] = "KMA_GCM_AES_128",
+ [31] = "KMA_GCM_AES_192",
+ [32] = "KMA_GCM_AES_256",
+ [33] = "KMA_GCM_ENCRYPTED_AES_128",
+ [34] = "KMA_GCM_ENCRYPTED_AES_192",
+ [35] = "KMA_GCM_ENCRYPTED_AES_256",
+ [36] = "KMF_DEA",
+ [37] = "KMF_TDEA_128",
+ [38] = "KMF_TDEA_192",
+ [39] = "KMF_ENCRYPTED_DEA",
+ [40] = "KMF_ENCRYPTED_TDEA_128",
+ [41] = "KMF_ENCRYPTED_TDEA_192",
+ [42] = "KMF_AES_128",
+ [43] = "KMF_AES_192",
+ [44] = "KMF_AES_256",
+ [45] = "KMF_ENCRYPTED_AES_128",
+ [46] = "KMF_ENCRYPTED_AES_192",
+ [47] = "KMF_ENCRYPTED_AES_256",
+ [48] = "KMCTR_DEA",
+ [49] = "KMCTR_TDEA_128",
+ [50] = "KMCTR_TDEA_192",
+ [51] = "KMCTR_ENCRYPTED_DEA",
+ [52] = "KMCTR_ENCRYPTED_TDEA_128",
+ [53] = "KMCTR_ENCRYPTED_TDEA_192",
+ [54] = "KMCTR_AES_128",
+ [55] = "KMCTR_AES_192",
+ [56] = "KMCTR_AES_256",
+ [57] = "KMCTR_ENCRYPTED_AES_128",
+ [58] = "KMCTR_ENCRYPTED_AES_192",
+ [59] = "KMCTR_ENCRYPTED_AES_256",
+ [60] = "KMO_DEA",
+ [61] = "KMO_TDEA_128",
+ [62] = "KMO_TDEA_192",
+ [63] = "KMO_ENCRYPTED_DEA",
+ [64] = "KMO_ENCRYPTED_TDEA_128",
+ [65] = "KMO_ENCRYPTED_TDEA_192",
+ [66] = "KMO_AES_128",
+ [67] = "KMO_AES_192",
+ [68] = "KMO_AES_256",
+ [69] = "KMO_ENCRYPTED_AES_128",
+ [70] = "KMO_ENCRYPTED_AES_192",
+ [71] = "KMO_ENCRYPTED_AES_256",
+ [72] = "KIMD_SHA_1",
+ [73] = "KIMD_SHA_256",
+ [74] = "KIMD_SHA_512",
+ [75] = "KIMD_SHA3_224",
+ [76] = "KIMD_SHA3_256",
+ [77] = "KIMD_SHA3_384",
+ [78] = "KIMD_SHA3_512",
+ [79] = "KIMD_SHAKE_128",
+ [80] = "KIMD_SHAKE_256",
+ [81] = "KIMD_GHASH",
+ [82] = "KLMD_SHA_1",
+ [83] = "KLMD_SHA_256",
+ [84] = "KLMD_SHA_512",
+ [85] = "KLMD_SHA3_224",
+ [86] = "KLMD_SHA3_256",
+ [87] = "KLMD_SHA3_384",
+ [88] = "KLMD_SHA3_512",
+ [89] = "KLMD_SHAKE_128",
+ [90] = "KLMD_SHAKE_256",
+ [91] = "KMAC_DEA",
+ [92] = "KMAC_TDEA_128",
+ [93] = "KMAC_TDEA_192",
+ [94] = "KMAC_ENCRYPTED_DEA",
+ [95] = "KMAC_ENCRYPTED_TDEA_128",
+ [96] = "KMAC_ENCRYPTED_TDEA_192",
+ [97] = "KMAC_AES_128",
+ [98] = "KMAC_AES_192",
+ [99] = "KMAC_AES_256",
+ [100] = "KMAC_ENCRYPTED_AES_128",
+ [101] = "KMAC_ENCRYPTED_AES_192",
+ [102] = "KMAC_ENCRYPTED_AES_256",
+ [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA",
+ [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128",
+ [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192",
+ [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA",
+ [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128",
+ [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192",
+ [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128",
+ [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192",
+ [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
+ [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
+ [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
+ [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A",
+ [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
+ [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
+ [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
+ [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256",
+ [119] = "PCC_SCALAR_MULTIPLY_P256",
+ [120] = "PCC_SCALAR_MULTIPLY_P384",
+ [121] = "PCC_SCALAR_MULTIPLY_P521",
+ [122] = "PCC_SCALAR_MULTIPLY_ED25519",
+ [123] = "PCC_SCALAR_MULTIPLY_ED448",
+ [124] = "PCC_SCALAR_MULTIPLY_X25519",
+ [125] = "PCC_SCALAR_MULTIPLY_X448",
+ [126] = "PRNO_SHA_512_DRNG",
+ [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO",
+ [128] = "PRNO_TRNG",
+ [129] = "KDSA_ECDSA_VERIFY_P256",
+ [130] = "KDSA_ECDSA_VERIFY_P384",
+ [131] = "KDSA_ECDSA_VERIFY_P521",
+ [132] = "KDSA_ECDSA_SIGN_P256",
+ [133] = "KDSA_ECDSA_SIGN_P384",
+ [134] = "KDSA_ECDSA_SIGN_P521",
+ [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256",
+ [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384",
+ [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521",
+ [138] = "KDSA_EDDSA_VERIFY_ED25519",
+ [139] = "KDSA_EDDSA_VERIFY_ED448",
+ [140] = "KDSA_EDDSA_SIGN_ED25519",
+ [141] = "KDSA_EDDSA_SIGN_ED448",
+ [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519",
+ [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448",
+ [144] = "PCKMO_ENCRYPT_DEA_KEY",
+ [145] = "PCKMO_ENCRYPT_TDEA_128_KEY",
+ [146] = "PCKMO_ENCRYPT_TDEA_192_KEY",
+ [147] = "PCKMO_ENCRYPT_AES_128_KEY",
+ [148] = "PCKMO_ENCRYPT_AES_192_KEY",
+ [149] = "PCKMO_ENCRYPT_AES_256_KEY",
+ [150] = "PCKMO_ENCRYPT_ECC_P256_KEY",
+ [151] = "PCKMO_ENCRYPT_ECC_P384_KEY",
+ [152] = "PCKMO_ENCRYPT_ECC_P521_KEY",
+ [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY",
+ [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY",
+ [155] = "IBM_RESERVED_155",
+ [156] = "IBM_RESERVED_156",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ struct device_attribute *dap;
+
+ dap = container_of(attrs[i], struct device_attribute, attr);
+ pa = container_of(dap, struct perf_pmu_events_attr, attr);
+ kfree(pa);
+ }
+ kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+
+ pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ sysfs_attr_init(&pa->attr.attr);
+ pa->id = PAI_CRYPTO_BASE + num;
+ pa->attr.attr.name = paicrypt_ctrnames[num];
+ pa->attr.attr.mode = 0444;
+ pa->attr.show = cpumf_events_sysfs_show;
+ pa->attr.store = NULL;
+ attrs[num] = &pa->attr.attr;
+ return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+ struct attribute **attrs;
+ int ret, i;
+
+ attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs),
+ GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) {
+ ret = attr_event_init_one(attrs, i);
+ if (ret) {
+ attr_event_free(attrs, i - 1);
+ return ret;
+ }
+ }
+ attrs[i] = NULL;
+ paicrypt_events_group.attrs = attrs;
+ return 0;
+}
+
+static int __init paicrypt_init(void)
+{
+ struct qpaci_info_block ib;
+ int rc;
+
+ if (!test_facility(196))
+ return 0;
+
+ qpaci(&ib);
+ paicrypt_cnt = ib.num_cc;
+ if (paicrypt_cnt == 0)
+ return 0;
+ if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR)
+ paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1;
+
+ rc = attr_event_init(); /* Export known PAI crypto events */
+ if (rc) {
+ pr_err("Creation of PMU pai_crypto /sysfs failed\n");
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+ if (!cfm_dbg) {
+ pr_err("Registration of s390dbf pai_crypto failed\n");
+ return -ENOMEM;
+ }
+ debug_register_view(cfm_dbg, &debug_sprintf_view);
+
+ rc = perf_pmu_register(&paicrypt, "pai_crypto", -1);
+ if (rc) {
+ pr_err("Registering the pai_crypto PMU failed with rc=%i\n",
+ rc);
+ debug_unregister_view(cfm_dbg, &debug_sprintf_view);
+ debug_unregister(cfm_dbg);
+ return rc;
+ }
+ return 0;
+}
+
+device_initcall(paicrypt_init);
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
new file mode 100644
index 000000000000..74b53c531e0c
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -0,0 +1,672 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Extension
+ * Facility
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT "pai_ext"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+
+#include <asm/cpu_mcf.h>
+#include <asm/ctl_reg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */
+#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */
+
+static debug_info_t *paiext_dbg;
+static unsigned int paiext_cnt; /* Extracted with QPACI instruction */
+
+enum paiext_mode {
+ PAI_MODE_NONE,
+ PAI_MODE_SAMPLING,
+ PAI_MODE_COUNTER,
+};
+
+struct pai_userdata {
+ u16 num;
+ u64 value;
+} __packed;
+
+/* Create the PAI extension 1 control block area.
+ * The PAI extension control block 1 is pointed to by lowcore
+ * address 0x1508 for each CPU. This control block is 512 bytes in size
+ * and requires a 512 byte boundary alignment.
+ */
+struct paiext_cb { /* PAI extension 1 control block */
+ u64 header; /* Not used */
+ u64 reserved1;
+ u64 acc; /* Addr to analytics counter control block */
+ u8 reserved2[488];
+} __packed;
+
+struct paiext_map {
+ unsigned long *area; /* Area for CPU to store counters */
+ struct pai_userdata *save; /* Area to store non-zero counters */
+ enum paiext_mode mode; /* Type of event */
+ unsigned int active_events; /* # of PAI Extension users */
+ unsigned int refcnt;
+ struct perf_event *event; /* Perf event for sampling */
+ struct paiext_cb *paiext_cb; /* PAI extension control block area */
+};
+
+struct paiext_mapptr {
+ struct paiext_map *mapptr;
+};
+
+static struct paiext_root { /* Anchor to per CPU data */
+ int refcnt; /* Overall active events */
+ struct paiext_mapptr __percpu *mapptr;
+} paiext_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paiext_root_free(void)
+{
+ if (!--paiext_root.refcnt) {
+ free_percpu(paiext_root.mapptr);
+ paiext_root.mapptr = NULL;
+ }
+}
+
+/* On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paiext_root_alloc(void)
+{
+ if (++paiext_root.refcnt == 1) {
+ /* The memory is already zeroed. */
+ paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
+ if (!paiext_root.mapptr) {
+ /* Returing without refcnt adjustment is ok. The
+ * error code is handled by paiext_alloc() which
+ * decrements refcnt when an event can not be
+ * created.
+ */
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/* Protects against concurrent increment of sampler and counter member
+ * increments at the same time and prohibits concurrent execution of
+ * counting and sampling events.
+ * Ensures that analytics counter block is deallocated only when the
+ * sampling and counting on that cpu is zero.
+ * For details see paiext_alloc().
+ */
+static DEFINE_MUTEX(paiext_reserve_mutex);
+
+/* Free all memory allocated for event counting/sampling setup */
+static void paiext_free(struct paiext_mapptr *mp)
+{
+ kfree(mp->mapptr->area);
+ kfree(mp->mapptr->paiext_cb);
+ kvfree(mp->mapptr->save);
+ kfree(mp->mapptr);
+ mp->mapptr = NULL;
+}
+
+/* Release the PMU if event is the last perf event */
+static void paiext_event_destroy(struct perf_event *event)
+{
+ struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ struct paiext_map *cpump = mp->mapptr;
+
+ mutex_lock(&paiext_reserve_mutex);
+ cpump->event = NULL;
+ if (!--cpump->refcnt) /* Last reference gone */
+ paiext_free(mp);
+ paiext_root_free();
+ mutex_unlock(&paiext_reserve_mutex);
+ debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__,
+ event->cpu, mp->mapptr);
+
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for pai_extension events.
+ *
+ * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is safe to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
+{
+ struct paiext_mapptr *mp;
+ struct paiext_map *cpump;
+ int rc;
+
+ mutex_lock(&paiext_reserve_mutex);
+
+ rc = paiext_root_alloc();
+ if (rc)
+ goto unlock;
+
+ mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ cpump = mp->mapptr;
+ if (!cpump) { /* Paiext_map allocated? */
+ rc = -ENOMEM;
+ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+ if (!cpump)
+ goto unlock;
+
+ /* Allocate memory for counter area and counter extraction.
+ * These are
+ * - a 512 byte block and requires 512 byte boundary alignment.
+ * - a 1KB byte block and requires 1KB boundary alignment.
+ * Only the first counting event has to allocate the area.
+ *
+ * Note: This works with commit 59bb47985c1d by default.
+ * Backporting this to kernels without this commit might
+ * need adjustment.
+ */
+ mp->mapptr = cpump;
+ cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
+ cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
+ cpump->save = kvmalloc_array(paiext_cnt + 1,
+ sizeof(struct pai_userdata),
+ GFP_KERNEL);
+ if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
+ paiext_free(mp);
+ goto unlock;
+ }
+ cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
+ : PAI_MODE_COUNTER;
+ } else {
+ /* Multiple invocation, check whats active.
+ * Supported are multiple counter events or only one sampling
+ * event concurrently at any one time.
+ */
+ if (cpump->mode == PAI_MODE_SAMPLING ||
+ (cpump->mode == PAI_MODE_COUNTER && a->sample_period)) {
+ rc = -EBUSY;
+ goto unlock;
+ }
+ }
+
+ rc = 0;
+ cpump->event = event;
+ ++cpump->refcnt;
+
+unlock:
+ if (rc) {
+ /* Error in allocation of event, decrement anchor. Since
+ * the event in not created, its destroy() function is never
+ * invoked. Adjust the reference counter for the anchor.
+ */
+ paiext_root_free();
+ }
+ mutex_unlock(&paiext_reserve_mutex);
+ /* If rc is non-zero, no increment of counter/sampler was done. */
+ return rc;
+}
+
+/* The PAI extension 1 control block supports up to 128 entries. Return
+ * the index within PAIE1_CB given the event number. Also validate event
+ * number.
+ */
+static int paiext_event_valid(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
+ /* Offset NNPA in paiext_cb */
+ event->hw.config_base = offsetof(struct paiext_cb, acc);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paiext_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ int rc;
+
+ /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
+ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+ return -ENOENT;
+ /* PAI extension event must be valid and in supported range */
+ rc = paiext_event_valid(event);
+ if (rc)
+ return rc;
+ /* Allow only CPU wide operation, no process context for now. */
+ if (event->hw.target || event->cpu == -1)
+ return -ENOENT;
+ /* Allow only event NNPA_ALL for sampling. */
+ if (a->sample_period && a->config != PAI_NNPA_BASE)
+ return -EINVAL;
+ /* Prohibit exclude_user event selection */
+ if (a->exclude_user)
+ return -EINVAL;
+
+ rc = paiext_alloc(a, event);
+ if (rc)
+ return rc;
+ event->hw.last_tag = 0;
+ event->destroy = paiext_event_destroy;
+
+ if (a->sample_period) {
+ a->sample_period = 1;
+ a->freq = 0;
+ /* Register for paicrypt_sched_task() to be called */
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ /* Add raw data which are the memory mapped counters */
+ a->sample_type |= PERF_SAMPLE_RAW;
+ /* Turn off inheritance */
+ a->inherit = 0;
+ }
+
+ return 0;
+}
+
+static u64 paiext_getctr(struct paiext_map *cpump, int nr)
+{
+ return cpump->area[nr];
+}
+
+/* Read the counter values. Return value from location in buffer. For event
+ * NNPA_ALL sum up all events.
+ */
+static u64 paiext_getdata(struct perf_event *event)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ u64 sum = 0;
+ int i;
+
+ if (event->attr.config != PAI_NNPA_BASE)
+ return paiext_getctr(cpump, event->attr.config - PAI_NNPA_BASE);
+
+ for (i = 1; i <= paiext_cnt; i++)
+ sum += paiext_getctr(cpump, i);
+
+ return sum;
+}
+
+static u64 paiext_getall(struct perf_event *event)
+{
+ return paiext_getdata(event);
+}
+
+static void paiext_read(struct perf_event *event)
+{
+ u64 prev, new, delta;
+
+ prev = local64_read(&event->hw.prev_count);
+ new = paiext_getall(event);
+ local64_set(&event->hw.prev_count, new);
+ delta = new - prev;
+ local64_add(delta, &event->count);
+}
+
+static void paiext_start(struct perf_event *event, int flags)
+{
+ u64 sum;
+
+ if (event->hw.last_tag)
+ return;
+ event->hw.last_tag = 1;
+ sum = paiext_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
+ local64_set(&event->count, 0);
+}
+
+static int paiext_add(struct perf_event *event, int flags)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct paiext_cb *pcb = cpump->paiext_cb;
+
+ if (++cpump->active_events == 1) {
+ S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb);
+ pcb->acc = virt_to_phys(cpump->area) | 0x1;
+ /* Enable CPU instruction lookup for PAIE1 control block */
+ __ctl_set_bit(0, 49);
+ debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
+ __func__, S390_lowcore.aicd, pcb->acc);
+ }
+ if (flags & PERF_EF_START && !event->attr.sample_period) {
+ /* Only counting needs initial counter value */
+ paiext_start(event, PERF_EF_RELOAD);
+ }
+ event->hw.state = 0;
+ if (event->attr.sample_period) {
+ cpump->event = event;
+ perf_sched_cb_inc(event->pmu);
+ }
+ return 0;
+}
+
+static void paiext_stop(struct perf_event *event, int flags)
+{
+ paiext_read(event);
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paiext_del(struct perf_event *event, int flags)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct paiext_cb *pcb = cpump->paiext_cb;
+
+ if (event->attr.sample_period)
+ perf_sched_cb_dec(event->pmu);
+ if (!event->attr.sample_period) {
+ /* Only counting needs to read counter */
+ paiext_stop(event, PERF_EF_UPDATE);
+ }
+ if (--cpump->active_events == 0) {
+ /* Disable CPU instruction lookup for PAIE1 control block */
+ __ctl_clear_bit(0, 49);
+ pcb->acc = 0;
+ S390_lowcore.aicd = 0;
+ debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
+ __func__, S390_lowcore.aicd, pcb->acc);
+ }
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paiext_copy(struct paiext_map *cpump)
+{
+ struct pai_userdata *userdata = cpump->save;
+ int i, outidx = 0;
+
+ for (i = 1; i <= paiext_cnt; i++) {
+ u64 val = paiext_getctr(cpump, i);
+
+ if (val) {
+ userdata[outidx].num = i;
+ userdata[outidx].value = val;
+ outidx++;
+ }
+ }
+ return outidx * sizeof(*userdata);
+}
+
+/* Write sample when one or more counters values are nonzero.
+ *
+ * Note: The function paiext_sched_task() and paiext_push_sample() are not
+ * invoked after function paiext_del() has been called because of function
+ * perf_sched_cb_dec().
+ * The function paiext_sched_task() and paiext_push_sample() are only
+ * called when sampling is active. Function perf_sched_cb_inc()
+ * has been invoked to install function paiext_sched_task() as call back
+ * to run at context switch time (see paiext_add()).
+ *
+ * This causes function perf_event_context_sched_out() and
+ * perf_event_context_sched_in() to check whether the PMU has installed an
+ * sched_task() callback. That callback is not active after paiext_del()
+ * returns and has deleted the event on that CPU.
+ */
+static int paiext_push_sample(void)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct perf_event *event = cpump->event;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ size_t rawsize;
+ int overflow;
+
+ rawsize = paiext_copy(cpump);
+ if (!rawsize) /* No incremented counters */
+ return 0;
+
+ /* Setup perf sample */
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+ memset(&data, 0, sizeof(data));
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ if (event->attr.sample_type & PERF_SAMPLE_TID) {
+ data.tid_entry.pid = task_tgid_nr(current);
+ data.tid_entry.tid = task_pid_nr(current);
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_TIME)
+ data.time = event->clock();
+ if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data.id = event->id;
+ if (event->attr.sample_type & PERF_SAMPLE_CPU)
+ data.cpu_entry.cpu = smp_processor_id();
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = rawsize;
+ raw.frag.data = cpump->save;
+ raw.size = raw.frag.size;
+ data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ perf_event_update_userpage(event);
+ /* Clear lowcore area after read */
+ memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ);
+ return overflow;
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event NNPA_ALL is allowed.
+ */
+static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ /* We started with a clean page on event installation. So read out
+ * results on schedule_out and if page was dirty, clear values.
+ */
+ if (!sched_in)
+ paiext_push_sample();
+}
+
+/* Attribute definitions for pai extension1 interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1800 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
+ * counters.
+ * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paiext_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group paiext_events_group = {
+ .name = "events",
+ .attrs = NULL, /* Filled in attr_event_init() */
+};
+
+static struct attribute_group paiext_format_group = {
+ .name = "format",
+ .attrs = paiext_format_attr,
+};
+
+static const struct attribute_group *paiext_attr_groups[] = {
+ &paiext_events_group,
+ &paiext_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paiext = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = paiext_event_init,
+ .add = paiext_add,
+ .del = paiext_del,
+ .start = paiext_start,
+ .stop = paiext_stop,
+ .read = paiext_read,
+ .sched_task = paiext_sched_task,
+ .attr_groups = paiext_attr_groups,
+};
+
+/* List of symbolic PAI extension 1 NNPA counter names. */
+static const char * const paiext_ctrnames[] = {
+ [0] = "NNPA_ALL",
+ [1] = "NNPA_ADD",
+ [2] = "NNPA_SUB",
+ [3] = "NNPA_MUL",
+ [4] = "NNPA_DIV",
+ [5] = "NNPA_MIN",
+ [6] = "NNPA_MAX",
+ [7] = "NNPA_LOG",
+ [8] = "NNPA_EXP",
+ [9] = "NNPA_IBM_RESERVED_9",
+ [10] = "NNPA_RELU",
+ [11] = "NNPA_TANH",
+ [12] = "NNPA_SIGMOID",
+ [13] = "NNPA_SOFTMAX",
+ [14] = "NNPA_BATCHNORM",
+ [15] = "NNPA_MAXPOOL2D",
+ [16] = "NNPA_AVGPOOL2D",
+ [17] = "NNPA_LSTMACT",
+ [18] = "NNPA_GRUACT",
+ [19] = "NNPA_CONVOLUTION",
+ [20] = "NNPA_MATMUL_OP",
+ [21] = "NNPA_MATMUL_OP_BCAST23",
+ [22] = "NNPA_SMALLBATCH",
+ [23] = "NNPA_LARGEDIM",
+ [24] = "NNPA_SMALLTENSOR",
+ [25] = "NNPA_1MFRAME",
+ [26] = "NNPA_2GFRAME",
+ [27] = "NNPA_ACCESSEXCEPT",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+ struct device_attribute *dap;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ dap = container_of(attrs[i], struct device_attribute, attr);
+ pa = container_of(dap, struct perf_pmu_events_attr, attr);
+ kfree(pa);
+ }
+ kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+
+ pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ sysfs_attr_init(&pa->attr.attr);
+ pa->id = PAI_NNPA_BASE + num;
+ pa->attr.attr.name = paiext_ctrnames[num];
+ pa->attr.attr.mode = 0444;
+ pa->attr.show = cpumf_events_sysfs_show;
+ pa->attr.store = NULL;
+ attrs[num] = &pa->attr.attr;
+ return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+ struct attribute **attrs;
+ int ret, i;
+
+ attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs),
+ GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) {
+ ret = attr_event_init_one(attrs, i);
+ if (ret) {
+ attr_event_free(attrs, i - 1);
+ return ret;
+ }
+ }
+ attrs[i] = NULL;
+ paiext_events_group.attrs = attrs;
+ return 0;
+}
+
+static int __init paiext_init(void)
+{
+ struct qpaci_info_block ib;
+ int rc = -ENOMEM;
+
+ if (!test_facility(197))
+ return 0;
+
+ qpaci(&ib);
+ paiext_cnt = ib.num_nnpa;
+ if (paiext_cnt >= PAI_NNPA_MAXCTR)
+ paiext_cnt = PAI_NNPA_MAXCTR;
+ if (!paiext_cnt)
+ return 0;
+
+ rc = attr_event_init();
+ if (rc) {
+ pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+ if (!paiext_dbg) {
+ pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
+ rc = -ENOMEM;
+ goto out_init;
+ }
+ debug_register_view(paiext_dbg, &debug_sprintf_view);
+
+ rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
+ if (rc) {
+ pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
+ "rc=%i\n", rc);
+ goto out_pmu;
+ }
+
+ return 0;
+
+out_pmu:
+ debug_unregister_view(paiext_dbg, &debug_sprintf_view);
+ debug_unregister(paiext_dbg);
+out_init:
+ attr_event_free(paiext_events_group.attrs,
+ ARRAY_SIZE(paiext_ctrnames) + 1);
+ return rc;
+}
+
+device_initcall(paiext_init);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 4352a504f235..6e9e5d5e927e 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -53,8 +53,7 @@ u64 perf_reg_abi(struct task_struct *task)
}
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
/*
* Use the regs from the first interruption and let
diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S
deleted file mode 100644
index eee3a482195a..000000000000
--- a/arch/s390/kernel/pgm_check.S
+++ /dev/null
@@ -1,147 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Program check table.
- *
- * Copyright IBM Corp. 2012
- */
-
-#include <linux/linkage.h>
-
-#define PGM_CHECK(handler) .quad handler
-#define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler)
-
-/*
- * The program check table contains exactly 128 (0x00-0x7f) entries. Each
- * line defines the function to be called corresponding to the program check
- * interruption code.
- */
-.section .rodata, "a"
-ENTRY(pgm_check_table)
-PGM_CHECK_DEFAULT /* 00 */
-PGM_CHECK(illegal_op) /* 01 */
-PGM_CHECK(privileged_op) /* 02 */
-PGM_CHECK(execute_exception) /* 03 */
-PGM_CHECK(do_protection_exception) /* 04 */
-PGM_CHECK(addressing_exception) /* 05 */
-PGM_CHECK(specification_exception) /* 06 */
-PGM_CHECK(data_exception) /* 07 */
-PGM_CHECK(overflow_exception) /* 08 */
-PGM_CHECK(divide_exception) /* 09 */
-PGM_CHECK(overflow_exception) /* 0a */
-PGM_CHECK(divide_exception) /* 0b */
-PGM_CHECK(hfp_overflow_exception) /* 0c */
-PGM_CHECK(hfp_underflow_exception) /* 0d */
-PGM_CHECK(hfp_significance_exception) /* 0e */
-PGM_CHECK(hfp_divide_exception) /* 0f */
-PGM_CHECK(do_dat_exception) /* 10 */
-PGM_CHECK(do_dat_exception) /* 11 */
-PGM_CHECK(translation_exception) /* 12 */
-PGM_CHECK(special_op_exception) /* 13 */
-PGM_CHECK_DEFAULT /* 14 */
-PGM_CHECK(operand_exception) /* 15 */
-PGM_CHECK_DEFAULT /* 16 */
-PGM_CHECK_DEFAULT /* 17 */
-PGM_CHECK(transaction_exception) /* 18 */
-PGM_CHECK_DEFAULT /* 19 */
-PGM_CHECK_DEFAULT /* 1a */
-PGM_CHECK(vector_exception) /* 1b */
-PGM_CHECK(space_switch_exception) /* 1c */
-PGM_CHECK(hfp_sqrt_exception) /* 1d */
-PGM_CHECK_DEFAULT /* 1e */
-PGM_CHECK_DEFAULT /* 1f */
-PGM_CHECK_DEFAULT /* 20 */
-PGM_CHECK_DEFAULT /* 21 */
-PGM_CHECK_DEFAULT /* 22 */
-PGM_CHECK_DEFAULT /* 23 */
-PGM_CHECK_DEFAULT /* 24 */
-PGM_CHECK_DEFAULT /* 25 */
-PGM_CHECK_DEFAULT /* 26 */
-PGM_CHECK_DEFAULT /* 27 */
-PGM_CHECK_DEFAULT /* 28 */
-PGM_CHECK_DEFAULT /* 29 */
-PGM_CHECK_DEFAULT /* 2a */
-PGM_CHECK_DEFAULT /* 2b */
-PGM_CHECK_DEFAULT /* 2c */
-PGM_CHECK_DEFAULT /* 2d */
-PGM_CHECK_DEFAULT /* 2e */
-PGM_CHECK_DEFAULT /* 2f */
-PGM_CHECK_DEFAULT /* 30 */
-PGM_CHECK_DEFAULT /* 31 */
-PGM_CHECK_DEFAULT /* 32 */
-PGM_CHECK_DEFAULT /* 33 */
-PGM_CHECK_DEFAULT /* 34 */
-PGM_CHECK_DEFAULT /* 35 */
-PGM_CHECK_DEFAULT /* 36 */
-PGM_CHECK_DEFAULT /* 37 */
-PGM_CHECK(do_dat_exception) /* 38 */
-PGM_CHECK(do_dat_exception) /* 39 */
-PGM_CHECK(do_dat_exception) /* 3a */
-PGM_CHECK(do_dat_exception) /* 3b */
-PGM_CHECK_DEFAULT /* 3c */
-PGM_CHECK_DEFAULT /* 3d */
-PGM_CHECK_DEFAULT /* 3e */
-PGM_CHECK_DEFAULT /* 3f */
-PGM_CHECK(monitor_event_exception) /* 40 */
-PGM_CHECK_DEFAULT /* 41 */
-PGM_CHECK_DEFAULT /* 42 */
-PGM_CHECK_DEFAULT /* 43 */
-PGM_CHECK_DEFAULT /* 44 */
-PGM_CHECK_DEFAULT /* 45 */
-PGM_CHECK_DEFAULT /* 46 */
-PGM_CHECK_DEFAULT /* 47 */
-PGM_CHECK_DEFAULT /* 48 */
-PGM_CHECK_DEFAULT /* 49 */
-PGM_CHECK_DEFAULT /* 4a */
-PGM_CHECK_DEFAULT /* 4b */
-PGM_CHECK_DEFAULT /* 4c */
-PGM_CHECK_DEFAULT /* 4d */
-PGM_CHECK_DEFAULT /* 4e */
-PGM_CHECK_DEFAULT /* 4f */
-PGM_CHECK_DEFAULT /* 50 */
-PGM_CHECK_DEFAULT /* 51 */
-PGM_CHECK_DEFAULT /* 52 */
-PGM_CHECK_DEFAULT /* 53 */
-PGM_CHECK_DEFAULT /* 54 */
-PGM_CHECK_DEFAULT /* 55 */
-PGM_CHECK_DEFAULT /* 56 */
-PGM_CHECK_DEFAULT /* 57 */
-PGM_CHECK_DEFAULT /* 58 */
-PGM_CHECK_DEFAULT /* 59 */
-PGM_CHECK_DEFAULT /* 5a */
-PGM_CHECK_DEFAULT /* 5b */
-PGM_CHECK_DEFAULT /* 5c */
-PGM_CHECK_DEFAULT /* 5d */
-PGM_CHECK_DEFAULT /* 5e */
-PGM_CHECK_DEFAULT /* 5f */
-PGM_CHECK_DEFAULT /* 60 */
-PGM_CHECK_DEFAULT /* 61 */
-PGM_CHECK_DEFAULT /* 62 */
-PGM_CHECK_DEFAULT /* 63 */
-PGM_CHECK_DEFAULT /* 64 */
-PGM_CHECK_DEFAULT /* 65 */
-PGM_CHECK_DEFAULT /* 66 */
-PGM_CHECK_DEFAULT /* 67 */
-PGM_CHECK_DEFAULT /* 68 */
-PGM_CHECK_DEFAULT /* 69 */
-PGM_CHECK_DEFAULT /* 6a */
-PGM_CHECK_DEFAULT /* 6b */
-PGM_CHECK_DEFAULT /* 6c */
-PGM_CHECK_DEFAULT /* 6d */
-PGM_CHECK_DEFAULT /* 6e */
-PGM_CHECK_DEFAULT /* 6f */
-PGM_CHECK_DEFAULT /* 70 */
-PGM_CHECK_DEFAULT /* 71 */
-PGM_CHECK_DEFAULT /* 72 */
-PGM_CHECK_DEFAULT /* 73 */
-PGM_CHECK_DEFAULT /* 74 */
-PGM_CHECK_DEFAULT /* 75 */
-PGM_CHECK_DEFAULT /* 76 */
-PGM_CHECK_DEFAULT /* 77 */
-PGM_CHECK_DEFAULT /* 78 */
-PGM_CHECK_DEFAULT /* 79 */
-PGM_CHECK_DEFAULT /* 7a */
-PGM_CHECK_DEFAULT /* 7b */
-PGM_CHECK_DEFAULT /* 7c */
-PGM_CHECK_DEFAULT /* 7d */
-PGM_CHECK_DEFAULT /* 7e */
-PGM_CHECK_DEFAULT /* 7f */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 6ccef5f29761..42af4b3aa02b 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -29,6 +29,7 @@
#include <linux/random.h>
#include <linux/export.h>
#include <linux/init_task.h>
+#include <linux/entry-common.h>
#include <asm/cpu_mf.h>
#include <asm/io.h>
#include <asm/processor.h>
@@ -43,9 +44,22 @@
#include <asm/unwind.h>
#include "entry.h"
-asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
+void ret_from_fork(void) asm("ret_from_fork");
-extern void kernel_thread_starter(void);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs)
+{
+ void (*func)(void *arg);
+
+ schedule_tail(prev);
+
+ if (!user_mode(regs)) {
+ /* Kernel thread */
+ func = (void *)regs->gprs[9];
+ func((void *)regs->gprs[10]);
+ }
+ clear_pt_regs_flag(regs, PIF_SYSCALL);
+ syscall_exit_to_user_mode(regs);
+}
void flush_thread(void)
{
@@ -77,12 +91,26 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
memcpy(dst, src, arch_task_struct_size);
dst->thread.fpu.regs = dst->thread.fpu.fprs;
+
+ /*
+ * Don't transfer over the runtime instrumentation or the guarded
+ * storage control block pointers. These fields are cleared here instead
+ * of in copy_thread() to avoid premature freeing of associated memory
+ * on fork() failure. Wait to clear the RI flag because ->stack still
+ * refers to the source thread.
+ */
+ dst->thread.ri_cb = NULL;
+ dst->thread.gs_cb = NULL;
+ dst->thread.gs_bc_cb = NULL;
+
return 0;
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
+ unsigned long clone_flags = args->flags;
+ unsigned long new_stackp = args->stack;
+ unsigned long tls = args->tls;
struct fake_frame
{
struct stack_frame sf;
@@ -94,7 +122,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
/* Save access registers to new thread structure. */
save_access_regs(&p->thread.acrs[0]);
/* start new process with ar4 pointing to the correct address space */
- p->thread.mm_segment = get_fs();
/* Don't copy debug registers */
memset(&p->thread.per_user, 0, sizeof(p->thread.per_user));
memset(&p->thread.per_event, 0, sizeof(p->thread.per_event));
@@ -106,26 +133,28 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
p->thread.system_timer = 0;
p->thread.hardirq_timer = 0;
p->thread.softirq_timer = 0;
+ p->thread.last_break = 1;
frame->sf.back_chain = 0;
+ frame->sf.gprs[5] = (unsigned long)frame + sizeof(struct stack_frame);
+ frame->sf.gprs[6] = (unsigned long)p;
/* new return point is ret_from_fork */
- frame->sf.gprs[8] = (unsigned long) ret_from_fork;
+ frame->sf.gprs[8] = (unsigned long)ret_from_fork;
/* fake return stack for resume(), don't go back to schedule */
- frame->sf.gprs[9] = (unsigned long) frame;
+ frame->sf.gprs[9] = (unsigned long)frame;
/* Store access registers to kernel stack of new process. */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(args->fn)) {
/* kernel thread */
memset(&frame->childregs, 0, sizeof(struct pt_regs));
frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
frame->childregs.psw.addr =
- (unsigned long) kernel_thread_starter;
- frame->childregs.gprs[9] = new_stackp; /* function */
- frame->childregs.gprs[10] = arg;
- frame->childregs.gprs[11] = (unsigned long) do_exit;
+ (unsigned long)__ret_from_fork;
+ frame->childregs.gprs[9] = (unsigned long)args->fn;
+ frame->childregs.gprs[10] = (unsigned long)args->fn_arg;
frame->childregs.orig_gpr2 = -1;
-
+ frame->childregs.last_break = 1;
return 0;
}
frame->childregs = *current_pt_regs();
@@ -133,13 +162,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
frame->childregs.flags = 0;
if (new_stackp)
frame->childregs.gprs[15] = new_stackp;
-
- /* Don't copy runtime instrumentation info */
- p->thread.ri_cb = NULL;
+ /*
+ * Clear the runtime instrumentation flag after the above childregs
+ * copy. The CB pointer was already cleared in arch_dup_task_struct().
+ */
frame->childregs.psw.mask &= ~PSW_MASK_RI;
- /* Don't copy guarded storage control block */
- p->thread.gs_cb = NULL;
- p->thread.gs_bc_cb = NULL;
/* Set a new TLS ? */
if (clone_flags & CLONE_SETTLS) {
@@ -150,39 +177,27 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
p->thread.acrs[1] = (unsigned int)tls;
}
}
+ /*
+ * s390 stores the svc return address in arch_data when calling
+ * sigreturn()/restart_syscall() via vdso. 1 means no valid address
+ * stored.
+ */
+ p->restart_block.arch_data = 1;
return 0;
}
-asmlinkage void execve_tail(void)
+void execve_tail(void)
{
current->thread.fpu.fpc = 0;
asm volatile("sfpc %0" : : "d" (0));
}
-/*
- * fill in the FPU structure for a core dump.
- */
-int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
-{
- save_fpu_regs();
- fpregs->fpc = current->thread.fpu.fpc;
- fpregs->pad = 0;
- if (MACHINE_HAS_VX)
- convert_vx_to_fp((freg_t *)&fpregs->fprs,
- current->thread.fpu.vxrs);
- else
- memcpy(&fpregs->fprs, current->thread.fpu.fprs,
- sizeof(fpregs->fprs));
- return 1;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
struct unwind_state state;
unsigned long ip = 0;
- if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p))
+ if (!task_stack_page(p))
return 0;
if (!try_get_task_stack(p))
@@ -209,13 +224,13 @@ unsigned long get_wchan(struct task_struct *p)
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() & ~PAGE_MASK;
+ sp -= prandom_u32_max(PAGE_SIZE);
return sp & ~0xf;
}
static inline unsigned long brk_rnd(void)
{
- return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
@@ -225,16 +240,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
ret = PAGE_ALIGN(mm->brk + brk_rnd());
return (ret > mm->brk) ? ret : mm->brk;
}
-
-void set_fs_fixup(void)
-{
- struct pt_regs *regs = current_pt_regs();
- static bool warned;
-
- set_fs(USER_DS);
- if (warned)
- return;
- WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
- show_registers(regs);
- warned = true;
-}
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 6ebc2117c66c..a194611ba88c 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -8,9 +8,9 @@
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/stop_machine.h>
-#include <linux/cpufeature.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
+#include <linux/random.h>
#include <linux/sched/mm.h>
#include <linux/init.h>
#include <linux/seq_file.h>
@@ -23,8 +23,12 @@
#include <asm/elf.h>
#include <asm/lowcore.h>
#include <asm/param.h>
+#include <asm/sclp.h>
#include <asm/smp.h>
+unsigned long __read_mostly elf_hwcap;
+char elf_platform[ELF_PLATFORM_SIZE];
+
struct cpu_info {
unsigned int cpu_mhz_dynamic;
unsigned int cpu_mhz_static;
@@ -91,23 +95,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, current);
}
-/*
- * cpu_have_feature - Test CPU features on module initialization
- */
-int cpu_have_feature(unsigned int num)
-{
- return elf_hwcap & (1UL << num);
-}
-EXPORT_SYMBOL(cpu_have_feature);
-
static void show_facilities(struct seq_file *m)
{
unsigned int bit;
- long *facilities;
- facilities = (long *)&S390_lowcore.stfle_fac_list;
seq_puts(m, "facilities :");
- for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT)
+ for_each_set_bit_inv(bit, (long *)&stfle_fac_list, MAX_FACILITY_BIT)
seq_printf(m, " %d", bit);
seq_putc(m, '\n');
}
@@ -115,15 +108,33 @@ static void show_facilities(struct seq_file *m)
static void show_cpu_summary(struct seq_file *m, void *v)
{
static const char *hwcap_str[] = {
- "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
- "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs",
- "vxe2", "vxp", "sort", "dflt"
- };
- static const char * const int_hwcap_str[] = {
- "sie"
+ [HWCAP_NR_ESAN3] = "esan3",
+ [HWCAP_NR_ZARCH] = "zarch",
+ [HWCAP_NR_STFLE] = "stfle",
+ [HWCAP_NR_MSA] = "msa",
+ [HWCAP_NR_LDISP] = "ldisp",
+ [HWCAP_NR_EIMM] = "eimm",
+ [HWCAP_NR_DFP] = "dfp",
+ [HWCAP_NR_HPAGE] = "edat",
+ [HWCAP_NR_ETF3EH] = "etf3eh",
+ [HWCAP_NR_HIGH_GPRS] = "highgprs",
+ [HWCAP_NR_TE] = "te",
+ [HWCAP_NR_VXRS] = "vx",
+ [HWCAP_NR_VXRS_BCD] = "vxd",
+ [HWCAP_NR_VXRS_EXT] = "vxe",
+ [HWCAP_NR_GS] = "gs",
+ [HWCAP_NR_VXRS_EXT2] = "vxe2",
+ [HWCAP_NR_VXRS_PDE] = "vxp",
+ [HWCAP_NR_SORT] = "sort",
+ [HWCAP_NR_DFLT] = "dflt",
+ [HWCAP_NR_VXRS_PDE2] = "vxp2",
+ [HWCAP_NR_NNPA] = "nnpa",
+ [HWCAP_NR_PCI_MIO] = "pcimio",
+ [HWCAP_NR_SIE] = "sie",
};
int i, cpu;
+ BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX);
seq_printf(m, "vendor_id : IBM/S390\n"
"# processors : %i\n"
"bogomips per cpu: %lu.%02lu\n",
@@ -134,9 +145,6 @@ static void show_cpu_summary(struct seq_file *m, void *v)
for (i = 0; i < ARRAY_SIZE(hwcap_str); i++)
if (hwcap_str[i] && (elf_hwcap & (1UL << i)))
seq_printf(m, "%s ", hwcap_str[i]);
- for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++)
- if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
- seq_printf(m, "%s ", int_hwcap_str[i]);
seq_puts(m, "\n");
show_facilities(m);
show_cacheinfo(m);
@@ -151,10 +159,158 @@ static void show_cpu_summary(struct seq_file *m, void *v)
}
}
+static int __init setup_hwcaps(void)
+{
+ /* instructions named N3, "backported" to esa-mode */
+ elf_hwcap |= HWCAP_ESAN3;
+
+ /* z/Architecture mode active */
+ elf_hwcap |= HWCAP_ZARCH;
+
+ /* store-facility-list-extended */
+ if (test_facility(7))
+ elf_hwcap |= HWCAP_STFLE;
+
+ /* message-security assist */
+ if (test_facility(17))
+ elf_hwcap |= HWCAP_MSA;
+
+ /* long-displacement */
+ if (test_facility(19))
+ elf_hwcap |= HWCAP_LDISP;
+
+ /* extended-immediate */
+ elf_hwcap |= HWCAP_EIMM;
+
+ /* extended-translation facility 3 enhancement */
+ if (test_facility(22) && test_facility(30))
+ elf_hwcap |= HWCAP_ETF3EH;
+
+ /* decimal floating point & perform floating point operation */
+ if (test_facility(42) && test_facility(44))
+ elf_hwcap |= HWCAP_DFP;
+
+ /* huge page support */
+ if (MACHINE_HAS_EDAT1)
+ elf_hwcap |= HWCAP_HPAGE;
+
+ /* 64-bit register support for 31-bit processes */
+ elf_hwcap |= HWCAP_HIGH_GPRS;
+
+ /* transactional execution */
+ if (MACHINE_HAS_TE)
+ elf_hwcap |= HWCAP_TE;
+
+ /*
+ * Vector extension can be disabled with the "novx" parameter.
+ * Use MACHINE_HAS_VX instead of facility bit 129.
+ */
+ if (MACHINE_HAS_VX) {
+ elf_hwcap |= HWCAP_VXRS;
+ if (test_facility(134))
+ elf_hwcap |= HWCAP_VXRS_BCD;
+ if (test_facility(135))
+ elf_hwcap |= HWCAP_VXRS_EXT;
+ if (test_facility(148))
+ elf_hwcap |= HWCAP_VXRS_EXT2;
+ if (test_facility(152))
+ elf_hwcap |= HWCAP_VXRS_PDE;
+ if (test_facility(192))
+ elf_hwcap |= HWCAP_VXRS_PDE2;
+ }
+
+ if (test_facility(150))
+ elf_hwcap |= HWCAP_SORT;
+
+ if (test_facility(151))
+ elf_hwcap |= HWCAP_DFLT;
+
+ if (test_facility(165))
+ elf_hwcap |= HWCAP_NNPA;
+
+ /* guarded storage */
+ if (MACHINE_HAS_GS)
+ elf_hwcap |= HWCAP_GS;
+
+ if (MACHINE_HAS_PCI_MIO)
+ elf_hwcap |= HWCAP_PCI_MIO;
+
+ /* virtualization support */
+ if (sclp.has_sief2)
+ elf_hwcap |= HWCAP_SIE;
+
+ return 0;
+}
+arch_initcall(setup_hwcaps);
+
+static int __init setup_elf_platform(void)
+{
+ struct cpuid cpu_id;
+
+ get_cpu_id(&cpu_id);
+ add_device_randomness(&cpu_id, sizeof(cpu_id));
+ switch (cpu_id.machine) {
+ default: /* Use "z10" as default. */
+ strcpy(elf_platform, "z10");
+ break;
+ case 0x2817:
+ case 0x2818:
+ strcpy(elf_platform, "z196");
+ break;
+ case 0x2827:
+ case 0x2828:
+ strcpy(elf_platform, "zEC12");
+ break;
+ case 0x2964:
+ case 0x2965:
+ strcpy(elf_platform, "z13");
+ break;
+ case 0x3906:
+ case 0x3907:
+ strcpy(elf_platform, "z14");
+ break;
+ case 0x8561:
+ case 0x8562:
+ strcpy(elf_platform, "z15");
+ break;
+ case 0x3931:
+ case 0x3932:
+ strcpy(elf_platform, "z16");
+ break;
+ }
+ return 0;
+}
+arch_initcall(setup_elf_platform);
+
+static void show_cpu_topology(struct seq_file *m, unsigned long n)
+{
+#ifdef CONFIG_SCHED_TOPOLOGY
+ seq_printf(m, "physical id : %d\n", topology_physical_package_id(n));
+ seq_printf(m, "core id : %d\n", topology_core_id(n));
+ seq_printf(m, "book id : %d\n", topology_book_id(n));
+ seq_printf(m, "drawer id : %d\n", topology_drawer_id(n));
+ seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n));
+ seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n));
+ seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n)));
+ seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n));
+#endif /* CONFIG_SCHED_TOPOLOGY */
+}
+
+static void show_cpu_ids(struct seq_file *m, unsigned long n)
+{
+ struct cpuid *id = &per_cpu(cpu_info.cpu_id, n);
+
+ seq_printf(m, "version : %02X\n", id->version);
+ seq_printf(m, "identification : %06X\n", id->ident);
+ seq_printf(m, "machine : %04X\n", id->machine);
+}
+
static void show_cpu_mhz(struct seq_file *m, unsigned long n)
{
struct cpu_info *c = per_cpu_ptr(&cpu_info, n);
+ if (!machine_has_cpu_mhz)
+ return;
seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic);
seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static);
}
@@ -165,12 +321,13 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n)
static int show_cpuinfo(struct seq_file *m, void *v)
{
unsigned long n = (unsigned long) v - 1;
+ unsigned long first = cpumask_first(cpu_online_mask);
- if (!n)
+ if (n == first)
show_cpu_summary(m, v);
- if (!machine_has_cpu_mhz)
- return 0;
seq_printf(m, "\ncpu number : %ld\n", n);
+ show_cpu_topology(m, n);
+ show_cpu_ids(m, n);
show_cpu_mhz(m, n);
return 0;
}
@@ -179,12 +336,14 @@ static inline void *c_update(loff_t *pos)
{
if (*pos)
*pos = cpumask_next(*pos - 1, cpu_online_mask);
+ else
+ *pos = cpumask_first(cpu_online_mask);
return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
- get_online_cpus();
+ cpus_read_lock();
return c_update(pos);
}
@@ -196,7 +355,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
static void c_stop(struct seq_file *m, void *v)
{
- put_online_cpus();
+ cpus_read_unlock();
}
const struct seq_operations cpuinfo_op = {
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 58faa12542a1..53e0209229f8 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -7,6 +7,7 @@
* Martin Schwidefsky (schwidefsky@de.ibm.com)
*/
+#include "asm/ptrace.h"
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
@@ -20,13 +21,10 @@
#include <linux/signal.h>
#include <linux/elf.h>
#include <linux/regset.h>
-#include <linux/tracehook.h>
#include <linux/seccomp.h>
#include <linux/compat.h>
#include <trace/syscall.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/switch_to.h>
@@ -39,9 +37,6 @@
#include "compat_ptrace.h"
#endif
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
void update_cr_regs(struct task_struct *task)
{
struct pt_regs *regs = task_pt_regs(task);
@@ -142,7 +137,7 @@ void ptrace_disable(struct task_struct *task)
memset(&task->thread.per_user, 0, sizeof(task->thread.per_user));
memset(&task->thread.per_event, 0, sizeof(task->thread.per_event));
clear_tsk_thread_flag(task, TIF_SINGLE_STEP);
- clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP);
+ clear_tsk_thread_flag(task, TIF_PER_TRAP);
task->thread.per_flags = 0;
}
@@ -151,38 +146,36 @@ void ptrace_disable(struct task_struct *task)
static inline unsigned long __peek_user_per(struct task_struct *child,
addr_t addr)
{
- struct per_struct_kernel *dummy = NULL;
-
- if (addr == (addr_t) &dummy->cr9)
+ if (addr == offsetof(struct per_struct_kernel, cr9))
/* Control bits of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
PER_EVENT_IFETCH : child->thread.per_user.control;
- else if (addr == (addr_t) &dummy->cr10)
+ else if (addr == offsetof(struct per_struct_kernel, cr10))
/* Start address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
0 : child->thread.per_user.start;
- else if (addr == (addr_t) &dummy->cr11)
+ else if (addr == offsetof(struct per_struct_kernel, cr11))
/* End address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
-1UL : child->thread.per_user.end;
- else if (addr == (addr_t) &dummy->bits)
+ else if (addr == offsetof(struct per_struct_kernel, bits))
/* Single-step bit. */
return test_thread_flag(TIF_SINGLE_STEP) ?
(1UL << (BITS_PER_LONG - 1)) : 0;
- else if (addr == (addr_t) &dummy->starting_addr)
+ else if (addr == offsetof(struct per_struct_kernel, starting_addr))
/* Start address of the user specified per set. */
return child->thread.per_user.start;
- else if (addr == (addr_t) &dummy->ending_addr)
+ else if (addr == offsetof(struct per_struct_kernel, ending_addr))
/* End address of the user specified per set. */
return child->thread.per_user.end;
- else if (addr == (addr_t) &dummy->perc_atmid)
+ else if (addr == offsetof(struct per_struct_kernel, perc_atmid))
/* PER code, ATMID and AI of the last PER trap */
return (unsigned long)
child->thread.per_event.cause << (BITS_PER_LONG - 16);
- else if (addr == (addr_t) &dummy->address)
+ else if (addr == offsetof(struct per_struct_kernel, address))
/* Address of the last PER trap */
return child->thread.per_event.address;
- else if (addr == (addr_t) &dummy->access_id)
+ else if (addr == offsetof(struct per_struct_kernel, access_id))
/* Access id of the last PER trap */
return (unsigned long)
child->thread.per_event.paid << (BITS_PER_LONG - 8);
@@ -200,61 +193,60 @@ static inline unsigned long __peek_user_per(struct task_struct *child,
*/
static unsigned long __peek_user(struct task_struct *child, addr_t addr)
{
- struct user *dummy = NULL;
addr_t offset, tmp;
- if (addr < (addr_t) &dummy->regs.acrs) {
+ if (addr < offsetof(struct user, regs.acrs)) {
/*
* psw and gprs are stored on the stack
*/
tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr);
- if (addr == (addr_t) &dummy->regs.psw.mask) {
+ if (addr == offsetof(struct user, regs.psw.mask)) {
/* Return a clean psw mask. */
tmp &= PSW_MASK_USER | PSW_MASK_RI;
tmp |= PSW_USER_BITS;
}
- } else if (addr < (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr < offsetof(struct user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy->regs.acrs;
+ offset = addr - offsetof(struct user, regs.acrs);
/*
* Very special case: old & broken 64 bit gdb reading
* from acrs[15]. Result is a 64 bit value. Read the
* 32 bit acrs[15] value and shift it by 32. Sick...
*/
- if (addr == (addr_t) &dummy->regs.acrs[15])
+ if (addr == offsetof(struct user, regs.acrs[15]))
tmp = ((unsigned long) child->thread.acrs[15]) << 32;
else
tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset);
- } else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr == offsetof(struct user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
tmp = (addr_t) task_pt_regs(child)->orig_gpr2;
- } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ } else if (addr < offsetof(struct user, regs.fp_regs)) {
/*
* prevent reads of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
tmp = 0;
- } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
tmp = child->thread.fpu.fpc;
tmp <<= BITS_PER_LONG - 32;
- } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
+ offset = addr - offsetof(struct user, regs.fp_regs.fprs);
if (MACHINE_HAS_VX)
tmp = *(addr_t *)
((addr_t) child->thread.fpu.vxrs + 2*offset);
@@ -262,11 +254,11 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr)
tmp = *(addr_t *)
((addr_t) child->thread.fpu.fprs + offset);
- } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy->regs.per_info;
+ addr -= offsetof(struct user, regs.per_info);
tmp = __peek_user_per(child, addr);
} else
@@ -285,8 +277,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
* an alignment of 4. Programmers from hell...
*/
mask = __ADDR_MASK;
- if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
- addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+ if (addr >= offsetof(struct user, regs.acrs) &&
+ addr < offsetof(struct user, regs.orig_gpr2))
mask = 3;
if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
return -EIO;
@@ -298,8 +290,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
static inline void __poke_user_per(struct task_struct *child,
addr_t addr, addr_t data)
{
- struct per_struct_kernel *dummy = NULL;
-
/*
* There are only three fields in the per_info struct that the
* debugger user can write to.
@@ -312,14 +302,14 @@ static inline void __poke_user_per(struct task_struct *child,
* addresses are used only if single stepping is not in effect.
* Writes to any other field in per_info are ignored.
*/
- if (addr == (addr_t) &dummy->cr9)
+ if (addr == offsetof(struct per_struct_kernel, cr9))
/* PER event mask of the user specified per set. */
child->thread.per_user.control =
data & (PER_EVENT_MASK | PER_CONTROL_MASK);
- else if (addr == (addr_t) &dummy->starting_addr)
+ else if (addr == offsetof(struct per_struct_kernel, starting_addr))
/* Starting address of the user specified per set. */
child->thread.per_user.start = data;
- else if (addr == (addr_t) &dummy->ending_addr)
+ else if (addr == offsetof(struct per_struct_kernel, ending_addr))
/* Ending address of the user specified per set. */
child->thread.per_user.end = data;
}
@@ -332,14 +322,15 @@ static inline void __poke_user_per(struct task_struct *child,
*/
static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
{
- struct user *dummy = NULL;
addr_t offset;
- if (addr < (addr_t) &dummy->regs.acrs) {
+
+ if (addr < offsetof(struct user, regs.acrs)) {
+ struct pt_regs *regs = task_pt_regs(child);
/*
* psw and gprs are stored on the stack
*/
- if (addr == (addr_t) &dummy->regs.psw.mask) {
+ if (addr == offsetof(struct user, regs.psw.mask)) {
unsigned long mask = PSW_MASK_USER;
mask |= is_ri_task(child) ? PSW_MASK_RI : 0;
@@ -353,38 +344,44 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
/* Invalid addressing mode bits */
return -EINVAL;
}
- *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data;
- } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) {
+ if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+ addr == offsetof(struct user, regs.gprs[2])) {
+ struct pt_regs *regs = task_pt_regs(child);
+
+ regs->int_code = 0x20000 | (data & 0xffff);
+ }
+ *(addr_t *)((addr_t) &regs->psw + addr) = data;
+ } else if (addr < offsetof(struct user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy->regs.acrs;
+ offset = addr - offsetof(struct user, regs.acrs);
/*
* Very special case: old & broken 64 bit gdb writing
* to acrs[15] with a 64 bit value. Ignore the lower
* half of the value and write the upper 32 bit to
* acrs[15]. Sick...
*/
- if (addr == (addr_t) &dummy->regs.acrs[15])
+ if (addr == offsetof(struct user, regs.acrs[15]))
child->thread.acrs[15] = (unsigned int) (data >> 32);
else
*(addr_t *)((addr_t) &child->thread.acrs + offset) = data;
- } else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr == offsetof(struct user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
task_pt_regs(child)->orig_gpr2 = data;
- } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ } else if (addr < offsetof(struct user, regs.fp_regs)) {
/*
* prevent writes of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
return 0;
- } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
@@ -393,12 +390,12 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
return -EINVAL;
child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32);
- } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
+ offset = addr - offsetof(struct user, regs.fp_regs.fprs);
if (MACHINE_HAS_VX)
*(addr_t *)((addr_t)
child->thread.fpu.vxrs + 2*offset) = data;
@@ -406,11 +403,11 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
*(addr_t *)((addr_t)
child->thread.fpu.fprs + offset) = data;
- } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy->regs.per_info;
+ addr -= offsetof(struct user, regs.per_info);
__poke_user_per(child, addr, data);
}
@@ -427,8 +424,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data)
* an alignment of 4. Programmers from hell indeed...
*/
mask = __ADDR_MASK;
- if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
- addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+ if (addr >= offsetof(struct user, regs.acrs) &&
+ addr < offsetof(struct user, regs.orig_gpr2))
mask = 3;
if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
return -EIO;
@@ -536,37 +533,35 @@ long arch_ptrace(struct task_struct *child, long request,
static inline __u32 __peek_user_per_compat(struct task_struct *child,
addr_t addr)
{
- struct compat_per_struct_kernel *dummy32 = NULL;
-
- if (addr == (addr_t) &dummy32->cr9)
+ if (addr == offsetof(struct compat_per_struct_kernel, cr9))
/* Control bits of the active per set. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
PER_EVENT_IFETCH : child->thread.per_user.control;
- else if (addr == (addr_t) &dummy32->cr10)
+ else if (addr == offsetof(struct compat_per_struct_kernel, cr10))
/* Start address of the active per set. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
0 : child->thread.per_user.start;
- else if (addr == (addr_t) &dummy32->cr11)
+ else if (addr == offsetof(struct compat_per_struct_kernel, cr11))
/* End address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
PSW32_ADDR_INSN : child->thread.per_user.end;
- else if (addr == (addr_t) &dummy32->bits)
+ else if (addr == offsetof(struct compat_per_struct_kernel, bits))
/* Single-step bit. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
0x80000000 : 0;
- else if (addr == (addr_t) &dummy32->starting_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
/* Start address of the user specified per set. */
return (__u32) child->thread.per_user.start;
- else if (addr == (addr_t) &dummy32->ending_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
/* End address of the user specified per set. */
return (__u32) child->thread.per_user.end;
- else if (addr == (addr_t) &dummy32->perc_atmid)
+ else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid))
/* PER code, ATMID and AI of the last PER trap */
return (__u32) child->thread.per_event.cause << 16;
- else if (addr == (addr_t) &dummy32->address)
+ else if (addr == offsetof(struct compat_per_struct_kernel, address))
/* Address of the last PER trap */
return (__u32) child->thread.per_event.address;
- else if (addr == (addr_t) &dummy32->access_id)
+ else if (addr == offsetof(struct compat_per_struct_kernel, access_id))
/* Access id of the last PER trap */
return (__u32) child->thread.per_event.paid << 24;
return 0;
@@ -577,21 +572,20 @@ static inline __u32 __peek_user_per_compat(struct task_struct *child,
*/
static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
{
- struct compat_user *dummy32 = NULL;
addr_t offset;
__u32 tmp;
- if (addr < (addr_t) &dummy32->regs.acrs) {
+ if (addr < offsetof(struct compat_user, regs.acrs)) {
struct pt_regs *regs = task_pt_regs(child);
/*
* psw and gprs are stored on the stack
*/
- if (addr == (addr_t) &dummy32->regs.psw.mask) {
+ if (addr == offsetof(struct compat_user, regs.psw.mask)) {
/* Fake a 31 bit psw mask. */
tmp = (__u32)(regs->psw.mask >> 32);
tmp &= PSW32_MASK_USER | PSW32_MASK_RI;
tmp |= PSW32_USER_BITS;
- } else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+ } else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
/* Fake a 31 bit psw address. */
tmp = (__u32) regs->psw.addr |
(__u32)(regs->psw.mask & PSW_MASK_BA);
@@ -599,38 +593,38 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
/* gpr 0-15 */
tmp = *(__u32 *)((addr_t) &regs->psw + addr*2 + 4);
}
- } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy32->regs.acrs;
+ offset = addr - offsetof(struct compat_user, regs.acrs);
tmp = *(__u32*)((addr_t) &child->thread.acrs + offset);
- } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4);
- } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
/*
* prevent reads of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
tmp = 0;
- } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
tmp = child->thread.fpu.fpc;
- } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
+ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
if (MACHINE_HAS_VX)
tmp = *(__u32 *)
((addr_t) child->thread.fpu.vxrs + 2*offset);
@@ -638,11 +632,11 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
tmp = *(__u32 *)
((addr_t) child->thread.fpu.fprs + offset);
- } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy32->regs.per_info;
+ addr -= offsetof(struct compat_user, regs.per_info);
tmp = __peek_user_per_compat(child, addr);
} else
@@ -669,16 +663,14 @@ static int peek_user_compat(struct task_struct *child,
static inline void __poke_user_per_compat(struct task_struct *child,
addr_t addr, __u32 data)
{
- struct compat_per_struct_kernel *dummy32 = NULL;
-
- if (addr == (addr_t) &dummy32->cr9)
+ if (addr == offsetof(struct compat_per_struct_kernel, cr9))
/* PER event mask of the user specified per set. */
child->thread.per_user.control =
data & (PER_EVENT_MASK | PER_CONTROL_MASK);
- else if (addr == (addr_t) &dummy32->starting_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
/* Starting address of the user specified per set. */
child->thread.per_user.start = data;
- else if (addr == (addr_t) &dummy32->ending_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
/* Ending address of the user specified per set. */
child->thread.per_user.end = data;
}
@@ -689,16 +681,15 @@ static inline void __poke_user_per_compat(struct task_struct *child,
static int __poke_user_compat(struct task_struct *child,
addr_t addr, addr_t data)
{
- struct compat_user *dummy32 = NULL;
__u32 tmp = (__u32) data;
addr_t offset;
- if (addr < (addr_t) &dummy32->regs.acrs) {
+ if (addr < offsetof(struct compat_user, regs.acrs)) {
struct pt_regs *regs = task_pt_regs(child);
/*
* psw, gprs, acrs and orig_gpr2 are stored on the stack
*/
- if (addr == (addr_t) &dummy32->regs.psw.mask) {
+ if (addr == offsetof(struct compat_user, regs.psw.mask)) {
__u32 mask = PSW32_MASK_USER;
mask |= is_ri_task(child) ? PSW32_MASK_RI : 0;
@@ -712,37 +703,43 @@ static int __poke_user_compat(struct task_struct *child,
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) |
(regs->psw.mask & PSW_MASK_BA) |
(__u64)(tmp & mask) << 32;
- } else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+ } else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
/* Build a 64 bit psw address from 31 bit address. */
regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN;
/* Transfer 31 bit amode bit to psw mask. */
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) |
(__u64)(tmp & PSW32_ADDR_AMODE);
} else {
+ if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+ addr == offsetof(struct compat_user, regs.gprs[2])) {
+ struct pt_regs *regs = task_pt_regs(child);
+
+ regs->int_code = 0x20000 | (data & 0xffff);
+ }
/* gpr 0-15 */
*(__u32*)((addr_t) &regs->psw + addr*2 + 4) = tmp;
}
- } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy32->regs.acrs;
+ offset = addr - offsetof(struct compat_user, regs.acrs);
*(__u32*)((addr_t) &child->thread.acrs + offset) = tmp;
- } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
*(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp;
- } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
/*
* prevent writess of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
return 0;
- } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
@@ -750,12 +747,12 @@ static int __poke_user_compat(struct task_struct *child,
return -EINVAL;
child->thread.fpu.fpc = data;
- } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
+ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
if (MACHINE_HAS_VX)
*(__u32 *)((addr_t)
child->thread.fpu.vxrs + 2*offset) = tmp;
@@ -763,11 +760,11 @@ static int __poke_user_compat(struct task_struct *child,
*(__u32 *)((addr_t)
child->thread.fpu.fprs + offset) = tmp;
- } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy32->regs.per_info;
+ addr -= offsetof(struct compat_user, regs.per_info);
__poke_user_per_compat(child, addr, data);
}
@@ -835,84 +832,20 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
}
#endif
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
-{
- unsigned long mask = -1UL;
-
- /*
- * The sysc_tracesys code in entry.S stored the system
- * call number to gprs[2].
- */
- if (test_thread_flag(TIF_SYSCALL_TRACE) &&
- (tracehook_report_syscall_entry(regs) ||
- regs->gprs[2] >= NR_syscalls)) {
- /*
- * Tracing decided this syscall should not happen or the
- * debugger stored an invalid system call number. Skip
- * the system call and the system call restart handling.
- */
- clear_pt_regs_flag(regs, PIF_SYSCALL);
- return -1;
- }
-
- /* Do the secure computing check after ptrace. */
- if (secure_computing()) {
- /* seccomp failures shouldn't expose any additional code. */
- return -1;
- }
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_enter(regs, regs->gprs[2]);
-
- if (is_compat_task())
- mask = 0xffffffff;
-
- audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask,
- regs->gprs[3] &mask, regs->gprs[4] &mask,
- regs->gprs[5] &mask);
-
- return regs->gprs[2];
-}
-
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
-{
- audit_syscall_exit(regs);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_exit(regs, regs->gprs[2]);
-
- if (test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, 0);
-}
-
/*
* user_regset definitions.
*/
static int s390_regs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
+ unsigned pos;
if (target == current)
save_access_regs(target->thread.acrs);
- if (kbuf) {
- unsigned long *k = kbuf;
- while (count > 0) {
- *k++ = __peek_user(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- unsigned long __user *u = ubuf;
- while (count > 0) {
- if (__put_user(__peek_user(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ for (pos = 0; pos < sizeof(s390_regs); pos += sizeof(long))
+ membuf_store(&to, __peek_user(target, pos));
return 0;
}
@@ -953,8 +886,8 @@ static int s390_regs_set(struct task_struct *target,
}
static int s390_fpregs_get(struct task_struct *target,
- const struct user_regset *regset, unsigned int pos,
- unsigned int count, void *kbuf, void __user *ubuf)
+ const struct user_regset *regset,
+ struct membuf to)
{
_s390_fp_regs fp_regs;
@@ -964,8 +897,7 @@ static int s390_fpregs_get(struct task_struct *target,
fp_regs.fpc = target->thread.fpu.fpc;
fpregs_store(&fp_regs, &target->thread.fpu);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fp_regs, 0, -1);
+ return membuf_write(&to, &fp_regs, sizeof(fp_regs));
}
static int s390_fpregs_set(struct task_struct *target,
@@ -1012,20 +944,9 @@ static int s390_fpregs_set(struct task_struct *target,
static int s390_last_break_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (count > 0) {
- if (kbuf) {
- unsigned long *k = kbuf;
- *k = target->thread.last_break;
- } else {
- unsigned long __user *u = ubuf;
- if (__put_user(target->thread.last_break, u))
- return -EFAULT;
- }
- }
- return 0;
+ return membuf_store(&to, target->thread.last_break);
}
static int s390_last_break_set(struct task_struct *target,
@@ -1038,16 +959,15 @@ static int s390_last_break_set(struct task_struct *target,
static int s390_tdb_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct pt_regs *regs = task_pt_regs(target);
- unsigned char *data;
+ size_t size;
if (!(regs->int_code & 0x200))
return -ENODATA;
- data = target->thread.trap_tdb;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256);
+ size = sizeof(target->thread.trap_tdb.data);
+ return membuf_write(&to, target->thread.trap_tdb.data, size);
}
static int s390_tdb_set(struct task_struct *target,
@@ -1060,8 +980,7 @@ static int s390_tdb_set(struct task_struct *target,
static int s390_vxrs_low_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
__u64 vxrs[__NUM_VXRS_LOW];
int i;
@@ -1072,7 +991,7 @@ static int s390_vxrs_low_get(struct task_struct *target,
save_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+ return membuf_write(&to, vxrs, sizeof(vxrs));
}
static int s390_vxrs_low_set(struct task_struct *target,
@@ -1101,18 +1020,14 @@ static int s390_vxrs_low_set(struct task_struct *target,
static int s390_vxrs_high_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- __vector128 vxrs[__NUM_VXRS_HIGH];
-
if (!MACHINE_HAS_VX)
return -ENODEV;
if (target == current)
save_fpu_regs();
- memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs));
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+ return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ __NUM_VXRS_HIGH * sizeof(__vector128));
}
static int s390_vxrs_high_set(struct task_struct *target,
@@ -1134,12 +1049,9 @@ static int s390_vxrs_high_set(struct task_struct *target,
static int s390_system_call_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- unsigned int *data = &target->thread.system_call;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(unsigned int));
+ return membuf_store(&to, target->thread.system_call);
}
static int s390_system_call_set(struct task_struct *target,
@@ -1154,8 +1066,7 @@ static int s390_system_call_set(struct task_struct *target,
static int s390_gs_cb_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct gs_cb *data = target->thread.gs_cb;
@@ -1165,8 +1076,7 @@ static int s390_gs_cb_get(struct task_struct *target,
return -ENODATA;
if (target == current)
save_gs_cb(data);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct gs_cb));
+ return membuf_write(&to, data, sizeof(struct gs_cb));
}
static int s390_gs_cb_set(struct task_struct *target,
@@ -1210,8 +1120,7 @@ static int s390_gs_cb_set(struct task_struct *target,
static int s390_gs_bc_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct gs_cb *data = target->thread.gs_bc_cb;
@@ -1219,8 +1128,7 @@ static int s390_gs_bc_get(struct task_struct *target,
return -ENODEV;
if (!data)
return -ENODATA;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct gs_cb));
+ return membuf_write(&to, data, sizeof(struct gs_cb));
}
static int s390_gs_bc_set(struct task_struct *target,
@@ -1256,7 +1164,6 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
cb->pc == 1 &&
cb->qc == 0 &&
cb->reserved2 == 0 &&
- cb->key == PAGE_DEFAULT_KEY &&
cb->reserved3 == 0 &&
cb->reserved4 == 0 &&
cb->reserved5 == 0 &&
@@ -1271,8 +1178,7 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
static int s390_runtime_instr_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct runtime_instr_cb *data = target->thread.ri_cb;
@@ -1281,8 +1187,7 @@ static int s390_runtime_instr_get(struct task_struct *target,
if (!data)
return -ENODATA;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct runtime_instr_cb));
+ return membuf_write(&to, data, sizeof(struct runtime_instr_cb));
}
static int s390_runtime_instr_set(struct task_struct *target,
@@ -1320,7 +1225,11 @@ static int s390_runtime_instr_set(struct task_struct *target,
kfree(data);
return -EINVAL;
}
-
+ /*
+ * Override access key in any case, since user space should
+ * not be able to set it, nor should it care about it.
+ */
+ ri_cb.key = PAGE_DEFAULT_KEY >> 4;
preempt_disable();
if (!target->thread.ri_cb)
target->thread.ri_cb = data;
@@ -1338,7 +1247,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(s390_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_regs_get,
+ .regset_get = s390_regs_get,
.set = s390_regs_set,
},
{
@@ -1346,7 +1255,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(s390_fp_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_fpregs_get,
+ .regset_get = s390_fpregs_get,
.set = s390_fpregs_set,
},
{
@@ -1354,7 +1263,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = sizeof(unsigned int),
.align = sizeof(unsigned int),
- .get = s390_system_call_get,
+ .regset_get = s390_system_call_get,
.set = s390_system_call_set,
},
{
@@ -1362,7 +1271,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_last_break_get,
+ .regset_get = s390_last_break_get,
.set = s390_last_break_set,
},
{
@@ -1370,7 +1279,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = 256,
.align = 1,
- .get = s390_tdb_get,
+ .regset_get = s390_tdb_get,
.set = s390_tdb_set,
},
{
@@ -1378,7 +1287,7 @@ static const struct user_regset s390_regsets[] = {
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_vxrs_low_get,
+ .regset_get = s390_vxrs_low_get,
.set = s390_vxrs_low_set,
},
{
@@ -1386,7 +1295,7 @@ static const struct user_regset s390_regsets[] = {
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
- .get = s390_vxrs_high_get,
+ .regset_get = s390_vxrs_high_get,
.set = s390_vxrs_high_set,
},
{
@@ -1394,7 +1303,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_cb_get,
+ .regset_get = s390_gs_cb_get,
.set = s390_gs_cb_set,
},
{
@@ -1402,7 +1311,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_bc_get,
+ .regset_get = s390_gs_bc_get,
.set = s390_gs_bc_set,
},
{
@@ -1410,13 +1319,13 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_runtime_instr_get,
+ .regset_get = s390_runtime_instr_get,
.set = s390_runtime_instr_set,
},
};
static const struct user_regset_view user_s390_view = {
- .name = UTS_MACHINE,
+ .name = "s390x",
.e_machine = EM_S390,
.regsets = s390_regsets,
.n = ARRAY_SIZE(s390_regsets)
@@ -1425,28 +1334,15 @@ static const struct user_regset_view user_s390_view = {
#ifdef CONFIG_COMPAT
static int s390_compat_regs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
+ unsigned n;
+
if (target == current)
save_access_regs(target->thread.acrs);
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- *k++ = __peek_user_compat(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- if (__put_user(__peek_user_compat(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t))
+ membuf_store(&to, __peek_user_compat(target, n));
return 0;
}
@@ -1488,29 +1384,14 @@ static int s390_compat_regs_set(struct task_struct *target,
static int s390_compat_regs_high_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
compat_ulong_t *gprs_high;
+ int i;
- gprs_high = (compat_ulong_t *)
- &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)];
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- *k++ = *gprs_high;
- gprs_high += 2;
- count -= sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- if (__put_user(*gprs_high, u++))
- return -EFAULT;
- gprs_high += 2;
- count -= sizeof(*u);
- }
- }
+ gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs;
+ for (i = 0; i < NUM_GPRS; i++, gprs_high += 2)
+ membuf_store(&to, *gprs_high);
return 0;
}
@@ -1549,23 +1430,11 @@ static int s390_compat_regs_high_set(struct task_struct *target,
static int s390_compat_last_break_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- compat_ulong_t last_break;
+ compat_ulong_t last_break = target->thread.last_break;
- if (count > 0) {
- last_break = target->thread.last_break;
- if (kbuf) {
- unsigned long *k = kbuf;
- *k = last_break;
- } else {
- unsigned long __user *u = ubuf;
- if (__put_user(last_break, u))
- return -EFAULT;
- }
- }
- return 0;
+ return membuf_store(&to, (unsigned long)last_break);
}
static int s390_compat_last_break_set(struct task_struct *target,
@@ -1582,7 +1451,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_compat_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_compat_regs_get,
+ .regset_get = s390_compat_regs_get,
.set = s390_compat_regs_set,
},
{
@@ -1590,7 +1459,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_fp_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_fpregs_get,
+ .regset_get = s390_fpregs_get,
.set = s390_fpregs_set,
},
{
@@ -1598,7 +1467,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = sizeof(compat_uint_t),
.align = sizeof(compat_uint_t),
- .get = s390_system_call_get,
+ .regset_get = s390_system_call_get,
.set = s390_system_call_set,
},
{
@@ -1606,7 +1475,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_compat_last_break_get,
+ .regset_get = s390_compat_last_break_get,
.set = s390_compat_last_break_set,
},
{
@@ -1614,7 +1483,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = 256,
.align = 1,
- .get = s390_tdb_get,
+ .regset_get = s390_tdb_get,
.set = s390_tdb_set,
},
{
@@ -1622,7 +1491,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_vxrs_low_get,
+ .regset_get = s390_vxrs_low_get,
.set = s390_vxrs_low_set,
},
{
@@ -1630,7 +1499,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
- .get = s390_vxrs_high_get,
+ .regset_get = s390_vxrs_high_get,
.set = s390_vxrs_high_set,
},
{
@@ -1638,7 +1507,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_compat_regs_high_get,
+ .regset_get = s390_compat_regs_high_get,
.set = s390_compat_regs_high_set,
},
{
@@ -1646,7 +1515,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_cb_get,
+ .regset_get = s390_gs_cb_get,
.set = s390_gs_cb_set,
},
{
@@ -1654,7 +1523,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_bc_get,
+ .regset_get = s390_gs_bc_get,
.set = s390_gs_bc_set,
},
{
@@ -1662,7 +1531,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_runtime_instr_get,
+ .regset_get = s390_runtime_instr_get,
.set = s390_runtime_instr_set,
},
};
diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S
index fe396673e8a6..a9a1a6f45375 100644
--- a/arch/s390/kernel/relocate_kernel.S
+++ b/arch/s390/kernel/relocate_kernel.S
@@ -2,8 +2,7 @@
/*
* Copyright IBM Corp. 2005
*
- * Author(s): Rolf Adelsberger,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
+ * Author(s): Rolf Adelsberger
*
*/
@@ -15,6 +14,7 @@
* moves the new kernel to its destination...
* %r2 = pointer to first kimage_entry_t
* %r3 = start address - where to jump to after the job is done...
+ * %r4 = subcode
*
* %r5 will be used as temp. storage
* %r6 holds the destination address
@@ -57,7 +57,7 @@ ENTRY(relocate_kernel)
jo 0b
j .base
.done:
- sgr %r0,%r0 # clear register r0
+ lgr %r0,%r4 # subcode
cghi %r3,0
je .diag
la %r4,load_psw-.base(%r13) # load psw-address into the register
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index 125c7f6e8715..1788a5454b6f 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -57,7 +57,7 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb)
cb->k = 1;
cb->ps = 1;
cb->pc = 1;
- cb->key = PAGE_DEFAULT_KEY;
+ cb->key = PAGE_DEFAULT_KEY >> 4;
cb->v = 1;
}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index b2c2f75860e8..ab19ddb09d65 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -37,7 +37,7 @@
#include <linux/root_dev.h>
#include <linux/console.h>
#include <linux/kernel_stat.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pfn.h>
@@ -49,6 +49,8 @@
#include <linux/memory.h>
#include <linux/compat.h>
#include <linux/start_kernel.h>
+#include <linux/hugetlb.h>
+#include <linux/kmemleak.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
@@ -56,7 +58,7 @@
#include <asm/smp.h>
#include <asm/mmu_context.h>
#include <asm/cpcmd.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/page.h>
@@ -72,7 +74,9 @@
#include <asm/alternative.h>
#include <asm/nospec-branch.h>
#include <asm/mem_detect.h>
+#include <asm/maccess.h>
#include <asm/uv.h>
+#include <asm/asm-offsets.h>
#include "entry.h"
/*
@@ -87,32 +91,72 @@ EXPORT_SYMBOL(console_devno);
unsigned int console_irq = -1;
EXPORT_SYMBOL(console_irq);
-unsigned long elf_hwcap __read_mostly = 0;
-char elf_platform[ELF_PLATFORM_SIZE];
+/*
+ * Some code and data needs to stay below 2 GB, even when the kernel would be
+ * relocated above 2 GB, because it has to use 31 bit addresses.
+ * Such code and data is part of the .amode31 section.
+ */
+unsigned long __amode31_ref __samode31 = (unsigned long)&_samode31;
+unsigned long __amode31_ref __eamode31 = (unsigned long)&_eamode31;
+unsigned long __amode31_ref __stext_amode31 = (unsigned long)&_stext_amode31;
+unsigned long __amode31_ref __etext_amode31 = (unsigned long)&_etext_amode31;
+struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table;
+struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table;
-unsigned long int_hwcap = 0;
+/*
+ * Control registers CR2, CR5 and CR15 are initialized with addresses
+ * of tables that must be placed below 2G which is handled by the AMODE31
+ * sections.
+ * Because the AMODE31 sections are relocated below 2G at startup,
+ * the content of control registers CR2, CR5 and CR15 must be updated
+ * with new addresses after the relocation. The initial initialization of
+ * control registers occurs in head64.S and then gets updated again after AMODE31
+ * relocation. We must access the relevant AMODE31 tables indirectly via
+ * pointers placed in the .amode31.refs linker section. Those pointers get
+ * updated automatically during AMODE31 relocation and always contain a valid
+ * address within AMODE31 sections.
+ */
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
-int __bootdata_preserved(prot_virt_guest);
-#endif
+static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64);
+
+static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = {
+ [1] = 0xffffffffffffffff
+};
+
+static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = {
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0
+};
+
+static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = {
+ 0, 0, 0x89000000, 0,
+ 0, 0, 0x8a000000, 0
+};
+
+static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31;
+static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31;
+static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
+static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
int __bootdata(noexec_disabled);
-int __bootdata(memory_end_set);
-unsigned long __bootdata(memory_end);
-unsigned long __bootdata(vmalloc_size);
-unsigned long __bootdata(max_physmem_end);
+unsigned long __bootdata(ident_map_size);
struct mem_detect_info __bootdata(mem_detect);
+struct initrd_data __bootdata(initrd_data);
-struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table);
-struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table);
-unsigned long __bootdata_preserved(__swsusp_reset_dma);
-unsigned long __bootdata_preserved(__stext_dma);
-unsigned long __bootdata_preserved(__etext_dma);
-unsigned long __bootdata_preserved(__sdma);
-unsigned long __bootdata_preserved(__edma);
unsigned long __bootdata_preserved(__kaslr_offset);
+unsigned long __bootdata(__amode31_base);
unsigned int __bootdata_preserved(zlib_dfltcc_support);
EXPORT_SYMBOL(zlib_dfltcc_support);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+EXPORT_SYMBOL(stfle_fac_list);
+u64 __bootdata_preserved(alt_stfle_fac_list[16]);
+struct oldmem_data __bootdata_preserved(oldmem_data);
unsigned long VMALLOC_START;
EXPORT_SYMBOL(VMALLOC_START);
@@ -122,6 +166,7 @@ EXPORT_SYMBOL(VMALLOC_END);
struct page *vmemmap;
EXPORT_SYMBOL(vmemmap);
+unsigned long vmemmap_size;
unsigned long MODULES_VADDR;
unsigned long MODULES_END;
@@ -130,6 +175,14 @@ unsigned long MODULES_END;
struct lowcore *lowcore_ptr[NR_CPUS];
EXPORT_SYMBOL(lowcore_ptr);
+DEFINE_STATIC_KEY_FALSE(cpu_has_bear);
+
+/*
+ * The Write Back bit position in the physaddr is given by the SLPC PCI.
+ * Leaving the mask zero always uses write through which is safe
+ */
+unsigned long mio_wb_bit_mask __ro_after_init;
+
/*
* This is set up by the setup-routine at boot-time
* for S390 need to find out, what we have to setup
@@ -163,7 +216,7 @@ static void __init set_preferred_console(void)
else if (CONSOLE_IS_3270)
add_preferred_console("tty3270", 0, NULL);
else if (CONSOLE_IS_VT220)
- add_preferred_console("ttyS", 1, NULL);
+ add_preferred_console("ttysclp", 0, NULL);
else if (CONSOLE_IS_HVC)
add_preferred_console("hvc", 0, NULL);
}
@@ -248,9 +301,9 @@ static void __init conmode_default(void)
#ifdef CONFIG_CRASH_DUMP
static void __init setup_zfcpdump(void)
{
- if (ipl_info.type != IPL_TYPE_FCP_DUMP)
+ if (!is_ipl_type_dump())
return;
- if (OLDMEM_BASE)
+ if (oldmem_data.start)
return;
strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev");
console_loglevel = 2;
@@ -303,17 +356,17 @@ void machine_power_off(void)
void (*pm_power_off)(void) = machine_power_off;
EXPORT_SYMBOL_GPL(pm_power_off);
-void *restart_stack __section(.data);
+void *restart_stack;
unsigned long stack_alloc(void)
{
#ifdef CONFIG_VMAP_STACK
- return (unsigned long)
- __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
+ void *ret;
+
+ ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ kmemleak_not_leak(ret);
+ return (unsigned long)ret;
#else
return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
#endif
@@ -339,24 +392,11 @@ int __init arch_early_irq_init(void)
return 0;
}
-static int __init async_stack_realloc(void)
-{
- unsigned long old, new;
-
- old = S390_lowcore.async_stack - STACK_INIT_OFFSET;
- new = stack_alloc();
- if (!new)
- panic("Couldn't allocate async stack");
- S390_lowcore.async_stack = new + STACK_INIT_OFFSET;
- free_pages(old, THREAD_SIZE_ORDER);
- return 0;
-}
-early_initcall(async_stack_realloc);
-
void __init arch_call_rest_init(void)
{
unsigned long stack;
+ smp_reinit_ipl_cpu();
stack = stack_alloc();
if (!stack)
panic("Couldn't allocate kernel stack");
@@ -367,12 +407,18 @@ void __init arch_call_rest_init(void)
set_task_stack_end_magic(current);
stack += STACK_INIT_OFFSET;
S390_lowcore.kernel_stack = stack;
- CALL_ON_STACK_NORETURN(rest_init, stack);
+ call_on_stack_noreturn(rest_init, stack);
}
static void __init setup_lowcore_dat_off(void)
{
- struct lowcore *lc;
+ unsigned long int_psw_mask = PSW_KERNEL_BITS;
+ struct lowcore *abs_lc, *lc;
+ unsigned long mcck_stack;
+ unsigned long flags;
+
+ if (IS_ENABLED(CONFIG_KASAN))
+ int_psw_mask |= PSW_MASK_DAT;
/*
* Setup lowcore for boot cpu
@@ -385,16 +431,15 @@ static void __init setup_lowcore_dat_off(void)
lc->restart_psw.mask = PSW_KERNEL_BITS;
lc->restart_psw.addr = (unsigned long) restart_int_handler;
- lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK;
lc->external_new_psw.addr = (unsigned long) ext_int_handler;
- lc->svc_new_psw.mask = PSW_KERNEL_BITS |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+ lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK;
lc->svc_new_psw.addr = (unsigned long) system_call;
- lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK;
lc->program_new_psw.addr = (unsigned long) pgm_check_handler;
lc->mcck_new_psw.mask = PSW_KERNEL_BITS;
lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler;
- lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK;
lc->io_new_psw.addr = (unsigned long) io_int_handler;
lc->clock_comparator = clock_comparator_max;
lc->nodat_stack = ((unsigned long) &init_thread_union)
@@ -403,15 +448,8 @@ static void __init setup_lowcore_dat_off(void)
lc->lpp = LPP_MAGIC;
lc->machine_flags = S390_lowcore.machine_flags;
lc->preempt_count = S390_lowcore.preempt_count;
- lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
- memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- sizeof(lc->stfle_fac_list));
- memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
- sizeof(lc->alt_stfle_fac_list));
- nmi_alloc_boot_cpu(lc);
- vdso_alloc_boot_cpu(lc);
- lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
- lc->async_enter_timer = S390_lowcore.async_enter_timer;
+ nmi_alloc_mcesa_early(&lc->mcesad);
+ lc->sys_enter_timer = S390_lowcore.sys_enter_timer;
lc->exit_timer = S390_lowcore.exit_timer;
lc->user_timer = S390_lowcore.user_timer;
lc->system_timer = S390_lowcore.system_timer;
@@ -437,32 +475,55 @@ static void __init setup_lowcore_dat_off(void)
lc->restart_stack = (unsigned long) restart_stack;
lc->restart_fn = (unsigned long) do_restart;
lc->restart_data = 0;
- lc->restart_source = -1UL;
-
- /* Setup absolute zero lowcore */
- mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack);
- mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn);
- mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data);
- mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source);
- mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw);
+ lc->restart_source = -1U;
+
+ abs_lc = get_abs_lowcore(&flags);
+ abs_lc->restart_stack = lc->restart_stack;
+ abs_lc->restart_fn = lc->restart_fn;
+ abs_lc->restart_data = lc->restart_data;
+ abs_lc->restart_source = lc->restart_source;
+ abs_lc->restart_psw = lc->restart_psw;
+ abs_lc->mcesad = lc->mcesad;
+ put_abs_lowcore(abs_lc, flags);
+
+ mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+ if (!mcck_stack)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, THREAD_SIZE, THREAD_SIZE);
+ lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
lc->spinlock_lockval = arch_spin_lockval(0);
lc->spinlock_index = 0;
arch_spin_lock_setup(0);
- lc->br_r1_trampoline = 0x07f1; /* br %r1 */
+ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+ lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+ lc->preempt_count = PREEMPT_DISABLED;
- set_prefix((u32)(unsigned long) lc);
+ set_prefix(__pa(lc));
lowcore_ptr[0] = lc;
}
static void __init setup_lowcore_dat_on(void)
{
+ struct lowcore *abs_lc;
+ unsigned long flags;
+
__ctl_clear_bit(0, 28);
S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT;
S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT;
S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT;
S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT;
__ctl_set_bit(0, 28);
+ __ctl_store(S390_lowcore.cregs_save_area, 0, 15);
+ if (abs_lowcore_map(0, lowcore_ptr[0], true))
+ panic("Couldn't setup absolute lowcore");
+ abs_lowcore_mapped = true;
+ abs_lc = get_abs_lowcore(&flags);
+ abs_lc->restart_flags = RESTART_FLAG_CTLREGS;
+ abs_lc->program_new_psw = S390_lowcore.program_new_psw;
+ memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area,
+ sizeof(abs_lc->cregs_save_area));
+ put_abs_lowcore(abs_lc, flags);
}
static struct resource code_resource = {
@@ -489,8 +550,9 @@ static struct resource __initdata *standard_resources[] = {
static void __init setup_resources(void)
{
struct resource *res, *std_res, *sub_res;
- struct memblock_region *reg;
+ phys_addr_t start, end;
int j;
+ u64 i;
code_resource.start = (unsigned long) _text;
code_resource.end = (unsigned long) _etext - 1;
@@ -499,7 +561,7 @@ static void __init setup_resources(void)
bss_resource.start = (unsigned long) __bss_start;
bss_resource.end = (unsigned long) __bss_stop - 1;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
res = memblock_alloc(sizeof(*res), 8);
if (!res)
panic("%s: Failed to allocate %zu bytes align=0x%x\n",
@@ -507,8 +569,13 @@ static void __init setup_resources(void)
res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
res->name = "System RAM";
- res->start = reg->base;
- res->end = reg->base + reg->size - 1;
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in resourses, end points to the last byte in
+ * the range.
+ */
+ res->end = end - 1;
request_resource(&iomem_resource, res);
for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
@@ -539,7 +606,8 @@ static void __init setup_resources(void)
* part of the System RAM resource.
*/
if (crashk_res.end) {
- memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
+ memblock_add_node(crashk_res.start, resource_size(&crashk_res),
+ 0, MEMBLOCK_NONE);
memblock_reserve(crashk_res.start, resource_size(&crashk_res));
insert_resource(&iomem_resource, &crashk_res);
}
@@ -548,56 +616,18 @@ static void __init setup_resources(void)
static void __init setup_memory_end(void)
{
- unsigned long vmax, tmp;
-
- /* Choose kernel address space layout: 3 or 4 levels. */
- if (IS_ENABLED(CONFIG_KASAN)) {
- vmax = IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)
- ? _REGION1_SIZE
- : _REGION2_SIZE;
- } else {
- tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE;
- tmp = tmp * (sizeof(struct page) + PAGE_SIZE);
- if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE)
- vmax = _REGION2_SIZE; /* 3-level kernel page table */
- else
- vmax = _REGION1_SIZE; /* 4-level kernel page table */
- }
-
- /* module area is at the end of the kernel address space. */
- MODULES_END = vmax;
- MODULES_VADDR = MODULES_END - MODULES_LEN;
- VMALLOC_END = MODULES_VADDR;
- VMALLOC_START = VMALLOC_END - vmalloc_size;
-
- /* Split remaining virtual space between 1:1 mapping & vmemmap array */
- tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
- /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
- tmp = SECTION_ALIGN_UP(tmp);
- tmp = VMALLOC_START - tmp * sizeof(struct page);
- tmp &= ~((vmax >> 11) - 1); /* align to page table level */
- tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS);
- vmemmap = (struct page *) tmp;
-
- /* Take care that memory_end is set and <= vmemmap */
- memory_end = min(memory_end ?: max_physmem_end, (unsigned long)vmemmap);
-#ifdef CONFIG_KASAN
- /* fit in kasan shadow memory region between 1:1 and vmemmap */
- memory_end = min(memory_end, KASAN_SHADOW_START);
- vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END);
-#endif
- max_pfn = max_low_pfn = PFN_DOWN(memory_end);
- memblock_remove(memory_end, ULONG_MAX);
-
- pr_notice("The maximum memory size is %luMB\n", memory_end >> 20);
+ memblock_remove(ident_map_size, PHYS_ADDR_MAX - ident_map_size);
+ max_pfn = max_low_pfn = PFN_DOWN(ident_map_size);
+ pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20);
}
#ifdef CONFIG_CRASH_DUMP
/*
- * When kdump is enabled, we have to ensure that no memory from
- * the area [0 - crashkernel memory size] and
- * [crashk_res.start - crashk_res.end] is set offline.
+ * When kdump is enabled, we have to ensure that no memory from the area
+ * [0 - crashkernel memory size] is set offline - it will be exchanged with
+ * the crashkernel memory region when kdump is triggered. The crashkernel
+ * memory region can never get offlined (pages are unmovable).
*/
static int kdump_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
@@ -608,11 +638,7 @@ static int kdump_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res)))
return NOTIFY_BAD;
- if (arg->start_pfn > PFN_DOWN(crashk_res.end))
- return NOTIFY_OK;
- if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start))
- return NOTIFY_OK;
- return NOTIFY_BAD;
+ return NOTIFY_OK;
}
static struct notifier_block kdump_mem_nb = {
@@ -622,39 +648,6 @@ static struct notifier_block kdump_mem_nb = {
#endif
/*
- * Make sure that the area behind memory_end is protected
- */
-static void reserve_memory_end(void)
-{
- if (memory_end_set)
- memblock_reserve(memory_end, ULONG_MAX);
-}
-
-/*
- * Make sure that oldmem, where the dump is stored, is protected
- */
-static void reserve_oldmem(void)
-{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE)
- /* Forget all memory above the running kdump system */
- memblock_reserve(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
-}
-
-/*
- * Make sure that oldmem, where the dump is stored, is protected
- */
-static void remove_oldmem(void)
-{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE)
- /* Forget all memory above the running kdump system */
- memblock_remove(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
-}
-
-/*
* Reserve memory for kdump kernel to be loaded with kexec
*/
static void __init reserve_crashkernel(void)
@@ -664,7 +657,7 @@ static void __init reserve_crashkernel(void)
phys_addr_t low, high;
int rc;
- rc = parse_crashkernel(boot_command_line, memory_end, &crash_size,
+ rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size,
&crash_base);
crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
@@ -678,9 +671,9 @@ static void __init reserve_crashkernel(void)
return;
}
- low = crash_base ?: OLDMEM_BASE;
+ low = crash_base ?: oldmem_data.start;
high = low + crash_size;
- if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) {
+ if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) {
/* The crashkernel fits into OLDMEM, reuse OLDMEM */
crash_base = low;
} else {
@@ -694,8 +687,9 @@ static void __init reserve_crashkernel(void)
return;
}
low = crash_base ?: low;
- crash_base = memblock_find_in_range(low, high, crash_size,
- KEXEC_CRASH_MEM_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size,
+ KEXEC_CRASH_MEM_ALIGN,
+ low, high);
}
if (!crash_base) {
@@ -704,10 +698,12 @@ static void __init reserve_crashkernel(void)
return;
}
- if (register_memory_notifier(&kdump_mem_nb))
+ if (register_memory_notifier(&kdump_mem_nb)) {
+ memblock_phys_free(crash_base, crash_size);
return;
+ }
- if (!OLDMEM_BASE && MACHINE_IS_VM)
+ if (!oldmem_data.start && MACHINE_IS_VM)
diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
@@ -726,11 +722,11 @@ static void __init reserve_crashkernel(void)
static void __init reserve_initrd(void)
{
#ifdef CONFIG_BLK_DEV_INITRD
- if (!INITRD_START || !INITRD_SIZE)
+ if (!initrd_data.start || !initrd_data.size)
return;
- initrd_start = INITRD_START;
- initrd_end = initrd_start + INITRD_SIZE;
- memblock_reserve(INITRD_START, INITRD_SIZE);
+ initrd_start = (unsigned long)__va(initrd_data.start);
+ initrd_end = initrd_start + initrd_data.size;
+ memblock_reserve(initrd_data.start, initrd_data.size);
#endif
}
@@ -758,7 +754,7 @@ static void __init free_mem_detect_info(void)
get_mem_detect_reserved(&start, &size);
if (size)
- memblock_free(start, size);
+ memblock_phys_free(start, size);
}
static const char * __init get_mem_info_source(void)
@@ -781,8 +777,8 @@ static void __init memblock_add_mem_detect_info(void)
unsigned long start, end;
int i;
- memblock_dbg("physmem info source: %s (%hhd)\n",
- get_mem_info_source(), mem_detect.info_source);
+ pr_debug("physmem info source: %s (%hhd)\n",
+ get_mem_info_source(), mem_detect.info_source);
/* keep memblock lists close to the kernel */
memblock_set_bottom_up(true);
for_each_mem_detect_block(i, &start, &end) {
@@ -790,7 +786,7 @@ static void __init memblock_add_mem_detect_info(void)
memblock_physmem_add(start, end - start);
}
memblock_set_bottom_up(false);
- memblock_dump_all();
+ memblock_set_node(0, ULONG_MAX, &memblock.memory, 0);
}
/*
@@ -799,10 +795,10 @@ static void __init memblock_add_mem_detect_info(void)
static void __init check_initrd(void)
{
#ifdef CONFIG_BLK_DEV_INITRD
- if (INITRD_START && INITRD_SIZE &&
- !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) {
+ if (initrd_data.start && initrd_data.size &&
+ !memblock_is_region_memory(initrd_data.start, initrd_data.size)) {
pr_err("The initial RAM disk does not fit into the memory\n");
- memblock_free(INITRD_START, INITRD_SIZE);
+ memblock_phys_free(initrd_data.start, initrd_data.size);
initrd_start = initrd_end = 0;
}
#endif
@@ -813,176 +809,68 @@ static void __init check_initrd(void)
*/
static void __init reserve_kernel(void)
{
- unsigned long start_pfn = PFN_UP(__pa(_end));
-
- memblock_reserve(0, HEAD_END);
- memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
- - (unsigned long)_stext);
- memblock_reserve(__sdma, __edma - __sdma);
+ memblock_reserve(0, STARTUP_NORMAL_OFFSET);
+ memblock_reserve(OLDMEM_BASE, sizeof(unsigned long));
+ memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long));
+ memblock_reserve(__amode31_base, __eamode31 - __samode31);
+ memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP);
+ memblock_reserve(__pa(_stext), _end - _stext);
}
static void __init setup_memory(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/*
* Init storage key for present memory
*/
- for_each_memblock(memory, reg) {
- storage_key_init_range(reg->base, reg->base + reg->size);
- }
- psw_set_key(PAGE_DEFAULT_KEY);
+ for_each_mem_range(i, &start, &end)
+ storage_key_init_range(start, end);
- /* Only cosmetics */
- memblock_enforce_memory_limit(memblock_end_of_DRAM());
+ psw_set_key(PAGE_DEFAULT_KEY);
}
-/*
- * Setup hardware capabilities.
- */
-static int __init setup_hwcaps(void)
+static void __init relocate_amode31_section(void)
{
- static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 };
- struct cpuid cpu_id;
- int i;
+ unsigned long amode31_size = __eamode31 - __samode31;
+ long amode31_offset = __amode31_base - __samode31;
+ long *ptr;
- /*
- * The store facility list bits numbers as found in the principles
- * of operation are numbered with bit 1UL<<31 as number 0 to
- * bit 1UL<<0 as number 31.
- * Bit 0: instructions named N3, "backported" to esa-mode
- * Bit 2: z/Architecture mode is active
- * Bit 7: the store-facility-list-extended facility is installed
- * Bit 17: the message-security assist is installed
- * Bit 19: the long-displacement facility is installed
- * Bit 21: the extended-immediate facility is installed
- * Bit 22: extended-translation facility 3 is installed
- * Bit 30: extended-translation facility 3 enhancement facility
- * These get translated to:
- * HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1,
- * HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3,
- * HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and
- * HWCAP_S390_ETF3EH bit 8 (22 && 30).
- */
- for (i = 0; i < 6; i++)
- if (test_facility(stfl_bits[i]))
- elf_hwcap |= 1UL << i;
+ pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
- if (test_facility(22) && test_facility(30))
- elf_hwcap |= HWCAP_S390_ETF3EH;
+ /* Move original AMODE31 section to the new one */
+ memmove((void *)__amode31_base, (void *)__samode31, amode31_size);
+ /* Zero out the old AMODE31 section to catch invalid accesses within it */
+ memset((void *)__samode31, 0, amode31_size);
- /*
- * Check for additional facilities with store-facility-list-extended.
- * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0
- * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information
- * as stored by stfl, bits 32-xxx contain additional facilities.
- * How many facility words are stored depends on the number of
- * doublewords passed to the instruction. The additional facilities
- * are:
- * Bit 42: decimal floating point facility is installed
- * Bit 44: perform floating point operation facility is installed
- * translated to:
- * HWCAP_S390_DFP bit 6 (42 && 44).
- */
- if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44))
- elf_hwcap |= HWCAP_S390_DFP;
-
- /*
- * Huge page support HWCAP_S390_HPAGE is bit 7.
- */
- if (MACHINE_HAS_EDAT1)
- elf_hwcap |= HWCAP_S390_HPAGE;
-
- /*
- * 64-bit register support for 31-bit processes
- * HWCAP_S390_HIGH_GPRS is bit 9.
- */
- elf_hwcap |= HWCAP_S390_HIGH_GPRS;
-
- /*
- * Transactional execution support HWCAP_S390_TE is bit 10.
- */
- if (MACHINE_HAS_TE)
- elf_hwcap |= HWCAP_S390_TE;
-
- /*
- * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension
- * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX
- * instead of facility bit 129.
- */
- if (MACHINE_HAS_VX) {
- elf_hwcap |= HWCAP_S390_VXRS;
- if (test_facility(134))
- elf_hwcap |= HWCAP_S390_VXRS_EXT;
- if (test_facility(135))
- elf_hwcap |= HWCAP_S390_VXRS_BCD;
- if (test_facility(148))
- elf_hwcap |= HWCAP_S390_VXRS_EXT2;
- if (test_facility(152))
- elf_hwcap |= HWCAP_S390_VXRS_PDE;
- }
- if (test_facility(150))
- elf_hwcap |= HWCAP_S390_SORT;
- if (test_facility(151))
- elf_hwcap |= HWCAP_S390_DFLT;
-
- /*
- * Guarded storage support HWCAP_S390_GS is bit 12.
- */
- if (MACHINE_HAS_GS)
- elf_hwcap |= HWCAP_S390_GS;
-
- get_cpu_id(&cpu_id);
- add_device_randomness(&cpu_id, sizeof(cpu_id));
- switch (cpu_id.machine) {
- case 0x2064:
- case 0x2066:
- default: /* Use "z900" as default for 64 bit kernels. */
- strcpy(elf_platform, "z900");
- break;
- case 0x2084:
- case 0x2086:
- strcpy(elf_platform, "z990");
- break;
- case 0x2094:
- case 0x2096:
- strcpy(elf_platform, "z9-109");
- break;
- case 0x2097:
- case 0x2098:
- strcpy(elf_platform, "z10");
- break;
- case 0x2817:
- case 0x2818:
- strcpy(elf_platform, "z196");
- break;
- case 0x2827:
- case 0x2828:
- strcpy(elf_platform, "zEC12");
- break;
- case 0x2964:
- case 0x2965:
- strcpy(elf_platform, "z13");
- break;
- case 0x3906:
- case 0x3907:
- strcpy(elf_platform, "z14");
- break;
- case 0x8561:
- case 0x8562:
- strcpy(elf_platform, "z15");
- break;
- }
-
- /*
- * Virtualization support HWCAP_INT_SIE is bit 0.
- */
- if (sclp.has_sief2)
- int_hwcap |= HWCAP_INT_SIE;
+ /* Update all AMODE31 region references */
+ for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++)
+ *ptr += amode31_offset;
+}
- return 0;
+/* This must be called after AMODE31 relocation */
+static void __init setup_cr(void)
+{
+ union ctlreg2 cr2;
+ union ctlreg5 cr5;
+ union ctlreg15 cr15;
+
+ __ctl_duct[1] = (unsigned long)__ctl_aste;
+ __ctl_duct[2] = (unsigned long)__ctl_aste;
+ __ctl_duct[4] = (unsigned long)__ctl_duald;
+
+ /* Update control registers CR2, CR5 and CR15 */
+ __ctl_store(cr2.val, 2, 2);
+ __ctl_store(cr5.val, 5, 5);
+ __ctl_store(cr15.val, 15, 15);
+ cr2.ducto = (unsigned long)__ctl_duct >> 6;
+ cr5.pasteo = (unsigned long)__ctl_duct >> 6;
+ cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3;
+ __ctl_load(cr2.val, 2, 2);
+ __ctl_load(cr5.val, 5, 5);
+ __ctl_load(cr15.val, 15, 15);
}
-arch_initcall(setup_hwcaps);
/*
* Add system information as device randomness
@@ -991,14 +879,15 @@ static void __init setup_randomness(void)
{
struct sysinfo_3_2_2 *vmms;
- vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE,
- PAGE_SIZE);
+ vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!vmms)
panic("Failed to allocate memory for sysinfo structure\n");
-
if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
- memblock_free((unsigned long) vmms, PAGE_SIZE);
+ memblock_free(vmms, PAGE_SIZE);
+
+ if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
+ static_branch_enable(&s390_arch_random_available);
}
/*
@@ -1025,8 +914,7 @@ static void __init setup_control_program_code(void)
{
union diag318_info diag318_info = {
.cpnc = CPNC_LINUX,
- .cpvc_linux = 0,
- .cpvc_distro = {0},
+ .cpvc = 0,
};
if (!sclp.has_diag318)
@@ -1096,14 +984,12 @@ void __init setup_arch(char **cmdline_p)
ROOT_DEV = Root_RAM0;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = (unsigned long) _end;
+ setup_initial_init_mm(_text, _etext, _edata, _end);
if (IS_ENABLED(CONFIG_EXPOLINE_AUTO))
nospec_auto_detect();
+ jump_label_init();
parse_early_param();
#ifdef CONFIG_CRASH_DUMP
/* Deactivate elfcorehdr= kernel parameter */
@@ -1116,41 +1002,37 @@ void __init setup_arch(char **cmdline_p)
setup_control_program_code();
/* Do some memory reservations *before* memory is added to memblock */
- reserve_memory_end();
- reserve_oldmem();
reserve_kernel();
reserve_initrd();
reserve_certificate_list();
reserve_mem_detect_info();
+ memblock_set_current_limit(ident_map_size);
memblock_allow_resize();
/* Get information about *all* installed memory */
memblock_add_mem_detect_info();
free_mem_detect_info();
- remove_oldmem();
-
- /*
- * Make sure all chunks are MAX_ORDER aligned so we don't need the
- * extra checks that HOLES_IN_ZONE would require.
- *
- * Is this still required?
- */
- memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
-
setup_memory_end();
+ memblock_dump_all();
setup_memory();
- dma_contiguous_reserve(memory_end);
+
+ relocate_amode31_section();
+ setup_cr();
+ setup_uv();
+ dma_contiguous_reserve(ident_map_size);
vmcp_cma_reserve();
+ if (MACHINE_HAS_EDAT2)
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
check_initrd();
reserve_crashkernel();
#ifdef CONFIG_CRASH_DUMP
/*
- * Be aware that smp_save_dump_cpus() triggers a system reset.
+ * Be aware that smp_save_dump_secondary_cpus() triggers a system reset.
* Therefore CPU and device initialization should be done afterwards.
*/
- smp_save_dump_cpus();
+ smp_save_dump_secondary_cpus();
#endif
setup_resources();
@@ -1162,16 +1044,22 @@ void __init setup_arch(char **cmdline_p)
smp_detect_cpus();
topology_init_early();
+ if (test_facility(193))
+ static_branch_enable(&cpu_has_bear);
+
/*
* Create kernel page tables and switch to virtual addressing.
*/
paging_init();
-
+ memcpy_real_init();
/*
* After paging_init created the kernel page table, the new PSWs
* in lowcore can now run with DAT enabled.
*/
setup_lowcore_dat_on();
+#ifdef CONFIG_CRASH_DUMP
+ smp_save_dump_ipl_cpu();
+#endif
/* Setup default console */
conmode_default();
@@ -1181,7 +1069,7 @@ void __init setup_arch(char **cmdline_p)
if (IS_ENABLED(CONFIG_EXPOLINE))
nospec_init_branches();
- /* Setup zfcpdump support */
+ /* Setup zfcp/nvme dump support */
setup_zfcpdump();
/* Add system specific data to the random pool */
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index e6fca5498e1f..38258f817048 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -16,6 +16,7 @@
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/signal.h>
+#include <linux/entry-common.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
@@ -24,13 +25,13 @@
#include <linux/tty.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
-#include <linux/tracehook.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
+#include <asm/vdso.h>
#include "entry.h"
/*
@@ -139,7 +140,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
{
_sigregs user_sregs;
- /* Alwys make any pending restarted system call return -EINTR */
+ /* Always make any pending restarted system call return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
@@ -332,15 +333,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
- if (ka->sa.sa_flags & SA_RESTORER) {
+ if (ka->sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ka->sa.sa_restorer;
- } else {
- /* Signal frame without vector registers are short ! */
- __u16 __user *svc = (void __user *) frame + frame_size - 2;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long) svc;
- }
+ else
+ restorer = VDSO64_SYMBOL(current, sigreturn);
/* Set up registers for signal handler */
regs->gprs[14] = restorer;
@@ -395,14 +391,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
- if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ksig->ka.sa.sa_restorer;
- } else {
- __u16 __user *svc = &frame->svc_insn;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long) svc;
- }
+ else
+ restorer = VDSO64_SYMBOL(current, rt_sigreturn);
/* Create siginfo on the signal stack */
if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -459,7 +451,8 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset,
* the kernel can handle, and then we build all the user-level signal handling
* stack-frames in one go after that.
*/
-void do_signal(struct pt_regs *regs)
+
+void arch_do_signal_or_restart(struct pt_regs *regs)
{
struct ksignal ksig;
sigset_t *oldset = sigmask_to_save();
@@ -487,7 +480,7 @@ void do_signal(struct pt_regs *regs)
regs->gprs[2] = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
regs->gprs[2] = regs->orig_gpr2;
regs->psw.addr =
@@ -498,6 +491,7 @@ void do_signal(struct pt_regs *regs)
}
/* No longer in a system call */
clear_pt_regs_flag(regs, PIF_SYSCALL);
+
rseq_signal_deliver(&ksig, regs);
if (is_compat_task())
handle_signal32(&ksig, oldset, regs);
@@ -513,16 +507,22 @@ void do_signal(struct pt_regs *regs)
switch (regs->gprs[2]) {
case -ERESTART_RESTARTBLOCK:
/* Restart with sys_restart_syscall */
- regs->int_code = __NR_restart_syscall;
- /* fallthrough */
+ regs->gprs[2] = regs->orig_gpr2;
+ current->restart_block.arch_data = regs->psw.addr;
+ if (is_compat_task())
+ regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
+ else
+ regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+ if (test_thread_flag(TIF_SINGLE_STEP))
+ clear_thread_flag(TIF_PER_TRAP);
+ break;
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
- /* Restart system call with magic TIF bit. */
regs->gprs[2] = regs->orig_gpr2;
- set_pt_regs_flag(regs, PIF_SYSCALL);
+ regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
if (test_thread_flag(TIF_SINGLE_STEP))
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
break;
}
}
@@ -532,10 +532,3 @@ void do_signal(struct pt_regs *regs)
*/
restore_saved_sigmask();
}
-
-void do_notify_resume(struct pt_regs *regs)
-{
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
-}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index a08bd2522dd9..0031325ce4bc 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -5,7 +5,6 @@
* Copyright IBM Corp. 1999, 2012
* Author(s): Denis Joseph Barrow,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*
* based on other smp stuff by
* (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net>
@@ -30,6 +29,7 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irqflags.h>
+#include <linux/irq_work.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/sched/hotplug.h>
@@ -45,9 +45,8 @@
#include <asm/irq.h>
#include <asm/tlbflush.h>
#include <asm/vtimer.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/sclp.h>
-#include <asm/vdso.h>
#include <asm/debug.h>
#include <asm/os_info.h>
#include <asm/sigp.h>
@@ -55,12 +54,16 @@
#include <asm/nmi.h>
#include <asm/stacktrace.h>
#include <asm/topology.h>
+#include <asm/vdso.h>
+#include <asm/maccess.h>
#include "entry.h"
enum {
ec_schedule = 0,
ec_call_function_single,
ec_stop_cpu,
+ ec_mcck_pending,
+ ec_irq_work,
};
enum {
@@ -71,7 +74,6 @@ enum {
static DEFINE_PER_CPU(struct cpu *, cpu_device);
struct pcpu {
- struct lowcore *lowcore; /* lowcore page(s) for the cpu */
unsigned long ec_mask; /* bit mask for ec_xxx functions */
unsigned long ec_clk; /* sigp timestamp for ec_xxx */
signed char state; /* physical cpu state */
@@ -93,6 +95,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
#endif
static unsigned int smp_max_threads __initdata = -1U;
+cpumask_t cpu_setup_mask;
static int __init early_nosmt(char *s)
{
@@ -145,7 +148,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm)
static inline int pcpu_stopped(struct pcpu *pcpu)
{
- u32 uninitialized_var(status);
+ u32 status;
if (__pcpu_sigp(pcpu->address, SIGP_SENSE,
0, &status) != SIGP_CC_STATUS_STORED)
@@ -188,99 +191,95 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
{
- unsigned long async_stack, nodat_stack;
+ unsigned long async_stack, nodat_stack, mcck_stack;
struct lowcore *lc;
- if (pcpu != &pcpu_devices[0]) {
- pcpu->lowcore = (struct lowcore *)
- __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
- nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
- if (!pcpu->lowcore || !nodat_stack)
- goto out;
- } else {
- nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
- }
+ lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
+ nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
async_stack = stack_alloc();
- if (!async_stack)
+ mcck_stack = stack_alloc();
+ if (!lc || !nodat_stack || !async_stack || !mcck_stack)
goto out;
- lc = pcpu->lowcore;
memcpy(lc, &S390_lowcore, 512);
memset((char *) lc + 512, 0, sizeof(*lc) - 512);
lc->async_stack = async_stack + STACK_INIT_OFFSET;
lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET;
+ lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
lc->cpu_nr = cpu;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
- lc->br_r1_trampoline = 0x07f1; /* br %r1 */
- if (nmi_alloc_per_cpu(lc))
- goto out_async;
- if (vdso_alloc_per_cpu(lc))
+ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+ lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+ lc->preempt_count = PREEMPT_DISABLED;
+ if (nmi_alloc_mcesa(&lc->mcesad))
+ goto out;
+ if (abs_lowcore_map(cpu, lc, true))
goto out_mcesa;
lowcore_ptr[cpu] = lc;
- pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc);
+ pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc));
return 0;
out_mcesa:
- nmi_free_per_cpu(lc);
-out_async:
- stack_free(async_stack);
+ nmi_free_mcesa(&lc->mcesad);
out:
- if (pcpu != &pcpu_devices[0]) {
- free_pages(nodat_stack, THREAD_SIZE_ORDER);
- free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
- }
+ stack_free(mcck_stack);
+ stack_free(async_stack);
+ free_pages(nodat_stack, THREAD_SIZE_ORDER);
+ free_pages((unsigned long) lc, LC_ORDER);
return -ENOMEM;
}
static void pcpu_free_lowcore(struct pcpu *pcpu)
{
- unsigned long async_stack, nodat_stack, lowcore;
-
- nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
- async_stack = pcpu->lowcore->async_stack - STACK_INIT_OFFSET;
- lowcore = (unsigned long) pcpu->lowcore;
+ unsigned long async_stack, nodat_stack, mcck_stack;
+ struct lowcore *lc;
+ int cpu;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET;
+ async_stack = lc->async_stack - STACK_INIT_OFFSET;
+ mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET;
pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
- lowcore_ptr[pcpu - pcpu_devices] = NULL;
- vdso_free_per_cpu(pcpu->lowcore);
- nmi_free_per_cpu(pcpu->lowcore);
+ lowcore_ptr[cpu] = NULL;
+ abs_lowcore_unmap(cpu);
+ nmi_free_mcesa(&lc->mcesad);
stack_free(async_stack);
- if (pcpu == &pcpu_devices[0])
- return;
+ stack_free(mcck_stack);
free_pages(nodat_stack, THREAD_SIZE_ORDER);
- free_pages(lowcore, LC_ORDER);
+ free_pages((unsigned long) lc, LC_ORDER);
}
static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc = lowcore_ptr[cpu];
cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
lc->cpu_nr = cpu;
+ lc->restart_flags = RESTART_FLAG_CTLREGS;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
lc->percpu_offset = __per_cpu_offset[cpu];
lc->kernel_asce = S390_lowcore.kernel_asce;
- lc->user_asce = S390_lowcore.kernel_asce;
+ lc->user_asce = s390_invalid_asce;
lc->machine_flags = S390_lowcore.machine_flags;
lc->user_timer = lc->system_timer =
lc->steal_timer = lc->avg_steal_timer = 0;
__ctl_store(lc->cregs_save_area, 0, 15);
lc->cregs_save_area[1] = lc->kernel_asce;
- lc->cregs_save_area[7] = lc->vdso_asce;
+ lc->cregs_save_area[7] = lc->user_asce;
save_access_regs((unsigned int *) lc->access_regs_save_area);
- memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- sizeof(lc->stfle_fac_list));
- memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
- sizeof(lc->alt_stfle_fac_list));
arch_spin_lock_setup(cpu);
}
static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc;
+ int cpu;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
lc->kernel_stack = (unsigned long) task_stack_page(tsk)
+ THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
lc->current_task = (unsigned long) tsk;
@@ -296,40 +295,59 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc;
+ int cpu;
- lc->restart_stack = lc->nodat_stack;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ lc->restart_stack = lc->kernel_stack;
lc->restart_fn = (unsigned long) func;
lc->restart_data = (unsigned long) data;
- lc->restart_source = -1UL;
+ lc->restart_source = -1U;
pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
}
+typedef void (pcpu_delegate_fn)(void *);
+
/*
* Call function via PSW restart on pcpu and stop the current cpu.
*/
-static void __pcpu_delegate(void (*func)(void*), void *data)
+static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
{
func(data); /* should not return */
}
-static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
- void (*func)(void *),
- void *data, unsigned long stack)
+static void pcpu_delegate(struct pcpu *pcpu,
+ pcpu_delegate_fn *func,
+ void *data, unsigned long stack)
{
- struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
- unsigned long source_cpu = stap();
+ struct lowcore *lc, *abs_lc;
+ unsigned int source_cpu;
+ unsigned long flags;
+ lc = lowcore_ptr[pcpu - pcpu_devices];
+ source_cpu = stap();
__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
- if (pcpu->address == source_cpu)
- CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
+ if (pcpu->address == source_cpu) {
+ call_on_stack(2, stack, void, __pcpu_delegate,
+ pcpu_delegate_fn *, func, void *, data);
+ }
/* Stop target cpu (if func returns this stops the current cpu). */
pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
/* Restart func on the target cpu and stop the current cpu. */
- mem_assign_absolute(lc->restart_stack, stack);
- mem_assign_absolute(lc->restart_fn, (unsigned long) func);
- mem_assign_absolute(lc->restart_data, (unsigned long) data);
- mem_assign_absolute(lc->restart_source, source_cpu);
+ if (lc) {
+ lc->restart_stack = stack;
+ lc->restart_fn = (unsigned long)func;
+ lc->restart_data = (unsigned long)data;
+ lc->restart_source = source_cpu;
+ } else {
+ abs_lc = get_abs_lowcore(&flags);
+ abs_lc->restart_stack = stack;
+ abs_lc->restart_fn = (unsigned long)func;
+ abs_lc->restart_data = (unsigned long)data;
+ abs_lc->restart_source = source_cpu;
+ put_abs_lowcore(abs_lc, flags);
+ }
__bpon();
asm volatile(
"0: sigp 0,%0,%2 # sigp restart to target cpu\n"
@@ -382,7 +400,7 @@ void smp_call_online_cpu(void (*func)(void *), void *data)
*/
void smp_call_ipl_cpu(void (*func)(void *), void *data)
{
- struct lowcore *lc = pcpu_devices->lowcore;
+ struct lowcore *lc = lowcore_ptr[0];
if (pcpu_devices[0].address == stap())
lc = &S390_lowcore;
@@ -401,7 +419,12 @@ int smp_find_processor_id(u16 address)
return -1;
}
-bool arch_vcpu_is_preempted(int cpu)
+void schedule_mcck_handler(void)
+{
+ pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending);
+}
+
+bool notrace arch_vcpu_is_preempted(int cpu)
{
if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
return false;
@@ -411,7 +434,7 @@ bool arch_vcpu_is_preempted(int cpu)
}
EXPORT_SYMBOL(arch_vcpu_is_preempted);
-void smp_yield_cpu(int cpu)
+void notrace smp_yield_cpu(int cpu)
{
if (!MACHINE_HAS_DIAG9C)
return;
@@ -419,6 +442,7 @@ void smp_yield_cpu(int cpu)
asm volatile("diag %0,0,0x9c"
: : "d" (pcpu_devices[cpu].address));
}
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the
@@ -426,10 +450,12 @@ void smp_yield_cpu(int cpu)
*/
void notrace smp_emergency_stop(void)
{
- cpumask_t cpumask;
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ static cpumask_t cpumask;
u64 end;
int cpu;
+ arch_spin_lock(&lock);
cpumask_copy(&cpumask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpumask);
@@ -450,6 +476,7 @@ void notrace smp_emergency_stop(void)
break;
cpu_relax();
}
+ arch_spin_unlock(&lock);
}
NOKPROBE_SYMBOL(smp_emergency_stop);
@@ -495,6 +522,10 @@ static void smp_handle_ext_call(void)
scheduler_ipi();
if (test_bit(ec_call_function_single, &bits))
generic_smp_call_function_single_interrupt();
+ if (test_bit(ec_mcck_pending, &bits))
+ __s390_handle_mcck();
+ if (test_bit(ec_irq_work, &bits))
+ irq_work_run();
}
static void do_ext_call_interrupt(struct ext_code ext_code,
@@ -527,6 +558,13 @@ void smp_send_reschedule(int cpu)
pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
}
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
+{
+ pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work);
+}
+#endif
+
/*
* parameter area for the set/clear control bit callbacks
*/
@@ -549,44 +587,52 @@ static void smp_ctl_bit_callback(void *info)
__ctl_load(cregs, 0, 15);
}
-/*
- * Set a bit in a control register of all cpus
- */
-void smp_ctl_set_bit(int cr, int bit)
-{
- struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr };
-
- on_each_cpu(smp_ctl_bit_callback, &parms, 1);
-}
-EXPORT_SYMBOL(smp_ctl_set_bit);
+static DEFINE_SPINLOCK(ctl_lock);
-/*
- * Clear a bit in a control register of all cpus
- */
-void smp_ctl_clear_bit(int cr, int bit)
+void smp_ctl_set_clear_bit(int cr, int bit, bool set)
{
- struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr };
-
+ struct ec_creg_mask_parms parms = { .cr = cr, };
+ struct lowcore *abs_lc;
+ unsigned long flags;
+ u64 ctlreg;
+
+ if (set) {
+ parms.orval = 1UL << bit;
+ parms.andval = -1UL;
+ } else {
+ parms.orval = 0;
+ parms.andval = ~(1UL << bit);
+ }
+ spin_lock(&ctl_lock);
+ abs_lc = get_abs_lowcore(&flags);
+ ctlreg = abs_lc->cregs_save_area[cr];
+ ctlreg = (ctlreg & parms.andval) | parms.orval;
+ abs_lc->cregs_save_area[cr] = ctlreg;
+ put_abs_lowcore(abs_lc, flags);
+ spin_unlock(&ctl_lock);
on_each_cpu(smp_ctl_bit_callback, &parms, 1);
}
-EXPORT_SYMBOL(smp_ctl_clear_bit);
+EXPORT_SYMBOL(smp_ctl_set_clear_bit);
#ifdef CONFIG_CRASH_DUMP
int smp_store_status(int cpu)
{
- struct pcpu *pcpu = pcpu_devices + cpu;
+ struct lowcore *lc;
+ struct pcpu *pcpu;
unsigned long pa;
- pa = __pa(&pcpu->lowcore->floating_pt_save_area);
+ pcpu = pcpu_devices + cpu;
+ lc = lowcore_ptr[cpu];
+ pa = __pa(&lc->floating_pt_save_area);
if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
if (!MACHINE_HAS_VX && !MACHINE_HAS_GS)
return 0;
- pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK);
+ pa = lc->mcesad & MCESA_ORIGIN_MASK;
if (MACHINE_HAS_GS)
- pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK;
+ pa |= lc->mcesad & MCESA_LC_MASK;
if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS,
pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
@@ -596,14 +642,14 @@ int smp_store_status(int cpu)
/*
* Collect CPU state of the previous, crashed system.
* There are four cases:
- * 1) standard zfcp dump
- * condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 1) standard zfcp/nvme dump
+ * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The boot CPU state is located in
* the absolute lowcore of the memory stored in the HSA. The zcore code
* will copy the boot CPU state from the HSA.
- * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory)
- * condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory)
+ * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The firmware or the boot-loader
* stored the registers of the boot CPU in the absolute lowcore in the
@@ -619,42 +665,39 @@ int smp_store_status(int cpu)
* This case does not exist for s390 anymore, setup_arch explicitly
* deactivates the elfcorehdr= kernel parameter
*/
-static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr,
- bool is_boot_cpu, unsigned long page)
+static bool dump_available(void)
{
- __vector128 *vxrs = (__vector128 *) page;
-
- if (is_boot_cpu)
- vxrs = boot_cpu_vector_save_area;
- else
- __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page);
- save_area_add_vxrs(sa, vxrs);
+ return oldmem_data.start || is_ipl_type_dump();
}
-static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr,
- bool is_boot_cpu, unsigned long page)
+void __init smp_save_dump_ipl_cpu(void)
{
- void *regs = (void *) page;
+ struct save_area *sa;
+ void *regs;
- if (is_boot_cpu)
- copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512);
- else
- __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page);
+ if (!dump_available())
+ return;
+ sa = save_area_alloc(true);
+ regs = memblock_alloc(512, 8);
+ if (!sa || !regs)
+ panic("could not allocate memory for boot CPU save area\n");
+ copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512);
save_area_add_regs(sa, regs);
+ memblock_free(regs, 512);
+ if (MACHINE_HAS_VX)
+ save_area_add_vxrs(sa, boot_cpu_vector_save_area);
}
-void __init smp_save_dump_cpus(void)
+void __init smp_save_dump_secondary_cpus(void)
{
int addr, boot_cpu_addr, max_cpu_addr;
struct save_area *sa;
- unsigned long page;
- bool is_boot_cpu;
+ void *page;
- if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP))
- /* No previous system present, normal boot. */
+ if (!dump_available())
return;
/* Allocate a page as dumping area for the store status sigps */
- page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31);
+ page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
if (!page)
panic("ERROR: Failed to allocate %lx bytes below %lx\n",
PAGE_SIZE, 1UL << 31);
@@ -664,29 +707,23 @@ void __init smp_save_dump_cpus(void)
boot_cpu_addr = stap();
max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev;
for (addr = 0; addr <= max_cpu_addr; addr++) {
+ if (addr == boot_cpu_addr)
+ continue;
if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) ==
SIGP_CC_NOT_OPERATIONAL)
continue;
- is_boot_cpu = (addr == boot_cpu_addr);
- /* Allocate save area */
- sa = save_area_alloc(is_boot_cpu);
+ sa = save_area_alloc(false);
if (!sa)
panic("could not allocate memory for save area\n");
- if (MACHINE_HAS_VX)
- /* Get the vector registers */
- smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page);
- /*
- * For a zfcp dump OLDMEM_BASE == NULL and the registers
- * of the boot CPU are stored in the HSA. To retrieve
- * these registers an SCLP request is required which is
- * done by drivers/s390/char/zcore.c:init_cpu_info()
- */
- if (!is_boot_cpu || OLDMEM_BASE)
- /* Get the CPU registers */
- smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
+ __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page));
+ save_area_add_regs(sa, page);
+ if (MACHINE_HAS_VX) {
+ __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page));
+ save_area_add_vxrs(sa, page);
+ }
}
memblock_free(page, PAGE_SIZE);
- diag_dma_ops.diag308_reset();
+ diag_amode31_ops.diag308_reset();
pcpu_set_smt(0);
}
#endif /* CONFIG_CRASH_DUMP */
@@ -701,6 +738,11 @@ int smp_cpu_get_polarization(int cpu)
return pcpu_devices[cpu].polarization;
}
+int smp_cpu_get_cpu_address(int cpu)
+{
+ return pcpu_devices[cpu].address;
+}
+
static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
{
static int use_sigp_detection;
@@ -760,11 +802,13 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
{
struct sclp_core_entry *core;
- cpumask_t avail;
+ static cpumask_t avail;
bool configured;
u16 core_id;
int nr, i;
+ cpus_read_lock();
+ mutex_lock(&smp_cpu_state_mutex);
nr = 0;
cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask);
/*
@@ -785,6 +829,8 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
configured = i < info->configured;
nr += smp_add_core(&info->core[i], &avail, configured, early);
}
+ mutex_unlock(&smp_cpu_state_mutex);
+ cpus_read_unlock();
return nr;
}
@@ -832,71 +878,52 @@ void __init smp_detect_cpus(void)
pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
/* Add CPUs present at boot */
- get_online_cpus();
__smp_rescan_cpus(info, true);
- put_online_cpus();
- memblock_free_early((unsigned long)info, sizeof(*info));
+ memblock_free(info, sizeof(*info));
}
-static void smp_init_secondary(void)
+/*
+ * Activate a secondary processor.
+ */
+static void smp_start_secondary(void *cpuvoid)
{
- int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
S390_lowcore.last_update_clock = get_tod_clock();
+ S390_lowcore.restart_stack = (unsigned long)restart_stack;
+ S390_lowcore.restart_fn = (unsigned long)do_restart;
+ S390_lowcore.restart_data = 0;
+ S390_lowcore.restart_source = -1U;
+ S390_lowcore.restart_flags = 0;
restore_access_regs(S390_lowcore.access_regs_save_area);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- set_cpu_flag(CIF_ASCE_SECONDARY);
cpu_init();
- preempt_disable();
+ rcu_cpu_starting(cpu);
init_cpu_timer();
vtime_init();
+ vdso_getcpu_init();
pfault_init();
- notify_cpu_starting(smp_processor_id());
+ cpumask_set_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
+ notify_cpu_starting(cpu);
if (topology_cpu_dedicated(cpu))
set_cpu_flag(CIF_DEDICATED_CPU);
else
clear_cpu_flag(CIF_DEDICATED_CPU);
- set_cpu_online(smp_processor_id(), true);
+ set_cpu_online(cpu, true);
inc_irq_stat(CPU_RST);
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-/*
- * Activate a secondary processor.
- */
-static void __no_sanitize_address smp_start_secondary(void *cpuvoid)
-{
- S390_lowcore.restart_stack = (unsigned long) restart_stack;
- S390_lowcore.restart_fn = (unsigned long) do_restart;
- S390_lowcore.restart_data = 0;
- S390_lowcore.restart_source = -1UL;
- __ctl_load(S390_lowcore.cregs_save_area, 0, 15);
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
- CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
-}
-
/* Upping and downing of CPUs */
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
- struct pcpu *pcpu;
- int base, i, rc;
+ struct pcpu *pcpu = pcpu_devices + cpu;
+ int rc;
- pcpu = pcpu_devices + cpu;
if (pcpu->state != CPU_STATE_CONFIGURED)
return -EIO;
- base = smp_get_base_cpu(cpu);
- for (i = 0; i <= smp_cpu_mtid; i++) {
- if (base + i < nr_cpu_ids)
- if (cpu_online(base + i))
- break;
- }
- /*
- * If this is the first CPU of the core to get online
- * do an initial CPU reset.
- */
- if (i > smp_cpu_mtid &&
- pcpu_sigp_retry(pcpu_devices + base, SIGP_INITIAL_CPU_RESET, 0) !=
+ if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) !=
SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
@@ -924,10 +951,14 @@ early_param("possible_cpus", _setup_possible_cpus);
int __cpu_disable(void)
{
unsigned long cregs[16];
+ int cpu;
/* Handle possible pending IPIs */
smp_handle_ext_call();
- set_cpu_online(smp_processor_id(), false);
+ cpu = smp_processor_id();
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
/* Disable pseudo page faults on this cpu. */
pfault_fini();
/* Disable interrupt sources via control register. */
@@ -990,15 +1021,10 @@ void __init smp_prepare_boot_cpu(void)
WARN_ON(!cpu_present(0) || !cpu_online(0));
pcpu->state = CPU_STATE_CONFIGURED;
- pcpu->lowcore = (struct lowcore *)(unsigned long) store_prefix();
S390_lowcore.percpu_offset = __per_cpu_offset[0];
smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
}
-void __init smp_cpus_done(unsigned int max_cpus)
-{
-}
-
void __init smp_setup_processor_id(void)
{
pcpu_devices[0].address = stap();
@@ -1041,7 +1067,7 @@ static ssize_t cpu_configure_store(struct device *dev,
return -EINVAL;
if (val != 0 && val != 1)
return -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smp_cpu_state_mutex);
rc = -EBUSY;
/* disallow configuration changes of online cpus and cpu 0 */
@@ -1090,7 +1116,7 @@ static ssize_t cpu_configure_store(struct device *dev,
}
out:
mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return rc ? rc : count;
}
static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
@@ -1128,6 +1154,7 @@ static int smp_cpu_online(unsigned int cpu)
return sysfs_create_group(&s->kobj, &cpu_online_attr_group);
}
+
static int smp_cpu_pre_down(unsigned int cpu)
{
struct device *s = &per_cpu(cpu_device, cpu)->dev;
@@ -1176,11 +1203,7 @@ int __ref smp_rescan_cpus(void)
if (!info)
return -ENOMEM;
smp_get_core_info(info, 0);
- get_online_cpus();
- mutex_lock(&smp_cpu_state_mutex);
nr = __smp_rescan_cpus(info, false);
- mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
kfree(info);
if (nr)
topology_schedule_update();
@@ -1223,3 +1246,60 @@ out:
return rc;
}
subsys_initcall(s390_smp_init);
+
+static __always_inline void set_new_lowcore(struct lowcore *lc)
+{
+ union register_pair dst, src;
+ u32 pfx;
+
+ src.even = (unsigned long) &S390_lowcore;
+ src.odd = sizeof(S390_lowcore);
+ dst.even = (unsigned long) lc;
+ dst.odd = sizeof(*lc);
+ pfx = __pa(lc);
+
+ asm volatile(
+ " mvcl %[dst],%[src]\n"
+ " spx %[pfx]\n"
+ : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair)
+ : [pfx] "Q" (pfx)
+ : "memory", "cc");
+}
+
+int __init smp_reinit_ipl_cpu(void)
+{
+ unsigned long async_stack, nodat_stack, mcck_stack;
+ struct lowcore *lc, *lc_ipl;
+ unsigned long flags, cr0;
+ u64 mcesad;
+
+ lc_ipl = lowcore_ptr[0];
+ lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
+ nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+ async_stack = stack_alloc();
+ mcck_stack = stack_alloc();
+ if (!lc || !nodat_stack || !async_stack || !mcck_stack || nmi_alloc_mcesa(&mcesad))
+ panic("Couldn't allocate memory");
+
+ local_irq_save(flags);
+ local_mcck_disable();
+ set_new_lowcore(lc);
+ S390_lowcore.nodat_stack = nodat_stack + STACK_INIT_OFFSET;
+ S390_lowcore.async_stack = async_stack + STACK_INIT_OFFSET;
+ S390_lowcore.mcck_stack = mcck_stack + STACK_INIT_OFFSET;
+ __ctl_store(cr0, 0, 0);
+ __ctl_clear_bit(0, 28); /* disable lowcore protection */
+ S390_lowcore.mcesad = mcesad;
+ __ctl_load(cr0, 0, 0);
+ if (abs_lowcore_map(0, lc, false))
+ panic("Couldn't remap absolute lowcore");
+ lowcore_ptr[0] = lc;
+ local_mcck_enable();
+ local_irq_restore(flags);
+
+ free_pages(lc_ipl->async_stack - STACK_INIT_OFFSET, THREAD_SIZE_ORDER);
+ memblock_free_late(__pa(lc_ipl->mcck_stack - STACK_INIT_OFFSET), THREAD_SIZE);
+ memblock_free_late(__pa(lc_ipl), sizeof(*lc_ipl));
+
+ return 0;
+}
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index fc5419ac64c8..7ee455e8e3d5 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -3,7 +3,6 @@
* Stack trace management functions
*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/stacktrace.h>
@@ -19,17 +18,11 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
unwind_for_each_frame(&state, task, regs, 0) {
addr = unwind_get_return_address(&state);
- if (!addr || !consume_entry(cookie, addr, false))
+ if (!addr || !consume_entry(cookie, addr))
break;
}
}
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack. Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
void *cookie, struct task_struct *task)
{
@@ -52,11 +45,11 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
* Mark stacktraces with kretprobed functions on them
* as unreliable.
*/
- if (state.ip == (unsigned long)kretprobe_trampoline)
+ if (state.ip == (unsigned long)__kretprobe_trampoline)
return -EINVAL;
#endif
- if (!consume_entry(cookie, addr, false))
+ if (!consume_entry(cookie, addr))
return -EINVAL;
}
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 888cc2f166db..4d141e2c132e 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -395,19 +395,18 @@ out:
static int sthyi(u64 vaddr, u64 *rc)
{
- register u64 code asm("0") = 0;
- register u64 addr asm("2") = vaddr;
- register u64 rcode asm("3");
+ union register_pair r1 = { .even = 0, }; /* subcode */
+ union register_pair r2 = { .even = vaddr, };
int cc;
asm volatile(
- ".insn rre,0xB2560000,%[code],%[addr]\n"
+ ".insn rre,0xB2560000,%[r1],%[r2]\n"
"ipm %[cc]\n"
"srl %[cc],28\n"
- : [cc] "=d" (cc), "=d" (rcode)
- : [code] "d" (code), [addr] "a" (addr)
+ : [cc] "=&d" (cc), [r2] "+&d" (r2.pair)
+ : [r1] "d" (r1.pair)
: "memory", "cc");
- *rc = rcode;
+ *rc = r2.odd;
return cc;
}
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c
deleted file mode 100644
index 75b7b307946e..000000000000
--- a/arch/s390/kernel/suspend.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Suspend support specific for s390.
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- */
-
-#include <linux/pfn.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <asm/ctl_reg.h>
-#include <asm/ipl.h>
-#include <asm/cio.h>
-#include <asm/sections.h>
-#include "entry.h"
-
-/*
- * The restore of the saved pages in an hibernation image will set
- * the change and referenced bits in the storage key for each page.
- * Overindication of the referenced bits after an hibernation cycle
- * does not cause any harm but the overindication of the change bits
- * would cause trouble.
- * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each
- * page to the most significant byte of the associated page frame
- * number in the hibernation image.
- */
-
-/*
- * Key storage is allocated as a linked list of pages.
- * The size of the keys array is (PAGE_SIZE - sizeof(long))
- */
-struct page_key_data {
- struct page_key_data *next;
- unsigned char data[];
-};
-
-#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *))
-
-static struct page_key_data *page_key_data;
-static struct page_key_data *page_key_rp, *page_key_wp;
-static unsigned long page_key_rx, page_key_wx;
-unsigned long suspend_zero_pages;
-
-/*
- * For each page in the hibernation image one additional byte is
- * stored in the most significant byte of the page frame number.
- * On suspend no additional memory is required but on resume the
- * keys need to be memorized until the page data has been restored.
- * Only then can the storage keys be set to their old state.
- */
-unsigned long page_key_additional_pages(unsigned long pages)
-{
- return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
-}
-
-/*
- * Free page_key_data list of arrays.
- */
-void page_key_free(void)
-{
- struct page_key_data *pkd;
-
- while (page_key_data) {
- pkd = page_key_data;
- page_key_data = pkd->next;
- free_page((unsigned long) pkd);
- }
-}
-
-/*
- * Allocate page_key_data list of arrays with enough room to store
- * one byte for each page in the hibernation image.
- */
-int page_key_alloc(unsigned long pages)
-{
- struct page_key_data *pk;
- unsigned long size;
-
- size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
- while (size--) {
- pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL);
- if (!pk) {
- page_key_free();
- return -ENOMEM;
- }
- pk->next = page_key_data;
- page_key_data = pk;
- }
- page_key_rp = page_key_wp = page_key_data;
- page_key_rx = page_key_wx = 0;
- return 0;
-}
-
-/*
- * Save the storage key into the upper 8 bits of the page frame number.
- */
-void page_key_read(unsigned long *pfn)
-{
- struct page *page;
- unsigned long addr;
- unsigned char key;
-
- page = pfn_to_page(*pfn);
- addr = (unsigned long) page_address(page);
- key = (unsigned char) page_get_storage_key(addr) & 0x7f;
- if (arch_test_page_nodat(page))
- key |= 0x80;
- *(unsigned char *) pfn = key;
-}
-
-/*
- * Extract the storage key from the upper 8 bits of the page frame number
- * and store it in the page_key_data list of arrays.
- */
-void page_key_memorize(unsigned long *pfn)
-{
- page_key_wp->data[page_key_wx] = *(unsigned char *) pfn;
- *(unsigned char *) pfn = 0;
- if (++page_key_wx < PAGE_KEY_DATA_SIZE)
- return;
- page_key_wp = page_key_wp->next;
- page_key_wx = 0;
-}
-
-/*
- * Get the next key from the page_key_data list of arrays and set the
- * storage key of the page referred by @address. If @address refers to
- * a "safe" page the swsusp_arch_resume code will transfer the storage
- * key from the buffer page to the original page.
- */
-void page_key_write(void *address)
-{
- struct page *page;
- unsigned char key;
-
- key = page_key_rp->data[page_key_rx];
- page_set_storage_key((unsigned long) address, key & 0x7f, 0);
- page = virt_to_page(address);
- if (key & 0x80)
- arch_set_page_nodat(page, 0);
- else
- arch_set_page_dat(page, 0);
- if (++page_key_rx >= PAGE_KEY_DATA_SIZE)
- return;
- page_key_rp = page_key_rp->next;
- page_key_rx = 0;
-}
-
-int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin));
- unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end));
- unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1;
- unsigned long stext_pfn = PFN_DOWN(__pa(_stext));
-
- /* Always save lowcore pages (LC protection might be enabled). */
- if (pfn <= LC_PAGES)
- return 0;
- if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn)
- return 1;
- /* Skip memory holes and read-only pages (DCSS, ...). */
- if (pfn >= stext_pfn && pfn <= end_rodata_pfn)
- return 0;
- if (tprot(PFN_PHYS(pfn)))
- return 1;
- return 0;
-}
-
-/*
- * PM notifier callback for suspend
- */
-static int suspend_pm_cb(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- switch (action) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER);
- if (!suspend_zero_pages)
- return NOTIFY_BAD;
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- free_pages(suspend_zero_pages, LC_ORDER);
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int __init suspend_pm_init(void)
-{
- pm_notifier(suspend_pm_cb, 0);
- return 0;
-}
-arch_initcall(suspend_pm_init);
-
-void save_processor_state(void)
-{
- /* swsusp_arch_suspend() actually saves all cpu register contents.
- * Machine checks must be disabled since swsusp_arch_suspend() stores
- * register contents to their lowcore save areas. That's the same
- * place where register contents on machine checks would be saved.
- * To avoid register corruption disable machine checks.
- * We must also disable machine checks in the new psw mask for
- * program checks, since swsusp_arch_suspend() may generate program
- * checks. Disabling machine checks for all other new psw masks is
- * just paranoia.
- */
- local_mcck_disable();
- /* Disable lowcore protection */
- __ctl_clear_bit(0,28);
- S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK;
-}
-
-void restore_processor_state(void)
-{
- S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK;
- /* Enable lowcore protection */
- __ctl_set_bit(0,28);
- local_mcck_enable();
-}
-
-/* Called at the end of swsusp_arch_resume */
-void s390_early_resume(void)
-{
- lgr_info_log();
- channel_subsystem_reinit();
- zpci_rescan();
-}
diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S
deleted file mode 100644
index a7baf0b5f818..000000000000
--- a/arch/s390/kernel/swsusp.S
+++ /dev/null
@@ -1,276 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 64-bit swsusp implementation
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- * Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/sigp.h>
-
-/*
- * Save register context in absolute 0 lowcore and call swsusp_save() to
- * create in-memory kernel image. The context is saved in the designated
- * "store status" memory locations (see POP).
- * We return from this function twice. The first time during the suspend to
- * disk process. The second time via the swsusp_arch_resume() function
- * (see below) in the resume process.
- * This function runs with disabled interrupts.
- */
- GEN_BR_THUNK %r14
-
- .section .text
-ENTRY(swsusp_arch_suspend)
- lg %r1,__LC_NODAT_STACK
- stmg %r6,%r15,__SF_GPRS(%r1)
- aghi %r1,-STACK_FRAME_OVERHEAD
- stg %r15,__SF_BACKCHAIN(%r1)
- lgr %r15,%r1
-
- /* Store FPU registers */
- brasl %r14,save_fpu_regs
-
- /* Deactivate DAT */
- stnsm __SF_EMPTY(%r15),0xfb
-
- /* Store prefix register on stack */
- stpx __SF_EMPTY(%r15)
-
- /* Save prefix register contents for lowcore copy */
- llgf %r10,__SF_EMPTY(%r15)
-
- /* Get pointer to save area */
- lghi %r1,0x1000
-
- /* Save CPU address */
- stap __LC_EXT_CPU_ADDR(%r0)
-
- /* Store registers */
- mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */
- stam %a0,%a15,0x340(%r1) /* store access registers */
- stctg %c0,%c15,0x380(%r1) /* store control registers */
- stmg %r0,%r15,0x280(%r1) /* store general registers */
-
- stpt 0x328(%r1) /* store timer */
- stck __SF_EMPTY(%r15) /* store clock */
- stckc 0x330(%r1) /* store clock comparator */
-
- /* Update cputime accounting before going to sleep */
- lg %r0,__LC_LAST_UPDATE_TIMER
- slg %r0,0x328(%r1)
- alg %r0,__LC_SYSTEM_TIMER
- stg %r0,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1)
- lg %r0,__LC_LAST_UPDATE_CLOCK
- slg %r0,__SF_EMPTY(%r15)
- alg %r0,__LC_STEAL_TIMER
- stg %r0,__LC_STEAL_TIMER
- mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15)
-
- /* Activate DAT */
- stosm __SF_EMPTY(%r15),0x04
-
- /* Set prefix page to zero */
- xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
- spx __SF_EMPTY(%r15)
-
- /* Save absolute zero pages */
- larl %r2,suspend_zero_pages
- lg %r2,0(%r2)
- lghi %r4,0
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Copy lowcore to absolute zero lowcore */
- lghi %r2,0
- lgr %r4,%r10
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Save image */
- brasl %r14,swsusp_save
-
- /* Restore prefix register and return */
- lghi %r1,0x1000
- spx 0x318(%r1)
- lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
- lghi %r2,0
- BR_EX %r14
-ENDPROC(swsusp_arch_suspend)
-
-/*
- * Restore saved memory image to correct place and restore register context.
- * Then we return to the function that called swsusp_arch_suspend().
- * swsusp_arch_resume() runs with disabled interrupts.
- */
-ENTRY(swsusp_arch_resume)
- stmg %r6,%r15,__SF_GPRS(%r15)
- lgr %r1,%r15
- aghi %r15,-STACK_FRAME_OVERHEAD
- stg %r1,__SF_BACKCHAIN(%r15)
-
- /* Make all free pages stable */
- lghi %r2,1
- brasl %r14,arch_set_page_states
-
- /* Set prefix page to zero */
- xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
- spx __SF_EMPTY(%r15)
-
- /* Deactivate DAT */
- stnsm __SF_EMPTY(%r15),0xfb
-
- /* Restore saved image */
- larl %r1,restore_pblist
- lg %r1,0(%r1)
- ltgr %r1,%r1
- jz 2f
-0:
- lg %r2,8(%r1)
- lg %r4,0(%r1)
- iske %r0,%r4
- lghi %r3,PAGE_SIZE
- lghi %r5,PAGE_SIZE
-1:
- mvcle %r2,%r4,0
- jo 1b
- lg %r2,8(%r1)
- sske %r0,%r2
- lg %r1,16(%r1)
- ltgr %r1,%r1
- jnz 0b
-2:
- ptlb /* flush tlb */
-
- /* Reset System */
- larl %r1,.Lnew_pgm_check_psw
- epsw %r2,%r3
- stm %r2,%r3,0(%r1)
- mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1)
- larl %r1,__swsusp_reset_dma
- lg %r1,0(%r1)
- BASR_EX %r14,%r1
- larl %r1,smp_cpu_mt_shift
- icm %r1,15,0(%r1)
- jz smt_done
- llgfr %r1,%r1
-smt_loop:
- sigp %r1,%r0,SIGP_SET_MULTI_THREADING
- brc 8,smt_done /* accepted */
- brc 2,smt_loop /* busy, try again */
-smt_done:
- larl %r1,.Lnew_pgm_check_psw
- lpswe 0(%r1)
-pgm_check_entry:
-
- /* Switch to original suspend CPU */
- larl %r1,.Lresume_cpu /* Resume CPU address: r2 */
- stap 0(%r1)
- llgh %r2,0(%r1)
- llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */
- cgr %r1,%r2
- je restore_registers /* r1 = r2 -> nothing to do */
- larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */
- mvc __LC_RST_NEW_PSW(16,%r0),0(%r4)
-3:
- sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */
- brc 8,4f /* accepted */
- brc 2,3b /* busy, try again */
-
- /* Suspend CPU not available -> panic */
- larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD
- larl %r2,.Lpanic_string
- brasl %r14,sclp_early_printk_force
- larl %r3,.Ldisabled_wait_31
- lpsw 0(%r3)
-4:
- /* Switch to suspend CPU */
- sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */
- brc 2,4b /* busy, try again */
-5:
- sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */
- brc 2,5b /* busy, try again */
-6: j 6b
-
-restart_suspend:
- larl %r1,.Lresume_cpu
- llgh %r2,0(%r1)
-7:
- sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */
- brc 8,7b /* accepted, status 0, still running */
- brc 2,7b /* busy, try again */
- tmll %r9,0x40 /* Test if resume CPU is stopped */
- jz 7b
-
-restore_registers:
- /* Restore registers */
- lghi %r13,0x1000 /* %r1 = pointer to save area */
-
- /* Ignore time spent in suspended state. */
- llgf %r1,0x318(%r13)
- stck __LC_LAST_UPDATE_CLOCK(%r1)
- spt 0x328(%r13) /* reprogram timer */
- //sckc 0x330(%r13) /* set clock comparator */
-
- lctlg %c0,%c15,0x380(%r13) /* load control registers */
- lam %a0,%a15,0x340(%r13) /* load access registers */
-
- /* Load old stack */
- lg %r15,0x2f8(%r13)
-
- /* Save prefix register */
- mvc __SF_EMPTY(4,%r15),0x318(%r13)
-
- /* Restore absolute zero pages */
- lghi %r2,0
- larl %r4,suspend_zero_pages
- lg %r4,0(%r4)
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Restore prefix register */
- spx __SF_EMPTY(%r15)
-
- /* Activate DAT */
- stosm __SF_EMPTY(%r15),0x04
-
- /* Make all free pages unstable */
- lghi %r2,0
- brasl %r14,arch_set_page_states
-
- /* Call arch specific early resume code */
- brasl %r14,s390_early_resume
-
- /* Return 0 */
- lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
- lghi %r2,0
- BR_EX %r14
-ENDPROC(swsusp_arch_resume)
-
- .section .data..nosave,"aw",@progbits
- .align 8
-.Ldisabled_wait_31:
- .long 0x000a0000,0x00000000
-.Lpanic_string:
- .asciz "Resume not possible because suspend CPU is no longer available\n"
- .align 8
-.Lrestart_suspend_psw:
- .quad 0x0000000180000000,restart_suspend
-.Lnew_pgm_check_psw:
- .quad 0,pgm_check_entry
-.Lresume_cpu:
- .byte 0,0
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/syscall.c
index 202fa73ac167..dc2355c623d6 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/syscall.c
@@ -29,6 +29,13 @@
#include <linux/unistd.h>
#include <linux/ipc.h>
#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/thread_info.h>
+#include <linux/entry-common.h>
+
+#include <asm/ptrace.h>
+#include <asm/vtime.h>
+
#include "entry.h"
/*
@@ -100,3 +107,64 @@ SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
+
+static void do_syscall(struct pt_regs *regs)
+{
+ unsigned long nr;
+
+ nr = regs->int_code & 0xffff;
+ if (!nr) {
+ nr = regs->gprs[1] & 0xffff;
+ regs->int_code &= ~0xffffUL;
+ regs->int_code |= nr;
+ }
+
+ regs->gprs[2] = nr;
+
+ if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
+ regs->psw.addr = current->restart_block.arch_data;
+ current->restart_block.arch_data = 1;
+ }
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /*
+ * In the s390 ptrace ABI, both the syscall number and the return value
+ * use gpr2. However, userspace puts the syscall number either in the
+ * svc instruction itself, or uses gpr1. To make at least skipping syscalls
+ * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
+ * and if set, the syscall will be skipped.
+ */
+
+ if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+ goto out;
+ regs->gprs[2] = -ENOSYS;
+ if (likely(nr >= NR_syscalls))
+ goto out;
+ do {
+ regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+ } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART));
+out:
+ syscall_exit_to_user_mode_work(regs);
+}
+
+void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
+{
+ add_random_kstack_offset();
+ enter_from_user_mode(regs);
+ regs->psw = S390_lowcore.svc_old_psw;
+ regs->int_code = S390_lowcore.svc_int_code;
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+
+ local_irq_enable();
+ regs->orig_gpr2 = regs->gprs[2];
+
+ if (per_trap)
+ set_thread_flag(TIF_PER_TRAP);
+
+ regs->flags = 0;
+ set_pt_regs_flag(regs, PIF_SYSCALL);
+ do_syscall(regs);
+ exit_to_user_mode();
+}
diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile
index b98f25029b8e..fb85e797946d 100644
--- a/arch/s390/kernel/syscalls/Makefile
+++ b/arch/s390/kernel/syscalls/Makefile
@@ -21,8 +21,7 @@ uapi: $(uapi-hdrs-y)
# Create output directory if not already present
-_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \
- $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+$(shell mkdir -p $(uapi) $(kapi))
filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $<
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index bd7bd3581a0f..799147658dee 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -26,7 +26,7 @@
16 32 lchown - sys_lchown16
19 common lseek sys_lseek compat_sys_lseek
20 common getpid sys_getpid sys_getpid
-21 common mount sys_mount compat_sys_mount
+21 common mount sys_mount sys_mount
22 common umount sys_oldumount sys_oldumount
23 32 setuid - sys_setuid16
24 32 getuid - sys_getuid16
@@ -122,7 +122,7 @@
131 common quotactl sys_quotactl sys_quotactl
132 common getpgid sys_getpgid sys_getpgid
133 common fchdir sys_fchdir sys_fchdir
-134 common bdflush sys_bdflush sys_bdflush
+134 common bdflush sys_ni_syscall sys_ni_syscall
135 common sysfs sys_sysfs sys_sysfs
136 common personality sys_s390_personality sys_s390_personality
137 common afs_syscall - -
@@ -134,11 +134,11 @@
142 64 select sys_select -
143 common flock sys_flock sys_flock
144 common msync sys_msync sys_msync
-145 common readv sys_readv compat_sys_readv
-146 common writev sys_writev compat_sys_writev
+145 common readv sys_readv sys_readv
+146 common writev sys_writev sys_writev
147 common getsid sys_getsid sys_getsid
148 common fdatasync sys_fdatasync sys_fdatasync
-149 common _sysctl sys_sysctl compat_sys_sysctl
+149 common _sysctl - -
150 common mlock sys_mlock sys_mlock
151 common munlock sys_munlock sys_munlock
152 common mlockall sys_mlockall sys_mlockall
@@ -274,9 +274,9 @@
265 common statfs64 sys_statfs64 compat_sys_statfs64
266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages
-268 common mbind sys_mbind compat_sys_mbind
-269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+268 common mbind sys_mbind sys_mbind
+269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy
+270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy
271 common mq_open sys_mq_open compat_sys_mq_open
272 common mq_unlink sys_mq_unlink sys_mq_unlink
273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32
@@ -293,7 +293,7 @@
284 common inotify_init sys_inotify_init sys_inotify_init
285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch
286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch
-287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages
+287 common migrate_pages sys_migrate_pages sys_migrate_pages
288 common openat sys_openat compat_sys_openat
289 common mkdirat sys_mkdirat sys_mkdirat
290 common mknodat sys_mknodat sys_mknodat
@@ -316,8 +316,8 @@
306 common splice sys_splice sys_splice
307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range
308 common tee sys_tee sys_tee
-309 common vmsplice sys_vmsplice compat_sys_vmsplice
-310 common move_pages sys_move_pages compat_sys_move_pages
+309 common vmsplice sys_vmsplice sys_vmsplice
+310 common move_pages sys_move_pages sys_move_pages
311 common getcpu sys_getcpu sys_getcpu
312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
313 common utimes sys_utimes sys_utimes_time32
@@ -347,8 +347,8 @@
337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32
338 common syncfs sys_syncfs sys_syncfs
339 common setns sys_setns sys_setns
-340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
-341 common process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
+340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv
+341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev
342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr
343 common kcmp sys_kcmp sys_kcmp
344 common finit_module sys_finit_module sys_finit_module
@@ -372,8 +372,8 @@
362 common connect sys_connect sys_connect
363 common listen sys_listen sys_listen
364 common accept4 sys_accept4 sys_accept4
-365 common getsockopt sys_getsockopt compat_sys_getsockopt
-366 common setsockopt sys_setsockopt compat_sys_setsockopt
+365 common getsockopt sys_getsockopt sys_getsockopt
+366 common setsockopt sys_setsockopt sys_setsockopt
367 common getsockname sys_getsockname sys_getsockname
368 common getpeername sys_getpeername sys_getpeername
369 common sendto sys_sendto sys_sendto
@@ -438,5 +438,18 @@
433 common fspick sys_fspick sys_fspick
434 common pidfd_open sys_pidfd_open sys_pidfd_open
435 common clone3 sys_clone3 sys_clone3
+436 common close_range sys_close_range sys_close_range
437 common openat2 sys_openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
+439 common faccessat2 sys_faccessat2 sys_faccessat2
+440 common process_madvise sys_process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+442 common mount_setattr sys_mount_setattr sys_mount_setattr
+443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd
+444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset
+445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule
+446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448 common process_mrelease sys_process_mrelease sys_process_mrelease
+449 common futex_waitv sys_futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 2ac3c9b56a13..b5e364358ce4 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -14,6 +14,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/slab.h>
+#include <asm/asm-extable.h>
#include <asm/ebcdic.h>
#include <asm/debug.h>
#include <asm/sysinfo.h>
@@ -25,19 +26,22 @@ int topology_max_mnest;
static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl)
{
- register int r0 asm("0") = (fc << 28) | sel1;
- register int r1 asm("1") = sel2;
+ int r0 = (fc << 28) | sel1;
int rc = 0;
asm volatile(
- " stsi 0(%3)\n"
+ " lr 0,%[r0]\n"
+ " lr 1,%[r1]\n"
+ " stsi 0(%[sysinfo])\n"
"0: jz 2f\n"
- "1: lhi %1,%4\n"
- "2:\n"
+ "1: lhi %[rc],%[retval]\n"
+ "2: lr %[r0],0\n"
EX_TABLE(0b, 1b)
- : "+d" (r0), "+d" (rc)
- : "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP)
- : "cc", "memory");
+ : [r0] "+d" (r0), [rc] "+d" (rc)
+ : [r1] "d" (sel2),
+ [sysinfo] "a" (sysinfo),
+ [retval] "K" (-EOPNOTSUPP)
+ : "cc", "0", "1", "memory");
*lvl = ((unsigned int) r0) >> 28;
return rc;
}
diff --git a/arch/s390/boot/text_dma.S b/arch/s390/kernel/text_amode31.S
index 9715715c4c28..2c8b14cc5556 100644
--- a/arch/s390/boot/text_dma.S
+++ b/arch/s390/kernel/text_amode31.S
@@ -6,38 +6,28 @@
*/
#include <linux/linkage.h>
+#include <asm/asm-extable.h>
#include <asm/errno.h>
#include <asm/sigp.h>
-#ifdef CC_USING_EXPOLINE
- .pushsection .dma.text.__s390_indirect_jump_r14,"axG"
-__dma__s390_indirect_jump_r14:
- larl %r1,0f
- ex 0,0(%r1)
- j .
-0: br %r14
- .popsection
-#endif
-
- .section .dma.text,"ax"
+ .section .amode31.text,"ax"
/*
* Simplified version of expoline thunk. The normal thunks can not be used here,
* because they might be more than 2 GB away, and not reachable by the relative
* branch. No comdat, exrl, etc. optimizations used here, because it only
* affects a few functions that are not performance-relevant.
*/
- .macro BR_EX_DMA_r14
-#ifdef CC_USING_EXPOLINE
- jg __dma__s390_indirect_jump_r14
-#else
- br %r14
-#endif
+ .macro BR_EX_AMODE31_r14
+ larl %r1,0f
+ ex 0,0(%r1)
+ j .
+0: br %r14
.endm
/*
- * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode)
+ * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode)
*/
-ENTRY(_diag14_dma)
+ENTRY(_diag14_amode31)
lgr %r1,%r2
lgr %r2,%r3
lgr %r3,%r4
@@ -50,14 +40,14 @@ ENTRY(_diag14_dma)
.Ldiag14_fault:
sam64
lgfr %r2,%r5
- BR_EX_DMA_r14
- EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault)
-ENDPROC(_diag14_dma)
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault)
+ENDPROC(_diag14_amode31)
/*
- * int _diag210_dma(struct diag210 *addr)
+ * int _diag210_amode31(struct diag210 *addr)
*/
-ENTRY(_diag210_dma)
+ENTRY(_diag210_amode31)
lgr %r1,%r2
lhi %r2,-1
sam31
@@ -68,57 +58,40 @@ ENTRY(_diag210_dma)
.Ldiag210_fault:
sam64
lgfr %r2,%r2
- BR_EX_DMA_r14
- EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault)
-ENDPROC(_diag210_dma)
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault)
+ENDPROC(_diag210_amode31)
/*
- * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode)
+ * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode)
*/
-ENTRY(_diag26c_dma)
+ENTRY(_diag26c_amode31)
lghi %r5,-EOPNOTSUPP
sam31
diag %r2,%r4,0x26c
.Ldiag26c_ex:
sam64
lgfr %r2,%r5
- BR_EX_DMA_r14
- EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex)
-ENDPROC(_diag26c_dma)
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex)
+ENDPROC(_diag26c_amode31)
/*
- * void _diag0c_dma(struct hypfs_diag0c_entry *entry)
+ * void _diag0c_amode31(struct hypfs_diag0c_entry *entry)
*/
-ENTRY(_diag0c_dma)
+ENTRY(_diag0c_amode31)
sam31
diag %r2,%r2,0x0c
sam64
- BR_EX_DMA_r14
-ENDPROC(_diag0c_dma)
-
-/*
- * void _swsusp_reset_dma(void)
- */
-ENTRY(_swsusp_reset_dma)
- larl %r1,restart_entry
- larl %r2,.Lrestart_diag308_psw
- og %r1,0(%r2)
- stg %r1,0(%r0)
- lghi %r0,0
- diag %r0,%r0,0x308
-restart_entry:
- lhi %r1,1
- sigp %r1,%r0,SIGP_SET_ARCHITECTURE
- sam64
- BR_EX_DMA_r14
-ENDPROC(_swsusp_reset_dma)
+ BR_EX_AMODE31_r14
+ENDPROC(_diag0c_amode31)
/*
- * void _diag308_reset_dma(void)
+ * void _diag308_reset_amode31(void)
*
* Calls diag 308 subcode 1 and continues execution
*/
-ENTRY(_diag308_reset_dma)
+ENTRY(_diag308_reset_amode31)
larl %r4,.Lctlregs # Save control registers
stctg %c0,%c15,0(%r4)
lg %r2,0(%r4) # Disable lowcore protection
@@ -135,7 +108,7 @@ ENTRY(_diag308_reset_dma)
larl %r4,.Lcontinue_psw # Save PSW flags
epsw %r2,%r3
stm %r2,%r3,0(%r4)
- larl %r4,restart_part2 # Setup restart PSW at absolute 0
+ larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0
larl %r3,.Lrestart_diag308_psw
og %r4,0(%r3) # Save PSW
lghi %r3,0
@@ -143,7 +116,7 @@ ENTRY(_diag308_reset_dma)
lghi %r1,1
lghi %r0,0
diag %r0,%r1,0x308
-restart_part2:
+.Lrestart_part2:
lhi %r0,0 # Load r0 with zero
lhi %r1,2 # Use mode 2 = ESAME (dump)
sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode
@@ -155,19 +128,21 @@ restart_part2:
larl %r4,.Lprefix # Restore prefix register
spx 0(%r4)
larl %r4,.Lcontinue_psw # Restore PSW flags
+ larl %r2,.Lcontinue
+ stg %r2,8(%r4)
lpswe 0(%r4)
.Lcontinue:
- BR_EX_DMA_r14
-ENDPROC(_diag308_reset_dma)
+ BR_EX_AMODE31_r14
+ENDPROC(_diag308_reset_amode31)
- .section .dma.data,"aw",@progbits
+ .section .amode31.data,"aw",@progbits
.align 8
.Lrestart_diag308_psw:
.long 0x00080000,0x80000000
.align 8
.Lcontinue_psw:
- .quad 0,.Lcontinue
+ .quad 0,0
.align 8
.Lctlreg0:
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index f9d070d016e3..6b7b6d5e3632 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -41,6 +41,9 @@
#include <linux/gfp.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
+#include <vdso/vsyscall.h>
+#include <vdso/clocksource.h>
+#include <vdso/helpers.h>
#include <asm/facility.h>
#include <asm/delay.h>
#include <asm/div64.h>
@@ -52,11 +55,7 @@
#include <asm/cio.h>
#include "entry.h"
-unsigned char tod_clock_base[16] __aligned(8) = {
- /* Force to data section. */
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
+union tod_clock tod_clock_base __section(".data");
EXPORT_SYMBOL_GPL(tod_clock_base);
u64 clock_comparator_max = -1ULL;
@@ -69,10 +68,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier);
unsigned char ptff_function_mask[16];
-static unsigned long long lpar_offset;
-static unsigned long long initial_leap_seconds;
-static unsigned long long tod_steering_end;
-static long long tod_steering_delta;
+static unsigned long lpar_offset;
+static unsigned long initial_leap_seconds;
+static unsigned long tod_steering_end;
+static long tod_steering_delta;
/*
* Get time offsets with PTFF
@@ -81,10 +80,12 @@ void __init time_early_init(void)
{
struct ptff_qto qto;
struct ptff_qui qui;
+ int cs;
/* Initialize TOD steering parameters */
- tod_steering_end = *(unsigned long long *) &tod_clock_base[1];
- vdso_data->ts_end = tod_steering_end;
+ tod_steering_end = tod_clock_base.tod;
+ for (cs = 0; cs < CS_BASES; cs++)
+ vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
if (!test_facility(28))
return;
@@ -97,7 +98,7 @@ void __init time_early_init(void)
/* get initial leap seconds */
if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0)
- initial_leap_seconds = (unsigned long long)
+ initial_leap_seconds = (unsigned long)
((long) qui.old_leap * 4096000000L);
}
@@ -110,18 +111,13 @@ unsigned long long notrace sched_clock(void)
}
NOKPROBE_SYMBOL(sched_clock);
-static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt)
+static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt)
{
- unsigned long long high, low, rem, sec, nsec;
+ unsigned long rem, sec, nsec;
- /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */
- high = (*(unsigned long long *) clk) >> 4;
- low = (*(unsigned long long *)&clk[7]) << 4;
- /* Calculate seconds and nano-seconds */
- sec = high;
+ sec = clk->us;
rem = do_div(sec, 1000000);
- nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32;
-
+ nsec = ((clk->sus + (rem << 12)) * 125) >> 9;
xt->tv_sec = sec;
xt->tv_nsec = nsec;
}
@@ -201,30 +197,26 @@ static void stp_reset(void);
void read_persistent_clock64(struct timespec64 *ts)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
- __u64 delta;
+ union tod_clock clk;
+ u64 delta;
delta = initial_leap_seconds + TOD_UNIX_EPOCH;
- get_tod_clock_ext(clk);
- *(__u64 *) &clk[1] -= delta;
- if (*(__u64 *) &clk[1] > delta)
- clk[0]--;
- ext_to_timespec64(clk, ts);
+ store_tod_clock_ext(&clk);
+ clk.eitod -= delta;
+ ext_to_timespec64(&clk, ts);
}
void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
struct timespec64 *boot_offset)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
struct timespec64 boot_time;
- __u64 delta;
+ union tod_clock clk;
+ u64 delta;
delta = initial_leap_seconds + TOD_UNIX_EPOCH;
- memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE);
- *(__u64 *)&clk[1] -= delta;
- if (*(__u64 *)&clk[1] > delta)
- clk[0]--;
- ext_to_timespec64(clk, &boot_time);
+ clk = tod_clock_base;
+ clk.eitod -= delta;
+ ext_to_timespec64(&clk, &boot_time);
read_persistent_clock64(wall_time);
*boot_offset = timespec64_sub(*wall_time, boot_time);
@@ -232,12 +224,12 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
static u64 read_tod_clock(struct clocksource *cs)
{
- unsigned long long now, adj;
+ unsigned long now, adj;
preempt_disable(); /* protect from changes to steering parameters */
now = get_tod_clock();
adj = tod_steering_end - now;
- if (unlikely((s64) adj >= 0))
+ if (unlikely((s64) adj > 0))
/*
* manually steer by 1 cycle every 2^16 cycles. This
* corresponds to shifting the tod delta by 15. 1s is
@@ -253,10 +245,11 @@ static struct clocksource clocksource_tod = {
.name = "tod",
.rating = 400,
.read = read_tod_clock,
- .mask = -1ULL,
+ .mask = CLOCKSOURCE_MASK(64),
.mult = 1000,
.shift = 12,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TOD,
};
struct clocksource * __init clocksource_default_clock(void)
@@ -264,55 +257,6 @@ struct clocksource * __init clocksource_default_clock(void)
return &clocksource_tod;
}
-void update_vsyscall(struct timekeeper *tk)
-{
- u64 nsecps;
-
- if (tk->tkr_mono.clock != &clocksource_tod)
- return;
-
- /* Make userspace gettimeofday spin until we're done. */
- ++vdso_data->tb_update_count;
- smp_wmb();
- vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
- vdso_data->xtime_clock_sec = tk->xtime_sec;
- vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
- vdso_data->wtom_clock_sec =
- tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
- + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
- nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
- while (vdso_data->wtom_clock_nsec >= nsecps) {
- vdso_data->wtom_clock_nsec -= nsecps;
- vdso_data->wtom_clock_sec++;
- }
-
- vdso_data->xtime_coarse_sec = tk->xtime_sec;
- vdso_data->xtime_coarse_nsec =
- (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
- vdso_data->wtom_coarse_sec =
- vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_coarse_nsec =
- vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
- while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) {
- vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC;
- vdso_data->wtom_coarse_sec++;
- }
-
- vdso_data->tk_mult = tk->tkr_mono.mult;
- vdso_data->tk_shift = tk->tkr_mono.shift;
- smp_wmb();
- ++vdso_data->tb_update_count;
-}
-
-extern struct timezone sys_tz;
-
-void update_vsyscall_tz(void)
-{
- vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
- vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-}
-
/*
* Initialize the TOD clock and the CPU timer of
* the boot cpu.
@@ -341,11 +285,12 @@ void __init time_init(void)
}
static DEFINE_PER_CPU(atomic_t, clock_sync_word);
-static DEFINE_MUTEX(clock_sync_mutex);
+static DEFINE_MUTEX(stp_mutex);
static unsigned long clock_sync_flags;
-#define CLOCK_SYNC_HAS_STP 0
-#define CLOCK_SYNC_STP 1
+#define CLOCK_SYNC_HAS_STP 0
+#define CLOCK_SYNC_STP 1
+#define CLOCK_SYNC_STPINFO_VALID 2
/*
* The get_clock function for the physical clock. It will get the current
@@ -419,18 +364,15 @@ static inline int check_sync_clock(void)
* Apply clock delta to the global data structures.
* This is called once on the CPU that performed the clock sync.
*/
-static void clock_sync_global(unsigned long long delta)
+static void clock_sync_global(long delta)
{
unsigned long now, adj;
struct ptff_qto qto;
+ int cs;
/* Fixup the monotonic sched clock. */
- *(unsigned long long *) &tod_clock_base[1] += delta;
- if (*(unsigned long long *) &tod_clock_base[1] < delta)
- /* Epoch overflow */
- tod_clock_base[0]++;
+ tod_clock_base.eitod += delta;
/* Adjust TOD steering parameters. */
- vdso_data->tb_update_count++;
now = get_tod_clock();
adj = tod_steering_end - now;
if (unlikely((s64) adj >= 0))
@@ -439,12 +381,14 @@ static void clock_sync_global(unsigned long long delta)
-(adj >> 15) : (adj >> 15);
tod_steering_delta += delta;
if ((abs(tod_steering_delta) >> 48) != 0)
- panic("TOD clock sync offset %lli is too large to drift\n",
+ panic("TOD clock sync offset %li is too large to drift\n",
tod_steering_delta);
tod_steering_end = now + (abs(tod_steering_delta) << 15);
- vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1;
- vdso_data->ts_end = tod_steering_end;
- vdso_data->tb_update_count++;
+ for (cs = 0; cs < CS_BASES; cs++) {
+ vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
+ vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta;
+ }
+
/* Update LPAR offset. */
if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
lpar_offset = qto.tod_epoch_difference;
@@ -456,7 +400,7 @@ static void clock_sync_global(unsigned long long delta)
* Apply clock delta to the per-CPU data structures of this CPU.
* This is called for each online CPU after the call to clock_sync_global.
*/
-static void clock_sync_local(unsigned long long delta)
+static void clock_sync_local(long delta)
{
/* Add the delta to the clock comparator. */
if (S390_lowcore.clock_comparator != clock_comparator_max) {
@@ -480,7 +424,7 @@ static void __init time_init_wq(void)
struct clock_sync_data {
atomic_t cpus;
int in_sync;
- unsigned long long clock_delta;
+ long clock_delta;
};
/*
@@ -491,7 +435,6 @@ static struct stp_sstpi stp_info;
static void *stp_page;
static void stp_work_fn(struct work_struct *work);
-static DEFINE_MUTEX(stp_work_mutex);
static DECLARE_WORK(stp_work, stp_work_fn);
static struct timer_list stp_timer;
@@ -582,10 +525,26 @@ void stp_queue_work(void)
queue_work(time_sync_wq, &stp_work);
}
+static int __store_stpinfo(void)
+{
+ int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+
+ if (rc)
+ clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+ else
+ set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+ return rc;
+}
+
+static int stpinfo_valid(void)
+{
+ return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+}
+
static int stp_sync_clock(void *data)
{
struct clock_sync_data *sync = data;
- unsigned long long clock_delta;
+ long clock_delta, flags;
static int first;
int rc;
@@ -595,19 +554,18 @@ static int stp_sync_clock(void *data)
while (atomic_read(&sync->cpus) != 0)
cpu_relax();
rc = 0;
- if (stp_info.todoff[0] || stp_info.todoff[1] ||
- stp_info.todoff[2] || stp_info.todoff[3] ||
- stp_info.tmd != 2) {
+ if (stp_info.todoff || stp_info.tmd != 2) {
+ flags = vdso_update_begin();
rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0,
&clock_delta);
if (rc == 0) {
sync->clock_delta = clock_delta;
clock_sync_global(clock_delta);
- rc = chsc_sstpi(stp_page, &stp_info,
- sizeof(struct stp_sstpi));
+ rc = __store_stpinfo();
if (rc == 0 && stp_info.tmd != 2)
rc = -EAGAIN;
}
+ vdso_update_end(flags);
}
sync->in_sync = rc ? -EAGAIN : 1;
xchg(&first, 0);
@@ -627,6 +585,81 @@ static int stp_sync_clock(void *data)
return 0;
}
+static int stp_clear_leap(void)
+{
+ struct __kernel_timex txc;
+ int ret;
+
+ memset(&txc, 0, sizeof(txc));
+
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ return ret;
+
+ txc.modes = ADJ_STATUS;
+ txc.status &= ~(STA_INS|STA_DEL);
+ return do_adjtimex(&txc);
+}
+
+static void stp_check_leap(void)
+{
+ struct stp_stzi stzi;
+ struct stp_lsoib *lsoib = &stzi.lsoib;
+ struct __kernel_timex txc;
+ int64_t timediff;
+ int leapdiff, ret;
+
+ if (!stp_info.lu || !check_sync_clock()) {
+ /*
+ * Either a scheduled leap second was removed by the operator,
+ * or STP is out of sync. In both cases, clear the leap second
+ * kernel flags.
+ */
+ if (stp_clear_leap() < 0)
+ pr_err("failed to clear leap second flags\n");
+ return;
+ }
+
+ if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) {
+ pr_err("stzi failed\n");
+ return;
+ }
+
+ timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC;
+ leapdiff = lsoib->nlso - lsoib->also;
+
+ if (leapdiff != 1 && leapdiff != -1) {
+ pr_err("Cannot schedule %d leap seconds\n", leapdiff);
+ return;
+ }
+
+ if (timediff < 0) {
+ if (stp_clear_leap() < 0)
+ pr_err("failed to clear leap second flags\n");
+ } else if (timediff < 7200) {
+ memset(&txc, 0, sizeof(txc));
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ return;
+
+ txc.modes = ADJ_STATUS;
+ if (leapdiff > 0)
+ txc.status |= STA_INS;
+ else
+ txc.status |= STA_DEL;
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ pr_err("failed to set leap second flags\n");
+ /* arm Timer to clear leap second flags */
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC));
+ } else {
+ /* The day the leap second is scheduled for hasn't been reached. Retry
+ * in one hour.
+ */
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC));
+ }
+}
+
/*
* STP work. Check for the STP state and take over the clock
* synchronization if the STP clock source is usable.
@@ -637,7 +670,7 @@ static void stp_work_fn(struct work_struct *work)
int rc;
/* prevent multiple execution. */
- mutex_lock(&stp_work_mutex);
+ mutex_lock(&stp_mutex);
if (!stp_online) {
chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL);
@@ -645,33 +678,34 @@ static void stp_work_fn(struct work_struct *work)
goto out_unlock;
}
- rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL);
+ rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL);
if (rc)
goto out_unlock;
- rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+ rc = __store_stpinfo();
if (rc || stp_info.c == 0)
goto out_unlock;
/* Skip synchronization if the clock is already in sync. */
- if (check_sync_clock())
- goto out_unlock;
-
- memset(&stp_sync, 0, sizeof(stp_sync));
- cpus_read_lock();
- atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
- stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
- cpus_read_unlock();
+ if (!check_sync_clock()) {
+ memset(&stp_sync, 0, sizeof(stp_sync));
+ cpus_read_lock();
+ atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
+ stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
+ cpus_read_unlock();
+ }
if (!check_sync_clock())
/*
* There is a usable clock but the synchonization failed.
* Retry after a second.
*/
- mod_timer(&stp_timer, jiffies + HZ);
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC));
+ else if (stp_info.lu)
+ stp_check_leap();
out_unlock:
- mutex_unlock(&stp_work_mutex);
+ mutex_unlock(&stp_mutex);
}
/*
@@ -682,115 +716,178 @@ static struct bus_type stp_subsys = {
.dev_name = "stp",
};
-static ssize_t stp_ctn_id_show(struct device *dev,
+static ssize_t ctn_id_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%016llx\n",
- *(unsigned long long *) stp_info.ctnid);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%016lx\n",
+ *(unsigned long *) stp_info.ctnid);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
+static DEVICE_ATTR_RO(ctn_id);
-static ssize_t stp_ctn_type_show(struct device *dev,
+static ssize_t ctn_type_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.ctn);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.ctn);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
+static DEVICE_ATTR_RO(ctn_type);
-static ssize_t stp_dst_offset_show(struct device *dev,
+static ssize_t dst_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x2000))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x2000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
+static DEVICE_ATTR_RO(dst_offset);
-static ssize_t stp_leap_seconds_show(struct device *dev,
+static ssize_t leap_seconds_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x8000))
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x8000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ mutex_unlock(&stp_mutex);
+ return ret;
+}
+
+static DEVICE_ATTR_RO(leap_seconds);
+
+static ssize_t leap_seconds_scheduled_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct stp_stzi stzi;
+ ssize_t ret;
+
+ mutex_lock(&stp_mutex);
+ if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) {
+ mutex_unlock(&stp_mutex);
return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ }
+
+ ret = chsc_stzi(stp_page, &stzi, sizeof(stzi));
+ mutex_unlock(&stp_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (!stzi.lsoib.p)
+ return sprintf(buf, "0,0\n");
+
+ return sprintf(buf, "%lu,%d\n",
+ tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC,
+ stzi.lsoib.nlso - stzi.lsoib.also);
}
-static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
+static DEVICE_ATTR_RO(leap_seconds_scheduled);
-static ssize_t stp_stratum_show(struct device *dev,
+static ssize_t stratum_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
+static DEVICE_ATTR_RO(stratum);
-static ssize_t stp_time_offset_show(struct device *dev,
+static ssize_t time_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x0800))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int) stp_info.tto);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x0800))
+ ret = sprintf(buf, "%i\n", (int) stp_info.tto);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
+static DEVICE_ATTR_RO(time_offset);
-static ssize_t stp_time_zone_offset_show(struct device *dev,
+static ssize_t time_zone_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x4000))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x4000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(time_zone_offset, 0400,
- stp_time_zone_offset_show, NULL);
+static DEVICE_ATTR_RO(time_zone_offset);
-static ssize_t stp_timing_mode_show(struct device *dev,
+static ssize_t timing_mode_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.tmd);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.tmd);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
+static DEVICE_ATTR_RO(timing_mode);
-static ssize_t stp_timing_state_show(struct device *dev,
+static ssize_t timing_state_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.tst);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.tst);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
+static DEVICE_ATTR_RO(timing_state);
-static ssize_t stp_online_show(struct device *dev,
+static ssize_t online_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%i\n", stp_online);
}
-static ssize_t stp_online_store(struct device *dev,
+static ssize_t online_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -801,14 +898,14 @@ static ssize_t stp_online_store(struct device *dev,
return -EINVAL;
if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
return -EOPNOTSUPP;
- mutex_lock(&clock_sync_mutex);
+ mutex_lock(&stp_mutex);
stp_online = value;
if (stp_online)
set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
else
clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
queue_work(time_sync_wq, &stp_work);
- mutex_unlock(&clock_sync_mutex);
+ mutex_unlock(&stp_mutex);
return count;
}
@@ -816,46 +913,27 @@ static ssize_t stp_online_store(struct device *dev,
* Can't use DEVICE_ATTR because the attribute should be named
* stp/online but dev_attr_online already exists in this file ..
*/
-static struct device_attribute dev_attr_stp_online = {
- .attr = { .name = "online", .mode = 0600 },
- .show = stp_online_show,
- .store = stp_online_store,
-};
-
-static struct device_attribute *stp_attributes[] = {
- &dev_attr_ctn_id,
- &dev_attr_ctn_type,
- &dev_attr_dst_offset,
- &dev_attr_leap_seconds,
- &dev_attr_stp_online,
- &dev_attr_stratum,
- &dev_attr_time_offset,
- &dev_attr_time_zone_offset,
- &dev_attr_timing_mode,
- &dev_attr_timing_state,
+static DEVICE_ATTR_RW(online);
+
+static struct attribute *stp_dev_attrs[] = {
+ &dev_attr_ctn_id.attr,
+ &dev_attr_ctn_type.attr,
+ &dev_attr_dst_offset.attr,
+ &dev_attr_leap_seconds.attr,
+ &dev_attr_online.attr,
+ &dev_attr_leap_seconds_scheduled.attr,
+ &dev_attr_stratum.attr,
+ &dev_attr_time_offset.attr,
+ &dev_attr_time_zone_offset.attr,
+ &dev_attr_timing_mode.attr,
+ &dev_attr_timing_state.attr,
NULL
};
+ATTRIBUTE_GROUPS(stp_dev);
static int __init stp_init_sysfs(void)
{
- struct device_attribute **attr;
- int rc;
-
- rc = subsys_system_register(&stp_subsys, NULL);
- if (rc)
- goto out;
- for (attr = stp_attributes; *attr; attr++) {
- rc = device_create_file(stp_subsys.dev_root, *attr);
- if (rc)
- goto out_unreg;
- }
- return 0;
-out_unreg:
- for (; attr >= stp_attributes; attr--)
- device_remove_file(stp_subsys.dev_root, *attr);
- bus_unregister(&stp_subsys);
-out:
- return rc;
+ return subsys_system_register(&stp_subsys, stp_dev_groups);
}
device_initcall(stp_init_sysfs);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3627953007ed..c6eecd4a5302 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2007, 2011
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#define KMSG_COMPONENT "cpu"
@@ -26,7 +25,6 @@
#include <linux/nodemask.h>
#include <linux/node.h>
#include <asm/sysinfo.h>
-#include <asm/numa.h>
#define PTF_HORIZONTAL (0UL)
#define PTF_VERTICAL (1UL)
@@ -63,50 +61,55 @@ static struct mask_info drawer_info;
struct cpu_topology_s390 cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
-cpumask_t cpus_with_topology;
-
-static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
+static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu)
{
- cpumask_t mask;
+ static cpumask_t mask;
- cpumask_copy(&mask, cpumask_of(cpu));
+ cpumask_clear(&mask);
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+ goto out;
+ cpumask_set_cpu(cpu, &mask);
switch (topology_mode) {
case TOPOLOGY_MODE_HW:
while (info) {
if (cpumask_test_cpu(cpu, &info->mask)) {
- mask = info->mask;
+ cpumask_copy(&mask, &info->mask);
break;
}
info = info->next;
}
- if (cpumask_empty(&mask))
- cpumask_copy(&mask, cpumask_of(cpu));
break;
case TOPOLOGY_MODE_PACKAGE:
cpumask_copy(&mask, cpu_present_mask);
break;
default:
- /* fallthrough */
+ fallthrough;
case TOPOLOGY_MODE_SINGLE:
- cpumask_copy(&mask, cpumask_of(cpu));
break;
}
- return mask;
+ cpumask_and(&mask, &mask, &cpu_setup_mask);
+out:
+ cpumask_copy(dst, &mask);
}
-static cpumask_t cpu_thread_map(unsigned int cpu)
+static void cpu_thread_map(cpumask_t *dst, unsigned int cpu)
{
- cpumask_t mask;
+ static cpumask_t mask;
int i;
- cpumask_copy(&mask, cpumask_of(cpu));
+ cpumask_clear(&mask);
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+ goto out;
+ cpumask_set_cpu(cpu, &mask);
if (topology_mode != TOPOLOGY_MODE_HW)
- return mask;
+ goto out;
cpu -= cpu % (smp_cpu_mtid + 1);
- for (i = 0; i <= smp_cpu_mtid; i++)
- if (cpu_present(cpu + i))
+ for (i = 0; i <= smp_cpu_mtid; i++) {
+ if (cpumask_test_cpu(cpu + i, &cpu_setup_mask))
cpumask_set_cpu(cpu + i, &mask);
- return mask;
+ }
+out:
+ cpumask_copy(dst, &mask);
}
#define TOPOLOGY_CORE_BITS 64
@@ -138,7 +141,6 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
cpumask_set_cpu(lcpu + i, &drawer->mask);
cpumask_set_cpu(lcpu + i, &book->mask);
cpumask_set_cpu(lcpu + i, &socket->mask);
- cpumask_set_cpu(lcpu + i, &cpus_with_topology);
smp_cpu_set_polarization(lcpu + i, tl_core->pp);
}
}
@@ -245,17 +247,18 @@ int topology_set_cpu_management(int fc)
return rc;
}
-static void update_cpu_masks(void)
+void update_cpu_masks(void)
{
- struct cpu_topology_s390 *topo;
- int cpu, id;
+ struct cpu_topology_s390 *topo, *topo_package, *topo_sibling;
+ int cpu, sibling, pkg_first, smt_first, id;
for_each_possible_cpu(cpu) {
topo = &cpu_topology[cpu];
- topo->thread_mask = cpu_thread_map(cpu);
- topo->core_mask = cpu_group_map(&socket_info, cpu);
- topo->book_mask = cpu_group_map(&book_info, cpu);
- topo->drawer_mask = cpu_group_map(&drawer_info, cpu);
+ cpu_thread_map(&topo->thread_mask, cpu);
+ cpu_group_map(&topo->core_mask, &socket_info, cpu);
+ cpu_group_map(&topo->book_mask, &book_info, cpu);
+ cpu_group_map(&topo->drawer_mask, &drawer_info, cpu);
+ topo->booted_cores = 0;
if (topology_mode != TOPOLOGY_MODE_HW) {
id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu;
topo->thread_id = cpu;
@@ -263,11 +266,23 @@ static void update_cpu_masks(void)
topo->socket_id = id;
topo->book_id = id;
topo->drawer_id = id;
- if (cpu_present(cpu))
- cpumask_set_cpu(cpu, &cpus_with_topology);
}
}
- numa_update_cpu_topology();
+ for_each_online_cpu(cpu) {
+ topo = &cpu_topology[cpu];
+ pkg_first = cpumask_first(&topo->core_mask);
+ topo_package = &cpu_topology[pkg_first];
+ if (cpu == pkg_first) {
+ for_each_cpu(sibling, &topo->core_mask) {
+ topo_sibling = &cpu_topology[sibling];
+ smt_first = cpumask_first(&topo_sibling->thread_mask);
+ if (sibling == smt_first)
+ topo_package->booted_cores++;
+ }
+ } else {
+ topo->booted_cores = topo_package->booted_cores;
+ }
+ }
}
void store_topology(struct sysinfo_15_1_x *info)
@@ -289,7 +304,6 @@ static int __arch_update_cpu_topology(void)
int rc = 0;
mutex_lock(&smp_cpu_state_mutex);
- cpumask_clear(&cpus_with_topology);
if (MACHINE_HAS_TOPOLOGY) {
rc = 1;
store_topology(info);
@@ -346,9 +360,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0);
static void set_topology_timer(void)
{
if (atomic_add_unless(&topology_poll, -1, 0))
- mod_timer(&topology_timer, jiffies + HZ / 10);
+ mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100));
else
- mod_timer(&topology_timer, jiffies + HZ * 60);
+ mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC));
}
void topology_expect_change(void)
@@ -391,7 +405,7 @@ static ssize_t dispatching_store(struct device *dev,
if (val != 0 && val != 1)
return -EINVAL;
rc = 0;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smp_cpu_state_mutex);
if (cpu_management == val)
goto out;
@@ -402,7 +416,7 @@ static ssize_t dispatching_store(struct device *dev,
topology_expect_change();
out:
mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return rc ? rc : count;
}
static DEVICE_ATTR_RW(dispatching);
@@ -554,6 +568,7 @@ void __init topology_init_early(void)
alloc_masks(info, &book_info, 2);
alloc_masks(info, &drawer_info, 3);
out:
+ cpumask_set_cpu(0, &cpu_setup_mask);
__arch_update_cpu_topology();
__arch_update_dedicated_flag(NULL);
}
@@ -584,7 +599,7 @@ static int __init topology_setup(char *str)
early_param("topology", topology_setup);
static int topology_ctl_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int enabled = topology_is_enabled();
int new_mode;
diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c
index 490b52e85014..11a669f3cc93 100644
--- a/arch/s390/kernel/trace.c
+++ b/arch/s390/kernel/trace.c
@@ -14,7 +14,7 @@ EXPORT_TRACEPOINT_SYMBOL(s390_diagnose);
static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth);
-void trace_s390_diagnose_norecursion(int diag_nr)
+void notrace trace_s390_diagnose_norecursion(int diag_nr)
{
unsigned long flags;
unsigned int *depth;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index dc75588d7894..1d2aa448d103 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -13,8 +13,11 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'asm.s'.
*/
+#include "asm/irqflags.h"
+#include "asm/ptrace.h"
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/randomize_kstack.h>
#include <linux/extable.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
@@ -23,7 +26,10 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/cpu.h>
+#include <linux/entry-common.h>
+#include <asm/asm-extable.h>
#include <asm/fpu/api.h>
+#include <asm/vtime.h>
#include "entry.h"
static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -31,7 +37,7 @@ static inline void __user *get_trap_ip(struct pt_regs *regs)
unsigned long address;
if (regs->int_code & 0x200)
- address = *(unsigned long *)(current->thread.trap_tdb + 24);
+ address = current->thread.trap_tdb.data[3];
else
address = regs->psw.addr;
return (void __user *) (address - (regs->int_code >> 16));
@@ -48,13 +54,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
force_sig_fault(si_signo, si_code, get_trap_ip(regs));
report_user_fault(regs, si_signo, 0);
} else {
- const struct exception_table_entry *fixup;
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup)
- regs->psw.addr = extable_fixup(fixup);
- else {
+ if (!fixup_exception(regs))
die(regs, str);
- }
}
}
@@ -78,17 +79,17 @@ void do_per_trap(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(do_per_trap);
-void default_trap_handler(struct pt_regs *regs)
+static void default_trap_handler(struct pt_regs *regs)
{
if (user_mode(regs)) {
report_user_fault(regs, SIGSEGV, 0);
- do_exit(SIGSEGV);
+ force_exit_sig(SIGSEGV);
} else
die(regs, "Unknown program exception");
}
#define DO_ERROR_INFO(name, signr, sicode, str) \
-void name(struct pt_regs *regs) \
+static void name(struct pt_regs *regs) \
{ \
do_trap(regs, signr, sicode, str); \
}
@@ -140,13 +141,13 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
do_trap(regs, SIGFPE, si_code, "floating point exception");
}
-void translation_exception(struct pt_regs *regs)
+static void translation_specification_exception(struct pt_regs *regs)
{
/* May never happen. */
- panic("Translation exception");
+ panic("Translation-Specification Exception");
}
-void illegal_op(struct pt_regs *regs)
+static void illegal_op(struct pt_regs *regs)
{
__u8 opcode[6];
__u16 __user *location;
@@ -188,7 +189,7 @@ NOKPROBE_SYMBOL(illegal_op);
DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN,
"specification exception");
-void vector_exception(struct pt_regs *regs)
+static void vector_exception(struct pt_regs *regs)
{
int si_code, vic;
@@ -222,7 +223,7 @@ void vector_exception(struct pt_regs *regs)
do_trap(regs, SIGFPE, si_code, "vector exception");
}
-void data_exception(struct pt_regs *regs)
+static void data_exception(struct pt_regs *regs)
{
save_fpu_regs();
if (current->thread.fpu.fpc & FPC_DXC_MASK)
@@ -231,7 +232,7 @@ void data_exception(struct pt_regs *regs)
do_trap(regs, SIGILL, ILL_ILLOPN, "data exception");
}
-void space_switch_exception(struct pt_regs *regs)
+static void space_switch_exception(struct pt_regs *regs)
{
/* Set user psw back to home space mode. */
if (user_mode(regs))
@@ -240,18 +241,14 @@ void space_switch_exception(struct pt_regs *regs)
do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event");
}
-void monitor_event_exception(struct pt_regs *regs)
+static void monitor_event_exception(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
-
if (user_mode(regs))
return;
switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) {
case BUG_TRAP_TYPE_NONE:
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup)
- regs->psw.addr = extable_fixup(fixup);
+ fixup_exception(regs);
break;
case BUG_TRAP_TYPE_WARN:
break;
@@ -271,10 +268,12 @@ void kernel_stack_overflow(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kernel_stack_overflow);
-static void test_monitor_call(void)
+static void __init test_monitor_call(void)
{
int val = 1;
+ if (!IS_ENABLED(CONFIG_BUG))
+ return;
asm volatile(
" mc 0,0\n"
"0: xgr %0,%0\n"
@@ -287,7 +286,118 @@ static void test_monitor_call(void)
void __init trap_init(void)
{
- sort_extable(__start_dma_ex_table, __stop_dma_ex_table);
local_mcck_enable();
test_monitor_call();
}
+
+static void (*pgm_check_table[128])(struct pt_regs *regs);
+
+void noinstr __do_pgm_check(struct pt_regs *regs)
+{
+ unsigned int trapnr;
+ irqentry_state_t state;
+
+ regs->int_code = S390_lowcore.pgm_int_code;
+ regs->int_parm_long = S390_lowcore.trans_exc_code;
+
+ state = irqentry_enter(regs);
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (!static_branch_likely(&cpu_has_bear)) {
+ if (regs->last_break < 4096)
+ regs->last_break = 1;
+ }
+ current->thread.last_break = regs->last_break;
+ }
+
+ if (S390_lowcore.pgm_code & 0x0200) {
+ /* transaction abort */
+ current->thread.trap_tdb = S390_lowcore.pgm_tdb;
+ }
+
+ if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) {
+ if (user_mode(regs)) {
+ struct per_event *ev = &current->thread.per_event;
+
+ set_thread_flag(TIF_PER_TRAP);
+ ev->address = S390_lowcore.per_address;
+ ev->cause = S390_lowcore.per_code_combined;
+ ev->paid = S390_lowcore.per_access_id;
+ } else {
+ /* PER event in kernel is kprobes */
+ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+ do_per_trap(regs);
+ goto out;
+ }
+ }
+
+ if (!irqs_disabled_flags(regs->psw.mask))
+ trace_hardirqs_on();
+ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+
+ trapnr = regs->int_code & PGM_INT_CODE_MASK;
+ if (trapnr)
+ pgm_check_table[trapnr](regs);
+out:
+ local_irq_disable();
+ irqentry_exit(regs, state);
+}
+
+/*
+ * The program check table contains exactly 128 (0x00-0x7f) entries. Each
+ * line defines the function to be called corresponding to the program check
+ * interruption code.
+ */
+static void (*pgm_check_table[128])(struct pt_regs *regs) = {
+ [0x00] = default_trap_handler,
+ [0x01] = illegal_op,
+ [0x02] = privileged_op,
+ [0x03] = execute_exception,
+ [0x04] = do_protection_exception,
+ [0x05] = addressing_exception,
+ [0x06] = specification_exception,
+ [0x07] = data_exception,
+ [0x08] = overflow_exception,
+ [0x09] = divide_exception,
+ [0x0a] = overflow_exception,
+ [0x0b] = divide_exception,
+ [0x0c] = hfp_overflow_exception,
+ [0x0d] = hfp_underflow_exception,
+ [0x0e] = hfp_significance_exception,
+ [0x0f] = hfp_divide_exception,
+ [0x10] = do_dat_exception,
+ [0x11] = do_dat_exception,
+ [0x12] = translation_specification_exception,
+ [0x13] = special_op_exception,
+ [0x14] = default_trap_handler,
+ [0x15] = operand_exception,
+ [0x16] = default_trap_handler,
+ [0x17] = default_trap_handler,
+ [0x18] = transaction_exception,
+ [0x19] = default_trap_handler,
+ [0x1a] = default_trap_handler,
+ [0x1b] = vector_exception,
+ [0x1c] = space_switch_exception,
+ [0x1d] = hfp_sqrt_exception,
+ [0x1e ... 0x37] = default_trap_handler,
+ [0x38] = do_dat_exception,
+ [0x39] = do_dat_exception,
+ [0x3a] = do_dat_exception,
+ [0x3b] = do_dat_exception,
+ [0x3c] = default_trap_handler,
+ [0x3d] = do_secure_storage_access,
+ [0x3e] = do_non_secure_storage_access,
+ [0x3f] = do_secure_storage_violation,
+ [0x40] = monitor_event_exception,
+ [0x41 ... 0x7f] = default_trap_handler,
+};
+
+#define COND_TRAP(x) asm( \
+ ".weak " __stringify(x) "\n\t" \
+ ".set " __stringify(x) "," \
+ __stringify(default_trap_handler))
+
+COND_TRAP(do_secure_storage_access);
+COND_TRAP(do_non_secure_storage_access);
+COND_TRAP(do_secure_storage_violation);
diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c
index 707fd99f6734..0ece156fdd7c 100644
--- a/arch/s390/kernel/unwind_bc.c
+++ b/arch/s390/kernel/unwind_bc.c
@@ -64,8 +64,8 @@ bool unwind_next_frame(struct unwind_state *state)
ip = READ_ONCE_NOCHECK(sf->gprs[8]);
reliable = false;
regs = NULL;
- if (!__kernel_text_address(ip)) {
- /* skip bogus %r14 */
+ /* skip bogus %r14 or if is the same as regs->psw.addr */
+ if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) {
state->regs = NULL;
return unwind_next_frame(state);
}
@@ -103,13 +103,11 @@ bool unwind_next_frame(struct unwind_state *state)
if (sp & 0x7)
goto out_err;
- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp);
-
/* Update unwind state */
state->sp = sp;
- state->ip = ip;
state->regs = regs;
state->reliable = reliable;
+ state->ip = unwind_recover_ret_addr(state, ip);
return true;
out_err:
@@ -161,12 +159,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
ip = READ_ONCE_NOCHECK(sf->gprs[8]);
}
- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL);
-
/* Update unwind state */
state->sp = sp;
- state->ip = ip;
state->reliable = true;
+ state->ip = unwind_recover_ret_addr(state, ip);
if (!first_frame)
return;
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 5007fac01bb5..b88345ef8bd9 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -32,7 +32,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
return -EINVAL;
if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)
return -EINVAL;
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
auprobe->saved_per = psw_bits(regs->psw).per;
auprobe->saved_int_code = regs->int_code;
regs->int_code = UPROBE_TRAP_NR;
@@ -103,7 +103,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
/* fix per address */
current->thread.per_event.address = utask->vaddr;
/* trigger per event */
- set_pt_regs_flag(regs, PIF_PER_TRAP);
+ set_thread_flag(TIF_PER_TRAP);
}
return 0;
}
@@ -126,6 +126,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
case DIE_SSTEP:
if (uprobe_post_sstep_notifier(regs))
return NOTIFY_STOP;
+ break;
default:
break;
}
@@ -176,9 +177,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(*(ptr)) input; \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)ptr & mask) \
+ if ((u64 __force)ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (get_user(input, ptr)) \
__rc = EMU_ADDRESSING; \
@@ -193,9 +192,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(ptr) __ptr = (ptr); \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)__ptr & mask) \
+ if ((u64 __force)__ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (put_user(*(input), __ptr)) \
__rc = EMU_ADDRESSING; \
@@ -212,9 +209,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(*(ptr)) input; \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)ptr & mask) \
+ if ((u64 __force)ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (get_user(input, ptr)) \
__rc = EMU_ADDRESSING; \
@@ -259,7 +254,7 @@ static void sim_stor_event(struct pt_regs *regs, void *addr, int len)
return;
current->thread.per_event.address = regs->psw.addr;
current->thread.per_event.cause = PER_EVENT_STORE >> 16;
- set_pt_regs_flag(regs, PIF_PER_TRAP);
+ set_thread_flag(TIF_PER_TRAP);
}
/*
@@ -326,10 +321,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs)
break;
case 0xc6:
switch (insn->opc1) {
- case 0x02: /* pfdrl */
- if (!test_facility(34))
- rc = EMU_ILLEGAL_OP;
- break;
case 0x04: /* cghrl */
rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64);
break;
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
new file mode 100644
index 000000000000..f9810d2a267c
--- /dev/null
+++ b/arch/s390/kernel/uv.c
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Ultravisor functions and initialization
+ *
+ * Copyright IBM Corp. 2019, 2020
+ */
+#define KMSG_COMPONENT "prot_virt"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/bitmap.h>
+#include <linux/memblock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/uv.h>
+
+/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+int __bootdata_preserved(prot_virt_guest);
+#endif
+
+struct uv_info __bootdata_preserved(uv_info);
+
+#if IS_ENABLED(CONFIG_KVM)
+int __bootdata_preserved(prot_virt_host);
+EXPORT_SYMBOL(prot_virt_host);
+EXPORT_SYMBOL(uv_info);
+
+static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len)
+{
+ struct uv_cb_init uvcb = {
+ .header.cmd = UVC_CMD_INIT_UV,
+ .header.len = sizeof(uvcb),
+ .stor_origin = stor_base,
+ .stor_len = stor_len,
+ };
+
+ if (uv_call(0, (uint64_t)&uvcb)) {
+ pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n",
+ uvcb.header.rc, uvcb.header.rrc);
+ return -1;
+ }
+ return 0;
+}
+
+void __init setup_uv(void)
+{
+ void *uv_stor_base;
+
+ if (!is_prot_virt_host())
+ return;
+
+ uv_stor_base = memblock_alloc_try_nid(
+ uv_info.uv_base_stor_len, SZ_1M, SZ_2G,
+ MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+ if (!uv_stor_base) {
+ pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n",
+ uv_info.uv_base_stor_len);
+ goto fail;
+ }
+
+ if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) {
+ memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
+ goto fail;
+ }
+
+ pr_info("Reserving %luMB as ultravisor base storage\n",
+ uv_info.uv_base_stor_len >> 20);
+ return;
+fail:
+ pr_info("Disabling support for protected virtualization");
+ prot_virt_host = 0;
+}
+
+/*
+ * Requests the Ultravisor to pin the page in the shared state. This will
+ * cause an intercept when the guest attempts to unshare the pinned page.
+ */
+static int uv_pin_shared(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_PIN_PAGE_SHARED,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr,
+ };
+
+ if (uv_call(0, (u64)&uvcb))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Requests the Ultravisor to destroy a guest page and make it
+ * accessible to the host. The destroy clears the page instead of
+ * exporting.
+ *
+ * @paddr: Absolute host address of page to be destroyed
+ */
+static int uv_destroy_page(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_DESTR_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr
+ };
+
+ if (uv_call(0, (u64)&uvcb)) {
+ /*
+ * Older firmware uses 107/d as an indication of a non secure
+ * page. Let us emulate the newer variant (no-op).
+ */
+ if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd)
+ return 0;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * The caller must already hold a reference to the page
+ */
+int uv_destroy_owned_page(unsigned long paddr)
+{
+ struct page *page = phys_to_page(paddr);
+ int rc;
+
+ get_page(page);
+ rc = uv_destroy_page(paddr);
+ if (!rc)
+ clear_bit(PG_arch_1, &page->flags);
+ put_page(page);
+ return rc;
+}
+
+/*
+ * Requests the Ultravisor to encrypt a guest page and make it
+ * accessible to the host for paging (export).
+ *
+ * @paddr: Absolute host address of page to be exported
+ */
+int uv_convert_from_secure(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr
+ };
+
+ if (uv_call(0, (u64)&uvcb))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * The caller must already hold a reference to the page
+ */
+int uv_convert_owned_from_secure(unsigned long paddr)
+{
+ struct page *page = phys_to_page(paddr);
+ int rc;
+
+ get_page(page);
+ rc = uv_convert_from_secure(paddr);
+ if (!rc)
+ clear_bit(PG_arch_1, &page->flags);
+ put_page(page);
+ return rc;
+}
+
+/*
+ * Calculate the expected ref_count for a page that would otherwise have no
+ * further pins. This was cribbed from similar functions in other places in
+ * the kernel, but with some slight modifications. We know that a secure
+ * page can not be a huge page for example.
+ */
+static int expected_page_refs(struct page *page)
+{
+ int res;
+
+ res = page_mapcount(page);
+ if (PageSwapCache(page)) {
+ res++;
+ } else if (page_mapping(page)) {
+ res++;
+ if (page_has_private(page))
+ res++;
+ }
+ return res;
+}
+
+static int make_secure_pte(pte_t *ptep, unsigned long addr,
+ struct page *exp_page, struct uv_cb_header *uvcb)
+{
+ pte_t entry = READ_ONCE(*ptep);
+ struct page *page;
+ int expected, cc = 0;
+
+ if (!pte_present(entry))
+ return -ENXIO;
+ if (pte_val(entry) & _PAGE_INVALID)
+ return -ENXIO;
+
+ page = pte_page(entry);
+ if (page != exp_page)
+ return -ENXIO;
+ if (PageWriteback(page))
+ return -EAGAIN;
+ expected = expected_page_refs(page);
+ if (!page_ref_freeze(page, expected))
+ return -EBUSY;
+ set_bit(PG_arch_1, &page->flags);
+ /*
+ * If the UVC does not succeed or fail immediately, we don't want to
+ * loop for long, or we might get stall notifications.
+ * On the other hand, this is a complex scenario and we are holding a lot of
+ * locks, so we can't easily sleep and reschedule. We try only once,
+ * and if the UVC returned busy or partial completion, we return
+ * -EAGAIN and we let the callers deal with it.
+ */
+ cc = __uv_call(0, (u64)uvcb);
+ page_ref_unfreeze(page, expected);
+ /*
+ * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
+ * If busy or partially completed, return -EAGAIN.
+ */
+ if (cc == UVC_CC_OK)
+ return 0;
+ else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL)
+ return -EAGAIN;
+ return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+}
+
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+ if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+ return false;
+ return atomic_read(&mm->context.protected_count) > 1;
+}
+
+/*
+ * Requests the Ultravisor to make a page accessible to a guest.
+ * If it's brought in the first time, it will be cleared. If
+ * it has been exported before, it will be decrypted and integrity
+ * checked.
+ */
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+{
+ struct vm_area_struct *vma;
+ bool local_drain = false;
+ spinlock_t *ptelock;
+ unsigned long uaddr;
+ struct page *page;
+ pte_t *ptep;
+ int rc;
+
+again:
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Secure pages cannot be huge and userspace should not combine both.
+ * In case userspace does it anyway this will result in an -EFAULT for
+ * the unpack. The guest is thus never reaching secure mode. If
+ * userspace is playing dirty tricky with mapping huge pages later
+ * on this will result in a segmentation fault.
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = -ENXIO;
+ page = follow_page(vma, uaddr, FOLL_WRITE);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+
+ lock_page(page);
+ ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
+ if (should_export_before_import(uvcb, gmap->mm))
+ uv_convert_from_secure(page_to_phys(page));
+ rc = make_secure_pte(ptep, uaddr, page, uvcb);
+ pte_unmap_unlock(ptep, ptelock);
+ unlock_page(page);
+out:
+ mmap_read_unlock(gmap->mm);
+
+ if (rc == -EAGAIN) {
+ /*
+ * If we are here because the UVC returned busy or partial
+ * completion, this is just a useless check, but it is safe.
+ */
+ wait_on_page_writeback(page);
+ } else if (rc == -EBUSY) {
+ /*
+ * If we have tried a local drain and the page refcount
+ * still does not match our expected safe value, try with a
+ * system wide drain. This is needed if the pagevecs holding
+ * the page are on a different CPU.
+ */
+ if (local_drain) {
+ lru_add_drain_all();
+ /* We give up here, and let the caller try again */
+ return -EAGAIN;
+ }
+ /*
+ * We are here if the page refcount does not match the
+ * expected safe value. The main culprits are usually
+ * pagevecs. With lru_add_drain() we drain the pagevecs
+ * on the local CPU so that hopefully the refcount will
+ * reach the expected safe value.
+ */
+ lru_add_drain();
+ local_drain = true;
+ /* And now we try again immediately after draining */
+ goto again;
+ } else if (rc == -ENXIO) {
+ if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
+ return -EFAULT;
+ return -EAGAIN;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_make_secure);
+
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+{
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .guest_handle = gmap->guest_handle,
+ .gaddr = gaddr,
+ };
+
+ return gmap_make_secure(gmap, gaddr, &uvcb);
+}
+EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
+
+/**
+ * gmap_destroy_page - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ */
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+{
+ struct vm_area_struct *vma;
+ unsigned long uaddr;
+ struct page *page;
+ int rc;
+
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Huge pages should not be able to become secure
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = 0;
+ /* we take an extra reference here */
+ page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ rc = uv_destroy_owned_page(page_to_phys(page));
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_owned_from_secure(page_to_phys(page));
+ put_page(page);
+out:
+ mmap_read_unlock(gmap->mm);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_destroy_page);
+
+/*
+ * To be called with the page locked or with an extra reference! This will
+ * prevent gmap_make_secure from touching the page concurrently. Having 2
+ * parallel make_page_accessible is fine, as the UV calls will become a
+ * no-op if the page is already exported.
+ */
+int arch_make_page_accessible(struct page *page)
+{
+ int rc = 0;
+
+ /* Hugepage cannot be protected, so nothing to do */
+ if (PageHuge(page))
+ return 0;
+
+ /*
+ * PG_arch_1 is used in 3 places:
+ * 1. for kernel page tables during early boot
+ * 2. for storage keys of huge pages and KVM
+ * 3. As an indication that this page might be secure. This can
+ * overindicate, e.g. we set the bit before calling
+ * convert_to_secure.
+ * As secure pages are never huge, all 3 variants can co-exists.
+ */
+ if (!test_bit(PG_arch_1, &page->flags))
+ return 0;
+
+ rc = uv_pin_shared(page_to_phys(page));
+ if (!rc) {
+ clear_bit(PG_arch_1, &page->flags);
+ return 0;
+ }
+
+ rc = uv_convert_from_secure(page_to_phys(page));
+ if (!rc) {
+ clear_bit(PG_arch_1, &page->flags);
+ return 0;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(arch_make_page_accessible);
+
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+static ssize_t uv_query_facilities(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n",
+ uv_info.inst_calls_list[0],
+ uv_info.inst_calls_list[1],
+ uv_info.inst_calls_list[2],
+ uv_info.inst_calls_list[3]);
+}
+
+static struct kobj_attribute uv_query_facilities_attr =
+ __ATTR(facilities, 0444, uv_query_facilities, NULL);
+
+static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_ver_attr =
+ __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
+ __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
+
+static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.guest_cpu_stor_len);
+}
+
+static struct kobj_attribute uv_query_dump_cpu_len_attr =
+ __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
+
+static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.conf_dump_storage_state_len);
+}
+
+static struct kobj_attribute uv_query_dump_storage_state_len_attr =
+ __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
+
+static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.conf_dump_finalize_len);
+}
+
+static struct kobj_attribute uv_query_dump_finalize_len_attr =
+ __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL);
+
+static ssize_t uv_query_feature_indications(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
+}
+
+static struct kobj_attribute uv_query_feature_indications_attr =
+ __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
+
+static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%d\n",
+ uv_info.max_guest_cpu_id + 1);
+}
+
+static struct kobj_attribute uv_query_max_guest_cpus_attr =
+ __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL);
+
+static ssize_t uv_query_max_guest_vms(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%d\n",
+ uv_info.max_num_sec_conf);
+}
+
+static struct kobj_attribute uv_query_max_guest_vms_attr =
+ __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL);
+
+static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.max_sec_stor_addr);
+}
+
+static struct kobj_attribute uv_query_max_guest_addr_attr =
+ __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
+
+static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
+ __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags);
+}
+
+static struct kobj_attribute uv_query_supp_att_pflags_attr =
+ __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
+
+static struct attribute *uv_query_attrs[] = {
+ &uv_query_facilities_attr.attr,
+ &uv_query_feature_indications_attr.attr,
+ &uv_query_max_guest_cpus_attr.attr,
+ &uv_query_max_guest_vms_attr.attr,
+ &uv_query_max_guest_addr_attr.attr,
+ &uv_query_supp_se_hdr_ver_attr.attr,
+ &uv_query_supp_se_hdr_pcf_attr.attr,
+ &uv_query_dump_storage_state_len_attr.attr,
+ &uv_query_dump_finalize_len_attr.attr,
+ &uv_query_dump_cpu_len_attr.attr,
+ &uv_query_supp_att_req_hdr_ver_attr.attr,
+ &uv_query_supp_att_pflags_attr.attr,
+ NULL,
+};
+
+static struct attribute_group uv_query_attr_group = {
+ .attrs = uv_query_attrs,
+};
+
+static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ int val = 0;
+
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+ val = prot_virt_guest;
+#endif
+ return scnprintf(page, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ int val = 0;
+
+#if IS_ENABLED(CONFIG_KVM)
+ val = prot_virt_host;
+#endif
+
+ return scnprintf(page, PAGE_SIZE, "%d\n", val);
+}
+
+static struct kobj_attribute uv_prot_virt_guest =
+ __ATTR(prot_virt_guest, 0444, uv_is_prot_virt_guest, NULL);
+
+static struct kobj_attribute uv_prot_virt_host =
+ __ATTR(prot_virt_host, 0444, uv_is_prot_virt_host, NULL);
+
+static const struct attribute *uv_prot_virt_attrs[] = {
+ &uv_prot_virt_guest.attr,
+ &uv_prot_virt_host.attr,
+ NULL,
+};
+
+static struct kset *uv_query_kset;
+static struct kobject *uv_kobj;
+
+static int __init uv_info_init(void)
+{
+ int rc = -ENOMEM;
+
+ if (!test_facility(158))
+ return 0;
+
+ uv_kobj = kobject_create_and_add("uv", firmware_kobj);
+ if (!uv_kobj)
+ return -ENOMEM;
+
+ rc = sysfs_create_files(uv_kobj, uv_prot_virt_attrs);
+ if (rc)
+ goto out_kobj;
+
+ uv_query_kset = kset_create_and_add("query", NULL, uv_kobj);
+ if (!uv_query_kset) {
+ rc = -ENOMEM;
+ goto out_ind_files;
+ }
+
+ rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group);
+ if (!rc)
+ return 0;
+
+ kset_unregister(uv_query_kset);
+out_ind_files:
+ sysfs_remove_files(uv_kobj, uv_prot_virt_attrs);
+out_kobj:
+ kobject_del(uv_kobj);
+ kobject_put(uv_kobj);
+ return rc;
+}
+device_initcall(uv_info_init);
+#endif
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index bcc9bdb39ba2..3105ca5bd470 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -6,269 +6,275 @@
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
*/
-#include <linux/init.h>
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
#include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/elf.h>
-#include <linux/security.h>
-#include <linux/memblock.h>
-#include <linux/compat.h>
-#include <asm/asm-offsets.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/sections.h>
+#include <linux/smp.h>
+#include <linux/time_namespace.h>
+#include <linux/random.h>
+#include <vdso/datapage.h>
#include <asm/vdso.h>
-#include <asm/facility.h>
-extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = &vdso64_start;
-static unsigned int vdso64_pages;
-static struct page **vdso64_pagelist;
+extern char vdso64_start[], vdso64_end[];
+extern char vdso32_start[], vdso32_end[];
-/*
- * Should the kernel map a VDSO page into processes and pass its
- * address down to glibc upon exec()?
- */
-unsigned int __read_mostly vdso_enabled = 1;
-
-static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page **vdso_pagelist;
- unsigned long vdso_pages;
-
- vdso_pagelist = vdso64_pagelist;
- vdso_pages = vdso64_pages;
-
- if (vmf->pgoff >= vdso_pages)
- return VM_FAULT_SIGBUS;
+static struct vm_special_mapping vvar_mapping;
- vmf->page = vdso_pagelist[vmf->pgoff];
- get_page(vmf->page);
- return 0;
-}
-
-static int vdso_mremap(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma)
-{
- unsigned long vdso_pages;
-
- vdso_pages = vdso64_pages;
-
- if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start)
- return -EINVAL;
-
- if (WARN_ON_ONCE(current->mm != vma->vm_mm))
- return -EFAULT;
+static union {
+ struct vdso_data data[CS_BASES];
+ u8 page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
- current->mm->context.vdso_base = vma->vm_start;
- return 0;
-}
+struct vdso_data *vdso_data = vdso_data_store.data;
-static const struct vm_special_mapping vdso_mapping = {
- .name = "[vdso]",
- .fault = vdso_fault,
- .mremap = vdso_mremap,
+enum vvar_pages {
+ VVAR_DATA_PAGE_OFFSET,
+ VVAR_TIMENS_PAGE_OFFSET,
+ VVAR_NR_PAGES,
};
-static int __init vdso_setup(char *str)
+#ifdef CONFIG_TIME_NS
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
{
- bool enabled;
-
- if (!kstrtobool(str, &enabled))
- vdso_enabled = enabled;
- return 1;
+ return (struct vdso_data *)(vvar_page);
}
-__setup("vdso=", vdso_setup);
-/*
- * The vdso data page
- */
-static union {
- struct vdso_data data;
- u8 page[PAGE_SIZE];
-} vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = &vdso_data_store.data;
-
-/*
- * Setup vdso data page.
- */
-static void __init vdso_init_data(struct vdso_data *vd)
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
- vd->ectg_available = test_facility(31);
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+ WARN(1, "vvar_page accessed remotely");
+ return NULL;
}
/*
- * Allocate/free per cpu vdso data.
+ * The VVAR page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
*/
-#define SEGMENT_ORDER 2
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma;
-/*
- * The initial vdso_data structure for the boot CPU. Eventually
- * it is replaced with a properly allocated structure in vdso_init.
- * This is necessary because a valid S390_lowcore.vdso_per_cpu_data
- * pointer is required to be able to return from an interrupt or
- * program check. See the exit paths in entry.S.
- */
-struct vdso_data boot_vdso_data __initdata;
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ unsigned long size = vma->vm_end - vma->vm_start;
-void __init vdso_alloc_boot_cpu(struct lowcore *lowcore)
+ if (!vma_is_special_mapping(vma, &vvar_mapping))
+ continue;
+ zap_page_range(vma, vma->vm_start, size);
+ break;
+ }
+ mmap_read_unlock(mm);
+ return 0;
+}
+#else
+static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
- lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data;
+ return NULL;
}
+#endif
-int vdso_alloc_per_cpu(struct lowcore *lowcore)
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
- unsigned long segment_table, page_table, page_frame;
- struct vdso_per_cpu_data *vd;
-
- segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
- page_table = get_zeroed_page(GFP_KERNEL);
- page_frame = get_zeroed_page(GFP_KERNEL);
- if (!segment_table || !page_table || !page_frame)
- goto out;
- arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER);
- arch_set_page_dat(virt_to_page(page_table), 0);
-
- /* Initialize per-cpu vdso data page */
- vd = (struct vdso_per_cpu_data *) page_frame;
- vd->cpu_nr = lowcore->cpu_nr;
- vd->node_id = cpu_to_node(vd->cpu_nr);
-
- /* Set up page table for the vdso address space */
- memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES);
- memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE);
-
- *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
- *(unsigned long *) page_table = _PAGE_PROTECT + page_frame;
-
- lowcore->vdso_asce = segment_table +
- _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
- lowcore->vdso_per_cpu_data = page_frame;
+ struct page *timens_page = find_timens_vvar_page(vma);
+ unsigned long addr, pfn;
+ vm_fault_t err;
+
+ switch (vmf->pgoff) {
+ case VVAR_DATA_PAGE_OFFSET:
+ pfn = virt_to_pfn(vdso_data);
+ if (timens_page) {
+ /*
+ * Fault in VVAR page too, since it will be accessed
+ * to get clock data anyway.
+ */
+ addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
+ err = vmf_insert_pfn(vma, addr, pfn);
+ if (unlikely(err & VM_FAULT_ERROR))
+ return err;
+ pfn = page_to_pfn(timens_page);
+ }
+ break;
+#ifdef CONFIG_TIME_NS
+ case VVAR_TIMENS_PAGE_OFFSET:
+ /*
+ * If a task belongs to a time namespace then a namespace
+ * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
+ * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
+ * offset.
+ * See also the comment near timens_setup_vdso_data().
+ */
+ if (!timens_page)
+ return VM_FAULT_SIGBUS;
+ pfn = virt_to_pfn(vdso_data);
+ break;
+#endif /* CONFIG_TIME_NS */
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+ return vmf_insert_pfn(vma, vmf->address, pfn);
+}
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma)
+{
+ current->mm->context.vdso_base = vma->vm_start;
return 0;
-
-out:
- free_page(page_frame);
- free_page(page_table);
- free_pages(segment_table, SEGMENT_ORDER);
- return -ENOMEM;
}
-void vdso_free_per_cpu(struct lowcore *lowcore)
-{
- unsigned long segment_table, page_table, page_frame;
+static struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
- segment_table = lowcore->vdso_asce & PAGE_MASK;
- page_table = *(unsigned long *) segment_table;
- page_frame = *(unsigned long *) page_table;
+static struct vm_special_mapping vdso64_mapping = {
+ .name = "[vdso]",
+ .mremap = vdso_mremap,
+};
- free_page(page_frame);
- free_page(page_table);
- free_pages(segment_table, SEGMENT_ORDER);
+static struct vm_special_mapping vdso32_mapping = {
+ .name = "[vdso]",
+ .mremap = vdso_mremap,
+};
+
+int vdso_getcpu_init(void)
+{
+ set_tod_programmable_field(smp_processor_id());
+ return 0;
}
+early_initcall(vdso_getcpu_init); /* Must be called before SMP init */
-/*
- * This is called from binfmt_elf, we create the special vma for the
- * vDSO and insert it into the mm struct tree
- */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
{
+ unsigned long vvar_start, vdso_text_start, vdso_text_len;
+ struct vm_special_mapping *vdso_mapping;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long vdso_pages;
- unsigned long vdso_base;
int rc;
- if (!vdso_enabled)
- return 0;
-
- if (is_compat_task())
- return 0;
-
- vdso_pages = vdso64_pages;
- /*
- * vDSO has a problem and was disabled, just don't "enable" it for
- * the process
- */
- if (vdso_pages == 0)
- return 0;
-
- /*
- * pick a base address for the vDSO in process space. We try to put
- * it at vdso_base which is the "natural" base for it, but we might
- * fail and end up putting it elsewhere.
- */
- if (down_write_killable(&mm->mmap_sem))
+ BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0);
- if (IS_ERR_VALUE(vdso_base)) {
- rc = vdso_base;
- goto out_up;
- }
- /*
- * our vma flags don't have VM_WRITE so by default, the process
- * isn't allowed to write those pages.
- * gdb can break that with ptrace interface, and thus trigger COW
- * on those pages but it's then your responsibility to never do that
- * on the "data" page of the vDSO or you'll stop getting kernel
- * updates and your nice userland gettimeofday will be totally dead.
- * It's fine to use that for setting breakpoints in the vDSO code
- * pages though.
- */
- vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
+ if (is_compat_task()) {
+ vdso_text_len = vdso32_end - vdso32_start;
+ vdso_mapping = &vdso32_mapping;
+ } else {
+ vdso_text_len = vdso64_end - vdso64_start;
+ vdso_mapping = &vdso64_mapping;
+ }
+ vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0);
+ rc = vvar_start;
+ if (IS_ERR_VALUE(vvar_start))
+ goto out;
+ vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE,
+ VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+ VM_PFNMAP,
+ &vvar_mapping);
+ rc = PTR_ERR(vma);
+ if (IS_ERR(vma))
+ goto out;
+ vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE;
+ /* VM_MAYWRITE for COW so gdb can set breakpoints */
+ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- &vdso_mapping);
+ vdso_mapping);
if (IS_ERR(vma)) {
+ do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
rc = PTR_ERR(vma);
- goto out_up;
+ } else {
+ current->mm->context.vdso_base = vdso_text_start;
+ rc = 0;
}
+out:
+ mmap_write_unlock(mm);
+ return rc;
+}
- current->mm->context.vdso_base = vdso_base;
- rc = 0;
+static unsigned long vdso_addr(unsigned long start, unsigned long len)
+{
+ unsigned long addr, end, offset;
-out_up:
- up_write(&mm->mmap_sem);
- return rc;
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= VDSO_BASE)
+ end = VDSO_BASE;
+ end -= len;
+
+ if (end > start) {
+ offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+ return addr;
}
-static int __init vdso_init(void)
+unsigned long vdso_size(void)
{
- int i;
+ unsigned long size = VVAR_NR_PAGES * PAGE_SIZE;
- vdso_init_data(vdso_data);
+ if (is_compat_task())
+ size += vdso32_end - vdso32_start;
+ else
+ size += vdso64_end - vdso64_start;
+ return PAGE_ALIGN(size);
+}
- /* Calculate the size of the 64 bit vDSO */
- vdso64_pages = ((&vdso64_end - &vdso64_start
- + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ unsigned long addr = VDSO_BASE;
+ unsigned long size = vdso_size();
- /* Make sure pages are in the correct state */
- vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- BUG_ON(vdso64_pagelist == NULL);
- for (i = 0; i < vdso64_pages - 1; i++) {
- struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- get_page(pg);
- vdso64_pagelist[i] = pg;
- }
- vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
- vdso64_pagelist[vdso64_pages] = NULL;
- if (vdso_alloc_per_cpu(&S390_lowcore))
- BUG();
+ if (current->flags & PF_RANDOMIZE)
+ addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size);
+ return map_vdso(addr, size);
+}
- get_page(virt_to_page(vdso_data));
+static struct page ** __init vdso_setup_pages(void *start, void *end)
+{
+ int pages = (end - start) >> PAGE_SHIFT;
+ struct page **pagelist;
+ int i;
+ pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+ if (!pagelist)
+ panic("%s: Cannot allocate page list for VDSO", __func__);
+ for (i = 0; i < pages; i++)
+ pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
+ return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+ vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
+ if (IS_ENABLED(CONFIG_COMPAT))
+ vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
return 0;
}
-early_initcall(vdso_init);
+arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore
new file mode 100644
index 000000000000..5167384843b9
--- /dev/null
+++ b/arch/s390/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
new file mode 100644
index 000000000000..245bddfe9bc0
--- /dev/null
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+KCOV_INSTRUMENT := n
+ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
+ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+
+include $(srctree)/lib/vdso/Makefile
+obj-vdso32 = vdso_user_wrapper-32.o note-32.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+KBUILD_AFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m31 -s
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
+
+LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \
+ --hash-style=both --build-id=sha1 -melf_s390 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+
+obj-y += vdso32_wrapper.o
+targets += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Disable gcov profiling, ubsan and kasan for VDSO code
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
+ $(call if_changed,ld)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+$(obj-vdso32): %-32.o: %.S FORCE
+ $(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32as = VDSO32A $@
+ cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+ cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso32.so: $(obj)/vdso32.so.dbg
+ @mkdir -p $(MODLIB)/vdso
+ $(call cmd,vdso_install)
+
+vdso_install: vdso32.so
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
+ $(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..9c4f951e227d
--- /dev/null
+++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S
new file mode 100644
index 000000000000..db19d0680a0a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/note.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 000000000000..edf5ff1debe1
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+
+#include <asm/page.h>
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
+OUTPUT_ARCH(s390:31-bit)
+ENTRY(_start)
+
+SECTIONS
+{
+ PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+ PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+ . = VDSO_LBASE + SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .note : { *(.note.*) } :text :note
+
+ . = ALIGN(16);
+ .text : {
+ *(.text .stub .text.* .gnu.linkonce.t.*)
+ } :text
+ PROVIDE(__etext = .);
+ PROVIDE(_etext = .);
+ PROVIDE(etext = .);
+
+ /*
+ * Other stuff is appended to the text segment:
+ */
+ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+ .rodata1 : { *(.rodata1) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+ .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
+
+ .rela.dyn ALIGN(8) : { *(.rela.dyn) }
+ .got ALIGN(8) : { *(.got .toc) }
+ .got.plt ALIGN(8) : { *(.got.plt) }
+
+ _end = .;
+ PROVIDE(end = .);
+
+ /*
+ * Stabs debugging sections are here too.
+ */
+ .stab 0 : { *(.stab) }
+ .stabstr 0 : { *(.stabstr) }
+ .stab.excl 0 : { *(.stab.excl) }
+ .stab.exclstr 0 : { *(.stab.exclstr) }
+ .stab.index 0 : { *(.stab.index) }
+ .stab.indexstr 0 : { *(.stab.indexstr) }
+ .comment 0 : { *(.comment) }
+
+ /*
+ * DWARF debug sections.
+ * Symbols in the DWARF debugging sections are relative to the
+ * beginning of the section so we begin them at 0.
+ */
+ /* DWARF 1 */
+ .debug 0 : { *(.debug) }
+ .line 0 : { *(.line) }
+ /* GNU DWARF 1 extensions */
+ .debug_srcinfo 0 : { *(.debug_srcinfo) }
+ .debug_sfnames 0 : { *(.debug_sfnames) }
+ /* DWARF 1.1 and DWARF 2 */
+ .debug_aranges 0 : { *(.debug_aranges) }
+ .debug_pubnames 0 : { *(.debug_pubnames) }
+ /* DWARF 2 */
+ .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
+ .debug_abbrev 0 : { *(.debug_abbrev) }
+ .debug_line 0 : { *(.debug_line) }
+ .debug_frame 0 : { *(.debug_frame) }
+ .debug_str 0 : { *(.debug_str) }
+ .debug_loc 0 : { *(.debug_loc) }
+ .debug_macinfo 0 : { *(.debug_macinfo) }
+ /* SGI/MIPS DWARF 2 extensions */
+ .debug_weaknames 0 : { *(.debug_weaknames) }
+ .debug_funcnames 0 : { *(.debug_funcnames) }
+ .debug_typenames 0 : { *(.debug_typenames) }
+ .debug_varnames 0 : { *(.debug_varnames) }
+ /* DWARF 3 */
+ .debug_pubtypes 0 : { *(.debug_pubtypes) }
+ .debug_ranges 0 : { *(.debug_ranges) }
+ .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+
+ /DISCARD/ : {
+ *(.note.GNU-stack)
+ *(.branch_lt)
+ *(.data .data.* .gnu.linkonce.d.* .sdata*)
+ *(.bss .sbss .dynbss .dynsbss)
+ }
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+ VDSO_VERSION_STRING {
+ global:
+ /*
+ * Has to be there for the kernel to find
+ */
+ __kernel_compat_restart_syscall;
+ __kernel_compat_rt_sigreturn;
+ __kernel_compat_sigreturn;
+ local: *;
+ };
+}
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 000000000000..de2fb930471a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+ __PAGE_ALIGNED_DATA
+
+ .globl vdso32_start, vdso32_end
+ .balign PAGE_SIZE
+vdso32_start:
+ .incbin "arch/s390/kernel/vdso32/vdso32.so"
+ .balign PAGE_SIZE
+vdso32_end:
+
+ .previous
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
new file mode 100644
index 000000000000..3f42f27f978c
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/unistd.h>
+#include <asm/dwarf.h>
+
+.macro vdso_syscall func,syscall
+ .globl __kernel_compat_\func
+ .type __kernel_compat_\func,@function
+ .align 8
+__kernel_compat_\func:
+ CFI_STARTPROC
+ svc \syscall
+ /* Make sure we notice when a syscall returns, which shouldn't happen */
+ .word 0
+ CFI_ENDPROC
+ .size __kernel_compat_\func,.-__kernel_compat_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore
index 3fd18cf9fec2..4ec80685fecc 100644
--- a/arch/s390/kernel/vdso64/.gitignore
+++ b/arch/s390/kernel/vdso64/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
vdso64.lds
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index bec19e7e6e1c..9e2b95a222a9 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -1,44 +1,53 @@
# SPDX-License-Identifier: GPL-2.0
-# List of files in the vdso, has to be asm only for now
+# List of files in the vdso
KCOV_INSTRUMENT := n
+ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
+ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
-obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o
+include $(srctree)/lib/vdso/Makefile
+obj-vdso64 = vdso_user_wrapper.o note.o
+obj-cvdso64 = vdso64_generic.o getcpu.o
+VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK)
+CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE)
# Build rules
-targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
+targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg
obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64))
KBUILD_AFLAGS += -DBUILD_VDSO
-KBUILD_CFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS))
KBUILD_AFLAGS_64 += -m64 -s
KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
-KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
- -Wl,--hash-style=both
+ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \
+ --hash-style=both --build-id=sha1 -T
$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
obj-y += vdso64_wrapper.o
-extra-y += vdso64.lds
+targets += vdso64.lds
CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
# Disable gcov profiling, ubsan and kasan for VDSO code
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
# Force dependency (incbin is bad)
$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
# link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
- $(call if_changed,vdso64ld)
+$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE
+ $(call if_changed,ld)
# strip rule for the .so file
$(obj)/%.so: OBJCOPYFLAGS := -S
@@ -49,11 +58,14 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(obj-vdso64): %.o: %.S FORCE
$(call if_changed_dep,vdso64as)
+$(obj-cvdso64): %.o: %.c FORCE
+ $(call if_changed_dep,vdso64cc)
+
# actual build commands
-quiet_cmd_vdso64ld = VDSO64L $@
- cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@
quiet_cmd_vdso64as = VDSO64A $@
cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso64cc = VDSO64C $@
+ cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $<
# install commands for the unstripped file
quiet_cmd_vdso_install = INSTALL $@
@@ -64,3 +76,11 @@ vdso64.so: $(obj)/vdso64.so.dbg
$(call cmd,vdso_install)
vdso_install: vdso64.so
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
+ $(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S
deleted file mode 100644
index 081435398e0a..000000000000
--- a/arch/s390/kernel/vdso64/clock_getres.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_getres() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-
- .text
- .align 4
- .globl __kernel_clock_getres
- .type __kernel_clock_getres,@function
-__kernel_clock_getres:
- CFI_STARTPROC
- larl %r1,4f
- cghi %r2,__CLOCK_REALTIME_COARSE
- je 0f
- cghi %r2,__CLOCK_MONOTONIC_COARSE
- je 0f
- larl %r1,3f
- cghi %r2,__CLOCK_REALTIME
- je 0f
- cghi %r2,__CLOCK_MONOTONIC
- je 0f
- cghi %r2,__CLOCK_THREAD_CPUTIME_ID
- je 0f
- cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
- jne 2f
- larl %r5,_vdso_data
- icm %r0,15,__LC_ECTG_OK(%r5)
- jz 2f
-0: ltgr %r3,%r3
- jz 1f /* res == NULL */
- lg %r0,0(%r1)
- xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */
- stg %r0,8(%r3) /* store tp->tv_usec */
-1: lghi %r2,0
- br %r14
-2: lghi %r1,__NR_clock_getres /* fallback to svc */
- svc 0
- br %r14
- CFI_ENDPROC
-3: .quad __CLOCK_REALTIME_RES
-4: .quad __CLOCK_COARSE_RES
- .size __kernel_clock_getres,.-__kernel_clock_getres
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
deleted file mode 100644
index 9d2ee79b90f2..000000000000
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ /dev/null
@@ -1,163 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_gettime() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
- .text
- .align 4
- .globl __kernel_clock_gettime
- .type __kernel_clock_gettime,@function
-__kernel_clock_gettime:
- CFI_STARTPROC
- aghi %r15,-16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
- larl %r5,_vdso_data
- cghi %r2,__CLOCK_REALTIME_COARSE
- je 4f
- cghi %r2,__CLOCK_REALTIME
- je 5f
- cghi %r2,-3 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
- je 9f
- cghi %r2,__CLOCK_MONOTONIC_COARSE
- je 3f
- cghi %r2,__CLOCK_MONOTONIC
- jne 12f
-
- /* CLOCK_MONOTONIC */
-0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 0b
- stcke 0(%r15) /* Store TOD clock */
- lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- lg %r0,__VDSO_WTOM_SEC(%r5)
- lg %r1,1(%r15)
- sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_WTOM_NSEC(%r5)
- srlg %r1,%r1,0(%r2) /* >> tk->shift */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 0b
- larl %r5,13f
-1: clg %r1,0(%r5)
- jl 2f
- slg %r1,0(%r5)
- aghi %r0,1
- j 1b
-2: stg %r0,0(%r3) /* store tp->tv_sec */
- stg %r1,8(%r3) /* store tp->tv_nsec */
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* CLOCK_MONOTONIC_COARSE */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-3: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 3b
- lg %r0,__VDSO_WTOM_CRS_SEC(%r5)
- lg %r1,__VDSO_WTOM_CRS_NSEC(%r5)
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 3b
- j 2b
-
- /* CLOCK_REALTIME_COARSE */
-4: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 4b
- lg %r0,__VDSO_XTIME_CRS_SEC(%r5)
- lg %r1,__VDSO_XTIME_CRS_NSEC(%r5)
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 4b
- j 7f
-
- /* CLOCK_REALTIME */
-5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 5b
- stcke 0(%r15) /* Store TOD clock */
- lg %r1,1(%r15)
- lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */
- slgr %r0,%r1 /* now - ts_steering_end */
- ltgr %r0,%r0 /* past end of steering ? */
- jm 17f
- srlg %r0,%r0,15 /* 1 per 2^16 */
- tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */
- jz 18f
- lcgr %r0,%r0 /* negative TOD offset */
-18: algr %r1,%r0 /* add steering offset */
-17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */
- srlg %r1,%r1,0(%r2) /* >> tk->shift */
- lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 5b
- larl %r5,13f
-6: clg %r1,0(%r5)
- jl 7f
- slg %r1,0(%r5)
- aghi %r0,1
- j 6b
-7: stg %r0,0(%r3) /* store tp->tv_sec */
- stg %r1,8(%r3) /* store tp->tv_nsec */
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* CPUCLOCK_VIRT for this thread */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-9: lghi %r4,0
- icm %r0,15,__VDSO_ECTG_OK(%r5)
- jz 12f
- sacf 256 /* Magic ectg instruction */
- .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
- sacf 0
- algr %r1,%r0 /* r1 = cputime as TOD value */
- mghi %r1,1000 /* convert to nanoseconds */
- srlg %r1,%r1,12 /* r1 = cputime in nanosec */
- lgr %r4,%r1
- larl %r5,13f
- srlg %r1,%r1,9 /* divide by 1000000000 */
- mlg %r0,8(%r5)
- srlg %r0,%r0,11 /* r0 = tv_sec */
- stg %r0,0(%r3)
- msg %r0,0(%r5) /* calculate tv_nsec */
- slgr %r4,%r0 /* r4 = tv_nsec */
- stg %r4,8(%r3)
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* Fallback to system call */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-12: lghi %r1,__NR_clock_gettime
- svc 0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
- CFI_ENDPROC
-
-13: .quad 1000000000
-14: .quad 19342813113834067
- .size __kernel_clock_gettime,.-__kernel_clock_gettime
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..37f05cb38dad
--- /dev/null
+++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S
deleted file mode 100644
index 3c04f7328500..000000000000
--- a/arch/s390/kernel/vdso64/getcpu.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of getcpu() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2016
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/dwarf.h>
-
- .text
- .align 4
- .globl __kernel_getcpu
- .type __kernel_getcpu,@function
-__kernel_getcpu:
- CFI_STARTPROC
- sacf 256
- lm %r4,%r5,__VDSO_GETCPU_VAL(%r0)
- sacf 0
- ltgr %r2,%r2
- jz 2f
- st %r5,0(%r2)
-2: ltgr %r3,%r3
- jz 3f
- st %r4,0(%r3)
-3: lghi %r2,0
- br %r14
- CFI_ENDPROC
- .size __kernel_getcpu,.-__kernel_getcpu
diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c
new file mode 100644
index 000000000000..5c5d4a848b76
--- /dev/null
+++ b/arch/s390/kernel/vdso64/getcpu.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright IBM Corp. 2020 */
+
+#include <linux/compiler.h>
+#include <linux/getcpu.h>
+#include <asm/timex.h>
+#include "vdso.h"
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ union tod_clock clk;
+
+ /* CPU number is stored in the programmable field of the TOD clock */
+ store_tod_clock_ext(&clk);
+ if (cpu)
+ *cpu = clk.pf;
+ /* NUMA node is always zero */
+ if (node)
+ *node = 0;
+ return 0;
+}
diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S
deleted file mode 100644
index aebe10dc7c99..000000000000
--- a/arch/s390/kernel/vdso64/gettimeofday.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of gettimeofday() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
- .text
- .align 4
- .globl __kernel_gettimeofday
- .type __kernel_gettimeofday,@function
-__kernel_gettimeofday:
- CFI_STARTPROC
- aghi %r15,-16
- CFI_ADJUST_CFA_OFFSET 16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
- larl %r5,_vdso_data
-0: ltgr %r3,%r3 /* check if tz is NULL */
- je 1f
- mvc 0(8,%r3),__VDSO_TIMEZONE(%r5)
-1: ltgr %r2,%r2 /* check if tv is NULL */
- je 4f
- lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 0b
- stcke 0(%r15) /* Store TOD clock */
- lg %r1,1(%r15)
- lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */
- slgr %r0,%r1 /* now - ts_steering_end */
- ltgr %r0,%r0 /* past end of steering ? */
- jm 6f
- srlg %r0,%r0,15 /* 1 per 2^16 */
- tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */
- jz 7f
- lcgr %r0,%r0 /* negative TOD offset */
-7: algr %r1,%r0 /* add steering offset */
-6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */
- lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 0b
- lgf %r5,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- srlg %r1,%r1,0(%r5) /* >> tk->shift */
- larl %r5,5f
-2: clg %r1,0(%r5)
- jl 3f
- slg %r1,0(%r5)
- aghi %r0,1
- j 2b
-3: stg %r0,0(%r2) /* store tv->tv_sec */
- slgr %r0,%r0 /* tv_nsec -> tv_usec */
- ml %r0,8(%r5)
- srlg %r0,%r0,6
- stg %r0,8(%r2) /* store tv->tv_usec */
-4: lghi %r2,0
- aghi %r15,16
- CFI_ADJUST_CFA_OFFSET -16
- CFI_RESTORE 15
- br %r14
- CFI_ENDPROC
-5: .quad 1000000000
- .long 274877907
- .size __kernel_gettimeofday,.-__kernel_gettimeofday
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h
new file mode 100644
index 000000000000..34c7a2312f9d
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H
+#define __ARCH_S390_KERNEL_VDSO64_VDSO_H
+
+#include <vdso/datapage.h>
+
+struct getcpu_cache;
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
+
+#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 7ddb116b5e2e..4461ea151e49 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -13,7 +13,11 @@ ENTRY(_start)
SECTIONS
{
- . = VDSO64_LBASE + SIZEOF_HEADERS;
+ PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+ PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+ . = VDSO_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
@@ -47,6 +51,7 @@ SECTIONS
.rela.dyn ALIGN(8) : { *(.rela.dyn) }
.got ALIGN(8) : { *(.got .toc) }
+ .got.plt ALIGN(8) : { *(.got.plt) }
_end = .;
PROVIDE(end = .);
@@ -94,9 +99,6 @@ SECTIONS
.debug_ranges 0 : { *(.debug_ranges) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
- . = ALIGN(PAGE_SIZE);
- PROVIDE(_vdso_data = .);
-
/DISCARD/ : {
*(.note.GNU-stack)
*(.branch_lt)
@@ -136,7 +138,9 @@ VERSION
__kernel_clock_gettime;
__kernel_clock_getres;
__kernel_getcpu;
-
+ __kernel_restart_syscall;
+ __kernel_rt_sigreturn;
+ __kernel_sigreturn;
local: *;
};
}
diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c
new file mode 100644
index 000000000000..a9aa75643c08
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso64_generic.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/vdso/gettimeofday.c"
+#include "vdso.h"
+
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv,
+ struct timezone *tz)
+{
+ return __cvdso_gettimeofday(tv, tz);
+}
+
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_getres(clock, ts);
+}
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
new file mode 100644
index 000000000000..97f0c0a669a5
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/vdso.h>
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+#include <asm/dwarf.h>
+#include <asm/ptrace.h>
+
+#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8)
+
+/*
+ * Older glibc version called vdso without allocating a stackframe. This wrapper
+ * is just used to allocate a stackframe. See
+ * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e
+ * for details.
+ */
+.macro vdso_func func
+ .globl __kernel_\func
+ .type __kernel_\func,@function
+ .align 8
+__kernel_\func:
+ CFI_STARTPROC
+ aghi %r15,-WRAPPER_FRAME_SIZE
+ CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE)
+ CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
+ stg %r14,STACK_FRAME_OVERHEAD(%r15)
+ brasl %r14,__s390_vdso_\func
+ lg %r14,STACK_FRAME_OVERHEAD(%r15)
+ aghi %r15,WRAPPER_FRAME_SIZE
+ CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
+ CFI_RESTORE 15
+ br %r14
+ CFI_ENDPROC
+ .size __kernel_\func,.-__kernel_\func
+.endm
+
+vdso_func gettimeofday
+vdso_func clock_getres
+vdso_func clock_gettime
+vdso_func getcpu
+
+.macro vdso_syscall func,syscall
+ .globl __kernel_\func
+ .type __kernel_\func,@function
+ .align 8
+__kernel_\func:
+ CFI_STARTPROC
+ svc \syscall
+ /* Make sure we notice when a syscall returns, which shouldn't happen */
+ .word 0
+ CFI_ENDPROC
+ .size __kernel_\func,.-__kernel_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 37695499717d..5ea3830af0cc 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,12 +5,14 @@
#include <asm/thread_info.h>
#include <asm/page.h>
+#include <asm/ftrace.lds.h>
/*
* Put .bss..swapper_pg_dir as the first thing in .bss. This will
* make sure it has 16k alignment.
*/
-#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir)
+#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \
+ *(.bss..invalid_pg_dir)
/* Handle ro_after_init data on our own. */
#define RO_AFTER_INIT_DATA
@@ -45,8 +47,8 @@ SECTIONS
KPROBES_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
+ FTRACE_HOTPATCH_TRAMPOLINES_TEXT
*(.text.*_indirect_*)
- *(.fixup)
*(.gnu.warning)
. = ALIGN(PAGE_SIZE);
_etext = .; /* End of text section */
@@ -70,6 +72,13 @@ SECTIONS
RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE)
BOOT_DATA_PRESERVED
+ . = ALIGN(8);
+ .amode31.refs : {
+ _start_amode31_refs = .;
+ *(.amode31.refs)
+ _end_amode31_refs = .;
+ }
+
_edata = .; /* End of data section */
/* will be freed after init */
@@ -122,6 +131,7 @@ SECTIONS
/*
* Table with the patch locations to undo expolines
*/
+ . = ALIGN(4);
.nospec_call_table : {
__nospec_call_start = . ;
*(.s390_indirect*)
@@ -135,6 +145,32 @@ SECTIONS
BOOT_DATA
+ /*
+ * .amode31 section for code, data, ex_table that need to stay
+ * below 2 GB, even when the kernel is relocated above 2 GB.
+ */
+ . = ALIGN(PAGE_SIZE);
+ _samode31 = .;
+ .amode31.text : {
+ _stext_amode31 = .;
+ *(.amode31.text)
+ *(.amode31.text.*_indirect_*)
+ . = ALIGN(PAGE_SIZE);
+ _etext_amode31 = .;
+ }
+ . = ALIGN(16);
+ .amode31.ex_table : {
+ _start_amode31_ex_table = .;
+ KEEP(*(.amode31.ex_table))
+ _stop_amode31_ex_table = .;
+ }
+ . = ALIGN(PAGE_SIZE);
+ .amode31.data : {
+ *(.amode31.data)
+ }
+ . = ALIGN(PAGE_SIZE);
+ _eamode31 = .;
+
/* early.c uses stsi, which requires page aligned data. */
. = ALIGN(PAGE_SIZE);
INIT_DATA_SECTION(0x100)
@@ -176,11 +212,13 @@ SECTIONS
QUAD(__dynsym_start) /* dynsym_start */
QUAD(__rela_dyn_start) /* rela_dyn_start */
QUAD(__rela_dyn_end) /* rela_dyn_end */
+ QUAD(_eamode31 - _samode31) /* amode31_size */
} :NONE
/* Debugging sections. */
STABS_DEBUG
DWARF_DEBUG
+ ELF_DETAILS
/* Sections to be discarded */
DISCARDS
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 8df10d3c8f6c..9436f3053b88 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -13,7 +13,7 @@
#include <linux/timex.h>
#include <linux/types.h>
#include <linux/time.h>
-
+#include <asm/alternative.h>
#include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/cpu_mf.h>
@@ -130,13 +130,10 @@ static int do_account_vtime(struct task_struct *tsk)
clock = S390_lowcore.last_update_clock;
asm volatile(
" stpt %0\n" /* Store current cpu timer value */
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
" stckf %1" /* Store current tod clock value */
-#else
- " stck %1" /* Store current tod clock value */
-#endif
: "=Q" (S390_lowcore.last_update_timer),
- "=Q" (S390_lowcore.last_update_clock));
+ "=Q" (S390_lowcore.last_update_clock)
+ : : "cc");
clock = S390_lowcore.last_update_clock - clock;
timer -= S390_lowcore.last_update_timer;
@@ -216,41 +213,56 @@ void vtime_flush(struct task_struct *tsk)
avg_steal = S390_lowcore.avg_steal_timer / 2;
if ((s64) steal > 0) {
S390_lowcore.steal_timer = 0;
- account_steal_time(steal);
+ account_steal_time(cputime_to_nsecs(steal));
avg_steal += steal;
}
S390_lowcore.avg_steal_timer = avg_steal;
}
+static u64 vtime_delta(void)
+{
+ u64 timer = S390_lowcore.last_update_timer;
+
+ S390_lowcore.last_update_timer = get_vtimer();
+
+ return timer - S390_lowcore.last_update_timer;
+}
+
/*
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
*/
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_account_kernel(struct task_struct *tsk)
{
- u64 timer;
+ u64 delta = vtime_delta();
- timer = S390_lowcore.last_update_timer;
- S390_lowcore.last_update_timer = get_vtimer();
- timer -= S390_lowcore.last_update_timer;
-
- if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
- S390_lowcore.guest_timer += timer;
- else if (hardirq_count())
- S390_lowcore.hardirq_timer += timer;
- else if (in_serving_softirq())
- S390_lowcore.softirq_timer += timer;
+ if (tsk->flags & PF_VCPU)
+ S390_lowcore.guest_timer += delta;
else
- S390_lowcore.system_timer += timer;
+ S390_lowcore.system_timer += delta;
- virt_timer_forward(timer);
+ virt_timer_forward(delta);
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-
-void vtime_account_kernel(struct task_struct *tsk)
-__attribute__((alias("vtime_account_irq_enter")));
EXPORT_SYMBOL_GPL(vtime_account_kernel);
+void vtime_account_softirq(struct task_struct *tsk)
+{
+ u64 delta = vtime_delta();
+
+ S390_lowcore.softirq_timer += delta;
+
+ virt_timer_forward(delta);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+ u64 delta = vtime_delta();
+
+ S390_lowcore.hardirq_timer += delta;
+
+ virt_timer_forward(delta);
+}
+
/*
* Sorted add to a list. List is linear searched until first bigger
* element is found.
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index d3db3d7ed077..33f4ff909476 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -7,7 +7,7 @@ source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
def_bool y
prompt "KVM"
- ---help---
+ help
Say Y here to get to see options for using your Linux host to run other
operating systems inside virtual machines (guests).
This option alone does not add any kernel code.
@@ -33,7 +33,9 @@ config KVM
select HAVE_KVM_NO_POLL
select SRCU
select KVM_VFIO
- ---help---
+ select INTERVAL_TREE
+ select MMU_NOTIFIER
+ help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
on any 64bit machine.
@@ -49,14 +51,10 @@ config KVM
config KVM_S390_UCONTROL
bool "Userspace controlled virtual machines"
depends on KVM
- ---help---
+ help
Allow CAP_SYS_ADMIN users to create KVM virtual machines that are
controlled by userspace.
If unsure, say N.
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 05ee90a5ea08..02217fb4ae10 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -3,12 +3,12 @@
#
# Copyright IBM Corp. 2008
-KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
+include $(srctree)/virt/kvm/Makefile.kvm
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
-kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o vsie.o
+kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3fb54ec2cf3e..807fa9da1e72 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -2,7 +2,7 @@
/*
* handling diagnose instructions
*
- * Copyright IBM Corp. 2008, 2011
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -10,7 +10,6 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
@@ -25,7 +24,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE;
- vcpu->stat.diagnose_10++;
+ vcpu->stat.instruction_diagnose_10++;
if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end
|| start < 2 * PAGE_SIZE)
@@ -75,7 +74,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx",
vcpu->run->s.regs.gprs[rx]);
- vcpu->stat.diagnose_258++;
+ vcpu->stat.instruction_diagnose_258++;
if (vcpu->run->s.regs.gprs[rx] & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
@@ -146,18 +145,31 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
- vcpu->stat.diagnose_44++;
+ vcpu->stat.instruction_diagnose_44++;
kvm_vcpu_on_spin(vcpu, true);
return 0;
}
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+ /* Reset the count on a new slice */
+ if (time_after(jiffies, cur_slice)) {
+ cur_slice = jiffies;
+ forward_cnt = diag9c_forwarding_hz / HZ;
+ }
+ return forward_cnt-- <= 0 ? 1 : 0;
+}
+
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
int tid;
tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
- vcpu->stat.diagnose_9c++;
+ vcpu->stat.instruction_diagnose_9c++;
/* yield to self */
if (tid == vcpu->vcpu_id)
@@ -168,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
if (!tcpu)
goto no_yield;
- /* target already running */
- if (READ_ONCE(tcpu->cpu) >= 0)
- goto no_yield;
+ /* target guest VCPU already running */
+ if (READ_ONCE(tcpu->cpu) >= 0) {
+ if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+ goto no_yield;
+
+ /* target host CPU already running */
+ if (!vcpu_is_preempted(tcpu->cpu))
+ goto no_yield;
+ smp_yield_cpu(tcpu->cpu);
+ VCPU_EVENT(vcpu, 5,
+ "diag time slice end directed to %d: yield forwarded",
+ tid);
+ vcpu->stat.diag_9c_forward++;
+ return 0;
+ }
if (kvm_vcpu_yield_to(tcpu) <= 0)
goto no_yield;
@@ -179,7 +203,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
return 0;
no_yield:
VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid);
- vcpu->stat.diagnose_9c_ignored++;
+ vcpu->stat.diag_9c_ignored++;
return 0;
}
@@ -189,7 +213,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff;
VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode);
- vcpu->stat.diagnose_308++;
+ vcpu->stat.instruction_diagnose_308++;
switch (subcode) {
case 3:
vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR;
@@ -201,6 +225,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+ /*
+ * no need to check the return value of vcpu_stop as it can only have
+ * an error for protvirt, but protvirt means user cpu state
+ */
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
kvm_s390_vcpu_stop(vcpu);
vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
@@ -217,7 +245,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
{
int ret;
- vcpu->stat.diagnose_500++;
+ vcpu->stat.instruction_diagnose_500++;
/* No virtio-ccw notification? Get out quickly. */
if (!vcpu->kvm->arch.css_support ||
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
@@ -271,7 +299,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
case 0x500:
return __diag_virtio_hypercall(vcpu);
default:
- vcpu->stat.diagnose_other++;
+ vcpu->stat.instruction_diagnose_other++;
return -EOPNOTSUPP;
}
}
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 07d30ffcfa41..0243b6e38d36 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -9,8 +9,9 @@
#include <linux/vmalloc.h>
#include <linux/mm_types.h>
#include <linux/err.h>
+#include <linux/pgtable.h>
+#include <linux/bitfield.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -261,77 +262,77 @@ struct aste {
/* .. more fields there */
};
-int ipte_lock_held(struct kvm_vcpu *vcpu)
+int ipte_lock_held(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII) {
+ if (sclp.has_siif) {
int rc;
- read_lock(&vcpu->kvm->arch.sca_lock);
- rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0;
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_lock(&kvm->arch.sca_lock);
+ rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
+ read_unlock(&kvm->arch.sca_lock);
return rc;
}
- return vcpu->kvm->arch.ipte_lock_count != 0;
+ return kvm->arch.ipte_lock_count != 0;
}
-static void ipte_lock_simple(struct kvm_vcpu *vcpu)
+static void ipte_lock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- mutex_lock(&vcpu->kvm->arch.ipte_mutex);
- vcpu->kvm->arch.ipte_lock_count++;
- if (vcpu->kvm->arch.ipte_lock_count > 1)
+ mutex_lock(&kvm->arch.ipte_mutex);
+ kvm->arch.ipte_lock_count++;
+ if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.k) {
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
out:
- mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+ mutex_unlock(&kvm->arch.ipte_mutex);
}
-static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
+static void ipte_unlock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- mutex_lock(&vcpu->kvm->arch.ipte_mutex);
- vcpu->kvm->arch.ipte_lock_count--;
- if (vcpu->kvm->arch.ipte_lock_count)
+ mutex_lock(&kvm->arch.ipte_mutex);
+ kvm->arch.ipte_lock_count--;
+ if (kvm->arch.ipte_lock_count)
goto out;
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
- wake_up(&vcpu->kvm->arch.ipte_wq);
+ read_unlock(&kvm->arch.sca_lock);
+ wake_up(&kvm->arch.ipte_wq);
out:
- mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+ mutex_unlock(&kvm->arch.ipte_mutex);
}
-static void ipte_lock_siif(struct kvm_vcpu *vcpu)
+static void ipte_lock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
retry:
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.kg) {
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
@@ -339,15 +340,15 @@ retry:
new.k = 1;
new.kh++;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
}
-static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
+static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
@@ -355,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
if (!new.kh)
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
if (!new.kh)
- wake_up(&vcpu->kvm->arch.ipte_wq);
+ wake_up(&kvm->arch.ipte_wq);
}
-void ipte_lock(struct kvm_vcpu *vcpu)
+void ipte_lock(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII)
- ipte_lock_siif(vcpu);
+ if (sclp.has_siif)
+ ipte_lock_siif(kvm);
else
- ipte_lock_simple(vcpu);
+ ipte_lock_simple(kvm);
}
-void ipte_unlock(struct kvm_vcpu *vcpu)
+void ipte_unlock(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII)
- ipte_unlock_siif(vcpu);
+ if (sclp.has_siif)
+ ipte_unlock_siif(kvm);
else
- ipte_unlock_simple(vcpu);
+ ipte_unlock_simple(kvm);
}
static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
@@ -488,10 +489,12 @@ enum prot_type {
PROT_TYPE_ALC = 2,
PROT_TYPE_DAT = 3,
PROT_TYPE_IEP = 4,
+ /* Dummy value for passing an initialized value when code != PGM_PROTECTION */
+ PROT_NONE,
};
-static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
- u8 ar, enum gacc_mode mode, enum prot_type prot)
+static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+ enum gacc_mode mode, enum prot_type prot, bool terminate)
{
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
struct trans_exc_code_bits *tec;
@@ -503,9 +506,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
switch (code) {
case PGM_PROTECTION:
switch (prot) {
+ case PROT_NONE:
+ /* We should never get here, acts like termination */
+ WARN_ON_ONCE(1);
+ break;
case PROT_TYPE_IEP:
tec->b61 = 1;
- /* FALL THROUGH */
+ fallthrough;
case PROT_TYPE_LA:
tec->b56 = 1;
break;
@@ -514,12 +521,17 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
break;
case PROT_TYPE_ALC:
tec->b60 = 1;
- /* FALL THROUGH */
+ fallthrough;
case PROT_TYPE_DAT:
tec->b61 = 1;
break;
}
- /* FALL THROUGH */
+ if (terminate) {
+ tec->b56 = 0;
+ tec->b60 = 0;
+ tec->b61 = 0;
+ }
+ fallthrough;
case PGM_ASCE_TYPE:
case PGM_PAGE_TRANSLATION:
case PGM_REGION_FIRST_TRANS:
@@ -534,7 +546,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
tec->addr = gva >> PAGE_SHIFT;
tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
- /* FALL THROUGH */
+ fallthrough;
case PGM_ALEN_TRANSLATION:
case PGM_ALE_SEQUENCE:
case PGM_ASTE_VALIDITY:
@@ -551,6 +563,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
return code;
}
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+ enum gacc_mode mode, enum prot_type prot)
+{
+ return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false);
+}
+
static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
unsigned long ga, u8 ar, enum gacc_mode mode)
{
@@ -677,7 +695,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rfte.p;
ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8;
}
- /* fallthrough */
+ fallthrough;
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
@@ -695,7 +713,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rste.p;
ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8;
}
- /* fallthrough */
+ fallthrough;
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
@@ -723,7 +741,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rtte.fc0.p;
ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8;
}
- /* fallthrough */
+ fallthrough;
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
@@ -794,48 +812,270 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
- unsigned long *pages, unsigned long nr_pages,
- const union asce asce, enum gacc_mode mode)
+static int vm_check_access_key(struct kvm *kvm, u8 access_key,
+ enum gacc_mode mode, gpa_t gpa)
+{
+ u8 storage_key, access_control;
+ bool fetch_protected;
+ unsigned long hva;
+ int r;
+
+ if (access_key == 0)
+ return 0;
+
+ hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+
+ mmap_read_lock(current->mm);
+ r = get_guest_storage_key(current->mm, hva, &storage_key);
+ mmap_read_unlock(current->mm);
+ if (r)
+ return r;
+ access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+ if (access_control == access_key)
+ return 0;
+ fetch_protected = storage_key & _PAGE_FP_BIT;
+ if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+ return 0;
+ return PGM_PROTECTION;
+}
+
+static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode,
+ union asce asce)
+{
+ psw_t *psw = &vcpu->arch.sie_block->gpsw;
+ unsigned long override;
+
+ if (mode == GACC_FETCH || mode == GACC_IFETCH) {
+ /* check if fetch protection override enabled */
+ override = vcpu->arch.sie_block->gcr[0];
+ override &= CR0_FETCH_PROTECTION_OVERRIDE;
+ /* not applicable if subject to DAT && private space */
+ override = override && !(psw_bits(*psw).dat && asce.p);
+ return override;
+ }
+ return false;
+}
+
+static bool fetch_prot_override_applies(unsigned long ga, unsigned int len)
+{
+ return ga < 2048 && ga + len <= 2048;
+}
+
+static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu)
+{
+ /* check if storage protection override enabled */
+ return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE;
+}
+
+static bool storage_prot_override_applies(u8 access_control)
+{
+ /* matches special storage protection override key (9) -> allow */
+ return access_control == PAGE_SPO_ACC;
+}
+
+static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
+ enum gacc_mode mode, union asce asce, gpa_t gpa,
+ unsigned long ga, unsigned int len)
+{
+ u8 storage_key, access_control;
+ unsigned long hva;
+ int r;
+
+ /* access key 0 matches any storage key -> allow */
+ if (access_key == 0)
+ return 0;
+ /*
+ * caller needs to ensure that gfn is accessible, so we can
+ * assume that this cannot fail
+ */
+ hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa));
+ mmap_read_lock(current->mm);
+ r = get_guest_storage_key(current->mm, hva, &storage_key);
+ mmap_read_unlock(current->mm);
+ if (r)
+ return r;
+ access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+ /* access key matches storage key -> allow */
+ if (access_control == access_key)
+ return 0;
+ if (mode == GACC_FETCH || mode == GACC_IFETCH) {
+ /* it is a fetch and fetch protection is off -> allow */
+ if (!(storage_key & _PAGE_FP_BIT))
+ return 0;
+ if (fetch_prot_override_applicable(vcpu, mode, asce) &&
+ fetch_prot_override_applies(ga, len))
+ return 0;
+ }
+ if (storage_prot_override_applicable(vcpu) &&
+ storage_prot_override_applies(access_control))
+ return 0;
+ return PGM_PROTECTION;
+}
+
+/**
+ * guest_range_to_gpas() - Calculate guest physical addresses of page fragments
+ * covering a logical range
+ * @vcpu: virtual cpu
+ * @ga: guest address, start of range
+ * @ar: access register
+ * @gpas: output argument, may be NULL
+ * @len: length of range in bytes
+ * @asce: address-space-control element to use for translation
+ * @mode: access mode
+ * @access_key: access key to mach the range's storage keys against
+ *
+ * Translate a logical range to a series of guest absolute addresses,
+ * such that the concatenation of page fragments starting at each gpa make up
+ * the whole range.
+ * The translation is performed as if done by the cpu for the given @asce, @ar,
+ * @mode and state of the @vcpu.
+ * If the translation causes an exception, its program interruption code is
+ * returned and the &struct kvm_s390_pgm_info pgm member of @vcpu is modified
+ * such that a subsequent call to kvm_s390_inject_prog_vcpu() will inject
+ * a correct exception into the guest.
+ * The resulting gpas are stored into @gpas, unless it is NULL.
+ *
+ * Note: All fragments except the first one start at the beginning of a page.
+ * When deriving the boundaries of a fragment from a gpa, all but the last
+ * fragment end at the end of the page.
+ *
+ * Return:
+ * * 0 - success
+ * * <0 - translation could not be performed, for example if guest
+ * memory could not be accessed
+ * * >0 - an access exception occurred. In this case the returned value
+ * is the program interruption code and the contents of pgm may
+ * be used to inject an exception into the guest.
+ */
+static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ unsigned long *gpas, unsigned long len,
+ const union asce asce, enum gacc_mode mode,
+ u8 access_key)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
+ unsigned int offset = offset_in_page(ga);
+ unsigned int fragment_len;
int lap_enabled, rc = 0;
enum prot_type prot;
+ unsigned long gpa;
lap_enabled = low_address_protection_enabled(vcpu, asce);
- while (nr_pages) {
+ while (min(PAGE_SIZE - offset, len) > 0) {
+ fragment_len = min(PAGE_SIZE - offset, len);
ga = kvm_s390_logical_to_effective(vcpu, ga);
if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
PROT_TYPE_LA);
- ga &= PAGE_MASK;
if (psw_bits(*psw).dat) {
- rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
+ rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot);
if (rc < 0)
return rc;
} else {
- *pages = kvm_s390_real_to_abs(vcpu, ga);
- if (kvm_is_error_gpa(vcpu->kvm, *pages))
+ gpa = kvm_s390_real_to_abs(vcpu, ga);
+ if (kvm_is_error_gpa(vcpu->kvm, gpa)) {
rc = PGM_ADDRESSING;
+ prot = PROT_NONE;
+ }
}
if (rc)
return trans_exc(vcpu, rc, ga, ar, mode, prot);
- ga += PAGE_SIZE;
- pages++;
- nr_pages--;
+ rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga,
+ fragment_len);
+ if (rc)
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC);
+ if (gpas)
+ *gpas++ = gpa;
+ offset = 0;
+ ga += fragment_len;
+ len -= fragment_len;
}
return 0;
}
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
- unsigned long len, enum gacc_mode mode)
+static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+ void *data, unsigned int len)
+{
+ const unsigned int offset = offset_in_page(gpa);
+ const gfn_t gfn = gpa_to_gfn(gpa);
+ int rc;
+
+ if (mode == GACC_STORE)
+ rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
+ else
+ rc = kvm_read_guest_page(kvm, gfn, data, offset, len);
+ return rc;
+}
+
+static int
+access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+ void *data, unsigned int len, u8 access_key)
+{
+ struct kvm_memory_slot *slot;
+ bool writable;
+ gfn_t gfn;
+ hva_t hva;
+ int rc;
+
+ gfn = gpa >> PAGE_SHIFT;
+ slot = gfn_to_memslot(kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+ /*
+ * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
+ * Don't try to actually handle that case.
+ */
+ if (!writable && mode == GACC_STORE)
+ return -EOPNOTSUPP;
+ hva += offset_in_page(gpa);
+ if (mode == GACC_STORE)
+ rc = copy_to_user_key((void __user *)hva, data, len, access_key);
+ else
+ rc = copy_from_user_key(data, (void __user *)hva, len, access_key);
+ if (rc)
+ return PGM_PROTECTION;
+ if (mode == GACC_STORE)
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ return 0;
+}
+
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len, enum gacc_mode mode, u8 access_key)
+{
+ int offset = offset_in_page(gpa);
+ int fragment_len;
+ int rc;
+
+ while (min(PAGE_SIZE - offset, len) > 0) {
+ fragment_len = min(PAGE_SIZE - offset, len);
+ rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key);
+ if (rc)
+ return rc;
+ offset = 0;
+ len -= fragment_len;
+ data += fragment_len;
+ gpa += fragment_len;
+ }
+ return 0;
+}
+
+int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, enum gacc_mode mode,
+ u8 access_key)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- unsigned long _len, nr_pages, gpa, idx;
- unsigned long pages_array[2];
- unsigned long *pages;
+ unsigned long nr_pages, idx;
+ unsigned long gpa_array[2];
+ unsigned int fragment_len;
+ unsigned long *gpas;
+ enum prot_type prot;
int need_ipte_lock;
union asce asce;
+ bool try_storage_prot_override;
+ bool try_fetch_prot_override;
int rc;
if (!len)
@@ -845,55 +1085,90 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
- pages = pages_array;
- if (nr_pages > ARRAY_SIZE(pages_array))
- pages = vmalloc(array_size(nr_pages, sizeof(unsigned long)));
- if (!pages)
+ gpas = gpa_array;
+ if (nr_pages > ARRAY_SIZE(gpa_array))
+ gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long)));
+ if (!gpas)
return -ENOMEM;
+ try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce);
+ try_storage_prot_override = storage_prot_override_applicable(vcpu);
need_ipte_lock = psw_bits(*psw).dat && !asce.r;
if (need_ipte_lock)
- ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
- for (idx = 0; idx < nr_pages && !rc; idx++) {
- gpa = *(pages + idx) + (ga & ~PAGE_MASK);
- _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
- if (mode == GACC_STORE)
- rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
+ ipte_lock(vcpu->kvm);
+ /*
+ * Since we do the access further down ultimately via a move instruction
+ * that does key checking and returns an error in case of a protection
+ * violation, we don't need to do the check during address translation.
+ * Skip it by passing access key 0, which matches any storage key,
+ * obviating the need for any further checks. As a result the check is
+ * handled entirely in hardware on access, we only need to take care to
+ * forego key protection checking if fetch protection override applies or
+ * retry with the special key 9 in case of storage protection override.
+ */
+ rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0);
+ if (rc)
+ goto out_unlock;
+ for (idx = 0; idx < nr_pages; idx++) {
+ fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len);
+ if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) {
+ rc = access_guest_page(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len);
+ } else {
+ rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len, access_key);
+ }
+ if (rc == PGM_PROTECTION && try_storage_prot_override)
+ rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len, PAGE_SPO_ACC);
+ if (rc)
+ break;
+ len -= fragment_len;
+ data += fragment_len;
+ ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len);
+ }
+ if (rc > 0) {
+ bool terminate = (mode == GACC_STORE) && (idx > 0);
+
+ if (rc == PGM_PROTECTION)
+ prot = PROT_TYPE_KEYC;
else
- rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
- len -= _len;
- ga += _len;
- data += _len;
+ prot = PROT_NONE;
+ rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
}
+out_unlock:
if (need_ipte_lock)
- ipte_unlock(vcpu);
- if (nr_pages > ARRAY_SIZE(pages_array))
- vfree(pages);
+ ipte_unlock(vcpu->kvm);
+ if (nr_pages > ARRAY_SIZE(gpa_array))
+ vfree(gpas);
return rc;
}
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode)
{
- unsigned long _len, gpa;
+ unsigned int fragment_len;
+ unsigned long gpa;
int rc = 0;
while (len && !rc) {
gpa = kvm_s390_real_to_abs(vcpu, gra);
- _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
- if (mode)
- rc = write_guest_abs(vcpu, gpa, data, _len);
- else
- rc = read_guest_abs(vcpu, gpa, data, _len);
- len -= _len;
- gra += _len;
- data += _len;
+ fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len);
+ rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len);
+ len -= fragment_len;
+ gra += fragment_len;
+ data += fragment_len;
}
return rc;
}
/**
- * guest_translate_address - translate guest logical into guest absolute address
+ * guest_translate_address_with_key - translate guest logical into guest absolute address
+ * @vcpu: virtual cpu
+ * @gva: Guest virtual address
+ * @ar: Access register
+ * @gpa: Guest physical address
+ * @mode: Translation access mode
+ * @access_key: access key to mach the storage key with
*
* Parameter semantics are the same as the ones from guest_translate.
* The memory contents at the guest address are not changed.
@@ -901,11 +1176,10 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* Note: The IPTE lock is not taken during this function, so the caller
* has to take care of this.
*/
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
- unsigned long *gpa, enum gacc_mode mode)
+int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+ unsigned long *gpa, enum gacc_mode mode,
+ u8 access_key)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- enum prot_type prot;
union asce asce;
int rc;
@@ -913,49 +1187,62 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
- if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (mode == GACC_STORE)
- return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
- mode, PROT_TYPE_LA);
- }
+ return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode,
+ access_key);
+}
- if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */
- rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
- if (rc > 0)
- return trans_exc(vcpu, rc, gva, 0, mode, prot);
- } else {
- *gpa = kvm_s390_real_to_abs(vcpu, gva);
- if (kvm_is_error_gpa(vcpu->kvm, *gpa))
- return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
- }
+/**
+ * check_gva_range - test a range of guest virtual addresses for accessibility
+ * @vcpu: virtual cpu
+ * @gva: Guest virtual address
+ * @ar: Access register
+ * @length: Length of test range
+ * @mode: Translation access mode
+ * @access_key: access key to mach the storage keys with
+ */
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+ unsigned long length, enum gacc_mode mode, u8 access_key)
+{
+ union asce asce;
+ int rc = 0;
+
+ rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
+ if (rc)
+ return rc;
+ ipte_lock(vcpu->kvm);
+ rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode,
+ access_key);
+ ipte_unlock(vcpu->kvm);
return rc;
}
/**
- * check_gva_range - test a range of guest virtual addresses for accessibility
+ * check_gpa_range - test a range of guest physical addresses for accessibility
+ * @kvm: virtual machine instance
+ * @gpa: guest physical address
+ * @length: length of test range
+ * @mode: access mode to test, relevant for storage keys
+ * @access_key: access key to mach the storage keys with
*/
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
- unsigned long length, enum gacc_mode mode)
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+ enum gacc_mode mode, u8 access_key)
{
- unsigned long gpa;
- unsigned long currlen;
+ unsigned int fragment_len;
int rc = 0;
- ipte_lock(vcpu);
- while (length > 0 && !rc) {
- currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
- rc = guest_translate_address(vcpu, gva, ar, &gpa, mode);
- gva += currlen;
- length -= currlen;
+ while (length && !rc) {
+ fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length);
+ rc = vm_check_access_key(kvm, access_key, mode, gpa);
+ length -= fragment_len;
+ gpa += fragment_len;
}
- ipte_unlock(vcpu);
-
return rc;
}
/**
* kvm_s390_check_low_addr_prot_real - check for low-address protection
+ * @vcpu: virtual cpu
* @gra: Guest real address
*
* Checks whether an address is subject to low-address protection and set
@@ -976,7 +1263,10 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ * successful (return value 0), or to the first invalid DAT entry in
+ * case of exceptions (return value > 0)
+ * @dat_protection: referenced memory is write protected
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1324,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
+ *pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
@@ -1050,7 +1341,8 @@ shadow_r2t:
rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
if (rc)
return rc;
- } /* fallthrough */
+ }
+ fallthrough;
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
@@ -1059,6 +1351,7 @@ shadow_r2t:
rste.val = ptr;
goto shadow_r3t;
}
+ *pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
@@ -1076,7 +1369,8 @@ shadow_r3t:
rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
if (rc)
return rc;
- } /* fallthrough */
+ }
+ fallthrough;
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
@@ -1085,6 +1379,7 @@ shadow_r3t:
rtte.val = ptr;
goto shadow_sgt;
}
+ *pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
@@ -1111,7 +1406,8 @@ shadow_sgt:
rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
if (rc)
return rc;
- } /* fallthrough */
+ }
+ fallthrough;
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
@@ -1120,6 +1416,7 @@ shadow_sgt:
ste.val = ptr;
goto shadow_pgt;
}
+ *pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
@@ -1154,6 +1451,8 @@ shadow_pgt:
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
@@ -1162,21 +1461,21 @@ shadow_pgt:
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr)
+ unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
- unsigned long pgt;
+ unsigned long pgt = 0;
int dat_protection, fake;
int rc;
- down_read(&sg->mm->mmap_sem);
+ mmap_read_lock(sg->mm);
/*
* We don't want any guest-2 tables to change - so the parent
* tables/pointers we read stay valid - unshadowing is however
* always possible - only guest_table_lock protects us.
*/
- ipte_lock(vcpu);
+ ipte_lock(vcpu->kvm);
rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
if (rc)
@@ -1188,8 +1487,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
- if (!rc)
- rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+ switch (rc) {
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_FIRST_TRANS:
+ pgt |= PEI_NOT_PTE;
+ break;
+ case 0:
+ pgt += vaddr.px * 8;
+ rc = gmap_read_table(sg->parent, pgt, &pte.val);
+ }
+ if (datptr)
+ *datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)
@@ -1198,7 +1509,7 @@ shadow_page:
pte.p |= dat_protection;
if (!rc)
rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
- ipte_unlock(vcpu);
- up_read(&sg->mm->mmap_sem);
+ ipte_unlock(vcpu->kvm);
+ mmap_read_unlock(sg->mm);
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index f4c51756c462..9408d6cc8e2c 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,17 +18,14 @@
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
* @gra - guest real address
*
* Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
*/
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
- unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
{
- unsigned long prefix = kvm_s390_get_prefix(vcpu);
-
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
}
/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+ unsigned long gra)
+{
+ return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+ unsigned long ga)
+{
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+ return ga;
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+ return ga & ((1UL << 31) - 1);
+ return ga & ((1UL << 24) - 1);
+}
+
+/**
* kvm_s390_logical_to_effective - convert guest logical to effective address
* @vcpu: guest virtual cpu
* @ga: guest logical address
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
unsigned long ga)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
- return ga;
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
- return ga & ((1UL << 31) - 1);
- return ga & ((1UL << 24) - 1);
+ return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
}
/*
@@ -158,24 +186,34 @@ enum gacc_mode {
GACC_IFETCH,
};
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
- u8 ar, unsigned long *gpa, enum gacc_mode mode);
+int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+ unsigned long *gpa, enum gacc_mode mode,
+ u8 access_key);
+
int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
- unsigned long length, enum gacc_mode mode);
+ unsigned long length, enum gacc_mode mode, u8 access_key);
+
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+ enum gacc_mode mode, u8 access_key);
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
- unsigned long len, enum gacc_mode mode);
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len, enum gacc_mode mode, u8 access_key);
+
+int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, enum gacc_mode mode,
+ u8 access_key);
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
/**
- * write_guest - copy data from kernel space to guest space
+ * write_guest_with_key - copy data from kernel space to guest space
* @vcpu: virtual cpu
* @ga: guest address
* @ar: access register
* @data: source address in kernel space
* @len: number of bytes to copy
+ * @access_key: access key the storage key needs to match
*
* Copy @len bytes from @data (kernel space) to @ga (guest address).
* In order to copy data to guest space the PSW of the vcpu is inspected:
@@ -186,8 +224,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* The addressing mode of the PSW is also inspected, so that address wrap
* around is taken into account for 24-, 31- and 64-bit addressing mode,
* if the to be copied data crosses page boundaries in guest address space.
- * In addition also low address and DAT protection are inspected before
- * copying any data (key protection is currently not implemented).
+ * In addition low address, DAT and key protection checks are performed before
+ * copying any data.
*
* This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu.
* In case of an access exception (e.g. protection exception) pgm will contain
@@ -215,10 +253,53 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* if data has been changed in guest space in case of an exception.
*/
static inline __must_check
+int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, u8 access_key)
+{
+ return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE,
+ access_key);
+}
+
+/**
+ * write_guest - copy data from kernel space to guest space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @ar: access register
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * The behaviour of write_guest is identical to write_guest_with_key, except
+ * that the PSW access key is used instead of an explicit argument.
+ */
+static inline __must_check
int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
+ u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+ return write_guest_with_key(vcpu, ga, ar, data, len, access_key);
+}
+
+/**
+ * read_guest_with_key - copy data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @ar: access register
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ * @access_key: access key the storage key needs to match
+ *
+ * Copy @len bytes from @ga (guest address) to @data (kernel space).
+ *
+ * The behaviour of read_guest_with_key is identical to write_guest_with_key,
+ * except that data will be copied from guest space to kernel space.
+ */
+static inline __must_check
+int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, u8 access_key)
+{
+ return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH,
+ access_key);
}
/**
@@ -231,14 +312,16 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
*
* Copy @len bytes from @ga (guest address) to @data (kernel space).
*
- * The behaviour of read_guest is identical to write_guest, except that
- * data will be copied from guest space to kernel space.
+ * The behaviour of read_guest is identical to read_guest_with_key, except
+ * that the PSW access key is used instead of an explicit argument.
*/
static inline __must_check
int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
+ u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+ return read_guest_with_key(vcpu, ga, ar, data, len, access_key);
}
/**
@@ -259,7 +342,10 @@ static inline __must_check
int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
+ u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+ return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH,
+ access_key);
}
/**
@@ -354,12 +440,16 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
return access_guest_real(vcpu, gra, data, len, 0);
}
-void ipte_lock(struct kvm_vcpu *vcpu);
-void ipte_unlock(struct kvm_vcpu *vcpu);
-int ipte_lock_held(struct kvm_vcpu *vcpu);
+void ipte_lock(struct kvm *kvm);
+void ipte_unlock(struct kvm *kvm);
+int ipte_lock_held(struct kvm *kvm);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr);
+ unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 394a5f53805b..3765c4223bf9 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -184,7 +184,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE)
return -EINVAL;
- wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL);
+ wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT);
if (!wp_info->old_data)
return -ENOMEM;
/* try to backup the original value */
@@ -234,7 +234,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
if (nr_wp > 0) {
wp_info = kmalloc_array(nr_wp,
sizeof(*wp_info),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!wp_info) {
ret = -ENOMEM;
goto error;
@@ -243,7 +243,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
if (nr_bp > 0) {
bp_info = kmalloc_array(nr_bp,
sizeof(*bp_info),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!bp_info) {
ret = -ENOMEM;
goto error;
@@ -349,7 +349,7 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu)
if (!wp_info || !wp_info->old_data || wp_info->len <= 0)
continue;
- temp = kmalloc(wp_info->len, GFP_KERNEL);
+ temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT);
if (!temp)
continue;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index a389fa85cca2..88112065d941 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -2,7 +2,7 @@
/*
* in-kernel handling for sie intercepts
*
- * Copyright IBM Corp. 2008, 2014
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -12,10 +12,10 @@
#include <linux/errno.h>
#include <linux/pagemap.h>
-#include <asm/kvm_host.h>
#include <asm/asm-offsets.h>
#include <asm/irq.h>
#include <asm/sysinfo.h>
+#include <asm/uv.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu)
return rc;
}
+ /*
+ * no need to check the return value of vcpu_stop as it can only have
+ * an error for protvirt, but protvirt means user cpu state
+ */
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
kvm_s390_vcpu_stop(vcpu);
return -EOPNOTSUPP;
@@ -231,6 +235,13 @@ static int handle_prog(struct kvm_vcpu *vcpu)
vcpu->stat.exit_program_interruption++;
+ /*
+ * Intercept 8 indicates a loop of specification exceptions
+ * for protected guests.
+ */
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ return -EOPNOTSUPP;
+
if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
rc = kvm_s390_handle_per_event(vcpu);
if (rc)
@@ -258,6 +269,7 @@ static int handle_prog(struct kvm_vcpu *vcpu)
/**
* handle_external_interrupt - used for external interruption interceptions
+ * @vcpu: virtual cpu
*
* This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
* the new PSW does not have external interrupts disabled. In the first case,
@@ -304,7 +316,8 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
}
/**
- * Handle MOVE PAGE partial execution interception.
+ * handle_mvpg_pei - Handle MOVE PAGE partial execution interception.
+ * @vcpu: virtual cpu
*
* This interception can only happen for guests with DAT disabled and
* addresses that are currently not mapped in the host. Thus we try to
@@ -318,18 +331,18 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
- /* Make sure that the source is paged-in */
- rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
- reg2, &srcaddr, GACC_FETCH);
+ /* Ensure that the source is paged-in, no actual access -> no key checking */
+ rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2],
+ reg2, &srcaddr, GACC_FETCH, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
if (rc != 0)
return rc;
- /* Make sure that the destination is paged-in */
- rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
- reg1, &dstaddr, GACC_STORE);
+ /* Ensure that the source is paged-in, no actual access -> no key checking */
+ rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1],
+ reg1, &dstaddr, GACC_STORE, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
@@ -384,10 +397,10 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
goto out;
}
- if (addr & ~PAGE_MASK)
+ if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sctns)
return -ENOMEM;
@@ -395,10 +408,15 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
out:
if (!cc) {
- r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
- if (r) {
- free_page((unsigned long)sctns);
- return kvm_s390_inject_prog_cond(vcpu, r);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy((void *)(sida_origin(vcpu->arch.sie_block)),
+ sctns, PAGE_SIZE);
+ } else {
+ r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+ if (r) {
+ free_page((unsigned long)sctns);
+ return kvm_s390_inject_prog_cond(vcpu, r);
+ }
}
}
@@ -444,6 +462,97 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
}
+static int handle_pv_spx(struct kvm_vcpu *vcpu)
+{
+ u32 pref = *(u32 *)vcpu->arch.sie_block->sidad;
+
+ kvm_s390_set_prefix(vcpu, pref);
+ trace_kvm_s390_handle_prefix(vcpu, 1, pref);
+ return 0;
+}
+
+static int handle_pv_sclp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+
+ spin_lock(&fi->lock);
+ /*
+ * 2 cases:
+ * a: an sccb answering interrupt was already pending or in flight.
+ * As the sccb value is not known we can simply set some value to
+ * trigger delivery of a saved SCCB. UV will then use its saved
+ * copy of the SCCB value.
+ * b: an error SCCB interrupt needs to be injected so we also inject
+ * a fake SCCB address. Firmware will use the proper one.
+ * This makes sure, that both errors and real sccb returns will only
+ * be delivered after a notification intercept (instruction has
+ * finished) but not after others.
+ */
+ fi->srv_signal.ext_params |= 0x43000;
+ set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+ clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
+ spin_unlock(&fi->lock);
+ return 0;
+}
+
+static int handle_pv_uvc(struct kvm_vcpu *vcpu)
+{
+ struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad;
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm),
+ .gaddr = guest_uvcb->paddr,
+ };
+ int rc;
+
+ if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) {
+ WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n",
+ guest_uvcb->header.cmd);
+ return 0;
+ }
+ rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+ /*
+ * If the unpin did not succeed, the guest will exit again for the UVC
+ * and we will retry the unpin.
+ */
+ if (rc == -EINVAL)
+ return 0;
+ /*
+ * If we got -EAGAIN here, we simply return it. It will eventually
+ * get propagated all the way to userspace, which should then try
+ * again.
+ */
+ return rc;
+}
+
+static int handle_pv_notification(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (vcpu->arch.sie_block->ipa == 0xb210)
+ return handle_pv_spx(vcpu);
+ if (vcpu->arch.sie_block->ipa == 0xb220)
+ return handle_pv_sclp(vcpu);
+ if (vcpu->arch.sie_block->ipa == 0xb9a4)
+ return handle_pv_uvc(vcpu);
+ if (vcpu->arch.sie_block->ipa >> 8 == 0xae) {
+ /*
+ * Besides external call, other SIGP orders also cause a
+ * 108 (pv notify) intercept. In contrast to external call,
+ * these orders need to be emulated and hence the appropriate
+ * place to handle them is in handle_instruction().
+ * So first try kvm_s390_handle_sigp_pei() and if that isn't
+ * successful, go on with handle_instruction().
+ */
+ ret = kvm_s390_handle_sigp_pei(vcpu);
+ if (!ret)
+ return ret;
+ }
+
+ return handle_instruction(vcpu);
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
int rc, per_rc = 0;
@@ -480,6 +589,28 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
case ICPT_KSS:
rc = kvm_s390_skey_check_enable(vcpu);
break;
+ case ICPT_MCHKREQ:
+ case ICPT_INT_ENABLE:
+ /*
+ * PSW bit 13 or a CR (0, 6, 14) changed and we might
+ * now be able to deliver interrupts. The pre-run code
+ * will take care of this.
+ */
+ rc = 0;
+ break;
+ case ICPT_PV_INSTR:
+ rc = handle_instruction(vcpu);
+ break;
+ case ICPT_PV_NOTIFY:
+ rc = handle_pv_notification(vcpu);
+ break;
+ case ICPT_PV_PREF:
+ rc = 0;
+ gmap_convert_to_secure(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu));
+ gmap_convert_to_secure(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index c06c89d370a7..ab569faf0df2 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2,7 +2,7 @@
/*
* handling kvm guest interrupts
*
- * Copyright IBM Corp. 2008, 2015
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
*/
@@ -28,9 +28,11 @@
#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/airq.h>
+#include <asm/tpi.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
+#include "pci.h"
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
@@ -324,8 +326,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu)
{
- return vcpu->kvm->arch.float_int.pending_irqs |
- vcpu->arch.local_int.pending_irqs;
+ unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs |
+ vcpu->arch.local_int.pending_irqs;
+
+ pending &= ~vcpu->kvm->arch.float_int.masked_irqs;
+ return pending;
}
static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
@@ -383,10 +388,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
__clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask);
if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK))
__clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask);
- if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
+ if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) {
__clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
+ __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask);
+ }
if (psw_mchk_disabled(vcpu))
active_mask &= ~IRQ_PEND_MCHK_MASK;
+ /* PV guest cpus can have a single interruption injected at a time. */
+ if (kvm_s390_pv_cpu_get_handle(vcpu) &&
+ vcpu->arch.sie_block->iictl != IICTL_CODE_NONE)
+ active_mask &= ~(IRQ_PEND_EXT_II_MASK |
+ IRQ_PEND_IO_MASK |
+ IRQ_PEND_MCHK_MASK);
/*
* Check both floating and local interrupt's cr14 because
* bit IRQ_PEND_MCHK_REP could be set in both cases.
@@ -408,13 +421,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -479,19 +492,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- int rc;
+ int rc = 0;
vcpu->stat.deliver_cputm++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
0, 0);
-
- rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
- (u16 *)__LC_EXT_INT_CODE);
- rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
- rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER;
+ } else {
+ rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
+ (u16 *)__LC_EXT_INT_CODE);
+ rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ }
clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
return rc ? -EFAULT : 0;
}
@@ -499,19 +516,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- int rc;
+ int rc = 0;
vcpu->stat.deliver_ckc++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
0, 0);
-
- rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
- (u16 __user *)__LC_EXT_INT_CODE);
- rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
- rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP;
+ } else {
+ rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
+ (u16 __user *)__LC_EXT_INT_CODE);
+ rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ }
clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
return rc ? -EFAULT : 0;
}
@@ -553,6 +574,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
union mci mci;
int rc;
+ /*
+ * All other possible payload for a machine check (e.g. the register
+ * contents in the save area) will be handled by the ultravisor, as
+ * the hypervisor does not not have the needed information for
+ * protected guests.
+ */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK;
+ vcpu->arch.sie_block->mcic = mchk->mcic;
+ vcpu->arch.sie_block->faddr = mchk->failing_storage_address;
+ vcpu->arch.sie_block->edc = mchk->ext_damage_code;
+ return 0;
+ }
+
mci.val = mchk->mcic;
/* take care of lazy register loading */
save_fpu_regs();
@@ -669,7 +704,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
/*
* We indicate floating repressible conditions along with
* other pending conditions. Channel Report Pending and Channel
- * Subsystem damage are the only two and and are indicated by
+ * Subsystem damage are the only two and are indicated by
* bits in mcic and masked in cr14.
*/
if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
@@ -696,17 +731,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- int rc;
+ int rc = 0;
VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart");
vcpu->stat.deliver_restart_signal++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0);
- rc = write_guest_lc(vcpu,
- offsetof(struct lowcore, restart_old_psw),
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART;
+ } else {
+ rc = write_guest_lc(vcpu,
+ offsetof(struct lowcore, restart_old_psw),
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ }
clear_bit(IRQ_PEND_RESTART, &li->pending_irqs);
return rc ? -EFAULT : 0;
}
@@ -748,6 +787,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu)
vcpu->stat.deliver_emergency_signal++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
cpu_addr, 0);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG;
+ vcpu->arch.sie_block->extcpuaddr = cpu_addr;
+ return 0;
+ }
rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG,
(u16 *)__LC_EXT_INT_CODE);
@@ -776,6 +821,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
KVM_S390_INT_EXTERNAL_CALL,
extcall.code, 0);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL;
+ vcpu->arch.sie_block->extcpuaddr = extcall.code;
+ return 0;
+ }
rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL,
(u16 *)__LC_EXT_INT_CODE);
@@ -787,6 +838,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
+static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code)
+{
+ switch (code) {
+ case PGM_SPECIFICATION:
+ vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION;
+ break;
+ case PGM_OPERAND:
+ vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -807,6 +873,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
pgm_info.code, 0);
+ /* PER is handled by the ultravisor */
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER);
+
switch (pgm_info.code & ~PGM_PER) {
case PGM_AFX_TRANSLATION:
case PGM_ASX_TRANSLATION:
@@ -818,7 +888,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
case PGM_PRIMARY_AUTHORITY:
case PGM_SECONDARY_AUTHORITY:
nullifying = true;
- /* fall through */
+ fallthrough;
case PGM_SPACE_SWITCH:
rc = put_guest_lc(vcpu, pgm_info.trans_exc_code,
(u64 *)__LC_TRANS_EXC_CODE);
@@ -892,7 +962,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
/* bit 1+2 of the target are the ilc, so we can directly use ilen */
rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
- (u64 *) __LC_LAST_BREAK);
+ (u64 *) __LC_PGM_LAST_BREAK);
rc |= put_guest_lc(vcpu, pgm_info.code,
(u16 *)__LC_PGM_INT_CODE);
rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
@@ -902,20 +972,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
+#define SCCB_MASK 0xFFFFFFF8
+#define SCCB_EVENT_PENDING 0x3
+
+static int write_sclp(struct kvm_vcpu *vcpu, u32 parm)
+{
+ int rc;
+
+ if (kvm_s390_pv_cpu_get_handle(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG;
+ vcpu->arch.sie_block->eiparams = parm;
+ return 0;
+ }
+
+ rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
+ rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= put_guest_lc(vcpu, parm,
+ (u32 *)__LC_EXT_PARAMS);
+
+ return rc ? -EFAULT : 0;
+}
+
static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
{
struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_ext_info ext;
- int rc = 0;
spin_lock(&fi->lock);
- if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
+ if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) ||
+ !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
spin_unlock(&fi->lock);
return 0;
}
ext = fi->srv_signal;
memset(&fi->srv_signal, 0, sizeof(ext));
clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+ clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
spin_unlock(&fi->lock);
VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x",
@@ -924,16 +1023,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
ext.ext_params, 0);
- rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
- rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
- rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= put_guest_lc(vcpu, ext.ext_params,
- (u32 *)__LC_EXT_PARAMS);
+ return write_sclp(vcpu, ext.ext_params);
+}
- return rc ? -EFAULT : 0;
+static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+ struct kvm_s390_ext_info ext;
+
+ spin_lock(&fi->lock);
+ if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) {
+ spin_unlock(&fi->lock);
+ return 0;
+ }
+ ext = fi->srv_signal;
+ /* only clear the event bit */
+ fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING;
+ clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+ spin_unlock(&fi->lock);
+
+ VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event");
+ vcpu->stat.deliver_service_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
+ ext.ext_params, 0);
+
+ return write_sclp(vcpu, SCCB_EVENT_PENDING);
}
static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)
@@ -1028,6 +1142,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io)
{
int rc;
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_IO;
+ vcpu->arch.sie_block->subchannel_id = io->subchannel_id;
+ vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr;
+ vcpu->arch.sie_block->io_int_parm = io->io_int_parm;
+ vcpu->arch.sie_block->io_int_word = io->io_int_word;
+ return 0;
+ }
+
rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID);
rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR);
rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM);
@@ -1166,7 +1289,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
/* already expired? */
if (cputm >> 63)
return 0;
- return min(sltime, tod_to_ns(cputm));
+ return min_t(u64, sltime, tod_to_ns(cputm));
}
} else if (cpu_timer_interrupts_enabled(vcpu)) {
sltime = kvm_s390_get_cpu_timer(vcpu);
@@ -1213,10 +1336,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL);
VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
no_timer:
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_vcpu_halt(vcpu);
+ vcpu->valid_wakeup = false;
__unset_cpu_idle(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
hrtimer_cancel(&vcpu->arch.ckc_timer);
return 0;
@@ -1329,6 +1453,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
case IRQ_PEND_EXT_SERVICE:
rc = __deliver_service(vcpu);
break;
+ case IRQ_PEND_EXT_SERVICE_EV:
+ rc = __deliver_service_ev(vcpu);
+ break;
case IRQ_PEND_PFAULT_DONE:
rc = __deliver_pfault_done(vcpu);
break;
@@ -1421,7 +1548,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
return -EINVAL;
- if (sclp.has_sigpif)
+ if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
return sca_inject_ext_call(vcpu, src_id);
if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
@@ -1668,7 +1795,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
goto out;
}
gisa_out:
- tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (tmp_inti) {
tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
tmp_inti->io.io_int_word = isc_to_int_word(isc);
@@ -1681,9 +1808,6 @@ out:
return inti;
}
-#define SCCB_MASK 0xFFFFFFF8
-#define SCCB_EVENT_PENDING 0x3
-
static int __inject_service(struct kvm *kvm,
struct kvm_s390_interrupt_info *inti)
{
@@ -1692,6 +1816,11 @@ static int __inject_service(struct kvm *kvm,
kvm->stat.inject_service_signal++;
spin_lock(&fi->lock);
fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
+
+ /* We always allow events, track them separately from the sccb ints */
+ if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING)
+ set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+
/*
* Early versions of the QEMU s390 bios will inject several
* service interrupts after another without handling a
@@ -1773,6 +1902,12 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
kvm->stat.inject_io++;
isc = int_word_to_isc(inti->io.io_int_word);
+ /*
+ * We do not use the lock checking variant as this is just a
+ * performance optimization and we do not hold the lock here.
+ * This is ok as the code will pick interrupts from both "lists"
+ * for delivery.
+ */
if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) {
VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
gisa_set_ipm_gisc(gi->origin, isc);
@@ -1834,7 +1969,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
if (!(type & KVM_S390_INT_IO_AI_MASK &&
- kvm->arch.gisa_int.origin))
+ kvm->arch.gisa_int.origin) ||
+ kvm_s390_pv_cpu_get_handle(dst_vcpu))
kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
break;
default:
@@ -1881,7 +2017,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt_info *inti;
int rc;
- inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -1981,6 +2117,13 @@ int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu)
return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
}
+int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+
+ return test_bit(IRQ_PEND_RESTART, &li->pending_irqs);
+}
+
void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -2080,6 +2223,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
int i;
+ mutex_lock(&kvm->lock);
+ if (!kvm_s390_pv_is_protected(kvm))
+ fi->masked_irqs = 0;
+ mutex_unlock(&kvm->lock);
spin_lock(&fi->lock);
fi->pending_irqs = 0;
memset(&fi->srv_signal, 0, sizeof(fi->srv_signal));
@@ -2146,7 +2293,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
n++;
}
}
- if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) {
+ if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) ||
+ test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) {
if (n == max_irqs) {
/* signal userspace to try again */
ret = -ENOMEM;
@@ -2275,7 +2423,7 @@ static int enqueue_floating_irq(struct kvm_device *dev,
return -EINVAL;
while (len >= sizeof(struct kvm_s390_irq)) {
- inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -2323,13 +2471,10 @@ static int register_io_adapter(struct kvm_device *dev,
if (dev->kvm->arch.adapters[adapter_info.id] != NULL)
return -EINVAL;
- adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+ adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT);
if (!adapter)
return -ENOMEM;
- INIT_LIST_HEAD(&adapter->maps);
- init_rwsem(&adapter->maps_lock);
- atomic_set(&adapter->nr_maps, 0);
adapter->id = adapter_info.id;
adapter->isc = adapter_info.isc;
adapter->maskable = adapter_info.maskable;
@@ -2354,87 +2499,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
return ret;
}
-static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
-{
- struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
- struct s390_map_info *map;
- int ret;
-
- if (!adapter || !addr)
- return -EINVAL;
-
- map = kzalloc(sizeof(*map), GFP_KERNEL);
- if (!map) {
- ret = -ENOMEM;
- goto out;
- }
- INIT_LIST_HEAD(&map->list);
- map->guest_addr = addr;
- map->addr = gmap_translate(kvm->arch.gmap, addr);
- if (map->addr == -EFAULT) {
- ret = -EFAULT;
- goto out;
- }
- ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
- if (ret < 0)
- goto out;
- BUG_ON(ret != 1);
- down_write(&adapter->maps_lock);
- if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
- list_add_tail(&map->list, &adapter->maps);
- ret = 0;
- } else {
- put_page(map->page);
- ret = -EINVAL;
- }
- up_write(&adapter->maps_lock);
-out:
- if (ret)
- kfree(map);
- return ret;
-}
-
-static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
-{
- struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
- struct s390_map_info *map, *tmp;
- int found = 0;
-
- if (!adapter || !addr)
- return -EINVAL;
-
- down_write(&adapter->maps_lock);
- list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
- if (map->guest_addr == addr) {
- found = 1;
- atomic_dec(&adapter->nr_maps);
- list_del(&map->list);
- put_page(map->page);
- kfree(map);
- break;
- }
- }
- up_write(&adapter->maps_lock);
-
- return found ? 0 : -EINVAL;
-}
-
void kvm_s390_destroy_adapters(struct kvm *kvm)
{
int i;
- struct s390_map_info *map, *tmp;
- for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
- if (!kvm->arch.adapters[i])
- continue;
- list_for_each_entry_safe(map, tmp,
- &kvm->arch.adapters[i]->maps, list) {
- list_del(&map->list);
- put_page(map->page);
- kfree(map);
- }
+ for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
kfree(kvm->arch.adapters[i]);
- }
}
static int modify_io_adapter(struct kvm_device *dev,
@@ -2456,11 +2526,14 @@ static int modify_io_adapter(struct kvm_device *dev,
if (ret > 0)
ret = 0;
break;
+ /*
+ * The following operations are no longer needed and therefore no-ops.
+ * The gpa to hva translation is done when an IRQ route is set up. The
+ * set_irq code uses get_user_pages_remote() to do the actual write.
+ */
case KVM_S390_IO_ADAPTER_MAP:
- ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
- break;
case KVM_S390_IO_ADAPTER_UNMAP:
- ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+ ret = 0;
break;
default:
ret = -EINVAL;
@@ -2595,7 +2668,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
{
int r = 0;
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
switch (attr->group) {
@@ -2699,19 +2772,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
}
-static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
- u64 addr)
+static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
{
- struct s390_map_info *map;
+ struct page *page = NULL;
- if (!adapter)
- return NULL;
-
- list_for_each_entry(map, &adapter->maps, list) {
- if (map->guest_addr == addr)
- return map;
- }
- return NULL;
+ mmap_read_lock(kvm->mm);
+ get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
+ &page, NULL, NULL);
+ mmap_read_unlock(kvm->mm);
+ return page;
}
static int adapter_indicators_set(struct kvm *kvm,
@@ -2720,30 +2789,35 @@ static int adapter_indicators_set(struct kvm *kvm,
{
unsigned long bit;
int summary_set, idx;
- struct s390_map_info *info;
+ struct page *ind_page, *summary_page;
void *map;
- info = get_map_info(adapter, adapter_int->ind_addr);
- if (!info)
+ ind_page = get_map_page(kvm, adapter_int->ind_addr);
+ if (!ind_page)
return -1;
- map = page_address(info->page);
- bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
- set_bit(bit, map);
- idx = srcu_read_lock(&kvm->srcu);
- mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
- set_page_dirty_lock(info->page);
- info = get_map_info(adapter, adapter_int->summary_addr);
- if (!info) {
- srcu_read_unlock(&kvm->srcu, idx);
+ summary_page = get_map_page(kvm, adapter_int->summary_addr);
+ if (!summary_page) {
+ put_page(ind_page);
return -1;
}
- map = page_address(info->page);
- bit = get_ind_bit(info->addr, adapter_int->summary_offset,
- adapter->swap);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ map = page_address(ind_page);
+ bit = get_ind_bit(adapter_int->ind_addr,
+ adapter_int->ind_offset, adapter->swap);
+ set_bit(bit, map);
+ mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+ set_page_dirty_lock(ind_page);
+ map = page_address(summary_page);
+ bit = get_ind_bit(adapter_int->summary_addr,
+ adapter_int->summary_offset, adapter->swap);
summary_set = test_and_set_bit(bit, map);
- mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
- set_page_dirty_lock(info->page);
+ mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+ set_page_dirty_lock(summary_page);
srcu_read_unlock(&kvm->srcu, idx);
+
+ put_page(ind_page);
+ put_page(summary_page);
return summary_set ? 0 : 1;
}
@@ -2765,9 +2839,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
adapter = get_io_adapter(kvm, e->adapter.adapter_id);
if (!adapter)
return -1;
- down_read(&adapter->maps_lock);
ret = adapter_indicators_set(kvm, adapter, &e->adapter);
- up_read(&adapter->maps_lock);
if ((ret > 0) && !adapter->masked) {
ret = kvm_s390_inject_airq(kvm, adapter);
if (ret == 0)
@@ -2818,23 +2890,27 @@ int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
- int ret;
+ u64 uaddr;
switch (ue->type) {
+ /* we store the userspace addresses instead of the guest addresses */
case KVM_IRQ_ROUTING_S390_ADAPTER:
e->set = set_adapter_int;
- e->adapter.summary_addr = ue->u.adapter.summary_addr;
- e->adapter.ind_addr = ue->u.adapter.ind_addr;
+ uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
+ if (uaddr == -EFAULT)
+ return -EFAULT;
+ e->adapter.summary_addr = uaddr;
+ uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
+ if (uaddr == -EFAULT)
+ return -EFAULT;
+ e->adapter.ind_addr = uaddr;
e->adapter.summary_offset = ue->u.adapter.summary_offset;
e->adapter.ind_offset = ue->u.adapter.ind_offset;
e->adapter.adapter_id = ue->u.adapter.adapter_id;
- ret = 0;
- break;
+ return 0;
default:
- ret = -EINVAL;
+ return -EINVAL;
}
-
- return ret;
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
@@ -2983,18 +3059,19 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
+ u8 vcpu_isc_mask;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
- deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
- if (deliverable_mask) {
+ vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
+ if (deliverable_mask & vcpu_isc_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
@@ -3015,7 +3092,7 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
__airqs_kick_single_vcpu(kvm, pending_mask);
hrtimer_forward_now(timer, ns_to_ktime(gi->expires));
return HRTIMER_RESTART;
- };
+ }
return HRTIMER_NORESTART;
}
@@ -3095,9 +3172,33 @@ void kvm_s390_gisa_init(struct kvm *kvm)
VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
}
+void kvm_s390_gisa_enable(struct kvm *kvm)
+{
+ struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u32 gisa_desc;
+
+ if (gi->origin)
+ return;
+ kvm_s390_gisa_init(kvm);
+ gisa_desc = kvm_s390_get_gisa_desc(kvm);
+ if (!gisa_desc)
+ return;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ vcpu->arch.sie_block->gd = gisa_desc;
+ vcpu->arch.sie_block->eca |= ECA_AIV;
+ VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
+ vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
void kvm_s390_gisa_destroy(struct kvm *kvm)
{
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+ struct kvm_s390_gisa *gisa = gi->origin;
if (!gi->origin)
return;
@@ -3108,6 +3209,25 @@ void kvm_s390_gisa_destroy(struct kvm *kvm)
cpu_relax();
hrtimer_cancel(&gi->timer);
gi->origin = NULL;
+ VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa);
+}
+
+void kvm_s390_gisa_disable(struct kvm *kvm)
+{
+ struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (!gi->origin)
+ return;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ vcpu->arch.sie_block->eca &= ~ECA_AIV;
+ vcpu->arch.sie_block->gd = 0U;
+ mutex_unlock(&vcpu->mutex);
+ VCPU_EVENT(vcpu, 3, "AIV disabled for cpu %03u", vcpu->vcpu_id);
+ }
+ kvm_s390_gisa_destroy(kvm);
}
/**
@@ -3193,10 +3313,87 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
-static void gib_alert_irq_handler(struct airq_struct *airq, bool floating)
+static void aen_host_forward(unsigned long si)
+{
+ struct kvm_s390_gisa_interrupt *gi;
+ struct zpci_gaite *gaite;
+ struct kvm *kvm;
+
+ gaite = (struct zpci_gaite *)aift->gait +
+ (si * sizeof(struct zpci_gaite));
+ if (gaite->count == 0)
+ return;
+ if (gaite->aisb != 0)
+ set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb));
+
+ kvm = kvm_s390_pci_si_to_kvm(aift, si);
+ if (!kvm)
+ return;
+ gi = &kvm->arch.gisa_int;
+
+ if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) ||
+ !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) {
+ gisa_set_ipm_gisc(gi->origin, gaite->gisc);
+ if (hrtimer_active(&gi->timer))
+ hrtimer_cancel(&gi->timer);
+ hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
+ kvm->stat.aen_forward++;
+ }
+}
+
+static void aen_process_gait(u8 isc)
+{
+ bool found = false, first = true;
+ union zpci_sic_iib iib = {{0}};
+ unsigned long si, flags;
+
+ spin_lock_irqsave(&aift->gait_lock, flags);
+
+ if (!aift->gait) {
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+ return;
+ }
+
+ for (si = 0;;) {
+ /* Scan adapter summary indicator bit vector */
+ si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv));
+ if (si == -1UL) {
+ if (first || found) {
+ /* Re-enable interrupts. */
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc,
+ &iib);
+ first = found = false;
+ } else {
+ /* Interrupts on and all bits processed */
+ break;
+ }
+ found = false;
+ si = 0;
+ /* Scan again after re-enabling interrupts */
+ continue;
+ }
+ found = true;
+ aen_host_forward(si);
+ }
+
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+}
+
+static void gib_alert_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info;
+
inc_irq_stat(IRQIO_GAL);
- process_gib_alert_list();
+
+ if ((info->forward || info->error) &&
+ IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ aen_process_gait(info->isc);
+ if (info->aism != 0)
+ process_gib_alert_list();
+ } else {
+ process_gib_alert_list();
+ }
}
static struct airq_struct gib_alert_irq = {
@@ -3208,6 +3405,11 @@ void kvm_s390_gib_destroy(void)
{
if (!gib)
return;
+ if (kvm_s390_pci_interp_allowed() && aift) {
+ mutex_lock(&aift->aift_lock);
+ kvm_s390_pci_aen_exit();
+ mutex_unlock(&aift->aift_lock);
+ }
chsc_sgib(0);
unregister_adapter_interrupt(&gib_alert_irq);
free_page((unsigned long)gib);
@@ -3223,7 +3425,7 @@ int kvm_s390_gib_init(u8 nisc)
goto out;
}
- gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
if (!gib) {
rc = -ENOMEM;
goto out;
@@ -3245,6 +3447,14 @@ int kvm_s390_gib_init(u8 nisc)
goto out_unreg_gal;
}
+ if (kvm_s390_pci_interp_allowed()) {
+ if (kvm_s390_pci_aen_init(nisc)) {
+ pr_err("Initializing AEN for PCI failed\n");
+ rc = -EIO;
+ goto out_unreg_gal;
+ }
+ }
+
KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
goto out;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c2e6d4ba4e23..bc491a73815c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2,11 +2,10 @@
/*
* hosting IBM Z kernel virtual machines (s390x)
*
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Christian Ehrhardt <ehrhardt@de.ibm.com>
* Jason J. Herne <jjherne@us.ibm.com>
*/
@@ -31,11 +30,12 @@
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
#include <linux/string.h>
+#include <linux/pgtable.h>
+#include <linux/mmu_notifier.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
#include <asm/stp.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/nmi.h>
#include <asm/switch_to.h>
@@ -44,8 +44,11 @@
#include <asm/cpacf.h>
#include <asm/timex.h>
#include <asm/ap.h>
+#include <asm/uv.h>
+#include <asm/fpu/api.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "pci.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -56,118 +59,130 @@
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "userspace_handled", VCPU_STAT(exit_userspace) },
- { "exit_null", VCPU_STAT(exit_null) },
- { "exit_validity", VCPU_STAT(exit_validity) },
- { "exit_stop_request", VCPU_STAT(exit_stop_request) },
- { "exit_external_request", VCPU_STAT(exit_external_request) },
- { "exit_io_request", VCPU_STAT(exit_io_request) },
- { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
- { "exit_instruction", VCPU_STAT(exit_instruction) },
- { "exit_pei", VCPU_STAT(exit_pei) },
- { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
- { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
- { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
- { "instruction_lctl", VCPU_STAT(instruction_lctl) },
- { "instruction_stctl", VCPU_STAT(instruction_stctl) },
- { "instruction_stctg", VCPU_STAT(instruction_stctg) },
- { "deliver_ckc", VCPU_STAT(deliver_ckc) },
- { "deliver_cputm", VCPU_STAT(deliver_cputm) },
- { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
- { "deliver_external_call", VCPU_STAT(deliver_external_call) },
- { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
- { "deliver_virtio", VCPU_STAT(deliver_virtio) },
- { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
- { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
- { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
- { "deliver_program", VCPU_STAT(deliver_program) },
- { "deliver_io", VCPU_STAT(deliver_io) },
- { "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
- { "exit_wait_state", VCPU_STAT(exit_wait_state) },
- { "inject_ckc", VCPU_STAT(inject_ckc) },
- { "inject_cputm", VCPU_STAT(inject_cputm) },
- { "inject_external_call", VCPU_STAT(inject_external_call) },
- { "inject_float_mchk", VM_STAT(inject_float_mchk) },
- { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
- { "inject_io", VM_STAT(inject_io) },
- { "inject_mchk", VCPU_STAT(inject_mchk) },
- { "inject_pfault_done", VM_STAT(inject_pfault_done) },
- { "inject_program", VCPU_STAT(inject_program) },
- { "inject_restart", VCPU_STAT(inject_restart) },
- { "inject_service_signal", VM_STAT(inject_service_signal) },
- { "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
- { "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
- { "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
- { "inject_virtio", VM_STAT(inject_virtio) },
- { "instruction_epsw", VCPU_STAT(instruction_epsw) },
- { "instruction_gs", VCPU_STAT(instruction_gs) },
- { "instruction_io_other", VCPU_STAT(instruction_io_other) },
- { "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
- { "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
- { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
- { "instruction_ptff", VCPU_STAT(instruction_ptff) },
- { "instruction_stidp", VCPU_STAT(instruction_stidp) },
- { "instruction_sck", VCPU_STAT(instruction_sck) },
- { "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
- { "instruction_spx", VCPU_STAT(instruction_spx) },
- { "instruction_stpx", VCPU_STAT(instruction_stpx) },
- { "instruction_stap", VCPU_STAT(instruction_stap) },
- { "instruction_iske", VCPU_STAT(instruction_iske) },
- { "instruction_ri", VCPU_STAT(instruction_ri) },
- { "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
- { "instruction_sske", VCPU_STAT(instruction_sske) },
- { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
- { "instruction_essa", VCPU_STAT(instruction_essa) },
- { "instruction_stsi", VCPU_STAT(instruction_stsi) },
- { "instruction_stfl", VCPU_STAT(instruction_stfl) },
- { "instruction_tb", VCPU_STAT(instruction_tb) },
- { "instruction_tpi", VCPU_STAT(instruction_tpi) },
- { "instruction_tprot", VCPU_STAT(instruction_tprot) },
- { "instruction_tsch", VCPU_STAT(instruction_tsch) },
- { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
- { "instruction_sie", VCPU_STAT(instruction_sie) },
- { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
- { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
- { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
- { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
- { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
- { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
- { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
- { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
- { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
- { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
- { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
- { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
- { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
- { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
- { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
- { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
- { "instruction_diag_10", VCPU_STAT(diagnose_10) },
- { "instruction_diag_44", VCPU_STAT(diagnose_44) },
- { "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
- { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
- { "instruction_diag_258", VCPU_STAT(diagnose_258) },
- { "instruction_diag_308", VCPU_STAT(diagnose_308) },
- { "instruction_diag_500", VCPU_STAT(diagnose_500) },
- { "instruction_diag_other", VCPU_STAT(diagnose_other) },
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, inject_io),
+ STATS_DESC_COUNTER(VM, inject_float_mchk),
+ STATS_DESC_COUNTER(VM, inject_pfault_done),
+ STATS_DESC_COUNTER(VM, inject_service_signal),
+ STATS_DESC_COUNTER(VM, inject_virtio),
+ STATS_DESC_COUNTER(VM, aen_forward)
};
-struct kvm_s390_tod_clock_ext {
- __u8 epoch_idx;
- __u64 tod;
- __u8 reserved[7];
-} __packed;
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, exit_userspace),
+ STATS_DESC_COUNTER(VCPU, exit_null),
+ STATS_DESC_COUNTER(VCPU, exit_external_request),
+ STATS_DESC_COUNTER(VCPU, exit_io_request),
+ STATS_DESC_COUNTER(VCPU, exit_external_interrupt),
+ STATS_DESC_COUNTER(VCPU, exit_stop_request),
+ STATS_DESC_COUNTER(VCPU, exit_validity),
+ STATS_DESC_COUNTER(VCPU, exit_instruction),
+ STATS_DESC_COUNTER(VCPU, exit_pei),
+ STATS_DESC_COUNTER(VCPU, halt_no_poll_steal),
+ STATS_DESC_COUNTER(VCPU, instruction_lctl),
+ STATS_DESC_COUNTER(VCPU, instruction_lctlg),
+ STATS_DESC_COUNTER(VCPU, instruction_stctl),
+ STATS_DESC_COUNTER(VCPU, instruction_stctg),
+ STATS_DESC_COUNTER(VCPU, exit_program_interruption),
+ STATS_DESC_COUNTER(VCPU, exit_instr_and_program),
+ STATS_DESC_COUNTER(VCPU, exit_operation_exception),
+ STATS_DESC_COUNTER(VCPU, deliver_ckc),
+ STATS_DESC_COUNTER(VCPU, deliver_cputm),
+ STATS_DESC_COUNTER(VCPU, deliver_external_call),
+ STATS_DESC_COUNTER(VCPU, deliver_emergency_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_service_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_virtio),
+ STATS_DESC_COUNTER(VCPU, deliver_stop_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_prefix_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_restart_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_program),
+ STATS_DESC_COUNTER(VCPU, deliver_io),
+ STATS_DESC_COUNTER(VCPU, deliver_machine_check),
+ STATS_DESC_COUNTER(VCPU, exit_wait_state),
+ STATS_DESC_COUNTER(VCPU, inject_ckc),
+ STATS_DESC_COUNTER(VCPU, inject_cputm),
+ STATS_DESC_COUNTER(VCPU, inject_external_call),
+ STATS_DESC_COUNTER(VCPU, inject_emergency_signal),
+ STATS_DESC_COUNTER(VCPU, inject_mchk),
+ STATS_DESC_COUNTER(VCPU, inject_pfault_init),
+ STATS_DESC_COUNTER(VCPU, inject_program),
+ STATS_DESC_COUNTER(VCPU, inject_restart),
+ STATS_DESC_COUNTER(VCPU, inject_set_prefix),
+ STATS_DESC_COUNTER(VCPU, inject_stop_signal),
+ STATS_DESC_COUNTER(VCPU, instruction_epsw),
+ STATS_DESC_COUNTER(VCPU, instruction_gs),
+ STATS_DESC_COUNTER(VCPU, instruction_io_other),
+ STATS_DESC_COUNTER(VCPU, instruction_lpsw),
+ STATS_DESC_COUNTER(VCPU, instruction_lpswe),
+ STATS_DESC_COUNTER(VCPU, instruction_pfmf),
+ STATS_DESC_COUNTER(VCPU, instruction_ptff),
+ STATS_DESC_COUNTER(VCPU, instruction_sck),
+ STATS_DESC_COUNTER(VCPU, instruction_sckpf),
+ STATS_DESC_COUNTER(VCPU, instruction_stidp),
+ STATS_DESC_COUNTER(VCPU, instruction_spx),
+ STATS_DESC_COUNTER(VCPU, instruction_stpx),
+ STATS_DESC_COUNTER(VCPU, instruction_stap),
+ STATS_DESC_COUNTER(VCPU, instruction_iske),
+ STATS_DESC_COUNTER(VCPU, instruction_ri),
+ STATS_DESC_COUNTER(VCPU, instruction_rrbe),
+ STATS_DESC_COUNTER(VCPU, instruction_sske),
+ STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock),
+ STATS_DESC_COUNTER(VCPU, instruction_stsi),
+ STATS_DESC_COUNTER(VCPU, instruction_stfl),
+ STATS_DESC_COUNTER(VCPU, instruction_tb),
+ STATS_DESC_COUNTER(VCPU, instruction_tpi),
+ STATS_DESC_COUNTER(VCPU, instruction_tprot),
+ STATS_DESC_COUNTER(VCPU, instruction_tsch),
+ STATS_DESC_COUNTER(VCPU, instruction_sie),
+ STATS_DESC_COUNTER(VCPU, instruction_essa),
+ STATS_DESC_COUNTER(VCPU, instruction_sthyi),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_sense),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_start),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_stop),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_arch),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_restart),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_10),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_44),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c),
+ STATS_DESC_COUNTER(VCPU, diag_9c_ignored),
+ STATS_DESC_COUNTER(VCPU, diag_9c_forward),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_258),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
+ STATS_DESC_COUNTER(VCPU, pfault_sync)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
+};
/* allow nested virtualization in KVM (if enabled by user space) */
static int nested;
@@ -184,6 +199,16 @@ static u8 halt_poll_max_steal = 10;
module_param(halt_poll_max_steal, byte, 0644);
MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling");
+/* if set to true, the GISA will be initialized and used if available */
+static bool use_gisa = true;
+module_param(use_gisa, bool, 0644);
+MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -207,7 +232,7 @@ static unsigned long kvm_s390_fac_size(void)
BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64);
BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64);
BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) >
- sizeof(S390_lowcore.stfle_fac_list));
+ sizeof(stfle_fac_list));
return SIZE_INTERNAL;
}
@@ -220,6 +245,7 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
static struct gmap_notifier gmap_notifier;
static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
+debug_info_t *kvm_s390_dbf_uv;
/* Section: not file related */
int kvm_arch_hardware_enable(void)
@@ -228,13 +254,15 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-int kvm_arch_check_processor_compat(void)
+int kvm_arch_check_processor_compat(void *opaque)
{
return 0;
}
+/* forward declarations */
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
+static int sca_switch_to_extended(struct kvm *kvm);
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
{
@@ -269,7 +297,7 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
unsigned long long *delta = v;
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -293,7 +321,7 @@ static struct notifier_block kvm_clock_notifier = {
.notifier_call = kvm_clock_sync,
};
-int kvm_arch_hardware_setup(void)
+int kvm_arch_hardware_setup(void *opaque)
{
gmap_notifier.notifier_call = kvm_gmap_notifier;
gmap_register_pte_notifier(&gmap_notifier);
@@ -319,31 +347,31 @@ static void allow_cpu_feat(unsigned long nr)
static inline int plo_test_bit(unsigned char nr)
{
- register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ unsigned long function = (unsigned long)nr | 0x100;
int cc;
asm volatile(
+ " lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc)
- : "d" (r0)
- : "cc");
+ : [function] "d" (function)
+ : "cc", "0");
return cc == 0;
}
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
{
- register unsigned long r0 asm("0") = 0; /* query function */
- register unsigned long r1 asm("1") = (unsigned long) query;
-
asm volatile(
- /* Parameter regs are ignored */
+ " lghi 0,0\n"
+ " lgr 1,%[query]\n"
+ /* Parameter registers are ignored */
" .insn rrf,%[opc] << 16,2,4,6,0\n"
:
- : "d" (r0), "a" (r1), [opc] "i" (opcode)
- : "cc", "memory");
+ : [query] "d" ((unsigned long)query), [opc] "i" (opcode)
+ : "cc", "memory", "0", "1");
}
#define INSN_SORTL 0xb938
@@ -460,7 +488,12 @@ int kvm_arch_init(void *opaque)
if (!kvm_s390_dbf)
return -ENOMEM;
- if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view))
+ kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long));
+ if (!kvm_s390_dbf_uv)
+ goto out;
+
+ if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) ||
+ debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view))
goto out;
kvm_s390_cpu_feat_init();
@@ -472,6 +505,14 @@ int kvm_arch_init(void *opaque)
goto out;
}
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ rc = kvm_s390_pci_init();
+ if (rc) {
+ pr_err("Unable to allocate AIFT for PCI\n");
+ goto out;
+ }
+ }
+
rc = kvm_s390_gib_init(GAL_ISC);
if (rc)
goto out;
@@ -486,7 +527,10 @@ out:
void kvm_arch_exit(void)
{
kvm_s390_gib_destroy();
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_exit();
debug_unregister(kvm_s390_dbf);
+ debug_unregister(kvm_s390_dbf_uv);
}
/* Section: device related */
@@ -530,8 +574,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_AIS:
case KVM_CAP_S390_AIS_MIGRATION:
case KVM_CAP_S390_VCPU_RESETS:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_S390_DIAG318:
+ case KVM_CAP_S390_MEM_OP_EXTENSION:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ r = KVM_GUESTDBG_VALID_MASK;
+ break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
if (hpage && !kvm_is_ucontrol(kvm))
@@ -548,6 +598,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_MAX_VCPUS;
else if (sclp.has_esca && sclp.has_64bscao)
r = KVM_S390_ESCA_CPU_SLOTS;
+ if (ext == KVM_CAP_NR_VCPUS)
+ r = min_t(unsigned int, num_online_cpus(), r);
break;
case KVM_CAP_S390_COW:
r = MACHINE_HAS_ESOP;
@@ -564,14 +616,42 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_BPB:
r = test_facility(82);
break;
+ case KVM_CAP_S390_PROTECTED:
+ r = is_prot_virt_host();
+ break;
+ case KVM_CAP_S390_PROTECTED_DUMP: {
+ u64 pv_cmds_dump[] = {
+ BIT_UVC_CMD_DUMP_INIT,
+ BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
+ BIT_UVC_CMD_DUMP_CPU,
+ BIT_UVC_CMD_DUMP_COMPLETE,
+ };
+ int i;
+
+ r = is_prot_virt_host();
+
+ for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
+ if (!test_bit_inv(pv_cmds_dump[i],
+ (unsigned long *)&uv_info.inst_calls_list)) {
+ r = 0;
+ break;
+ }
+ }
+ break;
+ }
+ case KVM_CAP_S390_ZPCI_OP:
+ r = kvm_s390_pci_interp_allowed();
+ break;
+ case KVM_CAP_S390_CPU_TOPOLOGY:
+ r = test_facility(11);
+ break;
default:
r = 0;
}
return r;
}
-static void kvm_s390_sync_dirty_log(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
int i;
gfn_t cur_gfn, last_gfn;
@@ -612,9 +692,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
{
int r;
unsigned long n;
- struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- int is_dirty = 0;
+ int is_dirty;
if (kvm_is_ucontrol(kvm))
return -EINVAL;
@@ -625,14 +704,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
- slots = kvm_memslots(kvm);
- memslot = id_to_memslot(slots, log->slot);
- r = -ENOENT;
- if (!memslot->dirty_bitmap)
- goto out;
-
- kvm_s390_sync_dirty_log(kvm, memslot);
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
+ r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot);
if (r)
goto out;
@@ -649,7 +721,7 @@ out:
static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -698,6 +770,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
set_kvm_facility(kvm->arch.model.fac_mask, 152);
set_kvm_facility(kvm->arch.model.fac_list, 152);
}
+ if (test_facility(192)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 192);
+ set_kvm_facility(kvm->arch.model.fac_list, 192);
+ }
r = 0;
} else
r = -EINVAL;
@@ -754,9 +830,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
r = -EINVAL;
else {
r = 0;
- down_write(&kvm->mm->mmap_sem);
+ mmap_write_lock(kvm->mm);
kvm->mm->context.allow_gmap_hpage_1m = 1;
- up_write(&kvm->mm->mmap_sem);
+ mmap_write_unlock(kvm->mm);
/*
* We might have to create fake 4k page
* tables. To avoid that the hardware works on
@@ -780,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
icpt_operexc_on_all_vcpus(kvm);
r = 0;
break;
+ case KVM_CAP_S390_CPU_TOPOLOGY:
+ r = -EINVAL;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus) {
+ r = -EBUSY;
+ } else if (test_facility(11)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 11);
+ set_kvm_facility(kvm->arch.model.fac_list, 11);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
+ r ? "(not available)" : "(success)");
+ break;
default:
r = -EINVAL;
break;
@@ -899,7 +989,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_s390_vcpu_block_all(kvm);
@@ -982,9 +1072,45 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
return 0;
}
+static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu)
+{
+ /* Only set the ECB bits after guest requests zPCI interpretation */
+ if (!vcpu->kvm->arch.use_zpci_interp)
+ return;
+
+ vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI;
+ vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI;
+}
+
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (!kvm_s390_pci_interp_allowed())
+ return;
+
+ /*
+ * If host is configured for PCI and the necessary facilities are
+ * available, turn on interpretation for the life of this guest
+ */
+ kvm->arch.use_zpci_interp = 1;
+
+ kvm_s390_vcpu_block_all(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_vcpu_pci_setup(vcpu);
+ kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
+ }
+
+ kvm_s390_vcpu_unblock_all(kvm);
+}
+
static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
{
- int cx;
+ unsigned long cx;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(cx, vcpu, kvm)
@@ -1000,13 +1126,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
struct kvm_memory_slot *ms;
struct kvm_memslots *slots;
unsigned long ram_pages = 0;
- int slotnr;
+ int bkt;
/* migration mode already enabled */
if (kvm->arch.migration_mode)
return 0;
slots = kvm_memslots(kvm);
- if (!slots || !slots->used_slots)
+ if (!slots || kvm_memslots_empty(slots))
return -EINVAL;
if (!kvm->arch.use_cmma) {
@@ -1014,8 +1140,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
return 0;
}
/* mark all the pages in active slots as dirty */
- for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
- ms = slots->memslots + slotnr;
+ kvm_for_each_memslot(ms, bkt, slots) {
if (!ms->dirty_bitmap)
return -EINVAL;
/*
@@ -1082,6 +1207,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm,
return 0;
}
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
+
static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
{
struct kvm_s390_vm_tod_clock gtod;
@@ -1091,7 +1218,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx)
return -EINVAL;
- kvm_s390_set_tod_clock(kvm, &gtod);
+ __kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
gtod.epoch_idx, gtod.tod);
@@ -1122,7 +1249,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
sizeof(gtod.tod)))
return -EFAULT;
- kvm_s390_set_tod_clock(kvm, &gtod);
+ __kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod);
return 0;
}
@@ -1134,6 +1261,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
if (attr->flags)
return -EINVAL;
+ mutex_lock(&kvm->lock);
+ /*
+ * For protected guests, the TOD is managed by the ultravisor, so trying
+ * to change it will never bring the expected results.
+ */
+ if (kvm_s390_pv_is_protected(kvm)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
switch (attr->attr) {
case KVM_S390_VM_TOD_EXT:
ret = kvm_s390_set_tod_ext(kvm, attr);
@@ -1148,23 +1285,26 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -ENXIO;
break;
}
+
+out_unlock:
+ mutex_unlock(&kvm->lock);
return ret;
}
static void kvm_s390_get_tod_clock(struct kvm *kvm,
struct kvm_s390_vm_tod_clock *gtod)
{
- struct kvm_s390_tod_clock_ext htod;
+ union tod_clock clk;
preempt_disable();
- get_tod_clock_ext((char *)&htod);
+ store_tod_clock_ext(&clk);
- gtod->tod = htod.tod + kvm->arch.epoch;
+ gtod->tod = clk.tod + kvm->arch.epoch;
gtod->epoch_idx = 0;
if (test_kvm_facility(kvm, 139)) {
- gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx;
- if (gtod->tod < htod.tod)
+ gtod->epoch_idx = clk.ei + kvm->arch.epdx;
+ if (gtod->tod < clk.tod)
gtod->epoch_idx += 1;
}
@@ -1244,7 +1384,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -EBUSY;
goto out;
}
- proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1296,8 +1436,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
mutex_unlock(&kvm->lock);
return -EBUSY;
}
- bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
- KVM_S390_VM_CPU_FEAT_NR_BITS);
+ bitmap_from_arr64(kvm->arch.cpu_feat, data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
mutex_unlock(&kvm->lock);
VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
data.feat[0],
@@ -1406,7 +1545,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_processor *proc;
int ret = 0;
- proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1434,7 +1573,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_machine *mach;
int ret = 0;
- mach = kzalloc(sizeof(*mach), GFP_KERNEL);
+ mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT);
if (!mach) {
ret = -ENOMEM;
goto out;
@@ -1443,8 +1582,8 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
mach->ibc = sclp.ibc;
memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
S390_ARCH_FAC_LIST_SIZE_BYTE);
- memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
- sizeof(S390_lowcore.stfle_fac_list));
+ memcpy((unsigned long *)&mach->fac_list, stfle_fac_list,
+ sizeof(stfle_fac_list));
VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx",
kvm->arch.model.ibc,
kvm->arch.model.cpuid);
@@ -1468,8 +1607,7 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm,
{
struct kvm_s390_vm_cpu_feat data;
- bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
- KVM_S390_VM_CPU_FEAT_NR_BITS);
+ bitmap_to_arr64(data.feat, kvm->arch.cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
return -EFAULT;
VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
@@ -1484,9 +1622,7 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
{
struct kvm_s390_vm_cpu_feat data;
- bitmap_copy((unsigned long *) data.feat,
- kvm_s390_available_cpu_feat,
- KVM_S390_VM_CPU_FEAT_NR_BITS);
+ bitmap_to_arr64(data.feat, kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
return -EFAULT;
VM_EVENT(kvm, 3, "GET: host feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
@@ -1659,6 +1795,57 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
return ret;
}
+/**
+ * kvm_s390_update_topology_change_report - update CPU topology change report
+ * @kvm: guest KVM description
+ * @val: set or clear the MTCR bit
+ *
+ * Updates the Multiprocessor Topology-Change-Report bit to signal
+ * the guest with a topology change.
+ * This is only relevant if the topology facility is present.
+ *
+ * The SCA version, bsca or esca, doesn't matter as offset is the same.
+ */
+static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
+{
+ union sca_utility new, old;
+ struct bsca_block *sca;
+
+ read_lock(&kvm->arch.sca_lock);
+ sca = kvm->arch.sca;
+ do {
+ old = READ_ONCE(sca->utility);
+ new = old;
+ new.mtcr = val;
+ } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val);
+ read_unlock(&kvm->arch.sca_lock);
+}
+
+static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (!test_kvm_facility(kvm, 11))
+ return -ENXIO;
+
+ kvm_s390_update_topology_change_report(kvm, !!attr->attr);
+ return 0;
+}
+
+static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ u8 topo;
+
+ if (!test_kvm_facility(kvm, 11))
+ return -ENXIO;
+
+ read_lock(&kvm->arch.sca_lock);
+ topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
+ read_unlock(&kvm->arch.sca_lock);
+
+ return put_user(topo, (u8 __user *)attr->addr);
+}
+
static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
@@ -1679,6 +1866,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = kvm_s390_vm_set_migration(kvm, attr);
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = kvm_s390_set_topo_change_indication(kvm, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -1704,6 +1894,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = kvm_s390_vm_get_migration(kvm, attr);
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = kvm_s390_get_topo_change_indication(kvm, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -1777,6 +1970,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = 0;
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
+ break;
default:
ret = -ENXIO;
break;
@@ -1802,11 +1998,11 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+ keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
@@ -1820,7 +2016,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
break;
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (!r) {
r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
@@ -1847,7 +2043,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+ keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
@@ -1864,7 +2060,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
goto out;
i = 0;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
while (i < args->count) {
unlocked = false;
@@ -1882,7 +2078,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r) {
- r = fixup_user_fault(current, current->mm, hva,
+ r = fixup_user_fault(current->mm, hva,
FAULT_FLAG_WRITE, &unlocked);
if (r)
break;
@@ -1891,7 +2087,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
i++;
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
out:
kvfree(keys);
return r;
@@ -1906,38 +2102,6 @@ out:
/* for consistency */
#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
-/*
- * Similar to gfn_to_memslot, but returns the index of a memslot also when the
- * address falls in a hole. In that case the index of one of the memslots
- * bordering the hole is returned.
- */
-static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
-{
- int start = 0, end = slots->used_slots;
- int slot = atomic_read(&slots->lru_slot);
- struct kvm_memory_slot *memslots = slots->memslots;
-
- if (gfn >= memslots[slot].base_gfn &&
- gfn < memslots[slot].base_gfn + memslots[slot].npages)
- return slot;
-
- while (start < end) {
- slot = start + (end - start) / 2;
-
- if (gfn >= memslots[slot].base_gfn)
- end = slot;
- else
- start = slot + 1;
- }
-
- if (gfn >= memslots[start].base_gfn &&
- gfn < memslots[start].base_gfn + memslots[start].npages) {
- atomic_set(&slots->lru_slot, start);
- }
-
- return start;
-}
-
static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
u8 *res, unsigned long bufsize)
{
@@ -1961,27 +2125,32 @@ static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
return 0;
}
+static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots,
+ gfn_t gfn)
+{
+ return ____gfn_to_memslot(slots, gfn, true);
+}
+
static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
unsigned long cur_gfn)
{
- int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
- struct kvm_memory_slot *ms = slots->memslots + slotidx;
+ struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
unsigned long ofs = cur_gfn - ms->base_gfn;
+ struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
if (ms->base_gfn + ms->npages <= cur_gfn) {
- slotidx--;
+ mnode = rb_next(mnode);
/* If we are above the highest slot, wrap around */
- if (slotidx < 0)
- slotidx = slots->used_slots - 1;
+ if (!mnode)
+ mnode = rb_first(&slots->gfn_tree);
- ms = slots->memslots + slotidx;
+ ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
ofs = 0;
}
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
- while ((slotidx > 0) && (ofs >= ms->npages)) {
- slotidx--;
- ms = slots->memslots + slotidx;
- ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
+ while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
+ ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
+ ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
}
return ms->base_gfn + ofs;
}
@@ -1993,6 +2162,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *ms;
+ if (unlikely(kvm_memslots_empty(slots)))
+ return 0;
+
cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
ms = gfn_to_memslot(kvm, cur_gfn);
args->count = 0;
@@ -2000,7 +2172,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
if (!ms)
return 0;
next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
- mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
+ mem_end = kvm_s390_get_gfn_end(slots);
while (args->count < bufsize) {
hva = gfn_to_hva(kvm, cur_gfn);
@@ -2074,14 +2246,14 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
if (!values)
return -ENOMEM;
- down_read(&kvm->mm->mmap_sem);
+ mmap_read_lock(kvm->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
if (peek)
ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
else
ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&kvm->mm->mmap_sem);
+ mmap_read_unlock(kvm->mm);
if (kvm->arch.migration_mode)
args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
@@ -2131,7 +2303,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
goto out;
}
- down_read(&kvm->mm->mmap_sem);
+ mmap_read_lock(kvm->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
@@ -2146,18 +2318,495 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
set_pgste_bits(kvm->mm, hva, mask, pgstev);
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&kvm->mm->mmap_sem);
+ mmap_read_unlock(kvm->mm);
if (!kvm->mm->context.uses_cmm) {
- down_write(&kvm->mm->mmap_sem);
+ mmap_write_lock(kvm->mm);
kvm->mm->context.uses_cmm = 1;
- up_write(&kvm->mm->mmap_sem);
+ mmap_write_unlock(kvm->mm);
}
out:
vfree(bits);
return r;
}
+/**
+ * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to
+ * non protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Does not stop in case of error, tries to convert as many
+ * CPUs as possible. In case of error, the RC and RRC of the last error are
+ * returned.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u16 _rc, _rrc;
+ int ret = 0;
+
+ /*
+ * We ignore failures and try to destroy as many CPUs as possible.
+ * At the same time we must not free the assigned resources when
+ * this fails, as the ultravisor has still access to that memory.
+ * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
+ * behind.
+ * We want to return the first failure rc and rrc, though.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) {
+ *rc = _rc;
+ *rrc = _rrc;
+ ret = -EIO;
+ }
+ mutex_unlock(&vcpu->mutex);
+ }
+ /* Ensure that we re-enable gisa if the non-PV guest used it but the PV guest did not. */
+ if (use_gisa)
+ kvm_s390_gisa_enable(kvm);
+ return ret;
+}
+
+/**
+ * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM
+ * to protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Tries to undo the conversion in case of error.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ unsigned long i;
+ int r = 0;
+ u16 dummy;
+
+ struct kvm_vcpu *vcpu;
+
+ /* Disable the GISA if the ultravisor does not support AIV. */
+ if (!test_bit_inv(BIT_UV_FEAT_AIV, &uv_info.uv_feature_indications))
+ kvm_s390_gisa_disable(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
+ mutex_unlock(&vcpu->mutex);
+ if (r)
+ break;
+ }
+ if (r)
+ kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+ return r;
+}
+
+/*
+ * Here we provide user space with a direct interface to query UV
+ * related data like UV maxima and available features as well as
+ * feature specific data.
+ *
+ * To facilitate future extension of the data structures we'll try to
+ * write data up to the maximum requested length.
+ */
+static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
+{
+ ssize_t len_min;
+
+ switch (info->header.id) {
+ case KVM_PV_INFO_VM: {
+ len_min = sizeof(info->header) + sizeof(info->vm);
+
+ if (info->header.len_max < len_min)
+ return -EINVAL;
+
+ memcpy(info->vm.inst_calls_list,
+ uv_info.inst_calls_list,
+ sizeof(uv_info.inst_calls_list));
+
+ /* It's max cpuid not max cpus, so it's off by one */
+ info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
+ info->vm.max_guests = uv_info.max_num_sec_conf;
+ info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
+ info->vm.feature_indication = uv_info.uv_feature_indications;
+
+ return len_min;
+ }
+ case KVM_PV_INFO_DUMP: {
+ len_min = sizeof(info->header) + sizeof(info->dump);
+
+ if (info->header.len_max < len_min)
+ return -EINVAL;
+
+ info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
+ info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
+ info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
+ return len_min;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
+ struct kvm_s390_pv_dmp dmp)
+{
+ int r = -EINVAL;
+ void __user *result_buff = (void __user *)dmp.buff_addr;
+
+ switch (dmp.subcmd) {
+ case KVM_PV_DUMP_INIT: {
+ if (kvm->arch.pv.dumping)
+ break;
+
+ /*
+ * Block SIE entry as concurrent dump UVCs could lead
+ * to validities.
+ */
+ kvm_s390_vcpu_block_all(kvm);
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ if (!r) {
+ kvm->arch.pv.dumping = true;
+ } else {
+ kvm_s390_vcpu_unblock_all(kvm);
+ r = -EINVAL;
+ }
+ break;
+ }
+ case KVM_PV_DUMP_CONFIG_STOR_STATE: {
+ if (!kvm->arch.pv.dumping)
+ break;
+
+ /*
+ * gaddr is an output parameter since we might stop
+ * early. As dmp will be copied back in our caller, we
+ * don't need to do it ourselves.
+ */
+ r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ case KVM_PV_DUMP_COMPLETE: {
+ if (!kvm->arch.pv.dumping)
+ break;
+
+ r = -EINVAL;
+ if (dmp.buff_len < uv_info.conf_dump_finalize_len)
+ break;
+
+ r = kvm_s390_pv_dump_complete(kvm, result_buff,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
+{
+ int r = 0;
+ u16 dummy;
+ void __user *argp = (void __user *)cmd->data;
+
+ switch (cmd->cmd) {
+ case KVM_PV_ENABLE: {
+ r = -EINVAL;
+ if (kvm_s390_pv_is_protected(kvm))
+ break;
+
+ /*
+ * FMT 4 SIE needs esca. As we never switch back to bsca from
+ * esca, we need no cleanup in the error cases below
+ */
+ r = sca_switch_to_extended(kvm);
+ if (r)
+ break;
+
+ mmap_write_lock(current->mm);
+ r = gmap_mark_unmergeable();
+ mmap_write_unlock(current->mm);
+ if (r)
+ break;
+
+ r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
+ if (r)
+ break;
+
+ r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
+ if (r)
+ kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+
+ /* we need to block service interrupts from now on */
+ set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+ break;
+ }
+ case KVM_PV_DISABLE: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+ /*
+ * If a CPU could not be destroyed, destroy VM will also fail.
+ * There is no point in trying to destroy it. Instead return
+ * the rc and rrc from the first CPU that failed destroying.
+ */
+ if (r)
+ break;
+ r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
+
+ /* no need to block service interrupts any more */
+ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+ break;
+ }
+ case KVM_PV_SET_SEC_PARMS: {
+ struct kvm_s390_pv_sec_parm parms = {};
+ void *hdr;
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&parms, argp, sizeof(parms)))
+ break;
+
+ /* Currently restricted to 8KB */
+ r = -EINVAL;
+ if (parms.length > PAGE_SIZE * 2)
+ break;
+
+ r = -ENOMEM;
+ hdr = vmalloc(parms.length);
+ if (!hdr)
+ break;
+
+ r = -EFAULT;
+ if (!copy_from_user(hdr, (void __user *)parms.origin,
+ parms.length))
+ r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
+ &cmd->rc, &cmd->rrc);
+
+ vfree(hdr);
+ break;
+ }
+ case KVM_PV_UNPACK: {
+ struct kvm_s390_pv_unp unp = {};
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&unp, argp, sizeof(unp)))
+ break;
+
+ r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ case KVM_PV_VERIFY: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
+ cmd->rrc);
+ break;
+ }
+ case KVM_PV_PREP_RESET: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ break;
+ }
+ case KVM_PV_UNSHARE_ALL: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ break;
+ }
+ case KVM_PV_INFO: {
+ struct kvm_s390_pv_info info = {};
+ ssize_t data_len;
+
+ /*
+ * No need to check the VM protection here.
+ *
+ * Maybe user space wants to query some of the data
+ * when the VM is still unprotected. If we see the
+ * need to fence a new data command we can still
+ * return an error in the info handler.
+ */
+
+ r = -EFAULT;
+ if (copy_from_user(&info, argp, sizeof(info.header)))
+ break;
+
+ r = -EINVAL;
+ if (info.header.len_max < sizeof(info.header))
+ break;
+
+ data_len = kvm_s390_handle_pv_info(&info);
+ if (data_len < 0) {
+ r = data_len;
+ break;
+ }
+ /*
+ * If a data command struct is extended (multiple
+ * times) this can be used to determine how much of it
+ * is valid.
+ */
+ info.header.len_written = data_len;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &info, data_len))
+ break;
+
+ r = 0;
+ break;
+ }
+ case KVM_PV_DUMP: {
+ struct kvm_s390_pv_dmp dmp;
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&dmp, argp, sizeof(dmp)))
+ break;
+
+ r = kvm_s390_pv_dmp(kvm, cmd, dmp);
+ if (r)
+ break;
+
+ if (copy_to_user(argp, &dmp, sizeof(dmp))) {
+ r = -EFAULT;
+ break;
+ }
+
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ }
+ return r;
+}
+
+static bool access_key_invalid(u8 access_key)
+{
+ return access_key > 0xf;
+}
+
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ u64 supported_flags;
+ void *tmpbuf = NULL;
+ int r, srcu_idx;
+
+ supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
+ | KVM_S390_MEMOP_F_CHECK_ONLY;
+ if (mop->flags & ~supported_flags || !mop->size)
+ return -EINVAL;
+ if (mop->size > MEM_OP_MAX_SIZE)
+ return -E2BIG;
+ /*
+ * This is technically a heuristic only, if the kvm->lock is not
+ * taken, it is not guaranteed that the vm is/remains non-protected.
+ * This is ok from a kernel perspective, wrongdoing is detected
+ * on the access, -EFAULT is returned and the vm may crash the
+ * next time it accesses the memory in question.
+ * There is no sane usecase to do switching and a memop on two
+ * different CPUs at the same time.
+ */
+ if (kvm_s390_pv_get_handle(kvm))
+ return -EINVAL;
+ if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
+ if (access_key_invalid(mop->key))
+ return -EINVAL;
+ } else {
+ mop->key = 0;
+ }
+ if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
+ tmpbuf = vmalloc(mop->size);
+ if (!tmpbuf)
+ return -ENOMEM;
+ }
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ r = PGM_ADDRESSING;
+ goto out_unlock;
+ }
+
+ switch (mop->op) {
+ case KVM_S390_MEMOP_ABSOLUTE_READ: {
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key);
+ } else {
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_FETCH, mop->key);
+ if (r == 0) {
+ if (copy_to_user(uaddr, tmpbuf, mop->size))
+ r = -EFAULT;
+ }
+ }
+ break;
+ }
+ case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
+ } else {
+ if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+ r = -EFAULT;
+ break;
+ }
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_STORE, mop->key);
+ }
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+ vfree(tmpbuf);
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2255,6 +2904,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
mutex_unlock(&kvm->slots_lock);
break;
}
+ case KVM_S390_PV_COMMAND: {
+ struct kvm_pv_cmd args;
+
+ /* protvirt means user cpu state */
+ kvm_s390_set_user_cpu_state_ctrl(kvm);
+ r = 0;
+ if (!is_prot_virt_host()) {
+ r = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ if (args.flags) {
+ r = -EINVAL;
+ break;
+ }
+ mutex_lock(&kvm->lock);
+ r = kvm_s390_handle_pv(kvm, &args);
+ mutex_unlock(&kvm->lock);
+ if (copy_to_user(argp, &args, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ break;
+ }
+ case KVM_S390_MEM_OP: {
+ struct kvm_s390_mem_op mem_op;
+
+ if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+ r = kvm_s390_vm_mem_op(kvm, &mem_op);
+ else
+ r = -EFAULT;
+ break;
+ }
+ case KVM_S390_ZPCI_OP: {
+ struct kvm_s390_zpci_op args;
+
+ r = -EINVAL;
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ break;
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ r = kvm_s390_pci_zpci_op(kvm, &args);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -2299,12 +2997,26 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm)
kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
}
+/*
+ * kvm_arch_crypto_set_masks
+ *
+ * @kvm: pointer to the target guest's KVM struct containing the crypto masks
+ * to be set.
+ * @apm: the mask identifying the accessible AP adapters
+ * @aqm: the mask identifying the accessible AP domains
+ * @adm: the mask identifying the accessible AP control domains
+ *
+ * Set the masks that identify the adapters, domains and control domains to
+ * which the KVM guest is granted access.
+ *
+ * Note: The kvm->lock mutex must be locked by the caller before invoking this
+ * function.
+ */
void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
unsigned long *aqm, unsigned long *adm)
{
struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
- mutex_lock(&kvm->lock);
kvm_s390_vcpu_block_all(kvm);
switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
@@ -2335,13 +3047,23 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
/* recreate the shadow crycb for each vcpu */
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
kvm_s390_vcpu_unblock_all(kvm);
- mutex_unlock(&kvm->lock);
}
EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
+/*
+ * kvm_arch_crypto_clear_masks
+ *
+ * @kvm: pointer to the target guest's KVM struct containing the crypto masks
+ * to be cleared.
+ *
+ * Clear the masks that identify the adapters, domains and control domains to
+ * which the KVM guest is granted access.
+ *
+ * Note: The kvm->lock mutex must be locked by the caller before invoking this
+ * function.
+ */
void kvm_arch_crypto_clear_masks(struct kvm *kvm)
{
- mutex_lock(&kvm->lock);
kvm_s390_vcpu_block_all(kvm);
memset(&kvm->arch.crypto.crycb->apcb0, 0,
@@ -2353,7 +3075,6 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm)
/* recreate the shadow crycb for each vcpu */
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
kvm_s390_vcpu_unblock_all(kvm);
- mutex_unlock(&kvm->lock);
}
EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
@@ -2370,6 +3091,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm)
{
kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
kvm_s390_set_crycb_format(kvm);
+ init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem);
if (!test_kvm_facility(kvm, 76))
return;
@@ -2392,9 +3114,17 @@ static void sca_dispose(struct kvm *kvm)
kvm->arch.sca = NULL;
}
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_clear_list(kvm);
+
+ __kvm_arch_free_vm(kvm);
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- gfp_t alloc_flags = GFP_KERNEL;
+ gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -2439,7 +3169,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
kvm->arch.sie_page2 =
- (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
if (!kvm->arch.sie_page2)
goto out_err;
@@ -2447,10 +3177,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
for (i = 0; i < kvm_s390_fac_size(); i++) {
- kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] &
+ kvm->arch.model.fac_mask[i] = stfle_fac_list[i] &
(kvm_s390_fac_base[i] |
kvm_s390_fac_ext[i]);
- kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
+ kvm->arch.model.fac_list[i] = stfle_fac_list[i] &
kvm_s390_fac_base[i];
}
kvm->arch.model.subfuncs = kvm_s390_available_subfunc;
@@ -2474,6 +3204,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_s390_crypto_init(kvm);
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ mutex_lock(&kvm->lock);
+ kvm_s390_pci_init_list(kvm);
+ kvm_s390_vcpu_pci_enable_interp(kvm);
+ mutex_unlock(&kvm->lock);
+ }
+
mutex_init(&kvm->arch.float_int.ais_lock);
spin_lock_init(&kvm->arch.float_int.lock);
for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@ -2504,7 +3241,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.use_skf = sclp.has_skey;
spin_lock_init(&kvm->arch.start_stop_lock);
kvm_s390_vsie_init(kvm);
- kvm_s390_gisa_init(kvm);
+ if (use_gisa)
+ kvm_s390_gisa_init(kvm);
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -2518,43 +3256,52 @@ out_err:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ u16 rc, rrc;
+
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
kvm_s390_clear_local_irqs(vcpu);
kvm_clear_async_pf_completion_queue(vcpu);
if (!kvm_is_ucontrol(vcpu->kvm))
sca_del_vcpu(vcpu);
+ kvm_s390_update_topology_change_report(vcpu->kvm, 1);
if (kvm_is_ucontrol(vcpu->kvm))
gmap_remove(vcpu->arch.gmap);
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
+ /* We can not hold the vcpu mutex here, we are already dying */
+ if (kvm_s390_pv_cpu_get_handle(vcpu))
+ kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
free_page((unsigned long)(vcpu->arch.sie_block));
}
-static void kvm_free_vcpus(struct kvm *kvm)
-{
- unsigned int i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vcpu_destroy(vcpu);
-
- mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
-
- atomic_set(&kvm->online_vcpus, 0);
- mutex_unlock(&kvm->lock);
-}
-
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- kvm_free_vcpus(kvm);
+ u16 rc, rrc;
+
+ kvm_destroy_vcpus(kvm);
sca_dispose(kvm);
- debug_unregister(kvm->arch.dbf);
kvm_s390_gisa_destroy(kvm);
+ /*
+ * We are already at the end of life and kvm->lock is not taken.
+ * This is ok as the file descriptor is closed by now and nobody
+ * can mess with the pv state. To avoid lockdep_assert_held from
+ * complaining we do not use kvm_s390_pv_is_protected.
+ */
+ if (kvm_s390_pv_get_handle(kvm))
+ kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
+ /*
+ * Remove the mmu notifier only when the whole KVM VM is torn down,
+ * and only if one was registered to begin with. If the VM is
+ * currently not protected, but has been previously been protected,
+ * then it's possible that the notifier is still registered.
+ */
+ if (kvm->arch.pv.mmu_notifier.ops)
+ mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm);
+
+ debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
gmap_remove(kvm->arch.gmap);
@@ -2647,10 +3394,13 @@ static int sca_switch_to_extended(struct kvm *kvm)
struct bsca_block *old_sca = kvm->arch.sca;
struct esca_block *new_sca;
struct kvm_vcpu *vcpu;
- unsigned int vcpu_idx;
+ unsigned long vcpu_idx;
u32 scaol, scaoh;
- new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
+ if (kvm->arch.use_esca)
+ return 0;
+
+ new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!new_sca)
return -ENOMEM;
@@ -2694,9 +3444,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (!sclp.has_esca || !sclp.has_64bscao)
return false;
- mutex_lock(&kvm->lock);
rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
- mutex_unlock(&kvm->lock);
return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
}
@@ -2883,7 +3631,7 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
{
- vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
+ vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.sie_block->cbrlo)
return -ENOMEM;
return 0;
@@ -2901,6 +3649,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
{
int rc = 0;
+ u16 uvrc, uvrrc;
atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
CPUSTAT_SM |
@@ -2918,8 +3667,12 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= ECB_SRSI;
+ if (test_kvm_facility(vcpu->kvm, 11))
+ vcpu->arch.sie_block->ecb |= ECB_PTF;
if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= ECB_TE;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ vcpu->arch.sie_block->ecb |= ECB_SPECI;
if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@ -2968,6 +3721,16 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_crypto_setup(vcpu);
+ kvm_s390_vcpu_pci_setup(vcpu);
+
+ mutex_lock(&vcpu->kvm->lock);
+ if (kvm_s390_pv_is_protected(vcpu->kvm)) {
+ rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
+ if (rc)
+ kvm_s390_vcpu_unsetup_cmma(vcpu);
+ }
+ mutex_unlock(&vcpu->kvm->lock);
+
return rc;
}
@@ -2984,7 +3747,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
int rc;
BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
- sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
+ sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sie_page)
return -ENOMEM;
@@ -2997,9 +3760,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->icpua = vcpu->vcpu_id;
spin_lock_init(&vcpu->arch.local_int.lock);
- vcpu->arch.sie_block->gd = (u32)(u64)vcpu->kvm->arch.gisa_int.origin;
- if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
- vcpu->arch.sie_block->gd |= GISA_FORMAT1;
+ vcpu->arch.sie_block->gd = kvm_s390_get_gisa_desc(vcpu->kvm);
seqcount_init(&vcpu->arch.cputm_seqcount);
vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
@@ -3009,7 +3770,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
KVM_SYNC_ACRS |
KVM_SYNC_CRS |
KVM_SYNC_ARCH0 |
- KVM_SYNC_PFAULT;
+ KVM_SYNC_PFAULT |
+ KVM_SYNC_DIAG318;
kvm_s390_set_prefix(vcpu, 0);
if (test_kvm_facility(vcpu->kvm, 64))
vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
@@ -3040,6 +3802,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
rc = kvm_s390_vcpu_setup(vcpu);
if (rc)
goto out_ucontrol_uninit;
+
+ kvm_s390_update_topology_change_report(vcpu->kvm, 1);
return 0;
out_ucontrol_uninit:
@@ -3052,6 +3816,7 @@ out_free_sie_block:
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
return kvm_s390_vcpu_has_irq(vcpu, 0);
}
@@ -3103,7 +3868,7 @@ void exit_sie(struct kvm_vcpu *vcpu)
/* Kick a guest cpu out of SIE to process a request synchronously */
void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
{
- kvm_make_request(req, vcpu);
+ __kvm_make_request(req, vcpu);
kvm_s390_vcpu_request(vcpu);
}
@@ -3113,7 +3878,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
struct kvm *kvm = gmap->private;
struct kvm_vcpu *vcpu;
unsigned long prefix;
- int i;
+ unsigned long i;
if (gmap_is_shadow(gmap))
return;
@@ -3126,7 +3891,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
start, end);
- kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
}
}
}
@@ -3135,7 +3900,7 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
/* do not poll with more than halt_poll_max_steal percent of steal time */
if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >=
- halt_poll_max_steal) {
+ READ_ONCE(halt_poll_max_steal)) {
vcpu->stat.halt_no_poll_steal++;
return true;
}
@@ -3277,7 +4042,6 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
kvm_s390_set_prefix(vcpu, 0);
kvm_s390_set_cpu_timer(vcpu, 0);
vcpu->arch.sie_block->ckc = 0;
- vcpu->arch.sie_block->todpr = 0;
memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr));
vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK;
vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK;
@@ -3295,9 +4059,17 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
vcpu->run->s.regs.pp = 0;
vcpu->run->s.regs.gbea = 1;
vcpu->run->s.regs.fpc = 0;
- vcpu->arch.sie_block->gbea = 1;
- vcpu->arch.sie_block->pp = 0;
- vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+ /*
+ * Do not reset these registers in the protected case, as some of
+ * them are overlayed and they are not accessible in this case
+ * anyway.
+ */
+ if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->gbea = 1;
+ vcpu->arch.sie_block->pp = 0;
+ vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+ vcpu->arch.sie_block->todpr = 0;
+ }
}
static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu)
@@ -3483,18 +4255,24 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
vcpu_load(vcpu);
/* user space knows about this interface - let it control the state */
- vcpu->kvm->arch.user_cpu_state_ctrl = 1;
+ kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm);
switch (mp_state->mp_state) {
case KVM_MP_STATE_STOPPED:
- kvm_s390_vcpu_stop(vcpu);
+ rc = kvm_s390_vcpu_stop(vcpu);
break;
case KVM_MP_STATE_OPERATING:
- kvm_s390_vcpu_start(vcpu);
+ rc = kvm_s390_vcpu_start(vcpu);
break;
case KVM_MP_STATE_LOAD:
+ if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+ rc = -ENXIO;
+ break;
+ }
+ rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD);
+ break;
case KVM_MP_STATE_CHECK_STOP:
- /* fall through - CHECK_STOP and LOAD are not supported yet */
+ fallthrough; /* CHECK_STOP and LOAD are not supported yet */
default:
rc = -ENXIO;
}
@@ -3515,19 +4293,19 @@ retry:
if (!kvm_request_pending(vcpu))
return 0;
/*
- * We use MMU_RELOAD just to re-arm the ipte notifier for the
+ * If the guest prefix changed, re-arm the ipte notifier for the
* guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
* This ensures that the ipte instruction for this request has
* already finished. We might race against a second unmapper that
* wants to set the blocking bit. Lets just retry the request loop.
*/
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
+ if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
int rc;
rc = gmap_mprotect_notify(vcpu->arch.gmap,
kvm_s390_get_prefix(vcpu),
PAGE_SIZE * 2, PROT_WRITE);
if (rc) {
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
return rc;
}
goto retry;
@@ -3580,30 +4358,26 @@ retry:
goto retry;
}
- /* nothing to do, just clear the request */
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
/* we left the vsie handler, nothing to do, just clear the request */
kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
return 0;
}
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod)
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
{
struct kvm_vcpu *vcpu;
- struct kvm_s390_tod_clock_ext htod;
- int i;
+ union tod_clock clk;
+ unsigned long i;
- mutex_lock(&kvm->lock);
preempt_disable();
- get_tod_clock_ext((char *)&htod);
+ store_tod_clock_ext(&clk);
- kvm->arch.epoch = gtod->tod - htod.tod;
+ kvm->arch.epoch = gtod->tod - clk.tod;
kvm->arch.epdx = 0;
if (test_kvm_facility(kvm, 139)) {
- kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx;
+ kvm->arch.epdx = gtod->epoch_idx - clk.ei;
if (kvm->arch.epoch > gtod->tod)
kvm->arch.epdx -= 1;
}
@@ -3616,7 +4390,15 @@ void kvm_s390_set_tod_clock(struct kvm *kvm,
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
+}
+
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
+{
+ if (!mutex_trylock(&kvm->lock))
+ return 0;
+ __kvm_s390_set_tod_clock(kvm, gtod);
mutex_unlock(&kvm->lock);
+ return 1;
}
/**
@@ -3652,11 +4434,13 @@ static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
}
}
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
__kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
+
+ return true;
}
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
@@ -3672,7 +4456,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
/* s390 will always inject the page directly */
}
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
{
/*
* s390 will always inject the page directly,
@@ -3681,33 +4465,31 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
return true;
}
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
{
hva_t hva;
struct kvm_arch_async_pf arch;
- int rc;
if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
- return 0;
+ return false;
if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
vcpu->arch.pfault_compare)
- return 0;
+ return false;
if (psw_extint_disabled(vcpu))
- return 0;
+ return false;
if (kvm_s390_vcpu_has_irq(vcpu, 0))
- return 0;
+ return false;
if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
- return 0;
+ return false;
if (!vcpu->arch.gmap->pfault_enabled)
- return 0;
+ return false;
hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
hva += current->thread.gmap_addr & ~PAGE_MASK;
if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
- return 0;
+ return false;
- rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
- return rc;
+ return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
}
static int vcpu_pre_run(struct kvm_vcpu *vcpu)
@@ -3727,9 +4509,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
if (need_resched())
schedule();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
-
if (!kvm_is_ucontrol(vcpu->kvm)) {
rc = kvm_s390_deliver_pending_interrupts(vcpu);
if (rc)
@@ -3745,7 +4524,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
@@ -3839,27 +4618,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
current->thread.gmap_pfault = 0;
if (kvm_arch_setup_async_pf(vcpu))
return 0;
+ vcpu->stat.pfault_sync++;
return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
}
return vcpu_post_run_fault_in_sie(vcpu);
}
+#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
int rc, exit_reason;
+ struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block;
/*
* We try to hold kvm->srcu during most of vcpu_run (except when run-
* ning the guest), so that memslots (and other stuff) are protected
*/
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
do {
rc = vcpu_pre_run(vcpu);
if (rc)
break;
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
* guest_enter and guest_exit should be no uaccess.
@@ -3868,23 +4650,46 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
guest_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
local_irq_enable();
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy(sie_page->pv_grregs,
+ vcpu->run->s.regs.gprs,
+ sizeof(sie_page->pv_grregs));
+ }
+ if (test_cpu_flag(CIF_FPU))
+ load_fpu_regs();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy(vcpu->run->s.regs.gprs,
+ sie_page->pv_grregs,
+ sizeof(sie_page->pv_grregs));
+ /*
+ * We're not allowed to inject interrupts on intercepts
+ * that leave the guest state in an "in-between" state
+ * where the next SIE entry will do a continuation.
+ * Fence interrupts in our "internal" PSW.
+ */
+ if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR ||
+ vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) {
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+ }
+ }
local_irq_disable();
__enable_cpu_timer_accounting(vcpu);
guest_exit_irqoff();
local_irq_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
} while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
return rc;
}
-static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void sync_regs_fmt2(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
struct runtime_instr_cb *riccb;
struct gs_cb *gscb;
@@ -3892,16 +4697,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
- if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
- kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
- if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
- memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
- /* some control register changes require a tlb flush */
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
- kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
- vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea;
@@ -3913,6 +4709,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
kvm_clear_async_pf_completion_queue(vcpu);
}
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) {
+ vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318;
+ vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc;
+ VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc);
+ }
/*
* If userspace sets the riccb (e.g. after migration) to a valid state,
* we should enable RI here instead of doing the lazy enablement.
@@ -3942,6 +4743,38 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
}
+ if (MACHINE_HAS_GS) {
+ preempt_disable();
+ __ctl_set_bit(2, 4);
+ if (current->thread.gs_cb) {
+ vcpu->arch.host_gscb = current->thread.gs_cb;
+ save_gs_cb(vcpu->arch.host_gscb);
+ }
+ if (vcpu->arch.gs_enabled) {
+ current->thread.gs_cb = (struct gs_cb *)
+ &vcpu->run->s.regs.gscb;
+ restore_gs_cb(current->thread.gs_cb);
+ }
+ preempt_enable();
+ }
+ /* SIE will load etoken directly from SDNX and therefore kvm_run */
+}
+
+static void sync_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
+ kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
+ memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
+ /* some control register changes require a tlb flush */
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
+ kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
+ vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
+ }
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
/* save host (userspace) fprs/vrs */
@@ -3956,40 +4789,65 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (test_fp_ctl(current->thread.fpu.fpc))
/* User space provided an invalid FPC, let's clear it */
current->thread.fpu.fpc = 0;
+
+ /* Sync fmt2 only data */
+ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
+ sync_regs_fmt2(vcpu);
+ } else {
+ /*
+ * In several places we have to modify our internal view to
+ * not do things that are disallowed by the ultravisor. For
+ * example we must not inject interrupts after specific exits
+ * (e.g. 112 prefix page not secure). We do this by turning
+ * off the machine check, external and I/O interrupt bits
+ * of our PSW copy. To avoid getting validity intercepts, we
+ * do only accept the condition code from userspace.
+ */
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC;
+ vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask &
+ PSW_MASK_CC;
+ }
+
+ kvm_run->kvm_dirty_regs = 0;
+}
+
+static void store_regs_fmt2(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
+ kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
+ kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
+ kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
+ kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
if (MACHINE_HAS_GS) {
preempt_disable();
__ctl_set_bit(2, 4);
- if (current->thread.gs_cb) {
- vcpu->arch.host_gscb = current->thread.gs_cb;
- save_gs_cb(vcpu->arch.host_gscb);
- }
- if (vcpu->arch.gs_enabled) {
- current->thread.gs_cb = (struct gs_cb *)
- &vcpu->run->s.regs.gscb;
- restore_gs_cb(current->thread.gs_cb);
- }
+ if (vcpu->arch.gs_enabled)
+ save_gs_cb(current->thread.gs_cb);
+ current->thread.gs_cb = vcpu->arch.host_gscb;
+ restore_gs_cb(vcpu->arch.host_gscb);
+ if (!vcpu->arch.host_gscb)
+ __ctl_clear_bit(2, 4);
+ vcpu->arch.host_gscb = NULL;
preempt_enable();
}
- /* SIE will load etoken directly from SDNX and therefore kvm_run */
-
- kvm_run->kvm_dirty_regs = 0;
+ /* SIE will save etoken directly into SDNX and therefore kvm_run */
}
-static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void store_regs(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
+
kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
- kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
- kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
- kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
kvm_run->s.regs.pft = vcpu->arch.pfault_token;
kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
- kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
save_access_regs(vcpu->run->s.regs.acrs);
restore_access_regs(vcpu->arch.host_acrs);
/* Save guest register state */
@@ -3998,25 +4856,24 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
/* Restore will be done lazily at return */
current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
- if (MACHINE_HAS_GS) {
- __ctl_set_bit(2, 4);
- if (vcpu->arch.gs_enabled)
- save_gs_cb(current->thread.gs_cb);
- preempt_disable();
- current->thread.gs_cb = vcpu->arch.host_gscb;
- restore_gs_cb(vcpu->arch.host_gscb);
- preempt_enable();
- if (!vcpu->arch.host_gscb)
- __ctl_clear_bit(2, 4);
- vcpu->arch.host_gscb = NULL;
- }
- /* SIE will save etoken directly into SDNX and therefore kvm_run */
+ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
+ store_regs_fmt2(vcpu);
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
int rc;
+ /*
+ * Running a VM while dumping always has the potential to
+ * produce inconsistent dump data. But for PV vcpus a SIE
+ * entry while dumping could also lead to a fatal validity
+ * intercept which we absolutely want to avoid.
+ */
+ if (vcpu->kvm->arch.pv.dumping)
+ return -EINVAL;
+
if (kvm_run->immediate_exit)
return -EINTR;
@@ -4034,6 +4891,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_sigset_activate(vcpu);
+ /*
+ * no need to check the return value of vcpu_start as it can only have
+ * an error for protvirt, but protvirt means user cpu state
+ */
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
kvm_s390_vcpu_start(vcpu);
} else if (is_vcpu_stopped(vcpu)) {
@@ -4043,7 +4904,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
- sync_regs(vcpu, kvm_run);
+ sync_regs(vcpu);
enable_cpu_timer_accounting(vcpu);
might_fault();
@@ -4065,7 +4926,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
disable_cpu_timer_accounting(vcpu);
- store_regs(vcpu, kvm_run);
+ store_regs(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -4155,7 +5016,7 @@ static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -4171,20 +5032,29 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
{
- int i, online_vcpus, started_vcpus = 0;
+ int i, online_vcpus, r = 0, started_vcpus = 0;
if (!is_vcpu_stopped(vcpu))
- return;
+ return 0;
trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
/* Only one cpu at a time may enter/leave the STOPPED state. */
spin_lock(&vcpu->kvm->arch.start_stop_lock);
online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
+ /* Let's tell the UV that we want to change into the operating state */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR);
+ if (r) {
+ spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+ return r;
+ }
+ }
+
for (i = 0; i < online_vcpus; i++) {
- if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
+ if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i)))
started_vcpus++;
}
@@ -4195,44 +5065,67 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
/*
* As we are starting a second VCPU, we have to disable
* the IBS facility on all VCPUs to remove potentially
- * oustanding ENABLE requests.
+ * outstanding ENABLE requests.
*/
__disable_ibs_on_all_vcpus(vcpu->kvm);
}
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
/*
+ * The real PSW might have changed due to a RESTART interpreted by the
+ * ultravisor. We block all interrupts and let the next sie exit
+ * refresh our view.
+ */
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+ /*
* Another VCPU might have used IBS while we were offline.
* Let's play safe and flush the VCPU at startup.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
spin_unlock(&vcpu->kvm->arch.start_stop_lock);
- return;
+ return 0;
}
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
{
- int i, online_vcpus, started_vcpus = 0;
+ int i, online_vcpus, r = 0, started_vcpus = 0;
struct kvm_vcpu *started_vcpu = NULL;
if (is_vcpu_stopped(vcpu))
- return;
+ return 0;
trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
/* Only one cpu at a time may enter/leave the STOPPED state. */
spin_lock(&vcpu->kvm->arch.start_stop_lock);
online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
- /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
- kvm_s390_clear_stop_irq(vcpu);
+ /* Let's tell the UV that we want to change into the stopped state */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP);
+ if (r) {
+ spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+ return r;
+ }
+ }
+ /*
+ * Set the VCPU to STOPPED and THEN clear the interrupt flag,
+ * now that the SIGP STOP and SIGP STOP AND STORE STATUS orders
+ * have been fully processed. This will ensure that the VCPU
+ * is kept BUSY if another VCPU is inquiring with SIGP SENSE.
+ */
kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED);
+ kvm_s390_clear_stop_irq(vcpu);
+
__disable_ibs_on_vcpu(vcpu);
for (i = 0; i < online_vcpus; i++) {
- if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
+ struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i);
+
+ if (!is_vcpu_stopped(tmp)) {
started_vcpus++;
- started_vcpu = vcpu->kvm->vcpus[i];
+ started_vcpu = tmp;
}
}
@@ -4245,7 +5138,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
}
spin_unlock(&vcpu->kvm->arch.start_stop_lock);
- return;
+ return 0;
}
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
@@ -4272,37 +5165,74 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
}
-static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
+static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ int r = 0;
+
+ if (mop->flags || !mop->size)
+ return -EINVAL;
+ if (mop->size + mop->sida_offset < mop->size)
+ return -EINVAL;
+ if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block))
+ return -E2BIG;
+ if (!kvm_s390_pv_cpu_is_protected(vcpu))
+ return -EINVAL;
+
+ switch (mop->op) {
+ case KVM_S390_MEMOP_SIDA_READ:
+ if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) +
+ mop->sida_offset), mop->size))
+ r = -EFAULT;
+
+ break;
+ case KVM_S390_MEMOP_SIDA_WRITE:
+ if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) +
+ mop->sida_offset), uaddr, mop->size))
+ r = -EFAULT;
+ break;
+ }
+ return r;
+}
+
+static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
+ struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
void *tmpbuf = NULL;
- int r, srcu_idx;
+ int r = 0;
const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
- | KVM_S390_MEMOP_F_CHECK_ONLY;
+ | KVM_S390_MEMOP_F_CHECK_ONLY
+ | KVM_S390_MEMOP_F_SKEY_PROTECTION;
if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size)
return -EINVAL;
-
if (mop->size > MEM_OP_MAX_SIZE)
return -E2BIG;
-
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ return -EINVAL;
+ if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
+ if (access_key_invalid(mop->key))
+ return -EINVAL;
+ } else {
+ mop->key = 0;
+ }
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
return -ENOMEM;
}
- srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-
switch (mop->op) {
case KVM_S390_MEMOP_LOGICAL_READ:
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar,
- mop->size, GACC_FETCH);
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+ GACC_FETCH, mop->key);
break;
}
- r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+ r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+ mop->size, mop->key);
if (r == 0) {
if (copy_to_user(uaddr, tmpbuf, mop->size))
r = -EFAULT;
@@ -4310,22 +5240,19 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
break;
case KVM_S390_MEMOP_LOGICAL_WRITE:
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar,
- mop->size, GACC_STORE);
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+ GACC_STORE, mop->key);
break;
}
if (copy_from_user(tmpbuf, uaddr, mop->size)) {
r = -EFAULT;
break;
}
- r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+ r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+ mop->size, mop->key);
break;
- default:
- r = -EINVAL;
}
- srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-
if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
@@ -4333,6 +5260,31 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
return r;
}
+static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
+ struct kvm_s390_mem_op *mop)
+{
+ int r, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ switch (mop->op) {
+ case KVM_S390_MEMOP_LOGICAL_READ:
+ case KVM_S390_MEMOP_LOGICAL_WRITE:
+ r = kvm_s390_vcpu_mem_op(vcpu, mop);
+ break;
+ case KVM_S390_MEMOP_SIDA_READ:
+ case KVM_S390_MEMOP_SIDA_WRITE:
+ /* we are locked against sida going away by the vcpu->mutex */
+ r = kvm_s390_vcpu_sida_op(vcpu, mop);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ return r;
+}
+
long kvm_arch_vcpu_async_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4361,6 +5313,48 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
return -ENOIOCTLCMD;
}
+static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
+ struct kvm_pv_cmd *cmd)
+{
+ struct kvm_s390_pv_dmp dmp;
+ void *data;
+ int ret;
+
+ /* Dump initialization is a prerequisite */
+ if (!vcpu->kvm->arch.pv.dumping)
+ return -EINVAL;
+
+ if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
+ return -EFAULT;
+
+ /* We only handle this subcmd right now */
+ if (dmp.subcmd != KVM_PV_DUMP_CPU)
+ return -EINVAL;
+
+ /* CPU dump length is the same as create cpu storage donation. */
+ if (dmp.buff_len != uv_info.guest_cpu_stor_len)
+ return -EINVAL;
+
+ data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
+
+ VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
+ vcpu->vcpu_id, cmd->rc, cmd->rrc);
+
+ if (ret)
+ ret = -EINVAL;
+
+ /* On success copy over the dump data */
+ if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
+ ret = -EFAULT;
+
+ kvfree(data);
+ return ret;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4368,6 +5362,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
int idx;
long r;
+ u16 rc, rrc;
vcpu_load(vcpu);
@@ -4389,18 +5384,40 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_S390_CLEAR_RESET:
r = 0;
kvm_arch_vcpu_ioctl_clear_reset(vcpu);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+ UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc);
+ VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x",
+ rc, rrc);
+ }
break;
case KVM_S390_INITIAL_RESET:
r = 0;
kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+ UVC_CMD_CPU_RESET_INITIAL,
+ &rc, &rrc);
+ VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x",
+ rc, rrc);
+ }
break;
case KVM_S390_NORMAL_RESET:
r = 0;
kvm_arch_vcpu_ioctl_normal_reset(vcpu);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+ UVC_CMD_CPU_RESET, &rc, &rrc);
+ VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x",
+ rc, rrc);
+ }
break;
case KVM_SET_ONE_REG:
case KVM_GET_ONE_REG: {
struct kvm_one_reg reg;
+ r = -EINVAL;
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ break;
r = -EFAULT;
if (copy_from_user(&reg, argp, sizeof(reg)))
break;
@@ -4463,7 +5480,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_s390_mem_op mem_op;
if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
- r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+ r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op);
else
r = -EFAULT;
break;
@@ -4502,6 +5519,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
irq_state.len);
break;
}
+ case KVM_S390_PV_CPU_COMMAND: {
+ struct kvm_pv_cmd cmd;
+
+ r = -EINVAL;
+ if (!is_prot_virt_host())
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ break;
+
+ r = -EINVAL;
+ if (cmd.flags)
+ break;
+
+ /* We only handle this cmd right now */
+ if (cmd.cmd != KVM_PV_DUMP)
+ break;
+
+ r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
+
+ /* Always copy over UV rc / rrc data */
+ if (copy_to_user((__u8 __user *)argp, &cmd.rc,
+ sizeof(cmd.rc) + sizeof(cmd.rrc)))
+ r = -EFAULT;
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -4523,38 +5567,41 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned long npages)
-{
- return 0;
-}
-
/* Section: memory related */
int kvm_arch_prepare_memory_region(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- const struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ gpa_t size;
+
+ /* When we are protected, we should not change the memory slots */
+ if (kvm_s390_pv_get_handle(kvm))
+ return -EINVAL;
+
+ if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
+ return 0;
+
/* A few sanity checks. We can have memory slots which have to be
located/ended at a segment boundary (1MB). The memory in userland is
ok to be fragmented into various different vmas. It is okay to mmap()
and munmap() stuff in this slot after doing this call at any time */
- if (mem->userspace_addr & 0xffffful)
+ if (new->userspace_addr & 0xffffful)
return -EINVAL;
- if (mem->memory_size & 0xffffful)
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
return -EINVAL;
- if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
return -EINVAL;
return 0;
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
- const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
@@ -4570,10 +5617,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
old->npages * PAGE_SIZE);
if (rc)
break;
- /* FALLTHROUGH */
+ fallthrough;
case KVM_MR_CREATE:
- rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
- mem->guest_phys_addr, mem->memory_size);
+ rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr,
+ new->base_gfn * PAGE_SIZE,
+ new->npages * PAGE_SIZE);
break;
case KVM_MR_FLAGS_ONLY:
break;
@@ -4592,11 +5640,6 @@ static inline unsigned long nonhyp_mask(int i)
return 0x0000ffffffffffffUL >> (nonhyp_fai << 4);
}
-void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu)
-{
- vcpu->valid_wakeup = false;
-}
-
static int __init kvm_s390_init(void)
{
int i;
@@ -4613,7 +5656,7 @@ static int __init kvm_s390_init(void)
for (i = 0; i < 16; i++)
kvm_s390_fac_base[i] |=
- S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
+ stfle_fac_list[i] & nonhyp_mask(i);
return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 6d9448dbd052..4755492dfabc 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -2,7 +2,7 @@
/*
* definition for kvm on s390
*
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -15,6 +15,7 @@
#include <linux/hrtimer.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/lockdep.h>
#include <asm/facility.h>
#include <asm/processor.h>
#include <asm/sclp.h>
@@ -25,6 +26,17 @@
#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
extern debug_info_t *kvm_s390_dbf;
+extern debug_info_t *kvm_s390_dbf_uv;
+
+#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
+do { \
+ debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \
+ d_args); \
+ debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \
+ "%d: " d_string "\n", (d_kvm)->userspace_pid, \
+ d_args); \
+} while (0)
+
#define KVM_EVENT(d_loglevel, d_string, d_args...)\
do { \
debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \
@@ -67,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -93,7 +105,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
prefix);
vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
}
static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
@@ -196,6 +208,76 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
return kvm->arch.user_cpu_state_ctrl != 0;
}
+static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm)
+{
+ if (kvm->arch.user_cpu_state_ctrl)
+ return;
+
+ VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control");
+ kvm->arch.user_cpu_state_ctrl = 1;
+}
+
+/* get the end gfn of the last (highest gfn) memslot */
+static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
+{
+ struct rb_node *node;
+ struct kvm_memory_slot *ms;
+
+ if (WARN_ON(kvm_memslots_empty(slots)))
+ return 0;
+
+ node = rb_last(&slots->gfn_tree);
+ ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]);
+ return ms->base_gfn + ms->npages;
+}
+
+static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
+{
+ u32 gd = (u32)(u64)kvm->arch.gisa_int.origin;
+
+ if (gd && sclp.has_gisaf)
+ gd |= GISA_FORMAT1;
+ return gd;
+}
+
+/* implemented in pv.c */
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+ u16 *rrc);
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+ unsigned long tweak, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+ u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+ u16 *rc, u16 *rrc);
+
+static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
+{
+ return kvm->arch.pv.handle;
+}
+
+static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv.handle;
+}
+
+static inline bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->lock);
+ return !!kvm_s390_pv_get_handle(kvm);
+}
+
+static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_held(&vcpu->mutex);
+ return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+
/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
@@ -281,13 +363,12 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod);
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu);
@@ -297,13 +378,14 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
WARN_ON(!mutex_is_locked(&kvm->lock));
@@ -313,7 +395,7 @@ static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -373,6 +455,7 @@ void kvm_s390_destroy_adapters(struct kvm *kvm);
int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu);
extern struct kvm_device_ops kvm_flic_ops;
int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu);
+int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu);
void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu);
int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu,
void __user *buf, int len);
@@ -381,6 +464,8 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
void kvm_s390_gisa_init(struct kvm *kvm);
void kvm_s390_gisa_clear(struct kvm *kvm);
void kvm_s390_gisa_destroy(struct kvm *kvm);
+void kvm_s390_gisa_disable(struct kvm *kvm);
+void kvm_s390_gisa_enable(struct kvm *kvm);
int kvm_s390_gib_init(u8 nisc);
void kvm_s390_gib_destroy(void);
@@ -426,4 +511,22 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * kvm_s390_vcpu_pci_enable_interp
+ *
+ * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store
+ * interpretation as well as adapter interruption forwarding.
+ *
+ * @kvm: the KVM guest
+ */
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
#endif
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
new file mode 100644
index 000000000000..ded1af2ddae9
--- /dev/null
+++ b/arch/s390/kvm/pci.c
@@ -0,0 +1,702 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <asm/pci.h>
+#include <asm/pci_insn.h>
+#include <asm/pci_io.h>
+#include <asm/sclp.h>
+#include "pci.h"
+#include "kvm-s390.h"
+
+struct zpci_aift *aift;
+
+static inline int __set_irq_noiib(u16 ctl, u8 isc)
+{
+ union zpci_sic_iib iib = {{0}};
+
+ return zpci_set_irq_ctrl(ctl, isc, &iib);
+}
+
+void kvm_s390_pci_aen_exit(void)
+{
+ unsigned long flags;
+ struct kvm_zdev **gait_kzdev;
+
+ lockdep_assert_held(&aift->aift_lock);
+
+ /*
+ * Contents of the aipb remain registered for the life of the host
+ * kernel, the information preserved in zpci_aipb and zpci_aif_sbv
+ * in case we insert the KVM module again later. Clear the AIFT
+ * information and free anything not registered with underlying
+ * firmware.
+ */
+ spin_lock_irqsave(&aift->gait_lock, flags);
+ gait_kzdev = aift->kzdev;
+ aift->gait = NULL;
+ aift->sbv = NULL;
+ aift->kzdev = NULL;
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+
+ kfree(gait_kzdev);
+}
+
+static int zpci_setup_aipb(u8 nisc)
+{
+ struct page *page;
+ int size, rc;
+
+ zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL);
+ if (!zpci_aipb)
+ return -ENOMEM;
+
+ aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
+ if (!aift->sbv) {
+ rc = -ENOMEM;
+ goto free_aipb;
+ }
+ zpci_aif_sbv = aift->sbv;
+ size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES *
+ sizeof(struct zpci_gaite)));
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size);
+ if (!page) {
+ rc = -ENOMEM;
+ goto free_sbv;
+ }
+ aift->gait = (struct zpci_gaite *)page_to_virt(page);
+
+ zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector);
+ zpci_aipb->aipb.gait = virt_to_phys(aift->gait);
+ zpci_aipb->aipb.afi = nisc;
+ zpci_aipb->aipb.faal = ZPCI_NR_DEVICES;
+
+ /* Setup Adapter Event Notification Interpretation */
+ if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) {
+ rc = -EIO;
+ goto free_gait;
+ }
+
+ return 0;
+
+free_gait:
+ free_pages((unsigned long)aift->gait, size);
+free_sbv:
+ airq_iv_release(aift->sbv);
+ zpci_aif_sbv = NULL;
+free_aipb:
+ kfree(zpci_aipb);
+ zpci_aipb = NULL;
+
+ return rc;
+}
+
+static int zpci_reset_aipb(u8 nisc)
+{
+ /*
+ * AEN registration can only happen once per system boot. If
+ * an aipb already exists then AEN was already registered and
+ * we can re-use the aipb contents. This can only happen if
+ * the KVM module was removed and re-inserted. However, we must
+ * ensure that the same forwarding ISC is used as this is assigned
+ * during KVM module load.
+ */
+ if (zpci_aipb->aipb.afi != nisc)
+ return -EINVAL;
+
+ aift->sbv = zpci_aif_sbv;
+ aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait;
+
+ return 0;
+}
+
+int kvm_s390_pci_aen_init(u8 nisc)
+{
+ int rc = 0;
+
+ /* If already enabled for AEN, bail out now */
+ if (aift->gait || aift->sbv)
+ return -EPERM;
+
+ mutex_lock(&aift->aift_lock);
+ aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *),
+ GFP_KERNEL);
+ if (!aift->kzdev) {
+ rc = -ENOMEM;
+ goto unlock;
+ }
+
+ if (!zpci_aipb)
+ rc = zpci_setup_aipb(nisc);
+ else
+ rc = zpci_reset_aipb(nisc);
+ if (rc)
+ goto free_zdev;
+
+ /* Enable floating IRQs */
+ if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) {
+ rc = -EIO;
+ kvm_s390_pci_aen_exit();
+ }
+
+ goto unlock;
+
+free_zdev:
+ kfree(aift->kzdev);
+unlock:
+ mutex_unlock(&aift->aift_lock);
+ return rc;
+}
+
+/* Modify PCI: Register floating adapter interruption forwarding */
+static int kvm_zpci_set_airq(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
+ struct zpci_fib fib = {};
+ u8 status;
+
+ fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc;
+ fib.fmt0.sum = 1; /* enable summary notifications */
+ fib.fmt0.noi = airq_iv_end(zdev->aibv);
+ fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
+ fib.fmt0.aibvo = 0;
+ fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+ fib.fmt0.aisbo = zdev->aisb & 63;
+ fib.gd = zdev->gisa;
+
+ return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+}
+
+/* Modify PCI: Unregister floating adapter interruption forwarding */
+static int kvm_zpci_clear_airq(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
+ struct zpci_fib fib = {};
+ u8 cc, status;
+
+ fib.gd = zdev->gisa;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc == 3 || (cc == 1 && status == 24))
+ /* Function already gone or IRQs already deregistered. */
+ cc = 0;
+
+ return cc ? -EIO : 0;
+}
+
+static inline void unaccount_mem(unsigned long nr_pages)
+{
+ struct user_struct *user = get_uid(current_user());
+
+ if (user)
+ atomic_long_sub(nr_pages, &user->locked_vm);
+ if (current->mm)
+ atomic64_sub(nr_pages, &current->mm->pinned_vm);
+}
+
+static inline int account_mem(unsigned long nr_pages)
+{
+ struct user_struct *user = get_uid(current_user());
+ unsigned long page_limit, cur_pages, new_pages;
+
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ do {
+ cur_pages = atomic_long_read(&user->locked_vm);
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+
+ atomic64_add(nr_pages, &current->mm->pinned_vm);
+
+ return 0;
+}
+
+static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib,
+ bool assist)
+{
+ struct page *pages[1], *aibv_page, *aisb_page = NULL;
+ unsigned int msi_vecs, idx;
+ struct zpci_gaite *gaite;
+ unsigned long hva, bit;
+ struct kvm *kvm;
+ phys_addr_t gaddr;
+ int rc = 0, gisc, npages, pcount = 0;
+
+ /*
+ * Interrupt forwarding is only applicable if the device is already
+ * enabled for interpretation
+ */
+ if (zdev->gisa == 0)
+ return -EINVAL;
+
+ kvm = zdev->kzdev->kvm;
+ msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi);
+
+ /* Get the associated forwarding ISC - if invalid, return the error */
+ gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc);
+ if (gisc < 0)
+ return gisc;
+
+ /* Replace AIBV address */
+ idx = srcu_read_lock(&kvm->srcu);
+ hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv));
+ npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages);
+ srcu_read_unlock(&kvm->srcu, idx);
+ if (npages < 1) {
+ rc = -EIO;
+ goto out;
+ }
+ aibv_page = pages[0];
+ pcount++;
+ gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK);
+ fib->fmt0.aibv = gaddr;
+
+ /* Pin the guest AISB if one was specified */
+ if (fib->fmt0.sum == 1) {
+ idx = srcu_read_lock(&kvm->srcu);
+ hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb));
+ npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ srcu_read_unlock(&kvm->srcu, idx);
+ if (npages < 1) {
+ rc = -EIO;
+ goto unpin1;
+ }
+ aisb_page = pages[0];
+ pcount++;
+ }
+
+ /* Account for pinned pages, roll back on failure */
+ if (account_mem(pcount))
+ goto unpin2;
+
+ /* AISB must be allocated before we can fill in GAITE */
+ mutex_lock(&aift->aift_lock);
+ bit = airq_iv_alloc_bit(aift->sbv);
+ if (bit == -1UL)
+ goto unlock;
+ zdev->aisb = bit; /* store the summary bit number */
+ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA |
+ AIRQ_IV_BITLOCK |
+ AIRQ_IV_GUESTVEC,
+ phys_to_virt(fib->fmt0.aibv));
+
+ spin_lock_irq(&aift->gait_lock);
+ gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+ sizeof(struct zpci_gaite));
+
+ /* If assist not requested, host will get all alerts */
+ if (assist)
+ gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+ else
+ gaite->gisa = 0;
+
+ gaite->gisc = fib->fmt0.isc;
+ gaite->count++;
+ gaite->aisbo = fib->fmt0.aisbo;
+ gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb &
+ ~PAGE_MASK));
+ aift->kzdev[zdev->aisb] = zdev->kzdev;
+ spin_unlock_irq(&aift->gait_lock);
+
+ /* Update guest FIB for re-issue */
+ fib->fmt0.aisbo = zdev->aisb & 63;
+ fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+ fib->fmt0.isc = gisc;
+
+ /* Save some guest fib values in the host for later use */
+ zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc;
+ zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv;
+ mutex_unlock(&aift->aift_lock);
+
+ /* Issue the clp to setup the irq now */
+ rc = kvm_zpci_set_airq(zdev);
+ return rc;
+
+unlock:
+ mutex_unlock(&aift->aift_lock);
+unpin2:
+ if (fib->fmt0.sum == 1)
+ unpin_user_page(aisb_page);
+unpin1:
+ unpin_user_page(aibv_page);
+out:
+ return rc;
+}
+
+static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force)
+{
+ struct kvm_zdev *kzdev = zdev->kzdev;
+ struct zpci_gaite *gaite;
+ struct page *vpage = NULL, *spage = NULL;
+ int rc, pcount = 0;
+ u8 isc;
+
+ if (zdev->gisa == 0)
+ return -EINVAL;
+
+ mutex_lock(&aift->aift_lock);
+
+ /*
+ * If the clear fails due to an error, leave now unless we know this
+ * device is about to go away (force) -- In that case clear the GAITE
+ * regardless.
+ */
+ rc = kvm_zpci_clear_airq(zdev);
+ if (rc && !force)
+ goto out;
+
+ if (zdev->kzdev->fib.fmt0.aibv == 0)
+ goto out;
+ spin_lock_irq(&aift->gait_lock);
+ gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+ sizeof(struct zpci_gaite));
+ isc = gaite->gisc;
+ gaite->count--;
+ if (gaite->count == 0) {
+ /* Release guest AIBV and AISB */
+ vpage = phys_to_page(kzdev->fib.fmt0.aibv);
+ if (gaite->aisb != 0)
+ spage = phys_to_page(gaite->aisb);
+ /* Clear the GAIT entry */
+ gaite->aisb = 0;
+ gaite->gisc = 0;
+ gaite->aisbo = 0;
+ gaite->gisa = 0;
+ aift->kzdev[zdev->aisb] = NULL;
+ /* Clear zdev info */
+ airq_iv_free_bit(aift->sbv, zdev->aisb);
+ airq_iv_release(zdev->aibv);
+ zdev->aisb = 0;
+ zdev->aibv = NULL;
+ }
+ spin_unlock_irq(&aift->gait_lock);
+ kvm_s390_gisc_unregister(kzdev->kvm, isc);
+ kzdev->fib.fmt0.isc = 0;
+ kzdev->fib.fmt0.aibv = 0;
+
+ if (vpage) {
+ unpin_user_page(vpage);
+ pcount++;
+ }
+ if (spage) {
+ unpin_user_page(spage);
+ pcount++;
+ }
+ if (pcount > 0)
+ unaccount_mem(pcount);
+out:
+ mutex_unlock(&aift->aift_lock);
+
+ return rc;
+}
+
+static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
+{
+ struct kvm_zdev *kzdev;
+
+ kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL);
+ if (!kzdev)
+ return -ENOMEM;
+
+ kzdev->zdev = zdev;
+ zdev->kzdev = kzdev;
+
+ return 0;
+}
+
+static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
+{
+ struct kvm_zdev *kzdev;
+
+ kzdev = zdev->kzdev;
+ WARN_ON(kzdev->zdev != zdev);
+ zdev->kzdev = NULL;
+ kfree(kzdev);
+}
+
+
+/*
+ * Register device with the specified KVM. If interpetation facilities are
+ * available, enable them and let userspace indicate whether or not they will
+ * be used (specify SHM bit to disable).
+ */
+static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
+{
+ struct zpci_dev *zdev = opaque;
+ int rc;
+
+ if (!zdev)
+ return -EINVAL;
+
+ mutex_lock(&zdev->kzdev_lock);
+
+ if (zdev->kzdev || zdev->gisa != 0 || !kvm) {
+ mutex_unlock(&zdev->kzdev_lock);
+ return -EINVAL;
+ }
+
+ kvm_get_kvm(kvm);
+
+ mutex_lock(&kvm->lock);
+
+ rc = kvm_s390_pci_dev_open(zdev);
+ if (rc)
+ goto err;
+
+ /*
+ * If interpretation facilities aren't available, add the device to
+ * the kzdev list but don't enable for interpretation.
+ */
+ if (!kvm_s390_pci_interp_allowed())
+ goto out;
+
+ /*
+ * If this is the first request to use an interpreted device, make the
+ * necessary vcpu changes
+ */
+ if (!kvm->arch.use_zpci_interp)
+ kvm_s390_vcpu_pci_enable_interp(kvm);
+
+ if (zdev_enabled(zdev)) {
+ rc = zpci_disable_device(zdev);
+ if (rc)
+ goto err;
+ }
+
+ /*
+ * Store information about the identity of the kvm guest allowed to
+ * access this device via interpretation to be used by host CLP
+ */
+ zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ goto clear_gisa;
+
+ /* Re-register the IOMMU that was already created */
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table));
+ if (rc)
+ goto clear_gisa;
+
+out:
+ zdev->kzdev->kvm = kvm;
+
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list);
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ return 0;
+
+clear_gisa:
+ zdev->gisa = 0;
+err:
+ if (zdev->kzdev)
+ kvm_s390_pci_dev_release(zdev);
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ kvm_put_kvm(kvm);
+ return rc;
+}
+
+static void kvm_s390_pci_unregister_kvm(void *opaque)
+{
+ struct zpci_dev *zdev = opaque;
+ struct kvm *kvm;
+
+ if (!zdev)
+ return;
+
+ mutex_lock(&zdev->kzdev_lock);
+
+ if (WARN_ON(!zdev->kzdev)) {
+ mutex_unlock(&zdev->kzdev_lock);
+ return;
+ }
+
+ kvm = zdev->kzdev->kvm;
+ mutex_lock(&kvm->lock);
+
+ /*
+ * A 0 gisa means interpretation was never enabled, just remove the
+ * device from the list.
+ */
+ if (zdev->gisa == 0)
+ goto out;
+
+ /* Forwarding must be turned off before interpretation */
+ if (zdev->kzdev->fib.fmt0.aibv != 0)
+ kvm_s390_pci_aif_disable(zdev, true);
+
+ /* Remove the host CLP guest designation */
+ zdev->gisa = 0;
+
+ if (zdev_enabled(zdev)) {
+ if (zpci_disable_device(zdev))
+ goto out;
+ }
+
+ if (zpci_enable_device(zdev))
+ goto out;
+
+ /* Re-register the IOMMU that was already created */
+ zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table));
+
+out:
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_del(&zdev->kzdev->entry);
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+ kvm_s390_pci_dev_release(zdev);
+
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+
+ kvm_put_kvm(kvm);
+}
+
+void kvm_s390_pci_init_list(struct kvm *kvm)
+{
+ spin_lock_init(&kvm->arch.kzdev_list_lock);
+ INIT_LIST_HEAD(&kvm->arch.kzdev_list);
+}
+
+void kvm_s390_pci_clear_list(struct kvm *kvm)
+{
+ /*
+ * This list should already be empty, either via vfio device closures
+ * or kvm fd cleanup.
+ */
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list));
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+}
+
+static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
+{
+ struct zpci_dev *zdev = NULL;
+ struct kvm_zdev *kzdev;
+
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
+ if (kzdev->zdev->fh == fh) {
+ zdev = kzdev->zdev;
+ break;
+ }
+ }
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+
+ return zdev;
+}
+
+static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
+ struct kvm_s390_zpci_op *args)
+{
+ struct zpci_fib fib = {};
+ bool hostflag;
+
+ fib.fmt0.aibv = args->u.reg_aen.ibv;
+ fib.fmt0.isc = args->u.reg_aen.isc;
+ fib.fmt0.noi = args->u.reg_aen.noi;
+ if (args->u.reg_aen.sb != 0) {
+ fib.fmt0.aisb = args->u.reg_aen.sb;
+ fib.fmt0.aisbo = args->u.reg_aen.sbo;
+ fib.fmt0.sum = 1;
+ } else {
+ fib.fmt0.aisb = 0;
+ fib.fmt0.aisbo = 0;
+ fib.fmt0.sum = 0;
+ }
+
+ hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
+ return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
+}
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
+{
+ struct kvm_zdev *kzdev;
+ struct zpci_dev *zdev;
+ int r;
+
+ zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
+ if (!zdev)
+ return -ENODEV;
+
+ mutex_lock(&zdev->kzdev_lock);
+ mutex_lock(&kvm->lock);
+
+ kzdev = zdev->kzdev;
+ if (!kzdev) {
+ r = -ENODEV;
+ goto out;
+ }
+ if (kzdev->kvm != kvm) {
+ r = -EPERM;
+ goto out;
+ }
+
+ switch (args->op) {
+ case KVM_S390_ZPCIOP_REG_AEN:
+ /* Fail on unknown flags */
+ if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
+ r = -EINVAL;
+ break;
+ }
+ r = kvm_s390_pci_zpci_reg_aen(zdev, args);
+ break;
+ case KVM_S390_ZPCIOP_DEREG_AEN:
+ r = kvm_s390_pci_aif_disable(zdev, false);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ return r;
+}
+
+int kvm_s390_pci_init(void)
+{
+ zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm;
+ zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm;
+
+ if (!kvm_s390_pci_interp_allowed())
+ return 0;
+
+ aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
+ if (!aift)
+ return -ENOMEM;
+
+ spin_lock_init(&aift->gait_lock);
+ mutex_init(&aift->aift_lock);
+
+ return 0;
+}
+
+void kvm_s390_pci_exit(void)
+{
+ zpci_kvm_hook.kvm_register = NULL;
+ zpci_kvm_hook.kvm_unregister = NULL;
+
+ if (!kvm_s390_pci_interp_allowed())
+ return;
+
+ mutex_destroy(&aift->aift_lock);
+
+ kfree(aift);
+}
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
new file mode 100644
index 000000000000..486d06ef563f
--- /dev/null
+++ b/arch/s390/kvm/pci.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_PCI_H
+#define __KVM_S390_PCI_H
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <asm/airq.h>
+#include <asm/cpu.h>
+
+struct kvm_zdev {
+ struct zpci_dev *zdev;
+ struct kvm *kvm;
+ struct zpci_fib fib;
+ struct list_head entry;
+};
+
+struct zpci_gaite {
+ u32 gisa;
+ u8 gisc;
+ u8 count;
+ u8 reserved;
+ u8 aisbo;
+ u64 aisb;
+};
+
+struct zpci_aift {
+ struct zpci_gaite *gait;
+ struct airq_iv *sbv;
+ struct kvm_zdev **kzdev;
+ spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
+ struct mutex aift_lock; /* Protects the other structures in aift */
+};
+
+extern struct zpci_aift *aift;
+
+static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
+ unsigned long si)
+{
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev ||
+ !aift->kzdev[si])
+ return NULL;
+ return aift->kzdev[si]->kvm;
+};
+
+int kvm_s390_pci_aen_init(u8 nisc);
+void kvm_s390_pci_aen_exit(void);
+
+void kvm_s390_pci_init_list(struct kvm *kvm);
+void kvm_s390_pci_clear_list(struct kvm *kvm);
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
+
+int kvm_s390_pci_init(void);
+void kvm_s390_pci_exit(void);
+
+static inline bool kvm_s390_pci_interp_allowed(void)
+{
+ struct cpuid cpu_id;
+
+ get_cpu_id(&cpu_id);
+ switch (cpu_id.machine) {
+ case 0x2817:
+ case 0x2818:
+ case 0x2827:
+ case 0x2828:
+ case 0x2964:
+ case 0x2965:
+ /* No SHM on certain machines */
+ return false;
+ default:
+ return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) &&
+ sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi &&
+ sclp.has_aisii);
+ }
+}
+
+#endif /* __KVM_S390_PCI_H */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ed52ffa8d5d4..3335fa09b6f1 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -2,7 +2,7 @@
/*
* handling privileged instructions
*
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -11,8 +11,8 @@
#include <linux/kvm.h>
#include <linux/gfp.h>
#include <linux/errno.h>
-#include <linux/compat.h>
#include <linux/mm_types.h>
+#include <linux/pgtable.h>
#include <asm/asm-offsets.h>
#include <asm/facility.h>
@@ -20,9 +20,7 @@
#include <asm/debug.h>
#include <asm/ebcdic.h>
#include <asm/sysinfo.h>
-#include <asm/pgtable.h>
#include <asm/page-states.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/io.h>
#include <asm/ptrace.h>
@@ -103,7 +101,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_cond(vcpu, rc);
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod);
- kvm_s390_set_tod_clock(vcpu->kvm, &gtod);
+ /*
+ * To set the TOD clock the kvm lock must be taken, but the vcpu lock
+ * is already held in handle_set_clock. The usual lock order is the
+ * opposite. As SCK is deprecated and should not be used in several
+ * cases, for example when the multiple epoch facility or TOD clock
+ * steering facility is installed (see Principles of Operation), a
+ * slow path can be used. If the lock can not be taken via try_lock,
+ * the instruction will be retried via -EAGAIN at a later point in
+ * time.
+ */
+ if (!kvm_s390_try_set_tod_clock(vcpu->kvm, &gtod)) {
+ kvm_s390_retry_instr(vcpu);
+ return -EAGAIN;
+ }
kvm_s390_set_psw_cc(vcpu, 0);
return 0;
@@ -270,18 +281,18 @@ static int handle_iske(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
retry:
unlocked = false;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = get_guest_storage_key(current->mm, vmaddr, &key);
if (rc) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
if (!rc) {
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
goto retry;
}
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (rc < 0)
@@ -317,17 +328,17 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
retry:
unlocked = false;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = reset_guest_reference_bit(current->mm, vmaddr);
if (rc < 0) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
if (!rc) {
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
goto retry;
}
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (rc < 0)
@@ -385,19 +396,21 @@ static int handle_sske(struct kvm_vcpu *vcpu)
if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
m3 & SSKE_NQ, m3 & SSKE_MR,
m3 & SSKE_MC);
if (rc < 0) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
rc = !rc ? -EAGAIN : rc;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (rc == -EAGAIN)
+ continue;
if (rc < 0)
return rc;
start += PAGE_SIZE;
@@ -429,7 +442,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_ipte_interlock++;
if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
+ wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm));
kvm_s390_retry_instr(vcpu);
VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
return 0;
@@ -611,6 +624,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
static int handle_pqap(struct kvm_vcpu *vcpu)
{
struct ap_queue_status status = {};
+ crypto_hook pqap_hook;
unsigned long reg0;
int ret;
uint8_t fc;
@@ -626,10 +640,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
* available for the guest are AQIC and TAPQ with the t bit set
* since we do not set IC.3 (FIII) we currently will only intercept
* the AQIC function code.
+ * Note: running nested under z/VM can result in intercepts for other
+ * function codes, e.g. PQAP(QCI). We do not support this and bail out.
*/
reg0 = vcpu->run->s.regs.gprs[0];
fc = (reg0 >> 24) & 0xff;
- if (WARN_ON_ONCE(fc != 0x03))
+ if (fc != 0x03)
return -EOPNOTSUPP;
/* PQAP instruction is allowed for guest kernel only */
@@ -653,18 +669,20 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
/*
- * Verify that the hook callback is registered, lock the owner
- * and call the hook.
+ * If the hook callback is registered, there will be a pointer to the
+ * hook function pointer in the kvm_s390_crypto structure. Lock the
+ * owner, retrieve the hook function pointer and call the hook.
*/
+ down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
if (vcpu->kvm->arch.crypto.pqap_hook) {
- if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner))
- return -EOPNOTSUPP;
- ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu);
- module_put(vcpu->kvm->arch.crypto.pqap_hook->owner);
+ pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook;
+ ret = pqap_hook(vcpu);
if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000)
kvm_s390_set_psw_cc(vcpu, 3);
+ up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
return ret;
}
+ up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
/*
* A vfio_driver must register a hook.
* No hook means no driver to enable the SIE CRYCB and no queues.
@@ -855,10 +873,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- if (fc > 3) {
- kvm_s390_set_psw_cc(vcpu, 3);
- return 0;
- }
+ /* Bailout forbidden function codes */
+ if (fc > 3 && fc != 15)
+ goto out_no_data;
+
+ /*
+ * fc 15 is provided only with
+ * - PTF/CPU topology support through facility 15
+ * - KVM_CAP_S390_USER_STSI
+ */
+ if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) ||
+ !vcpu->kvm->arch.user_stsi))
+ goto out_no_data;
if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
|| vcpu->run->s.regs.gprs[1] & 0xffff0000)
@@ -872,13 +898,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
- if (operand2 & 0xfff)
+ if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
switch (fc) {
case 1: /* same handling for 1 and 2 */
case 2:
- mem = get_zeroed_page(GFP_KERNEL);
+ mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!mem)
goto out_no_data;
if (stsi((void *) mem, fc, sel1, sel2))
@@ -887,14 +913,23 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
case 3:
if (sel1 != 2 || sel2 != 2)
goto out_no_data;
- mem = get_zeroed_page(GFP_KERNEL);
+ mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!mem)
goto out_no_data;
handle_stsi_3_2_2(vcpu, (void *) mem);
break;
+ case 15: /* fc 15 is fully handled in userspace */
+ insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
+ return -EREMOTE;
+ }
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
+ PAGE_SIZE);
+ rc = 0;
+ } else {
+ rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
}
-
- rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
if (rc) {
rc = kvm_s390_inject_prog_cond(vcpu, rc);
goto out;
@@ -1084,15 +1119,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = cond_set_guest_storage_key(current->mm, vmaddr,
key, NULL, nq, mr, mc);
if (rc < 0) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
rc = !rc ? -EAGAIN : rc;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (rc == -EAGAIN)
@@ -1115,7 +1150,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
/*
- * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
+ * Must be called with relevant read locks held (kvm->mm->mmap_lock, kvm->srcu)
*/
static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
{
@@ -1213,9 +1248,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
* already correct, we do nothing and avoid the lock.
*/
if (vcpu->kvm->mm->context.uses_cmm == 0) {
- down_write(&vcpu->kvm->mm->mmap_sem);
+ mmap_write_lock(vcpu->kvm->mm);
vcpu->kvm->mm->context.uses_cmm = 1;
- up_write(&vcpu->kvm->mm->mmap_sem);
+ mmap_write_unlock(vcpu->kvm->mm);
}
/*
* If we are here, we are supposed to have CMMA enabled in
@@ -1232,11 +1267,11 @@ static int handle_essa(struct kvm_vcpu *vcpu)
} else {
int srcu_idx;
- down_read(&vcpu->kvm->mm->mmap_sem);
+ mmap_read_lock(vcpu->kvm->mm);
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
i = __do_essa(vcpu, orc);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
- up_read(&vcpu->kvm->mm->mmap_sem);
+ mmap_read_unlock(vcpu->kvm->mm);
if (i < 0)
return i;
/* Account for the possible extra cbrl entry */
@@ -1244,10 +1279,10 @@ static int handle_essa(struct kvm_vcpu *vcpu)
}
vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
for (i = 0; i < entries; ++i)
__gmap_zap(gmap, cbrlo[i]);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return 0;
}
@@ -1432,10 +1467,11 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
static int handle_tprot(struct kvm_vcpu *vcpu)
{
- u64 address1, address2;
- unsigned long hva, gpa;
- int ret = 0, cc = 0;
+ u64 address, operand2;
+ unsigned long gpa;
+ u8 access_key;
bool writable;
+ int ret, cc;
u8 ar;
vcpu->stat.instruction_tprot++;
@@ -1443,45 +1479,48 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL);
+ kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL);
+ access_key = (operand2 & 0xf0) >> 4;
- /* we only handle the Linux memory detection case:
- * access key == 0
- * everything else goes to userspace. */
- if (address2 & 0xf0)
- return -EOPNOTSUPP;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
- ipte_lock(vcpu);
- ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE);
- if (ret == PGM_PROTECTION) {
+ ipte_lock(vcpu->kvm);
+
+ ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
+ GACC_STORE, access_key);
+ if (ret == 0) {
+ gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
+ } else if (ret == PGM_PROTECTION) {
+ writable = false;
/* Write protected? Try again with read-only... */
- cc = 1;
- ret = guest_translate_address(vcpu, address1, ar, &gpa,
- GACC_FETCH);
+ ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
+ GACC_FETCH, access_key);
}
- if (ret) {
- if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
- ret = kvm_s390_inject_program_int(vcpu, ret);
- } else if (ret > 0) {
- /* Translation not available */
- kvm_s390_set_psw_cc(vcpu, 3);
+ if (ret >= 0) {
+ cc = -1;
+
+ /* Fetching permitted; storing permitted */
+ if (ret == 0 && writable)
+ cc = 0;
+ /* Fetching permitted; storing not permitted */
+ else if (ret == 0 && !writable)
+ cc = 1;
+ /* Fetching not permitted; storing not permitted */
+ else if (ret == PGM_PROTECTION)
+ cc = 2;
+ /* Translation not available */
+ else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC)
+ cc = 3;
+
+ if (cc != -1) {
+ kvm_s390_set_psw_cc(vcpu, cc);
ret = 0;
+ } else {
+ ret = kvm_s390_inject_program_int(vcpu, ret);
}
- goto out_unlock;
}
- hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
- if (kvm_is_error_hva(hva)) {
- ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- } else {
- if (!writable)
- cc = 1; /* Write not permitted ==> read-only */
- kvm_s390_set_psw_cc(vcpu, cc);
- /* Note: CC2 only occurs for storage keys (not supported yet) */
- }
-out_unlock:
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
- ipte_unlock(vcpu);
+ ipte_unlock(vcpu->kvm);
return ret;
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
new file mode 100644
index 000000000000..7cb7799a0acb
--- /dev/null
+++ b/arch/s390/kvm/pv.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hosting Protected Virtual Machines
+ *
+ * Copyright IBM Corp. 2019, 2020
+ * Author(s): Janosch Frank <frankja@linux.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/minmax.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+#include <asm/mman.h>
+#include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_notifier.h>
+#include "kvm-s390.h"
+
+static void kvm_s390_clear_pv_state(struct kvm *kvm)
+{
+ kvm->arch.pv.handle = 0;
+ kvm->arch.pv.guest_len = 0;
+ kvm->arch.pv.stor_base = 0;
+ kvm->arch.pv.stor_var = NULL;
+}
+
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+ int cc;
+
+ if (!kvm_s390_pv_cpu_get_handle(vcpu))
+ return 0;
+
+ cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
+
+ KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
+ vcpu->vcpu_id, *rc, *rrc);
+ WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc);
+
+ /* Intended memory leak for something that should never happen. */
+ if (!cc)
+ free_pages(vcpu->arch.pv.stor_base,
+ get_order(uv_info.guest_cpu_stor_len));
+
+ free_page(sida_origin(vcpu->arch.sie_block));
+ vcpu->arch.sie_block->pv_handle_cpu = 0;
+ vcpu->arch.sie_block->pv_handle_config = 0;
+ memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
+ vcpu->arch.sie_block->sdf = 0;
+ /*
+ * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0).
+ * Use the reset value of gbea to avoid leaking the kernel pointer of
+ * the just freed sida.
+ */
+ vcpu->arch.sie_block->gbea = 1;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ return cc ? EIO : 0;
+}
+
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_csc uvcb = {
+ .header.cmd = UVC_CMD_CREATE_SEC_CPU,
+ .header.len = sizeof(uvcb),
+ };
+ int cc;
+
+ if (kvm_s390_pv_cpu_get_handle(vcpu))
+ return -EINVAL;
+
+ vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT,
+ get_order(uv_info.guest_cpu_stor_len));
+ if (!vcpu->arch.pv.stor_base)
+ return -ENOMEM;
+
+ /* Input */
+ uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
+ uvcb.num = vcpu->arch.sie_block->icpua;
+ uvcb.state_origin = (u64)vcpu->arch.sie_block;
+ uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
+
+ /* Alloc Secure Instruction Data Area Designation */
+ vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vcpu->arch.sie_block->sidad) {
+ free_pages(vcpu->arch.pv.stor_base,
+ get_order(uv_info.guest_cpu_stor_len));
+ return -ENOMEM;
+ }
+
+ cc = uv_call(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ KVM_UV_EVENT(vcpu->kvm, 3,
+ "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
+ vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
+ uvcb.header.rrc);
+
+ if (cc) {
+ u16 dummy;
+
+ kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
+ return -EIO;
+ }
+
+ /* Output */
+ vcpu->arch.pv.handle = uvcb.cpu_handle;
+ vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
+ vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
+ vcpu->arch.sie_block->sdf = 2;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ return 0;
+}
+
+/* only free resources when the destroy was successful */
+static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
+{
+ vfree(kvm->arch.pv.stor_var);
+ free_pages(kvm->arch.pv.stor_base,
+ get_order(uv_info.guest_base_stor_len));
+ kvm_s390_clear_pv_state(kvm);
+}
+
+static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
+{
+ unsigned long base = uv_info.guest_base_stor_len;
+ unsigned long virt = uv_info.guest_virt_var_stor_len;
+ unsigned long npages = 0, vlen = 0;
+
+ kvm->arch.pv.stor_var = NULL;
+ kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base));
+ if (!kvm->arch.pv.stor_base)
+ return -ENOMEM;
+
+ /*
+ * Calculate current guest storage for allocation of the
+ * variable storage, which is based on the length in MB.
+ *
+ * Slots are sorted by GFN
+ */
+ mutex_lock(&kvm->slots_lock);
+ npages = kvm_s390_get_gfn_end(kvm_memslots(kvm));
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.pv.guest_len = npages * PAGE_SIZE;
+
+ /* Allocate variable storage */
+ vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
+ vlen += uv_info.guest_virt_base_stor_len;
+ kvm->arch.pv.stor_var = vzalloc(vlen);
+ if (!kvm->arch.pv.stor_var)
+ goto out_err;
+ return 0;
+
+out_err:
+ kvm_s390_pv_dealloc_vm(kvm);
+ return -ENOMEM;
+}
+
+/* this should not fail, but if it does, we must not free the donated memory */
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ int cc;
+
+ cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+ WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ /*
+ * if the mm still has a mapping, make all its pages accessible
+ * before destroying the guest
+ */
+ if (mmget_not_zero(kvm->mm)) {
+ s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+ mmput(kvm->mm);
+ }
+
+ if (!cc) {
+ atomic_dec(&kvm->mm->context.protected_count);
+ kvm_s390_pv_dealloc_vm(kvm);
+ } else {
+ /* Intended memory leak on "impossible" error */
+ s390_replace_asce(kvm->arch.gmap);
+ }
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
+ WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
+
+ return cc ? -EIO : 0;
+}
+
+static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
+ u16 dummy;
+
+ /*
+ * No locking is needed since this is the last thread of the last user of this
+ * struct mm.
+ * When the struct kvm gets deinitialized, this notifier is also
+ * unregistered. This means that if this notifier runs, then the
+ * struct kvm is still valid.
+ */
+ kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+}
+
+static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
+ .release = kvm_s390_pv_mmu_notifier_release,
+};
+
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_cgc uvcb = {
+ .header.cmd = UVC_CMD_CREATE_SEC_CONF,
+ .header.len = sizeof(uvcb)
+ };
+ int cc, ret;
+ u16 dummy;
+
+ ret = kvm_s390_pv_alloc_vm(kvm);
+ if (ret)
+ return ret;
+
+ /* Inputs */
+ uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
+ uvcb.guest_stor_len = kvm->arch.pv.guest_len;
+ uvcb.guest_asce = kvm->arch.gmap->asce;
+ uvcb.guest_sca = (unsigned long)kvm->arch.sca;
+ uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
+ uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
+ uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
+
+ /* Outputs */
+ kvm->arch.pv.handle = uvcb.guest_handle;
+
+ atomic_inc(&kvm->mm->context.protected_count);
+ if (cc) {
+ if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
+ kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+ } else {
+ atomic_dec(&kvm->mm->context.protected_count);
+ kvm_s390_pv_dealloc_vm(kvm);
+ }
+ return -EIO;
+ }
+ kvm->arch.gmap->guest_handle = uvcb.guest_handle;
+ /* Add the notifier only once. No races because we hold kvm->lock */
+ if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+ kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+ mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+ }
+ return 0;
+}
+
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+ u16 *rrc)
+{
+ struct uv_cb_ssc uvcb = {
+ .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
+ .header.len = sizeof(uvcb),
+ .sec_header_origin = (u64)hdr,
+ .sec_header_len = length,
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ };
+ int cc = uv_call(0, (u64)&uvcb);
+
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
+ *rc, *rrc);
+ return cc ? -EINVAL : 0;
+}
+
+static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
+ u64 offset, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_unp uvcb = {
+ .header.cmd = UVC_CMD_UNPACK_IMG,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ .gaddr = addr,
+ .tweak[0] = tweak,
+ .tweak[1] = offset,
+ };
+ int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+
+ if (ret && ret != -EAGAIN)
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
+ uvcb.gaddr, *rc, *rrc);
+ return ret;
+}
+
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+ unsigned long tweak, u16 *rc, u16 *rrc)
+{
+ u64 offset = 0;
+ int ret = 0;
+
+ if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
+ return -EINVAL;
+
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
+ addr, size);
+
+ while (offset < size) {
+ ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
+ if (ret == -EAGAIN) {
+ cond_resched();
+ if (fatal_signal_pending(current))
+ break;
+ continue;
+ }
+ if (ret)
+ break;
+ addr += PAGE_SIZE;
+ offset += PAGE_SIZE;
+ }
+ if (!ret)
+ KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
+ return ret;
+}
+
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
+{
+ struct uv_cb_cpu_set_state uvcb = {
+ .header.cmd = UVC_CMD_CPU_SET_STATE,
+ .header.len = sizeof(uvcb),
+ .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu),
+ .state = state,
+ };
+ int cc;
+
+ cc = uv_call(0, (u64)&uvcb);
+ KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x",
+ vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc);
+ if (cc)
+ return -EINVAL;
+ return 0;
+}
+
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_cpu uvcb = {
+ .header.cmd = UVC_CMD_DUMP_CPU,
+ .header.len = sizeof(uvcb),
+ .cpu_handle = vcpu->arch.pv.handle,
+ .dump_area_origin = (u64)buff,
+ };
+ int cc;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ return cc;
+}
+
+/* Size of the cache for the storage state dump data. 1MB for now */
+#define DUMP_BUFF_LEN HPAGE_SIZE
+
+/**
+ * kvm_s390_pv_dump_stor_state
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @gaddr: Starting absolute guest address for which the storage state
+ * is requested.
+ * @buff_user_len: Length of the buff_user buffer
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Stores buff_len bytes of tweak component values to buff_user
+ * starting with the 1MB block specified by the absolute guest address
+ * (gaddr). The gaddr pointer will be updated with the last address
+ * for which data was written when returning to userspace. buff_user
+ * might be written to even if an error rc is returned. For instance
+ * if we encounter a fault after writing the first page of data.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM if allocating the cache fails
+ * -EINVAL if gaddr is not aligned to 1MB
+ * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
+ * -EINVAL if the UV call fails, rc and rrc will be set in this case
+ * -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+ u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_stor_state uvcb = {
+ .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
+ .header.len = sizeof(uvcb),
+ .config_handle = kvm->arch.pv.handle,
+ .gaddr = *gaddr,
+ .dump_area_origin = 0,
+ };
+ const u64 increment_len = uv_info.conf_dump_storage_state_len;
+ size_t buff_kvm_size;
+ size_t size_done = 0;
+ u8 *buff_kvm = NULL;
+ int cc, ret;
+
+ ret = -EINVAL;
+ /* UV call processes 1MB guest storage chunks at a time */
+ if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
+ goto out;
+
+ /*
+ * We provide the storage state for 1MB chunks of guest
+ * storage. The buffer will need to be aligned to
+ * conf_dump_storage_state_len so we don't end on a partial
+ * chunk.
+ */
+ if (!buff_user_len ||
+ !IS_ALIGNED(buff_user_len, increment_len))
+ goto out;
+
+ /*
+ * Allocate a buffer from which we will later copy to the user
+ * process. We don't want userspace to dictate our buffer size
+ * so we limit it to DUMP_BUFF_LEN.
+ */
+ ret = -ENOMEM;
+ buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
+ buff_kvm = vzalloc(buff_kvm_size);
+ if (!buff_kvm)
+ goto out;
+
+ ret = 0;
+ uvcb.dump_area_origin = (u64)buff_kvm;
+ /* We will loop until the user buffer is filled or an error occurs */
+ do {
+ /* Get 1MB worth of guest storage state data */
+ cc = uv_call_sched(0, (u64)&uvcb);
+
+ /* All or nothing */
+ if (cc) {
+ ret = -EINVAL;
+ break;
+ }
+
+ size_done += increment_len;
+ uvcb.dump_area_origin += increment_len;
+ buff_user_len -= increment_len;
+ uvcb.gaddr += HPAGE_SIZE;
+
+ /* KVM Buffer full, time to copy to the process */
+ if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
+ if (copy_to_user(buff_user, buff_kvm, size_done)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ buff_user += size_done;
+ size_done = 0;
+ uvcb.dump_area_origin = (u64)buff_kvm;
+ }
+ } while (buff_user_len);
+
+ /* Report back where we ended dumping */
+ *gaddr = uvcb.gaddr;
+
+ /* Lets only log errors, we don't want to spam */
+out:
+ if (ret)
+ KVM_UV_EVENT(kvm, 3,
+ "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
+ uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ vfree(buff_kvm);
+
+ return ret;
+}
+
+/**
+ * kvm_s390_pv_dump_complete
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Completes the dumping operation and writes the completion data to
+ * user space.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM if allocating the completion buffer fails
+ * -EINVAL if the UV call fails, rc and rrc will be set in this case
+ * -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+ u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_complete complete = {
+ .header.len = sizeof(complete),
+ .header.cmd = UVC_CMD_DUMP_COMPLETE,
+ .config_handle = kvm_s390_pv_get_handle(kvm),
+ };
+ u64 *compl_data;
+ int ret;
+
+ /* Allocate dump area */
+ compl_data = vzalloc(uv_info.conf_dump_finalize_len);
+ if (!compl_data)
+ return -ENOMEM;
+ complete.dump_area_origin = (u64)compl_data;
+
+ ret = uv_call_sched(0, (u64)&complete);
+ *rc = complete.header.rc;
+ *rrc = complete.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
+ complete.header.rc, complete.header.rrc);
+
+ if (!ret) {
+ /*
+ * kvm_s390_pv_dealloc_vm() will also (mem)set
+ * this to false on a reboot or other destroy
+ * operation for this vm.
+ */
+ kvm->arch.pv.dumping = false;
+ kvm_s390_vcpu_unblock_all(kvm);
+ ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
+ if (ret)
+ ret = -EFAULT;
+ }
+ vfree(compl_data);
+ /* If the UVC returned an error, translate it to -EINVAL */
+ if (ret > 0)
+ ret = -EINVAL;
+ return ret;
+}
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 683036c1c92a..cb747bf6c798 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -151,22 +151,10 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu,
static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter,
u64 *status_reg)
{
- unsigned int i;
- struct kvm_vcpu *v;
- bool all_stopped = true;
-
- kvm_for_each_vcpu(i, v, vcpu->kvm) {
- if (v == vcpu)
- continue;
- if (!is_vcpu_stopped(v))
- all_stopped = false;
- }
-
*status_reg &= 0xffffffff00000000UL;
/* Reject set arch order, with czam we're always in z/Arch mode. */
- *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER :
- SIGP_STATUS_INCORRECT_STATE);
+ *status_reg |= SIGP_STATUS_INVALID_PARAMETER;
return SIGP_CC_STATUS_STORED;
}
@@ -288,6 +276,34 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
if (!dst_vcpu)
return SIGP_CC_NOT_OPERATIONAL;
+ /*
+ * SIGP RESTART, SIGP STOP, and SIGP STOP AND STORE STATUS orders
+ * are processed asynchronously. Until the affected VCPU finishes
+ * its work and calls back into KVM to clear the (RESTART or STOP)
+ * interrupt, we need to return any new non-reset orders "busy".
+ *
+ * This is important because a single VCPU could issue:
+ * 1) SIGP STOP $DESTINATION
+ * 2) SIGP SENSE $DESTINATION
+ *
+ * If the SIGP SENSE would not be rejected as "busy", it could
+ * return an incorrect answer as to whether the VCPU is STOPPED
+ * or OPERATING.
+ */
+ if (order_code != SIGP_INITIAL_CPU_RESET &&
+ order_code != SIGP_CPU_RESET) {
+ /*
+ * Lockless check. Both SIGP STOP and SIGP (RE)START
+ * properly synchronize everything while processing
+ * their orders, while the guest cannot observe a
+ * difference when issuing other orders from two
+ * different VCPUs.
+ */
+ if (kvm_s390_is_stop_irq_pending(dst_vcpu) ||
+ kvm_s390_is_restart_irq_pending(dst_vcpu))
+ return SIGP_CC_BUSY;
+ }
+
switch (order_code) {
case SIGP_SENSE:
vcpu->stat.instruction_sigp_sense++;
@@ -464,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
struct kvm_vcpu *dest_vcpu;
u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
- trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
-
if (order_code == SIGP_EXTERNAL_CALL) {
+ trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
+
dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr);
BUG_ON(dest_vcpu == NULL);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 076090f9e666..94138f8f0c1c 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -18,6 +18,7 @@
#include <asm/sclp.h>
#include <asm/nmi.h>
#include <asm/dis.h>
+#include <asm/fpu/api.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -416,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
- case ICPT_PARTEXEC:
- /* MVPG only */
- memcpy((void *)((u64)scb_o + 0xc0),
- (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
- break;
}
if (scb_s->ihcpu != 0xffffU)
@@ -507,6 +503,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* Host-protection-interruption introduced with ESOP */
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
+ /*
+ * CPU Topology
+ * This facility only uses the utility field of the SCA and none of
+ * the cpu entries that are problematic with the other interpretation
+ * facilities so we can pass it through
+ */
+ if (test_kvm_facility(vcpu->kvm, 11))
+ scb_s->ecb |= scb_o->ecb & ECB_PTF;
/* transactional execution */
if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
/* remap the prefix is tx is toggled on */
@@ -514,6 +518,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
prefix_unmapped(vsie_page);
scb_s->ecb |= ECB_TE;
}
+ /* specification exception interpretation */
+ scb_s->ecb |= scb_o->ecb & ECB_SPECI;
/* branch prediction */
if (test_kvm_facility(vcpu->kvm, 82))
scb_s->fpf |= scb_o->fpf & FPF_BPBC;
@@ -548,6 +554,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
scb_s->hpid = HPID_VSIE;
+ scb_s->cpnc = scb_o->cpnc;
prepare_ibc(vcpu, vsie_page);
rc = shadow_crycb(vcpu, vsie_page);
@@ -618,10 +625,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE);
+ prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -912,7 +919,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr);
+ current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
@@ -934,7 +941,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr);
+ vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
@@ -982,6 +989,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+ /* no need to validate the parameter and/or perform error handling */
+ reg &= 0xf;
+ switch (reg) {
+ case 15:
+ return vsie_page->scb_s.gg15;
+ case 14:
+ return vsie_page->scb_s.gg14;
+ default:
+ return vcpu->run->s.regs.gprs[reg];
+ }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ u64 *pei_block = &vsie_page->scb_o->mcic;
+ int edat, rc_dest, rc_src;
+ union ctlreg0 cr0;
+
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+ prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+ dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+ dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+ src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+ src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+ rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+ rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ /*
+ * Either everything went well, or something non-critical went wrong
+ * e.g. because of a race. In either case, simply retry.
+ */
+ if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+ retry_vsie_icpt(vsie_page);
+ return -EAGAIN;
+ }
+ /* Something more serious went wrong, propagate the error */
+ if (rc_dest < 0)
+ return rc_dest;
+ if (rc_src < 0)
+ return rc_src;
+
+ /* The only possible suppressing exception: just deliver it */
+ if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+ clear_vsie_icpt(vsie_page);
+ rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+ WARN_ON_ONCE(rc_dest);
+ return 1;
+ }
+
+ /*
+ * Forward the PEI intercept to the guest if it was a page fault, or
+ * also for segment and region table faults if EDAT applies.
+ */
+ if (edat) {
+ rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+ rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+ } else {
+ rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+ rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+ }
+ if (!rc_dest && !rc_src) {
+ pei_block[0] = pei_dest;
+ pei_block[1] = pei_src;
+ return 1;
+ }
+
+ retry_vsie_icpt(vsie_page);
+
+ /*
+ * The host has edat, and the guest does not, or it was an ASCE type
+ * exception. The host needs to inject the appropriate DAT interrupts
+ * into the guest.
+ */
+ if (rc_dest)
+ return inject_fault(vcpu, rc_dest, dest, 1);
+ return inject_fault(vcpu, rc_src, src, 0);
+}
+
+/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
*
@@ -1000,12 +1099,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
handle_last_fault(vcpu, vsie_page);
- if (need_resched())
- schedule();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
/* save current guest state of bp isolation override */
guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
@@ -1032,6 +1126,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
*/
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
barrier();
+ if (test_cpu_flag(CIF_FPU))
+ load_fpu_regs();
if (!kvm_s390_vcpu_sie_inhibited(vcpu))
rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
barrier();
@@ -1045,7 +1141,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (rc == -EINTR) {
VCPU_EVENT(vcpu, 3, "%s", "machine check");
@@ -1072,6 +1168,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
+ case ICPT_PARTEXEC:
+ if (scb_s->ipa == 0xb254)
+ rc = vsie_handle_mvpg(vcpu, vsie_page);
+ break;
}
return rc;
}
@@ -1185,6 +1285,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
kvm_s390_vcpu_has_irq(vcpu, 0) ||
kvm_s390_vcpu_sie_inhibited(vcpu))
break;
+ cond_resched();
}
if (rc == -EFAULT) {
@@ -1202,6 +1303,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->iprcc = PGM_ADDRESSING;
scb_s->pgmilc = 4;
scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ rc = 1;
}
return rc;
}
@@ -1236,7 +1338,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
mutex_lock(&kvm->arch.vsie.mutex);
if (kvm->arch.vsie.page_count < nr_vcpus) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
if (!page) {
mutex_unlock(&kvm->arch.vsie.mutex);
return ERR_PTR(-ENOMEM);
@@ -1338,7 +1440,7 @@ out_put:
void kvm_s390_vsie_init(struct kvm *kvm)
{
mutex_init(&kvm->arch.vsie.mutex);
- INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
}
/* Destroy the vsie data structures. To be called when a vm is destroyed. */
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 28fd66d558ff..580d2e3265cb 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -7,6 +7,8 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o
obj-y += mem.o xor.o
lib-$(CONFIG_KPROBES) += probes.o
lib-$(CONFIG_UPROBES) += probes.o
+obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o
+test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o
# Instrumenting memory accesses to __user data (in different address space)
# produce false positives
@@ -14,3 +16,10 @@ KASAN_SANITIZE_uaccess.o := n
obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o
CFLAGS_test_unwind.o += -fno-optimize-sibling-calls
+
+obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o
+obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
+
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+
+obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/
diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c
index d4aa10795605..be14c58cb989 100644
--- a/arch/s390/lib/delay.c
+++ b/arch/s390/lib/delay.c
@@ -4,126 +4,42 @@
*
* Copyright IBM Corp. 1999, 2008
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
-#include <linux/sched.h>
+#include <linux/processor.h>
#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/export.h>
-#include <linux/irqflags.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <asm/vtimer.h>
#include <asm/div64.h>
-#include <asm/idle.h>
+#include <asm/timex.h>
void __delay(unsigned long loops)
{
- /*
- * To end the bloody studid and useless discussion about the
- * BogoMips number I took the liberty to define the __delay
- * function in a way that that resulting BogoMips number will
- * yield the megahertz number of the cpu. The important function
- * is udelay and that is done using the tod clock. -- martin.
- */
+ /*
+ * Loop 'loops' times. Callers must not assume a specific
+ * amount of time passes before this function returns.
+ */
asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1));
}
EXPORT_SYMBOL(__delay);
-static void __udelay_disabled(unsigned long long usecs)
+static void delay_loop(unsigned long delta)
{
- unsigned long cr0, cr0_new, psw_mask;
- struct s390_idle_data idle;
- u64 end;
+ unsigned long end;
- end = get_tod_clock() + (usecs << 12);
- __ctl_store(cr0, 0, 0);
- cr0_new = cr0 & ~CR0_IRQ_SUBCLASS_MASK;
- cr0_new |= (1UL << (63 - 52)); /* enable clock comparator irq */
- __ctl_load(cr0_new, 0, 0);
- psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT;
- set_clock_comparator(end);
- set_cpu_flag(CIF_IGNORE_IRQ);
- psw_idle(&idle, psw_mask);
- clear_cpu_flag(CIF_IGNORE_IRQ);
- set_clock_comparator(S390_lowcore.clock_comparator);
- __ctl_load(cr0, 0, 0);
-}
-
-static void __udelay_enabled(unsigned long long usecs)
-{
- u64 clock_saved, end;
-
- end = get_tod_clock_fast() + (usecs << 12);
- do {
- clock_saved = 0;
- if (tod_after(S390_lowcore.clock_comparator, end)) {
- clock_saved = local_tick_disable();
- set_clock_comparator(end);
- }
- enabled_wait();
- if (clock_saved)
- local_tick_enable(clock_saved);
- } while (get_tod_clock_fast() < end);
+ end = get_tod_clock_monotonic() + delta;
+ while (!tod_after(get_tod_clock_monotonic(), end))
+ cpu_relax();
}
-/*
- * Waits for 'usecs' microseconds using the TOD clock comparator.
- */
-void __udelay(unsigned long long usecs)
+void __udelay(unsigned long usecs)
{
- unsigned long flags;
-
- preempt_disable();
- local_irq_save(flags);
- if (in_irq()) {
- __udelay_disabled(usecs);
- goto out;
- }
- if (in_softirq()) {
- if (raw_irqs_disabled_flags(flags))
- __udelay_disabled(usecs);
- else
- __udelay_enabled(usecs);
- goto out;
- }
- if (raw_irqs_disabled_flags(flags)) {
- local_bh_disable();
- __udelay_disabled(usecs);
- _local_bh_enable();
- goto out;
- }
- __udelay_enabled(usecs);
-out:
- local_irq_restore(flags);
- preempt_enable();
+ delay_loop(usecs << 12);
}
EXPORT_SYMBOL(__udelay);
-/*
- * Simple udelay variant. To be used on startup and reboot
- * when the interrupt handler isn't working.
- */
-void udelay_simple(unsigned long long usecs)
-{
- u64 end;
-
- end = get_tod_clock_fast() + (usecs << 12);
- while (get_tod_clock_fast() < end)
- cpu_relax();
-}
-
-void __ndelay(unsigned long long nsecs)
+void __ndelay(unsigned long nsecs)
{
- u64 end;
-
nsecs <<= 9;
do_div(nsecs, 125);
- end = get_tod_clock_fast() + nsecs;
- if (nsecs & ~0xfffUL)
- __udelay(nsecs >> 12);
- while (get_tod_clock_fast() < end)
- barrier();
+ delay_loop(nsecs);
}
EXPORT_SYMBOL(__ndelay);
diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c
new file mode 100644
index 000000000000..8c9d4da87eef
--- /dev/null
+++ b/arch/s390/lib/error-inject.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <asm/ptrace.h>
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ /*
+ * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some
+ * kernel function.
+ */
+ regs->psw.addr = regs->gprs[14];
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile
new file mode 100644
index 000000000000..854631d9cb03
--- /dev/null
+++ b/arch/s390/lib/expoline/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += expoline.o
diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline/expoline.S
new file mode 100644
index 000000000000..92ed8409a7a4
--- /dev/null
+++ b/arch/s390/lib/expoline/expoline.S
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/nospec-insn.h>
+#include <linux/linkage.h>
+
+.macro GEN_ALL_BR_THUNK_EXTERN
+ .irp r1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ GEN_BR_THUNK_EXTERN %r\r1
+ .endr
+.endm
+
+GEN_ALL_BR_THUNK_EXTERN
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 9b2dab5a69f9..04d4c6cf898e 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -26,7 +26,7 @@ static int __init spin_retry_init(void)
}
early_initcall(spin_retry_init);
-/**
+/*
* spin_retry= parameter
*/
static int __init spin_retry_setup(char *str)
@@ -75,7 +75,7 @@ static inline int arch_load_niai4(int *lock)
int owner;
asm_inline volatile(
- ALTERNATIVE("", ".long 0xb2fa0040", 49) /* NIAI 4 */
+ ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */
" l %0,%1\n"
: "=d" (owner) : "Q" (*lock) : "memory");
return owner;
@@ -86,7 +86,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new)
int expected = old;
asm_inline volatile(
- ALTERNATIVE("", ".long 0xb2fa0080", 49) /* NIAI 8 */
+ ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */
" cs %0,%3,%1\n"
: "=d" (old), "=Q" (*lock)
: "0" (old), "d" (new), "Q" (*lock)
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index 0e30e6e43b0c..7d8741818239 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -8,6 +8,9 @@
*/
#define IN_ARCH_STRING_C 1
+#ifndef __NO_FORTIFY
+# define __NO_FORTIFY
+#endif
#include <linux/types.h>
#include <linux/string.h>
@@ -18,23 +21,30 @@
*/
static inline char *__strend(const char *s)
{
- register unsigned long r0 asm("0") = 0;
-
- asm volatile ("0: srst %0,%1\n"
- " jo 0b"
- : "+d" (r0), "+a" (s) : : "cc", "memory");
- return (char *) r0;
+ unsigned long e = 0;
+
+ asm volatile(
+ " lghi 0,0\n"
+ "0: srst %[e],%[s]\n"
+ " jo 0b\n"
+ : [e] "+&a" (e), [s] "+&a" (s)
+ :
+ : "cc", "memory", "0");
+ return (char *)e;
}
static inline char *__strnend(const char *s, size_t n)
{
- register unsigned long r0 asm("0") = 0;
const char *p = s + n;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b"
- : "+d" (p), "+a" (s) : "d" (r0) : "cc", "memory");
- return (char *) p;
+ asm volatile(
+ " lghi 0,0\n"
+ "0: srst %[p],%[s]\n"
+ " jo 0b\n"
+ : [p] "+&d" (p), [s] "+&a" (s)
+ :
+ : "cc", "memory", "0");
+ return (char *)p;
}
/**
@@ -76,45 +86,21 @@ EXPORT_SYMBOL(strnlen);
#ifdef __HAVE_ARCH_STRCPY
char *strcpy(char *dest, const char *src)
{
- register int r0 asm("0") = 0;
char *ret = dest;
- asm volatile ("0: mvst %0,%1\n"
- " jo 0b"
- : "+&a" (dest), "+&a" (src) : "d" (r0)
- : "cc", "memory" );
+ asm volatile(
+ " lghi 0,0\n"
+ "0: mvst %[dest],%[src]\n"
+ " jo 0b\n"
+ : [dest] "+&a" (dest), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
EXPORT_SYMBOL(strcpy);
#endif
/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-#ifdef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
- size_t ret = __strend(src) - src;
-
- if (size) {
- size_t len = (ret >= size) ? size-1 : ret;
- dest[len] = '\0';
- memcpy(dest, src, len);
- }
- return ret;
-}
-EXPORT_SYMBOL(strlcpy);
-#endif
-
-/**
* strncpy - Copy a length-limited, %NUL-terminated string
* @dest: Where to copy the string to
* @src: Where to copy the string from
@@ -144,16 +130,18 @@ EXPORT_SYMBOL(strncpy);
#ifdef __HAVE_ARCH_STRCAT
char *strcat(char *dest, const char *src)
{
- register int r0 asm("0") = 0;
- unsigned long dummy;
+ unsigned long dummy = 0;
char *ret = dest;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b\n"
- "1: mvst %0,%2\n"
- " jo 1b"
- : "=&a" (dummy), "+a" (dest), "+a" (src)
- : "d" (r0), "0" (0UL) : "cc", "memory" );
+ asm volatile(
+ " lghi 0,0\n"
+ "0: srst %[dummy],%[dest]\n"
+ " jo 0b\n"
+ "1: mvst %[dummy],%[src]\n"
+ " jo 1b\n"
+ : [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
EXPORT_SYMBOL(strcat);
@@ -221,58 +209,40 @@ EXPORT_SYMBOL(strncat);
#ifdef __HAVE_ARCH_STRCMP
int strcmp(const char *s1, const char *s2)
{
- register int r0 asm("0") = 0;
int ret = 0;
- asm volatile ("0: clst %2,%3\n"
- " jo 0b\n"
- " je 1f\n"
- " ic %0,0(%2)\n"
- " ic %1,0(%3)\n"
- " sr %0,%1\n"
- "1:"
- : "+d" (ret), "+d" (r0), "+a" (s1), "+a" (s2)
- : : "cc", "memory");
+ asm volatile(
+ " lghi 0,0\n"
+ "0: clst %[s1],%[s2]\n"
+ " jo 0b\n"
+ " je 1f\n"
+ " ic %[ret],0(%[s1])\n"
+ " ic 0,0(%[s2])\n"
+ " sr %[ret],0\n"
+ "1:"
+ : [ret] "+&d" (ret), [s1] "+&a" (s1), [s2] "+&a" (s2)
+ :
+ : "cc", "memory", "0");
return ret;
}
EXPORT_SYMBOL(strcmp);
#endif
-/**
- * strrchr - Find the last occurrence of a character in a string
- * @s: The string to be searched
- * @c: The character to search for
- */
-#ifdef __HAVE_ARCH_STRRCHR
-char *strrchr(const char *s, int c)
-{
- size_t len = __strend(s) - s;
-
- if (len)
- do {
- if (s[len] == (char) c)
- return (char *) s + len;
- } while (--len > 0);
- return NULL;
-}
-EXPORT_SYMBOL(strrchr);
-#endif
-
static inline int clcle(const char *s1, unsigned long l1,
const char *s2, unsigned long l2)
{
- register unsigned long r2 asm("2") = (unsigned long) s1;
- register unsigned long r3 asm("3") = (unsigned long) l1;
- register unsigned long r4 asm("4") = (unsigned long) s2;
- register unsigned long r5 asm("5") = (unsigned long) l2;
+ union register_pair r1 = { .even = (unsigned long)s1, .odd = l1, };
+ union register_pair r3 = { .even = (unsigned long)s2, .odd = l2, };
int cc;
- asm volatile ("0: clcle %1,%3,0\n"
- " jo 0b\n"
- " ipm %0\n"
- " srl %0,28"
- : "=&d" (cc), "+a" (r2), "+a" (r3),
- "+a" (r4), "+a" (r5) : : "cc", "memory");
+ asm volatile(
+ "0: clcle %[r1],%[r3],0\n"
+ " jo 0b\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [r1] "+&d" (r1.pair), [r3] "+&d" (r3.pair)
+ :
+ : "cc", "memory");
return cc;
}
@@ -315,15 +285,18 @@ EXPORT_SYMBOL(strstr);
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b\n"
- " jl 1f\n"
- " la %0,0\n"
- "1:"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+ asm volatile(
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
+ " jo 0b\n"
+ " jl 1f\n"
+ " la %[ret],0\n"
+ "1:"
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
return (void *) ret;
}
EXPORT_SYMBOL(memchr);
@@ -333,7 +306,7 @@ EXPORT_SYMBOL(memchr);
* memcmp - Compare two areas of memory
* @s1: One area of memory
* @s2: Another area of memory
- * @count: The size of the area.
+ * @n: The size of the area.
*/
#ifdef __HAVE_ARCH_MEMCMP
int memcmp(const void *s1, const void *s2, size_t n)
@@ -360,13 +333,16 @@ EXPORT_SYMBOL(memcmp);
#ifdef __HAVE_ARCH_MEMSCAN
void *memscan(void *s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b\n"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
- return (void *) ret;
+ asm volatile(
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
+ " jo 0b\n"
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
+ return (void *)ret;
}
EXPORT_SYMBOL(memscan);
#endif
diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c
new file mode 100644
index 000000000000..9e62d62812e5
--- /dev/null
+++ b/arch/s390/lib/test_kprobes.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+#include <kunit/test.h>
+#include "test_kprobes.h"
+
+static struct kprobe kp;
+
+static void setup_kprobe(struct kunit *test, struct kprobe *kp,
+ const char *symbol, int offset)
+{
+ kp->offset = offset;
+ kp->addr = NULL;
+ kp->symbol_name = symbol;
+}
+
+static void test_kprobe_offset(struct kunit *test, struct kprobe *kp,
+ const char *target, int offset)
+{
+ int ret;
+
+ setup_kprobe(test, kp, target, 0);
+ ret = register_kprobe(kp);
+ if (!ret)
+ unregister_kprobe(kp);
+ KUNIT_EXPECT_EQ(test, 0, ret);
+ setup_kprobe(test, kp, target, offset);
+ ret = register_kprobe(kp);
+ KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+ if (!ret)
+ unregister_kprobe(kp);
+}
+
+static void test_kprobe_odd(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_odd",
+ kprobes_target_odd_offs);
+}
+
+static void test_kprobe_in_insn4(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_in_insn4",
+ kprobes_target_in_insn4_offs);
+}
+
+static void test_kprobe_in_insn6_lo(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo",
+ kprobes_target_in_insn6_lo_offs);
+}
+
+static void test_kprobe_in_insn6_hi(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi",
+ kprobes_target_in_insn6_hi_offs);
+}
+
+static struct kunit_case kprobes_testcases[] = {
+ KUNIT_CASE(test_kprobe_odd),
+ KUNIT_CASE(test_kprobe_in_insn4),
+ KUNIT_CASE(test_kprobe_in_insn6_lo),
+ KUNIT_CASE(test_kprobe_in_insn6_hi),
+ {}
+};
+
+static struct kunit_suite kprobes_test_suite = {
+ .name = "kprobes_test_s390",
+ .test_cases = kprobes_testcases,
+};
+
+kunit_test_suites(&kprobes_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h
new file mode 100644
index 000000000000..2b4c9bc337f1
--- /dev/null
+++ b/arch/s390/lib/test_kprobes.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_KPROBES_H
+#define TEST_KPROBES_H
+
+extern unsigned long kprobes_target_odd_offs;
+extern unsigned long kprobes_target_in_insn4_offs;
+extern unsigned long kprobes_target_in_insn6_lo_offs;
+extern unsigned long kprobes_target_in_insn6_hi_offs;
+
+#endif
diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S
new file mode 100644
index 000000000000..ade7a3042334
--- /dev/null
+++ b/arch/s390/lib/test_kprobes_asm.S
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define KPROBES_TARGET_START(name) \
+ SYM_FUNC_START(name); \
+ FTRACE_GEN_NOP_ASM(name)
+
+#define KPROBES_TARGET_END(name) \
+ SYM_FUNC_END(name); \
+ SYM_DATA(name##_offs, .quad 1b - name)
+
+KPROBES_TARGET_START(kprobes_target_in_insn4)
+ .word 0x4700 // bc 0,0
+1: .word 0x0000
+ br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn4)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_lo)
+ .word 0xe310 // ly 1,0
+1: .word 0x0000
+ .word 0x0058
+ br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_lo)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_hi)
+ .word 0xe310 // ly 1,0
+ .word 0x0000
+1: .word 0x0058
+ br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_hi)
+
+KPROBES_TARGET_START(kprobes_target_bp)
+ nop
+ .word 0x0000
+ nop
+1: br %r14
+KPROBES_TARGET_END(kprobes_target_bp)
+
+KPROBES_TARGET_START(kprobes_target_odd)
+ .byte 0x07
+1: .byte 0x07
+ br %r14
+KPROBES_TARGET_END(kprobes_target_odd)
diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c
new file mode 100644
index 000000000000..9894009fc1f2
--- /dev/null
+++ b/arch/s390/lib/test_modules.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <kunit/test.h>
+#include <linux/module.h>
+
+#include "test_modules.h"
+
+/*
+ * Test that modules with many relocations are loaded properly.
+ */
+static void test_modules_many_vmlinux_relocs(struct kunit *test)
+{
+ int result = 0;
+
+#define CALL_RETURN(i) result += test_modules_return_ ## i()
+ REPEAT_10000(CALL_RETURN);
+ KUNIT_ASSERT_EQ(test, result, 49995000);
+}
+
+static struct kunit_case modules_testcases[] = {
+ KUNIT_CASE(test_modules_many_vmlinux_relocs),
+ {}
+};
+
+static struct kunit_suite modules_test_suite = {
+ .name = "modules_test_s390",
+ .test_cases = modules_testcases,
+};
+
+kunit_test_suites(&modules_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h
new file mode 100644
index 000000000000..6371fcf17684
--- /dev/null
+++ b/arch/s390/lib/test_modules.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_MODULES_H
+#define TEST_MODULES_H
+
+#define __REPEAT_10000_3(f, x) \
+ f(x ## 0); \
+ f(x ## 1); \
+ f(x ## 2); \
+ f(x ## 3); \
+ f(x ## 4); \
+ f(x ## 5); \
+ f(x ## 6); \
+ f(x ## 7); \
+ f(x ## 8); \
+ f(x ## 9)
+#define __REPEAT_10000_2(f, x) \
+ __REPEAT_10000_3(f, x ## 0); \
+ __REPEAT_10000_3(f, x ## 1); \
+ __REPEAT_10000_3(f, x ## 2); \
+ __REPEAT_10000_3(f, x ## 3); \
+ __REPEAT_10000_3(f, x ## 4); \
+ __REPEAT_10000_3(f, x ## 5); \
+ __REPEAT_10000_3(f, x ## 6); \
+ __REPEAT_10000_3(f, x ## 7); \
+ __REPEAT_10000_3(f, x ## 8); \
+ __REPEAT_10000_3(f, x ## 9)
+#define __REPEAT_10000_1(f, x) \
+ __REPEAT_10000_2(f, x ## 0); \
+ __REPEAT_10000_2(f, x ## 1); \
+ __REPEAT_10000_2(f, x ## 2); \
+ __REPEAT_10000_2(f, x ## 3); \
+ __REPEAT_10000_2(f, x ## 4); \
+ __REPEAT_10000_2(f, x ## 5); \
+ __REPEAT_10000_2(f, x ## 6); \
+ __REPEAT_10000_2(f, x ## 7); \
+ __REPEAT_10000_2(f, x ## 8); \
+ __REPEAT_10000_2(f, x ## 9)
+#define REPEAT_10000(f) \
+ __REPEAT_10000_1(f, 0); \
+ __REPEAT_10000_1(f, 1); \
+ __REPEAT_10000_1(f, 2); \
+ __REPEAT_10000_1(f, 3); \
+ __REPEAT_10000_1(f, 4); \
+ __REPEAT_10000_1(f, 5); \
+ __REPEAT_10000_1(f, 6); \
+ __REPEAT_10000_1(f, 7); \
+ __REPEAT_10000_1(f, 8); \
+ __REPEAT_10000_1(f, 9)
+
+#define DECLARE_RETURN(i) int test_modules_return_ ## i(void)
+REPEAT_10000(DECLARE_RETURN);
+
+#endif
diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c
new file mode 100644
index 000000000000..1670349a03eb
--- /dev/null
+++ b/arch/s390/lib/test_modules_helpers.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/export.h>
+
+#include "test_modules.h"
+
+#define DEFINE_RETURN(i) \
+ int test_modules_return_ ## i(void) \
+ { \
+ return 1 ## i - 10000; \
+ } \
+ EXPORT_SYMBOL_GPL(test_modules_return_ ## i)
+REPEAT_10000(DEFINE_RETURN);
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index 32b7a30b2485..5a053b393d5c 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -3,20 +3,28 @@
* Test module for unwind_for_each_frame
*/
-#define pr_fmt(fmt) "test_unwind: " fmt
+#include <kunit/test.h>
#include <asm/unwind.h>
#include <linux/completion.h>
#include <linux/kallsyms.h>
#include <linux/kthread.h>
+#include <linux/ftrace.h>
#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
#include <linux/string.h>
#include <linux/kprobes.h>
#include <linux/wait.h>
#include <asm/irq.h>
-#include <asm/delay.h>
+
+static struct kunit *current_test;
#define BT_BUF_SIZE (PAGE_SIZE * 4)
+static bool force_bt;
+module_param_named(backtrace, force_bt, bool, 0444);
+MODULE_PARM_DESC(backtrace, "print backtraces for all tests");
+
/*
* To avoid printk line limit split backtrace by lines
*/
@@ -28,7 +36,7 @@ static void print_backtrace(char *bt)
p = strsep(&bt, "\n");
if (!p)
break;
- pr_err("%s\n", p);
+ kunit_err(current_test, "%s\n", p);
}
}
@@ -39,7 +47,7 @@ static void print_backtrace(char *bt)
static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
unsigned long sp)
{
- int frame_count, prev_is_func2, seen_func2_func1;
+ int frame_count, prev_is_func2, seen_func2_func1, seen_kretprobe_trampoline;
const int max_frames = 128;
struct unwind_state state;
size_t bt_pos = 0;
@@ -48,13 +56,14 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC);
if (!bt) {
- pr_err("failed to allocate backtrace buffer\n");
+ kunit_err(current_test, "failed to allocate backtrace buffer\n");
return -ENOMEM;
}
/* Unwind. */
frame_count = 0;
prev_is_func2 = 0;
seen_func2_func1 = 0;
+ seen_kretprobe_trampoline = 0;
unwind_for_each_frame(&state, task, regs, sp) {
unsigned long addr = unwind_get_return_address(&state);
char sym[KSYM_SYMBOL_LEN];
@@ -62,8 +71,9 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
if (frame_count++ == max_frames)
break;
if (state.reliable && !addr) {
- pr_err("unwind state reliable but addr is 0\n");
- return -EINVAL;
+ kunit_err(current_test, "unwind state reliable but addr is 0\n");
+ ret = -EINVAL;
+ break;
}
sprint_symbol(sym, addr);
if (bt_pos < BT_BUF_SIZE) {
@@ -73,28 +83,34 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
stack_type_name(state.stack_info.type),
(void *)state.sp, (void *)state.ip);
if (bt_pos >= BT_BUF_SIZE)
- pr_err("backtrace buffer is too small\n");
+ kunit_err(current_test, "backtrace buffer is too small\n");
}
frame_count += 1;
if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1"))
seen_func2_func1 = 1;
prev_is_func2 = str_has_prefix(sym, "unwindme_func2");
+ if (str_has_prefix(sym, "__kretprobe_trampoline+0x0/"))
+ seen_kretprobe_trampoline = 1;
}
/* Check the results. */
if (unwind_error(&state)) {
- pr_err("unwind error\n");
+ kunit_err(current_test, "unwind error\n");
ret = -EINVAL;
}
if (!seen_func2_func1) {
- pr_err("unwindme_func2 and unwindme_func1 not found\n");
+ kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n");
ret = -EINVAL;
}
if (frame_count == max_frames) {
- pr_err("Maximum number of frames exceeded\n");
+ kunit_err(current_test, "Maximum number of frames exceeded\n");
ret = -EINVAL;
}
- if (ret)
+ if (seen_kretprobe_trampoline) {
+ kunit_err(current_test, "__kretprobe_trampoline+0x0 in unwinding results\n");
+ ret = -EINVAL;
+ }
+ if (ret || force_bt)
print_backtrace(bt);
kfree(bt);
return ret;
@@ -118,31 +134,187 @@ static struct unwindme *unwindme;
#define UWM_REGS 0x2 /* Pass regs to test_unwind(). */
#define UWM_SP 0x4 /* Pass sp to test_unwind(). */
#define UWM_CALLER 0x8 /* Unwind starting from caller. */
-#define UWM_SWITCH_STACK 0x10 /* Use CALL_ON_STACK. */
+#define UWM_SWITCH_STACK 0x10 /* Use call_on_stack. */
#define UWM_IRQ 0x20 /* Unwind from irq context. */
-#define UWM_PGM 0x40 /* Unwind from program check handler. */
+#define UWM_PGM 0x40 /* Unwind from program check handler */
+#define UWM_KPROBE_ON_FTRACE 0x80 /* Unwind from kprobe handler called via ftrace. */
+#define UWM_FTRACE 0x100 /* Unwind from ftrace handler. */
+#define UWM_KRETPROBE 0x200 /* Unwind through kretprobed function. */
+#define UWM_KRETPROBE_HANDLER 0x400 /* Unwind from kretprobe handler. */
-static __always_inline unsigned long get_psw_addr(void)
+static __always_inline struct pt_regs fake_pt_regs(void)
{
- unsigned long psw_addr;
+ struct pt_regs regs;
+
+ memset(&regs, 0, sizeof(regs));
+ regs.gprs[15] = current_stack_pointer;
asm volatile(
"basr %[psw_addr],0\n"
- : [psw_addr] "=d" (psw_addr));
- return psw_addr;
+ : [psw_addr] "=d" (regs.psw.addr));
+ return regs;
}
-#ifdef CONFIG_KPROBES
-static int pgm_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int kretprobe_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct unwindme *u = unwindme;
+ if (!(u->flags & UWM_KRETPROBE_HANDLER))
+ return 0;
+
u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL,
(u->flags & UWM_SP) ? u->sp : 0);
+
return 0;
}
+
+static noinline notrace int test_unwind_kretprobed_func(struct unwindme *u)
+{
+ struct pt_regs regs;
+
+ if (!(u->flags & UWM_KRETPROBE))
+ return 0;
+
+ regs = fake_pt_regs();
+ return test_unwind(NULL, (u->flags & UWM_REGS) ? &regs : NULL,
+ (u->flags & UWM_SP) ? u->sp : 0);
+}
+
+static noinline int test_unwind_kretprobed_func_caller(struct unwindme *u)
+{
+ return test_unwind_kretprobed_func(u);
+}
+
+static int test_unwind_kretprobe(struct unwindme *u)
+{
+ int ret;
+ struct kretprobe my_kretprobe;
+
+ if (!IS_ENABLED(CONFIG_KPROBES))
+ kunit_skip(current_test, "requires CONFIG_KPROBES");
+
+ u->ret = -1; /* make sure kprobe is called */
+ unwindme = u;
+
+ memset(&my_kretprobe, 0, sizeof(my_kretprobe));
+ my_kretprobe.handler = kretprobe_ret_handler;
+ my_kretprobe.maxactive = 1;
+ my_kretprobe.kp.addr = (kprobe_opcode_t *)test_unwind_kretprobed_func;
+
+ ret = register_kretprobe(&my_kretprobe);
+
+ if (ret < 0) {
+ kunit_err(current_test, "register_kretprobe failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ ret = test_unwind_kretprobed_func_caller(u);
+ unregister_kretprobe(&my_kretprobe);
+ unwindme = NULL;
+ if (u->flags & UWM_KRETPROBE_HANDLER)
+ ret = u->ret;
+ return ret;
+}
+
+static int kprobe_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct unwindme *u = unwindme;
+
+ u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL,
+ (u->flags & UWM_SP) ? u->sp : 0);
+ return 0;
+}
+
+extern const char test_unwind_kprobed_insn[];
+
+static noinline void test_unwind_kprobed_func(void)
+{
+ asm volatile(
+ " nopr %%r7\n"
+ "test_unwind_kprobed_insn:\n"
+ " nopr %%r7\n"
+ :);
+}
+
+static int test_unwind_kprobe(struct unwindme *u)
+{
+ struct kprobe kp;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_KPROBES))
+ kunit_skip(current_test, "requires CONFIG_KPROBES");
+ if (!IS_ENABLED(CONFIG_KPROBES_ON_FTRACE) && u->flags & UWM_KPROBE_ON_FTRACE)
+ kunit_skip(current_test, "requires CONFIG_KPROBES_ON_FTRACE");
+
+ u->ret = -1; /* make sure kprobe is called */
+ unwindme = u;
+ memset(&kp, 0, sizeof(kp));
+ kp.pre_handler = kprobe_pre_handler;
+ kp.addr = u->flags & UWM_KPROBE_ON_FTRACE ?
+ (kprobe_opcode_t *)test_unwind_kprobed_func :
+ (kprobe_opcode_t *)test_unwind_kprobed_insn;
+ ret = register_kprobe(&kp);
+ if (ret < 0) {
+ kunit_err(current_test, "register_kprobe failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ test_unwind_kprobed_func();
+ unregister_kprobe(&kp);
+ unwindme = NULL;
+ return u->ret;
+}
+
+static void notrace __used test_unwind_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct ftrace_regs *fregs)
+{
+ struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2];
+
+ u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL,
+ (u->flags & UWM_SP) ? u->sp : 0);
+}
+
+static noinline int test_unwind_ftraced_func(struct unwindme *u)
+{
+ return READ_ONCE(u)->ret;
+}
+
+static int test_unwind_ftrace(struct unwindme *u)
+{
+ int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ struct ftrace_ops *fops;
+
+ fops = kunit_kzalloc(current_test, sizeof(*fops), GFP_KERNEL);
+ fops->func = test_unwind_ftrace_handler;
+ fops->flags = FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_RECURSION |
+ FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_PERMANENT;
+#else
+ kunit_skip(current_test, "requires CONFIG_DYNAMIC_FTRACE");
#endif
+ ret = ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 0, 0);
+ if (ret) {
+ kunit_err(current_test, "failed to set ftrace filter (%d)\n", ret);
+ return -1;
+ }
+
+ ret = register_ftrace_function(fops);
+ if (!ret) {
+ ret = test_unwind_ftraced_func(u);
+ unregister_ftrace_function(fops);
+ } else {
+ kunit_err(current_test, "failed to register ftrace handler (%d)\n", ret);
+ }
+
+ ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 1, 0);
+ return ret;
+}
+
/* This function may or may not appear in the backtrace. */
static noinline int unwindme_func4(struct unwindme *u)
{
@@ -153,40 +325,15 @@ static noinline int unwindme_func4(struct unwindme *u)
wait_event(u->task_wq, kthread_should_park());
kthread_parkme();
return 0;
-#ifdef CONFIG_KPROBES
- } else if (u->flags & UWM_PGM) {
- struct kprobe kp;
- int ret;
-
- unwindme = u;
- memset(&kp, 0, sizeof(kp));
- kp.symbol_name = "do_report_trap";
- kp.pre_handler = pgm_pre_handler;
- ret = register_kprobe(&kp);
- if (ret < 0) {
- pr_err("register_kprobe failed %d\n", ret);
- return -EINVAL;
- }
-
- /*
- * trigger specification exception
- */
- asm volatile(
- " mvcl %%r1,%%r1\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b, 0b)
- :);
-
- unregister_kprobe(&kp);
- unwindme = NULL;
- return u->ret;
-#endif
+ } else if (u->flags & (UWM_PGM | UWM_KPROBE_ON_FTRACE)) {
+ return test_unwind_kprobe(u);
+ } else if (u->flags & (UWM_KRETPROBE | UWM_KRETPROBE_HANDLER)) {
+ return test_unwind_kretprobe(u);
+ } else if (u->flags & UWM_FTRACE) {
+ return test_unwind_ftrace(u);
} else {
- struct pt_regs regs;
+ struct pt_regs regs = fake_pt_regs();
- memset(&regs, 0, sizeof(regs));
- regs.psw.addr = get_psw_addr();
- regs.gprs[15] = current_stack_pointer();
return test_unwind(NULL,
(u->flags & UWM_REGS) ? &regs : NULL,
(u->flags & UWM_SP) ? u->sp : 0);
@@ -203,12 +350,16 @@ static noinline int unwindme_func3(struct unwindme *u)
/* This function must appear in the backtrace. */
static noinline int unwindme_func2(struct unwindme *u)
{
+ unsigned long flags;
int rc;
if (u->flags & UWM_SWITCH_STACK) {
- preempt_disable();
- rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u);
- preempt_enable();
+ local_irq_save(flags);
+ local_mcck_disable();
+ rc = call_on_stack(1, S390_lowcore.nodat_stack,
+ int, unwindme_func3, struct unwindme *, u);
+ local_mcck_enable();
+ local_irq_restore(flags);
return rc;
} else {
return unwindme_func3(u);
@@ -221,31 +372,27 @@ static noinline int unwindme_func1(void *u)
return unwindme_func2((struct unwindme *)u);
}
-static void unwindme_irq_handler(struct ext_code ext_code,
- unsigned int param32,
- unsigned long param64)
+static void unwindme_timer_fn(struct timer_list *unused)
{
struct unwindme *u = READ_ONCE(unwindme);
- if (u && u->task == current) {
+ if (u) {
unwindme = NULL;
u->task = NULL;
u->ret = unwindme_func1(u);
+ complete(&u->task_ready);
}
}
+static struct timer_list unwind_timer;
+
static int test_unwind_irq(struct unwindme *u)
{
- preempt_disable();
- if (register_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler)) {
- pr_info("Couldn't register external interrupt handler");
- return -1;
- }
- u->task = current;
unwindme = u;
- udelay(1);
- unregister_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler);
- preempt_enable();
+ init_completion(&u->task_ready);
+ timer_setup(&unwind_timer, unwindme_timer_fn, 0);
+ mod_timer(&unwind_timer, jiffies + 1);
+ wait_for_completion(&u->task_ready);
return u->ret;
}
@@ -265,7 +412,7 @@ static int test_unwind_task(struct unwindme *u)
*/
task = kthread_run(unwindme_func1, u, "%s", __func__);
if (IS_ERR(task)) {
- pr_err("kthread_run() failed\n");
+ kunit_err(current_test, "kthread_run() failed\n");
return PTR_ERR(task);
}
/*
@@ -280,68 +427,96 @@ static int test_unwind_task(struct unwindme *u)
return ret;
}
-static int test_unwind_flags(int flags)
+struct test_params {
+ int flags;
+ char *name;
+};
+
+/*
+ * Create required parameter list for tests
+ */
+#define TEST_WITH_FLAGS(f) { .flags = f, .name = #f }
+static const struct test_params param_list[] = {
+ TEST_WITH_FLAGS(UWM_DEFAULT),
+ TEST_WITH_FLAGS(UWM_SP),
+ TEST_WITH_FLAGS(UWM_REGS),
+ TEST_WITH_FLAGS(UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_CALLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_THREAD),
+ TEST_WITH_FLAGS(UWM_THREAD | UWM_SP),
+ TEST_WITH_FLAGS(UWM_THREAD | UWM_CALLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_IRQ),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_SP),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_PGM),
+ TEST_WITH_FLAGS(UWM_PGM | UWM_SP),
+ TEST_WITH_FLAGS(UWM_PGM | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_PGM | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_FTRACE),
+ TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP),
+ TEST_WITH_FLAGS(UWM_FTRACE | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE),
+ TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP),
+ TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP | UWM_REGS),
+};
+
+/*
+ * Parameter description generator: required for KUNIT_ARRAY_PARAM()
+ */
+static void get_desc(const struct test_params *params, char *desc)
+{
+ strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE);
+}
+
+/*
+ * Create test_unwind_gen_params
+ */
+KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc);
+
+static void test_unwind_flags(struct kunit *test)
{
struct unwindme u;
+ const struct test_params *params;
- u.flags = flags;
+ current_test = test;
+ params = (const struct test_params *)test->param_value;
+ u.flags = params->flags;
if (u.flags & UWM_THREAD)
- return test_unwind_task(&u);
+ KUNIT_EXPECT_EQ(test, 0, test_unwind_task(&u));
else if (u.flags & UWM_IRQ)
- return test_unwind_irq(&u);
+ KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u));
else
- return unwindme_func1(&u);
+ KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u));
}
-static int test_unwind_init(void)
-{
- int ret = 0;
-
-#define TEST(flags) \
-do { \
- pr_info("[ RUN ] " #flags "\n"); \
- if (!test_unwind_flags((flags))) { \
- pr_info("[ OK ] " #flags "\n"); \
- } else { \
- pr_err("[ FAILED ] " #flags "\n"); \
- ret = -EINVAL; \
- } \
-} while (0)
-
- TEST(UWM_DEFAULT);
- TEST(UWM_SP);
- TEST(UWM_REGS);
- TEST(UWM_SWITCH_STACK);
- TEST(UWM_SP | UWM_REGS);
- TEST(UWM_CALLER | UWM_SP);
- TEST(UWM_CALLER | UWM_SP | UWM_REGS);
- TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
- TEST(UWM_THREAD);
- TEST(UWM_THREAD | UWM_SP);
- TEST(UWM_THREAD | UWM_CALLER | UWM_SP);
- TEST(UWM_IRQ);
- TEST(UWM_IRQ | UWM_SWITCH_STACK);
- TEST(UWM_IRQ | UWM_SP);
- TEST(UWM_IRQ | UWM_REGS);
- TEST(UWM_IRQ | UWM_SP | UWM_REGS);
- TEST(UWM_IRQ | UWM_CALLER | UWM_SP);
- TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS);
- TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
-#ifdef CONFIG_KPROBES
- TEST(UWM_PGM);
- TEST(UWM_PGM | UWM_SP);
- TEST(UWM_PGM | UWM_REGS);
- TEST(UWM_PGM | UWM_SP | UWM_REGS);
-#endif
-#undef TEST
+static struct kunit_case unwind_test_cases[] = {
+ KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params),
+ {}
+};
- return ret;
-}
+static struct kunit_suite test_unwind_suite = {
+ .name = "test_unwind",
+ .test_cases = unwind_test_cases,
+};
-static void test_unwind_exit(void)
-{
-}
+kunit_test_suites(&test_unwind_suite);
-module_init(test_unwind_init);
-module_exit(test_unwind_exit);
MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index c4f8039a35e8..720036fb1924 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -8,104 +8,44 @@
* Gerald Schaefer (gerald.schaefer@de.ibm.com)
*/
-#include <linux/jump_label.h>
#include <linux/uaccess.h>
#include <linux/export.h>
-#include <linux/errno.h>
#include <linux/mm.h>
-#include <asm/mmu_context.h>
-#include <asm/facility.h>
+#include <asm/asm-extable.h>
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
-static DEFINE_STATIC_KEY_FALSE(have_mvcos);
-
-static int __init uaccess_init(void)
-{
- if (test_facility(27))
- static_branch_enable(&have_mvcos);
- return 0;
-}
-early_initcall(uaccess_init);
-
-static inline int copy_with_mvcos(void)
-{
- if (static_branch_likely(&have_mvcos))
- return 1;
- return 0;
-}
-#else
-static inline int copy_with_mvcos(void)
-{
- return 1;
-}
-#endif
-
-void set_fs(mm_segment_t fs)
-{
- current->thread.mm_segment = fs;
- if (fs == USER_DS) {
- __ctl_load(S390_lowcore.user_asce, 1, 1);
- clear_cpu_flag(CIF_ASCE_PRIMARY);
- } else {
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- }
- if (fs & 1) {
- if (fs == USER_DS_SACF)
- __ctl_load(S390_lowcore.user_asce, 7, 7);
- else
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
- set_cpu_flag(CIF_ASCE_SECONDARY);
- }
-}
-EXPORT_SYMBOL(set_fs);
-
-mm_segment_t enable_sacf_uaccess(void)
+#ifdef CONFIG_DEBUG_ENTRY
+void debug_user_asce(int exit)
{
- mm_segment_t old_fs;
- unsigned long asce, cr;
+ unsigned long cr1, cr7;
- old_fs = current->thread.mm_segment;
- if (old_fs & 1)
- return old_fs;
- current->thread.mm_segment |= 1;
- asce = S390_lowcore.kernel_asce;
- if (likely(old_fs == USER_DS)) {
- __ctl_store(cr, 1, 1);
- if (cr != S390_lowcore.kernel_asce) {
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- }
- asce = S390_lowcore.user_asce;
- }
- __ctl_store(cr, 7, 7);
- if (cr != asce) {
- __ctl_load(asce, 7, 7);
- set_cpu_flag(CIF_ASCE_SECONDARY);
- }
- return old_fs;
-}
-EXPORT_SYMBOL(enable_sacf_uaccess);
+ __ctl_store(cr1, 1, 1);
+ __ctl_store(cr7, 7, 7);
+ if (cr1 == S390_lowcore.kernel_asce && cr7 == S390_lowcore.user_asce)
+ return;
+ panic("incorrect ASCE on kernel %s\n"
+ "cr1: %016lx cr7: %016lx\n"
+ "kernel: %016llx user: %016llx\n",
+ exit ? "exit" : "entry", cr1, cr7,
+ S390_lowcore.kernel_asce, S390_lowcore.user_asce);
-void disable_sacf_uaccess(mm_segment_t old_fs)
-{
- current->thread.mm_segment = old_fs;
- if (old_fs == USER_DS && test_facility(27)) {
- __ctl_load(S390_lowcore.user_asce, 1, 1);
- clear_cpu_flag(CIF_ASCE_PRIMARY);
- }
}
-EXPORT_SYMBOL(disable_sacf_uaccess);
+#endif /*CONFIG_DEBUG_ENTRY */
-static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr,
- unsigned long size)
+static unsigned long raw_copy_from_user_key(void *to, const void __user *from,
+ unsigned long size, unsigned long key)
{
- register unsigned long reg0 asm("0") = 0x01UL;
unsigned long tmp1, tmp2;
+ union oac spec = {
+ .oac2.key = key,
+ .oac2.as = PSW_BITS_AS_SECONDARY,
+ .oac2.k = 1,
+ .oac2.a = 1,
+ };
tmp1 = -4096UL;
asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n"
+ " lr 0,%[spec]\n"
+ "0: mvcos 0(%2),0(%1),%0\n"
"6: jz 4f\n"
"1: algr %0,%3\n"
" slgr %1,%3\n"
@@ -116,71 +56,56 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr
" slgr %4,%1\n"
" clgr %0,%4\n" /* copy crosses next page boundary? */
" jnh 5f\n"
- "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n"
+ "3: mvcos 0(%2),0(%1),%4\n"
"7: slgr %0,%4\n"
" j 5f\n"
"4: slgr %0,%0\n"
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
+ : "+a" (size), "+a" (from), "+a" (to), "+a" (tmp1), "=a" (tmp2)
+ : [spec] "d" (spec.val)
+ : "cc", "memory", "0");
return size;
}
-static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
- unsigned long size)
+unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
{
- unsigned long tmp1, tmp2;
- mm_segment_t old_fs;
-
- old_fs = enable_sacf_uaccess();
- tmp1 = -256UL;
- asm volatile(
- " sacf 0\n"
- "0: mvcp 0(%0,%2),0(%1),%3\n"
- "7: jz 5f\n"
- "1: algr %0,%3\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "2: mvcp 0(%0,%2),0(%1),%3\n"
- "8: jnz 1b\n"
- " j 5f\n"
- "3: la %4,255(%1)\n" /* %4 = ptr + 255 */
- " lghi %3,-4096\n"
- " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */
- " slgr %4,%1\n"
- " clgr %0,%4\n" /* copy crosses next page boundary? */
- " jnh 6f\n"
- "4: mvcp 0(%4,%2),0(%1),%3\n"
- "9: slgr %0,%4\n"
- " j 6f\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
- EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
- return size;
+ return raw_copy_from_user_key(to, from, n, 0);
}
+EXPORT_SYMBOL(raw_copy_from_user);
-unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+unsigned long _copy_from_user_key(void *to, const void __user *from,
+ unsigned long n, unsigned long key)
{
- if (copy_with_mvcos())
- return copy_from_user_mvcos(to, from, n);
- return copy_from_user_mvcp(to, from, n);
+ unsigned long res = n;
+
+ might_fault();
+ if (!should_fail_usercopy()) {
+ instrument_copy_from_user_before(to, from, n);
+ res = raw_copy_from_user_key(to, from, n, key);
+ instrument_copy_from_user_after(to, from, n, res);
+ }
+ if (unlikely(res))
+ memset(to + (n - res), 0, res);
+ return res;
}
-EXPORT_SYMBOL(raw_copy_from_user);
+EXPORT_SYMBOL(_copy_from_user_key);
-static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
- unsigned long size)
+static unsigned long raw_copy_to_user_key(void __user *to, const void *from,
+ unsigned long size, unsigned long key)
{
- register unsigned long reg0 asm("0") = 0x010000UL;
unsigned long tmp1, tmp2;
+ union oac spec = {
+ .oac1.key = key,
+ .oac1.as = PSW_BITS_AS_SECONDARY,
+ .oac1.k = 1,
+ .oac1.a = 1,
+ };
tmp1 = -4096UL;
asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
+ " lr 0,%[spec]\n"
+ "0: mvcos 0(%1),0(%2),%0\n"
"6: jz 4f\n"
"1: algr %0,%3\n"
" slgr %1,%3\n"
@@ -191,136 +116,48 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
" slgr %4,%1\n"
" clgr %0,%4\n" /* copy crosses next page boundary? */
" jnh 5f\n"
- "3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n"
+ "3: mvcos 0(%1),0(%2),%4\n"
"7: slgr %0,%4\n"
" j 5f\n"
"4: slgr %0,%0\n"
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
- return size;
-}
-
-static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
- unsigned long size)
-{
- unsigned long tmp1, tmp2;
- mm_segment_t old_fs;
-
- old_fs = enable_sacf_uaccess();
- tmp1 = -256UL;
- asm volatile(
- " sacf 0\n"
- "0: mvcs 0(%0,%1),0(%2),%3\n"
- "7: jz 5f\n"
- "1: algr %0,%3\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "2: mvcs 0(%0,%1),0(%2),%3\n"
- "8: jnz 1b\n"
- " j 5f\n"
- "3: la %4,255(%1)\n" /* %4 = ptr + 255 */
- " lghi %3,-4096\n"
- " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */
- " slgr %4,%1\n"
- " clgr %0,%4\n" /* copy crosses next page boundary? */
- " jnh 6f\n"
- "4: mvcs 0(%4,%1),0(%2),%3\n"
- "9: slgr %0,%4\n"
- " j 6f\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
- EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
+ : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
+ : [spec] "d" (spec.val)
+ : "cc", "memory", "0");
return size;
}
unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
{
- if (copy_with_mvcos())
- return copy_to_user_mvcos(to, from, n);
- return copy_to_user_mvcs(to, from, n);
+ return raw_copy_to_user_key(to, from, n, 0);
}
EXPORT_SYMBOL(raw_copy_to_user);
-static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from,
- unsigned long size)
-{
- register unsigned long reg0 asm("0") = 0x010001UL;
- unsigned long tmp1, tmp2;
-
- tmp1 = -4096UL;
- /* FIXME: copy with reduced length. */
- asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
- " jz 2f\n"
- "1: algr %0,%3\n"
- " slgr %1,%3\n"
- " slgr %2,%3\n"
- " j 0b\n"
- "2:slgr %0,%0\n"
- "3: \n"
- EX_TABLE(0b,3b)
- : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
- return size;
-}
-
-static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from,
- unsigned long size)
-{
- mm_segment_t old_fs;
- unsigned long tmp1;
-
- old_fs = enable_sacf_uaccess();
- asm volatile(
- " sacf 256\n"
- " aghi %0,-1\n"
- " jo 5f\n"
- " bras %3,3f\n"
- "0: aghi %0,257\n"
- "1: mvc 0(1,%1),0(%2)\n"
- " la %1,1(%1)\n"
- " la %2,1(%2)\n"
- " aghi %0,-1\n"
- " jnz 1b\n"
- " j 5f\n"
- "2: mvc 0(256,%1),0(%2)\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "3: aghi %0,-256\n"
- " jnm 2b\n"
- "4: ex %0,1b-0b(%3)\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
- : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
- return size;
-}
-
-unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
+unsigned long _copy_to_user_key(void __user *to, const void *from,
+ unsigned long n, unsigned long key)
{
- if (copy_with_mvcos())
- return copy_in_user_mvcos(to, from, n);
- return copy_in_user_mvc(to, from, n);
+ might_fault();
+ if (should_fail_usercopy())
+ return n;
+ instrument_copy_to_user(to, from, n);
+ return raw_copy_to_user_key(to, from, n, key);
}
-EXPORT_SYMBOL(raw_copy_in_user);
+EXPORT_SYMBOL(_copy_to_user_key);
-static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
+unsigned long __clear_user(void __user *to, unsigned long size)
{
- register unsigned long reg0 asm("0") = 0x010000UL;
unsigned long tmp1, tmp2;
+ union oac spec = {
+ .oac1.as = PSW_BITS_AS_SECONDARY,
+ .oac1.a = 1,
+ };
tmp1 = -4096UL;
asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n"
- " jz 4f\n"
+ " lr 0,%[spec]\n"
+ "0: mvcos 0(%1),0(%4),%0\n"
+ "6: jz 4f\n"
"1: algr %0,%2\n"
" slgr %1,%2\n"
" j 0b\n"
@@ -329,116 +166,15 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size
" slgr %3,%1\n"
" clgr %0,%3\n" /* copy crosses next page boundary? */
" jnh 5f\n"
- "3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n"
- " slgr %0,%3\n"
+ "3: mvcos 0(%1),0(%4),%3\n"
+ "7: slgr %0,%3\n"
" j 5f\n"
"4: slgr %0,%0\n"
"5:\n"
- EX_TABLE(0b,2b) EX_TABLE(3b,5b)
+ EX_TABLE(0b,2b) EX_TABLE(6b,2b) EX_TABLE(3b,5b) EX_TABLE(7b,5b)
: "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2)
- : "a" (empty_zero_page), "d" (reg0) : "cc", "memory");
+ : "a" (empty_zero_page), [spec] "d" (spec.val)
+ : "cc", "memory", "0");
return size;
}
-
-static inline unsigned long clear_user_xc(void __user *to, unsigned long size)
-{
- mm_segment_t old_fs;
- unsigned long tmp1, tmp2;
-
- old_fs = enable_sacf_uaccess();
- asm volatile(
- " sacf 256\n"
- " aghi %0,-1\n"
- " jo 5f\n"
- " bras %3,3f\n"
- " xc 0(1,%1),0(%1)\n"
- "0: aghi %0,257\n"
- " la %2,255(%1)\n" /* %2 = ptr + 255 */
- " srl %2,12\n"
- " sll %2,12\n" /* %2 = (ptr + 255) & -4096 */
- " slgr %2,%1\n"
- " clgr %0,%2\n" /* clear crosses next page boundary? */
- " jnh 5f\n"
- " aghi %2,-1\n"
- "1: ex %2,0(%3)\n"
- " aghi %2,1\n"
- " slgr %0,%2\n"
- " j 5f\n"
- "2: xc 0(256,%1),0(%1)\n"
- " la %1,256(%1)\n"
- "3: aghi %0,-256\n"
- " jnm 2b\n"
- "4: ex %0,0(%3)\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
- : "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
- return size;
-}
-
-unsigned long __clear_user(void __user *to, unsigned long size)
-{
- if (copy_with_mvcos())
- return clear_user_mvcos(to, size);
- return clear_user_xc(to, size);
-}
EXPORT_SYMBOL(__clear_user);
-
-static inline unsigned long strnlen_user_srst(const char __user *src,
- unsigned long size)
-{
- register unsigned long reg0 asm("0") = 0;
- unsigned long tmp1, tmp2;
-
- asm volatile(
- " la %2,0(%1)\n"
- " la %3,0(%0,%1)\n"
- " slgr %0,%0\n"
- " sacf 256\n"
- "0: srst %3,%2\n"
- " jo 0b\n"
- " la %0,1(%3)\n" /* strnlen_user results includes \0 */
- " slgr %0,%1\n"
- "1: sacf 768\n"
- EX_TABLE(0b,1b)
- : "+a" (size), "+a" (src), "=a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
- return size;
-}
-
-unsigned long __strnlen_user(const char __user *src, unsigned long size)
-{
- mm_segment_t old_fs;
- unsigned long len;
-
- if (unlikely(!size))
- return 0;
- old_fs = enable_sacf_uaccess();
- len = strnlen_user_srst(src, size);
- disable_sacf_uaccess(old_fs);
- return len;
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long __strncpy_from_user(char *dst, const char __user *src, long size)
-{
- size_t done, len, offset, len_str;
-
- if (unlikely(size <= 0))
- return 0;
- done = 0;
- do {
- offset = (size_t)src & (L1_CACHE_BYTES - 1);
- len = min(size - done, L1_CACHE_BYTES - offset);
- if (copy_from_user(dst, src, len))
- return -EFAULT;
- len_str = strnlen(dst, len);
- done += len_str;
- src += len_str;
- dst += len_str;
- } while ((len_str == len) && (done < size));
- return done;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
index 29d9470dbceb..fb924a8041dc 100644
--- a/arch/s390/lib/xor.c
+++ b/arch/s390/lib/xor.c
@@ -11,7 +11,8 @@
#include <linux/raid/xor.h>
#include <asm/xor.h>
-static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
asm volatile(
" larl 1,2f\n"
@@ -32,8 +33,9 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
: "0", "1", "cc", "memory");
}
-static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
asm volatile(
" larl 1,2f\n"
@@ -58,8 +60,10 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: : "0", "1", "cc", "memory");
}
-static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
asm volatile(
" larl 1,2f\n"
@@ -88,12 +92,12 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: : "0", "1", "cc", "memory");
}
-static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
- /* Get around a gcc oddity */
- register unsigned long *reg7 asm ("7") = p5;
-
asm volatile(
" larl 1,2f\n"
" aghi %0,-1\n"
@@ -122,7 +126,7 @@ static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
" xc 0(1,%1),0(%5)\n"
"3:\n"
: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4),
- "+a" (reg7)
+ "+a" (p5)
: : "0", "1", "cc", "memory");
}
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 3175413186b9..57e4f3a24829 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -4,11 +4,11 @@
#
obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
-obj-y += page-states.o pageattr.o pgtable.o pgalloc.o
+obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a51c892f14f3..9141ed4c52e9 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -14,15 +14,13 @@
#include <linux/moduleparam.h>
#include <linux/gfp.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysctl.h>
-#include <linux/ctype.h>
#include <linux/swap.h>
#include <linux/kthread.h>
#include <linux/oom.h>
-#include <linux/suspend.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/diag.h>
#ifdef CONFIG_CMM_IUCV
@@ -49,7 +47,6 @@ static volatile long cmm_pages_target;
static volatile long cmm_timed_pages_target;
static long cmm_timeout_pages;
static long cmm_timeout_seconds;
-static int cmm_suspended;
static struct cmm_page_array *cmm_page_list;
static struct cmm_page_array *cmm_timed_page_list;
@@ -93,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter,
} else
free_page((unsigned long) npa);
}
- diag10_range(addr >> PAGE_SHIFT, 1);
+ diag10_range(virt_to_pfn(addr), 1);
pa->pages[pa->index++] = addr;
(*counter)++;
spin_unlock(&cmm_lock);
@@ -151,9 +148,9 @@ static int cmm_thread(void *dummy)
while (1) {
rc = wait_event_interruptible(cmm_thread_wait,
- (!cmm_suspended && (cmm_pages != cmm_pages_target ||
- cmm_timed_pages != cmm_timed_pages_target)) ||
- kthread_should_stop());
+ cmm_pages != cmm_pages_target ||
+ cmm_timed_pages != cmm_timed_pages_target ||
+ kthread_should_stop());
if (kthread_should_stop() || rc == -ERESTARTSYS) {
cmm_pages_target = cmm_pages;
cmm_timed_pages_target = cmm_timed_pages;
@@ -191,7 +188,7 @@ static void cmm_set_timer(void)
del_timer(&cmm_timer);
return;
}
- mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+ mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
}
static void cmm_timer_fn(struct timer_list *unused)
@@ -247,7 +244,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
}
static int cmm_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long nr = cmm_get_pages();
struct ctl_table ctl_entry = {
@@ -266,7 +263,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
long nr = cmm_get_timed_pages();
@@ -286,7 +283,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timeout_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char buf[64], *p;
long nr, seconds;
@@ -299,8 +296,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
if (write) {
len = min(*lenp, sizeof(buf));
- if (copy_from_user(buf, buffer, len))
- return -EFAULT;
+ memcpy(buf, buffer, len);
buf[len - 1] = '\0';
cmm_skip_blanks(buf, &p);
nr = simple_strtoul(p, &p, 0);
@@ -313,8 +309,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
cmm_timeout_pages, cmm_timeout_seconds);
if (len > *lenp)
len = *lenp;
- if (copy_to_user(buffer, buf, len))
- return -EFAULT;
+ memcpy(buffer, buf, len);
*lenp = len;
*ppos += len;
}
@@ -390,38 +385,6 @@ static void cmm_smsg_target(const char *from, char *msg)
static struct ctl_table_header *cmm_sysctl_header;
-static int cmm_suspend(void)
-{
- cmm_suspended = 1;
- cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
- cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
- return 0;
-}
-
-static int cmm_resume(void)
-{
- cmm_suspended = 0;
- cmm_kick_thread();
- return 0;
-}
-
-static int cmm_power_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- switch (event) {
- case PM_POST_HIBERNATION:
- return cmm_resume();
- case PM_HIBERNATION_PREPARE:
- return cmm_suspend();
- default:
- return NOTIFY_DONE;
- }
-}
-
-static struct notifier_block cmm_power_notifier = {
- .notifier_call = cmm_power_event,
-};
-
static int __init cmm_init(void)
{
int rc = -ENOMEM;
@@ -431,13 +394,10 @@ static int __init cmm_init(void)
goto out_sysctl;
#ifdef CONFIG_CMM_IUCV
/* convert sender to uppercase characters */
- if (sender) {
- int len = strlen(sender);
- while (len--)
- sender[len] = toupper(sender[len]);
- } else {
+ if (sender)
+ string_upper(sender, sender);
+ else
sender = cmm_default_sender;
- }
rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
if (rc < 0)
@@ -446,16 +406,11 @@ static int __init cmm_init(void)
rc = register_oom_notifier(&cmm_oom_nb);
if (rc < 0)
goto out_oom_notify;
- rc = register_pm_notifier(&cmm_power_notifier);
- if (rc)
- goto out_pm;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (!IS_ERR(cmm_thread_ptr))
return 0;
rc = PTR_ERR(cmm_thread_ptr);
- unregister_pm_notifier(&cmm_power_notifier);
-out_pm:
unregister_oom_notifier(&cmm_oom_nb);
out_oom_notify:
#ifdef CONFIG_CMM_IUCV
@@ -475,7 +430,6 @@ static void __exit cmm_exit(void)
#ifdef CONFIG_CMM_IUCV
smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
#endif
- unregister_pm_notifier(&cmm_power_notifier);
unregister_oom_notifier(&cmm_oom_nb);
kthread_stop(cmm_thread_ptr);
del_timer_sync(&cmm_timer);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 5d67b81c704a..9953819d7959 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,12 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/set_memory.h>
+#include <linux/ptdump.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
-#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/kfence.h>
#include <linux/kasan.h>
+#include <asm/ptdump.h>
#include <asm/kasan.h>
+#include <asm/abs_lowcore.h>
+#include <asm/nospec-branch.h>
#include <asm/sections.h>
-#include <asm/pgtable.h>
+#include <asm/maccess.h>
static unsigned long max_addr;
@@ -16,266 +21,267 @@ struct addr_marker {
};
enum address_markers_idx {
- IDENTITY_NR = 0,
+ IDENTITY_BEFORE_NR = 0,
+ IDENTITY_BEFORE_END_NR,
+ AMODE31_START_NR,
+ AMODE31_END_NR,
KERNEL_START_NR,
KERNEL_END_NR,
+#ifdef CONFIG_KFENCE
+ KFENCE_START_NR,
+ KFENCE_END_NR,
+#endif
+ IDENTITY_AFTER_NR,
+ IDENTITY_AFTER_END_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
VMEMMAP_NR,
+ VMEMMAP_END_NR,
VMALLOC_NR,
+ VMALLOC_END_NR,
MODULES_NR,
+ MODULES_END_NR,
+ ABS_LOWCORE_NR,
+ ABS_LOWCORE_END_NR,
+ MEMCPY_REAL_NR,
+ MEMCPY_REAL_END_NR,
};
static struct addr_marker address_markers[] = {
- [IDENTITY_NR] = {0, "Identity Mapping"},
+ [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"},
+ [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
+ [AMODE31_START_NR] = {0, "Amode31 Area Start"},
+ [AMODE31_END_NR] = {0, "Amode31 Area End"},
[KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
[KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
+#ifdef CONFIG_KFENCE
+ [KFENCE_START_NR] = {0, "KFence Pool Start"},
+ [KFENCE_END_NR] = {0, "KFence Pool End"},
+#endif
+ [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"},
+ [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"},
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"},
[KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"},
#endif
- [VMEMMAP_NR] = {0, "vmemmap Area"},
- [VMALLOC_NR] = {0, "vmalloc Area"},
- [MODULES_NR] = {0, "Modules Area"},
+ [VMEMMAP_NR] = {0, "vmemmap Area Start"},
+ [VMEMMAP_END_NR] = {0, "vmemmap Area End"},
+ [VMALLOC_NR] = {0, "vmalloc Area Start"},
+ [VMALLOC_END_NR] = {0, "vmalloc Area End"},
+ [MODULES_NR] = {0, "Modules Area Start"},
+ [MODULES_END_NR] = {0, "Modules Area End"},
+ [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"},
+ [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"},
+ [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"},
+ [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"},
{ -1, NULL }
};
struct pg_state {
+ struct ptdump_state ptdump;
+ struct seq_file *seq;
int level;
unsigned int current_prot;
+ bool check_wx;
+ unsigned long wx_pages;
unsigned long start_address;
- unsigned long current_address;
const struct addr_marker *marker;
};
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt, ##args); \
+})
+
+#define pt_dump_seq_puts(m, fmt) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt); \
+})
+
static void print_prot(struct seq_file *m, unsigned int pr, int level)
{
static const char * const level_name[] =
{ "ASCE", "PGD", "PUD", "PMD", "PTE" };
- seq_printf(m, "%s ", level_name[level]);
+ pt_dump_seq_printf(m, "%s ", level_name[level]);
if (pr & _PAGE_INVALID) {
- seq_printf(m, "I\n");
+ pt_dump_seq_printf(m, "I\n");
return;
}
- seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
- seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
+ pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+ pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
}
-static void note_page(struct seq_file *m, struct pg_state *st,
- unsigned int new_prot, int level)
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+#ifdef CONFIG_DEBUG_WX
+ if (!st->check_wx)
+ return;
+ if (st->current_prot & _PAGE_INVALID)
+ return;
+ if (st->current_prot & _PAGE_PROTECT)
+ return;
+ if (st->current_prot & _PAGE_NOEXEC)
+ return;
+ /*
+ * The first lowcore page is W+X if spectre mitigations are using
+ * trampolines or the BEAR enhancements facility is not installed,
+ * in which case we have two lpswe instructions in lowcore that need
+ * to be executable.
+ */
+ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
+ return;
+ WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+#endif /* CONFIG_DEBUG_WX */
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
{
- static const char units[] = "KMGTPE";
int width = sizeof(unsigned long) * 2;
+ static const char units[] = "KMGTPE";
const char *unit = units;
- unsigned int prot, cur;
unsigned long delta;
+ struct pg_state *st;
+ struct seq_file *m;
+ unsigned int prot;
- /*
- * If we have a "break" in the series, we need to flush the state
- * that we have now. "break" is either changing perms, levels or
- * address space marker.
- */
- prot = new_prot;
- cur = st->current_prot;
-
- if (!st->level) {
- /* First entry */
- st->current_prot = new_prot;
+ st = container_of(pt_st, struct pg_state, ptdump);
+ m = st->seq;
+ prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC);
+ if (level == 4 && (val & _PAGE_INVALID))
+ prot = _PAGE_INVALID;
+ /* For pmd_none() & friends val gets passed as zero. */
+ if (level != 4 && !val)
+ prot = _PAGE_INVALID;
+ /* Final flush from generic code. */
+ if (level == -1)
+ addr = max_addr;
+ if (st->level == -1) {
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
- st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
- } else if (prot != cur || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
- /* Print the actual finished series */
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
- delta = (st->current_address - st->start_address) >> 10;
+ } else if (prot != st->current_prot || level != st->level ||
+ addr >= st->marker[1].start_address) {
+ note_prot_wx(st, addr);
+ pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, addr);
+ delta = (addr - st->start_address) >> 10;
while (!(delta & 0x3ff) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(m, "%9lu%c ", delta, *unit);
+ pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
print_prot(m, st->current_prot, st->level);
- while (st->current_address >= st->marker[1].start_address) {
+ while (addr >= st->marker[1].start_address) {
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
- st->start_address = st->current_address;
- st->current_prot = new_prot;
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
}
}
-#ifdef CONFIG_KASAN
-static void note_kasan_early_shadow_page(struct seq_file *m,
- struct pg_state *st)
-{
- unsigned int prot;
-
- prot = pte_val(*kasan_early_shadow_pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
-}
-#endif
-
-/*
- * The actual page table walker functions. In order to keep the
- * implementation of print_prot() short, we only check and pass
- * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region,
- * segment or page table entry is invalid or read-only.
- * After all it's just a hint that the current level being walked
- * contains an invalid or read-only entry.
- */
-static void walk_pte_level(struct seq_file *m, struct pg_state *st,
- pmd_t *pmd, unsigned long addr)
-{
- unsigned int prot;
- pte_t *pte;
- int i;
-
- for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
- st->current_address = addr;
- pte = pte_offset_kernel(pmd, addr);
- prot = pte_val(*pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
- addr += PAGE_SIZE;
- }
-}
-
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
- pud_t *pud, unsigned long addr)
-{
- unsigned int prot;
- pmd_t *pmd;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- pmd = pmd_offset(pud, addr);
- for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) {
- st->current_address = addr;
- if (!pmd_none(*pmd)) {
- if (pmd_large(*pmd)) {
- prot = pmd_val(*pmd) &
- (_SEGMENT_ENTRY_PROTECT |
- _SEGMENT_ENTRY_NOEXEC);
- note_page(m, st, prot, 3);
- } else
- walk_pte_level(m, st, pmd, addr);
- } else
- note_page(m, st, _PAGE_INVALID, 3);
- addr += PMD_SIZE;
- }
-}
-
-static void walk_pud_level(struct seq_file *m, struct pg_state *st,
- p4d_t *p4d, unsigned long addr)
+#ifdef CONFIG_DEBUG_WX
+void ptdump_check_wx(void)
{
- unsigned int prot;
- pud_t *pud;
- int i;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = NULL,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = true,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = (struct addr_marker[]) {
+ { .start_address = 0, .name = NULL},
+ { .start_address = -1, .name = NULL},
+ },
+ };
-#ifdef CONFIG_KASAN
- if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) {
- note_kasan_early_shadow_page(m, st);
+ if (!MACHINE_HAS_NX)
return;
- }
-#endif
-
- pud = pud_offset(p4d, addr);
- for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) {
- st->current_address = addr;
- if (!pud_none(*pud))
- if (pud_large(*pud)) {
- prot = pud_val(*pud) &
- (_REGION_ENTRY_PROTECT |
- _REGION_ENTRY_NOEXEC);
- note_page(m, st, prot, 2);
- } else
- walk_pmd_level(m, st, pud, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += PUD_SIZE;
- }
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ if (st.wx_pages)
+ pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
+ else
+ pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
+ (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
+ "unexpected " : "");
}
+#endif /* CONFIG_DEBUG_WX */
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
- pgd_t *pgd, unsigned long addr)
+#ifdef CONFIG_PTDUMP_DEBUGFS
+static int ptdump_show(struct seq_file *m, void *v)
{
- p4d_t *p4d;
- int i;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = m,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = false,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = address_markers,
+ };
-#ifdef CONFIG_KASAN
- if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- p4d = p4d_offset(pgd, addr);
- for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) {
- st->current_address = addr;
- if (!p4d_none(*p4d))
- walk_pud_level(m, st, p4d, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += P4D_SIZE;
- }
+ get_online_mems();
+ mutex_lock(&cpa_mutex);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ mutex_unlock(&cpa_mutex);
+ put_online_mems();
+ return 0;
}
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
-static void walk_pgd_level(struct seq_file *m)
+/*
+ * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple
+ * insertion sort to preserve the original order of markers with the same
+ * start address.
+ */
+static void sort_address_markers(void)
{
- unsigned long addr = 0;
- struct pg_state st;
- pgd_t *pgd;
- int i;
+ struct addr_marker tmp;
+ int i, j;
- memset(&st, 0, sizeof(st));
- for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
- st.current_address = addr;
- pgd = pgd_offset_k(addr);
- if (!pgd_none(*pgd))
- walk_p4d_level(m, &st, pgd, addr);
- else
- note_page(m, &st, _PAGE_INVALID, 1);
- addr += PGDIR_SIZE;
- cond_resched();
+ for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) {
+ tmp = address_markers[i];
+ for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--)
+ address_markers[j + 1] = address_markers[j];
+ address_markers[j + 1] = tmp;
}
- /* Flush out the last page */
- st.current_address = max_addr;
- note_page(m, &st, 0, 0);
-}
-
-static int ptdump_show(struct seq_file *m, void *v)
-{
- walk_pgd_level(m);
- return 0;
-}
-
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show, NULL);
}
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int pt_dump_init(void)
{
+#ifdef CONFIG_KFENCE
+ unsigned long kfence_start = (unsigned long)__kfence_pool;
+#endif
/*
* Figure out the maximum virtual address being accessible with the
* kernel ASCE. We need this to keep the page table walker functions
@@ -283,10 +289,27 @@ static int pt_dump_init(void)
*/
max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
max_addr = 1UL << (max_addr * 11 + 31);
+ address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size;
+ address_markers[AMODE31_START_NR].start_address = __samode31;
+ address_markers[AMODE31_END_NR].start_address = __eamode31;
address_markers[MODULES_NR].start_address = MODULES_VADDR;
+ address_markers[MODULES_END_NR].start_address = MODULES_END;
+ address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
+ address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
+ address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
+ address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
+ address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+#ifdef CONFIG_KFENCE
+ address_markers[KFENCE_START_NR].start_address = kfence_start;
+ address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE;
+#endif
+ sort_address_markers();
+#ifdef CONFIG_PTDUMP_DEBUGFS
debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
return 0;
}
device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
new file mode 100644
index 000000000000..1e4d2187541a
--- /dev/null
+++ b/arch/s390/mm/extable.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/extable.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/panic.h>
+#include <asm/asm-extable.h>
+#include <asm/extable.h>
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
+{
+ const struct exception_table_entry *fixup;
+ size_t num;
+
+ fixup = search_exception_tables(addr);
+ if (fixup)
+ return fixup;
+ num = __stop_amode31_ex_table - __start_amode31_ex_table;
+ return search_extable(__start_amode31_ex_table, num, addr);
+}
+
+static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+ size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ memset((void *)regs->gprs[reg_addr], 0, len);
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->gprs[reg_zero] = 0;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+bool fixup_exception(struct pt_regs *regs)
+{
+ const struct exception_table_entry *ex;
+
+ ex = s390_search_extables(instruction_pointer(regs));
+ if (!ex)
+ return false;
+ switch (ex->type) {
+ case EX_TYPE_FIXUP:
+ return ex_handler_fixup(ex, regs);
+ case EX_TYPE_BPF:
+ return ex_handler_bpf(ex, regs);
+ case EX_TYPE_UA_STORE:
+ return ex_handler_ua_store(ex, regs);
+ case EX_TYPE_UA_LOAD_MEM:
+ return ex_handler_ua_load_mem(ex, regs);
+ case EX_TYPE_UA_LOAD_REG:
+ return ex_handler_ua_load_reg(ex, regs);
+ }
+ panic("invalid exception table entry");
+}
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index fd0dae9d10f4..5060956b8e7d 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -20,9 +20,9 @@
#include <linux/ctype.h>
#include <linux/ioport.h>
#include <linux/refcount.h>
+#include <linux/pgtable.h>
#include <asm/diag.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/ebcdic.h>
#include <asm/errno.h>
#include <asm/extmem.h>
@@ -313,15 +313,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
goto out_free;
}
- rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
- if (rc)
- goto out_free;
-
seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (seg->res == NULL) {
rc = -ENOMEM;
- goto out_shared;
+ goto out_free;
}
seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
seg->res->start = seg->start_addr;
@@ -335,12 +330,17 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
if (rc == SEG_TYPE_SC ||
((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
seg->res->flags |= IORESOURCE_READONLY;
+
+ /* Check for overlapping resources before adding the mapping. */
if (request_resource(&iomem_resource, seg->res)) {
rc = -EBUSY;
- kfree(seg->res);
- goto out_shared;
+ goto out_free_resource;
}
+ rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ if (rc)
+ goto out_resource;
+
if (do_nonshared)
diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
&start_addr, &end_addr);
@@ -351,14 +351,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
rc = diag_cc;
- goto out_resource;
+ goto out_mapping;
}
if (diag_cc > 1) {
pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
rc = dcss_diag_translate_rc(end_addr);
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
- goto out_resource;
+ goto out_mapping;
}
seg->start_addr = start_addr;
seg->end = end_addr;
@@ -377,11 +377,12 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
(void*) seg->end, segtype_string[seg->vm_segtype]);
}
goto out;
+ out_mapping:
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_resource:
release_resource(seg->res);
+ out_free_resource:
kfree(seg->res);
- out_shared:
- vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_free:
kfree(seg);
out:
@@ -400,8 +401,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
* -EIO : could not perform query or load diagnose
* -ENOENT : no such segment
* -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC : segment cannot be used (overlaps with storage)
- * -EBUSY : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY : segment cannot be used (overlaps with dcss or storage)
* -ERANGE : segment cannot be used (exceeds kernel mapping range)
* -EPERM : segment is currently loaded with incompatible permissions
* -ENOMEM : out of memory
@@ -626,10 +626,6 @@ void segment_warning(int rc, char *seg_name)
pr_err("DCSS %s has multiple page ranges and cannot be "
"loaded or queried\n", seg_name);
break;
- case -ENOSPC:
- pr_err("DCSS %s overlaps with used storage and cannot "
- "be loaded\n", seg_name);
- break;
case -EBUSY:
pr_err("%s needs used memory resources and cannot be "
"loaded or queried\n", seg_name);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 7b0bb475c166..9649d9382e0a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,29 +31,30 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
+#include <linux/kfence.h>
+#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
#include <asm/diag.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/facility.h>
+#include <asm/uv.h>
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
#define __SUBCODE_MASK 0x0600
#define __PF_RES_FIELD 0x8000000000000000ULL
-#define VM_FAULT_BADCONTEXT 0x010000
-#define VM_FAULT_BADMAP 0x020000
-#define VM_FAULT_BADACCESS 0x040000
-#define VM_FAULT_SIGNAL 0x080000
-#define VM_FAULT_PFAULT 0x100000
+#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
+#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000)
+#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000)
+#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000)
enum fault_type {
KERNEL_FAULT,
USER_FAULT,
- VDSO_FAULT,
GMAP_FAULT,
};
@@ -77,22 +78,16 @@ static enum fault_type get_fault_type(struct pt_regs *regs)
trans_exc_code = regs->int_parm_long & 3;
if (likely(trans_exc_code == 0)) {
/* primary space exception */
- if (IS_ENABLED(CONFIG_PGSTE) &&
- test_pt_regs_flag(regs, PIF_GUEST_FAULT))
- return GMAP_FAULT;
- if (current->thread.mm_segment == USER_DS)
+ if (user_mode(regs))
return USER_FAULT;
- return KERNEL_FAULT;
- }
- if (trans_exc_code == 2) {
- /* secondary space exception */
- if (current->thread.mm_segment & 1) {
- if (current->thread.mm_segment == USER_DS_SACF)
- return USER_FAULT;
+ if (!IS_ENABLED(CONFIG_PGSTE))
return KERNEL_FAULT;
- }
- return VDSO_FAULT;
+ if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
+ return GMAP_FAULT;
+ return KERNEL_FAULT;
}
+ if (trans_exc_code == 2)
+ return USER_FAULT;
if (trans_exc_code == 1) {
/* access register mode, not used in the kernel */
return USER_FAULT;
@@ -105,7 +100,7 @@ static int bad_address(void *p)
{
unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long asce, unsigned long address)
@@ -121,8 +116,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R1:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
if (bad_address(table))
@@ -130,8 +125,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R2:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
if (bad_address(table))
@@ -139,8 +134,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R3:%016lx ", *table);
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (bad_address(table))
@@ -148,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("S:%016lx ", *table);
if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
}
table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
if (bad_address(table))
@@ -188,10 +183,6 @@ static void dump_fault_info(struct pt_regs *regs)
asce = S390_lowcore.user_asce;
pr_cont("user ");
break;
- case VDSO_FAULT:
- asce = S390_lowcore.vdso_asce;
- pr_cont("vdso ");
- break;
case GMAP_FAULT:
asce = ((struct gmap *) S390_lowcore.gmap)->asce;
pr_cont("gmap ");
@@ -237,29 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-const struct exception_table_entry *s390_search_extables(unsigned long addr)
-{
- const struct exception_table_entry *fixup;
-
- fixup = search_extable(__start_dma_ex_table,
- __stop_dma_ex_table - __start_dma_ex_table,
- addr);
- if (!fixup)
- fixup = search_exception_tables(addr);
- return fixup;
-}
-
static noinline void do_no_context(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
-
- /* Are we prepared to handle this kernel fault? */
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup) {
- regs->psw.addr = extable_fixup(fixup);
+ if (fixup_exception(regs))
return;
- }
-
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
@@ -272,7 +244,6 @@ static noinline void do_no_context(struct pt_regs *regs)
" in virtual user address space\n");
dump_fault_info(regs);
die(regs, "Oops");
- do_exit(SIGKILL);
}
static noinline void do_low_address(struct pt_regs *regs)
@@ -282,7 +253,6 @@ static noinline void do_low_address(struct pt_regs *regs)
if (regs->psw.mask & PSW_MASK_PSTATE) {
/* Low-address protection hit in user mode 'cannot happen'. */
die (regs, "Low-address protection");
- do_exit(SIGKILL);
}
do_no_context(regs);
@@ -298,36 +268,12 @@ static noinline void do_sigbus(struct pt_regs *regs)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-static noinline int signal_return(struct pt_regs *regs)
-{
- u16 instruction;
- int rc;
-
- rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
- if (rc)
- return rc;
- if (instruction == 0x0a77) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x00040077;
- return 0;
- } else if (instruction == 0x0aad) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x000400ad;
- return 0;
- }
- return -EACCES;
-}
-
-static noinline void do_fault_error(struct pt_regs *regs, int access,
- vm_fault_t fault)
+static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
{
int si_code;
switch (fault) {
case VM_FAULT_BADACCESS:
- if (access == VM_EXEC && signal_return(regs) == 0)
- break;
- /* fallthrough */
case VM_FAULT_BADMAP:
/* Bad memory access. Check if it is kernel or user space. */
if (user_mode(regs)) {
@@ -337,9 +283,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
do_sigsegv(regs, si_code);
break;
}
- /* fallthrough */
+ fallthrough;
case VM_FAULT_BADCONTEXT:
- /* fallthrough */
case VM_FAULT_PFAULT:
do_no_context(regs);
break;
@@ -377,7 +322,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
* routines.
*
* interruption code (int_code):
- * 04 Protection -> Write-Protection (suprression)
+ * 04 Protection -> Write-Protection (suppression)
* 10 Segment translation -> Not present (nullification)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
@@ -393,19 +338,22 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
unsigned long address;
unsigned int flags;
vm_fault_t fault;
+ bool is_write;
tsk = current;
/*
* The instruction that caused the program check has
* been nullified. Don't signal single step via SIGTRAP.
*/
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
if (kprobe_page_fault(regs, 14))
return 0;
mm = tsk->mm;
trans_exc_code = regs->int_parm_long;
+ address = trans_exc_code & __FAIL_ADDR_MASK;
+ is_write = (trans_exc_code & store_indication) == 0x400;
/*
* Verify that the fault happened in user space, that
@@ -416,9 +364,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
type = get_fault_type(regs);
switch (type) {
case KERNEL_FAULT:
- goto out;
- case VDSO_FAULT:
- fault = VM_FAULT_BADMAP;
+ if (kfence_handle_page_fault(address, is_write, regs))
+ return 0;
goto out;
case USER_FAULT:
case GMAP_FAULT:
@@ -427,14 +374,15 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
break;
}
- address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
@@ -472,57 +420,49 @@ retry:
if (unlikely(!(vma->vm_flags & access)))
goto out_up;
- if (is_vm_hugetlb_page(vma))
- address &= HPAGE_MASK;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
- /* No reason to continue if interrupted by SIGKILL. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ fault = handle_mm_fault(vma, address, flags, regs);
+ if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
goto out_up;
goto out;
}
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ if (gmap) {
+ mmap_read_lock(mm);
+ goto out_gmap;
+ }
+ fault = 0;
+ goto out;
+ }
+
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_sem has not been released */
- current->thread.gmap_pfault = 1;
- fault = VM_FAULT_PFAULT;
- goto out_up;
- }
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT);
- flags |= FAULT_FLAG_TRIED;
- down_read(&mm->mmap_sem);
- goto retry;
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
+ * not been released
+ */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
}
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ flags |= FAULT_FLAG_TRIED;
+ mmap_read_lock(mm);
+ goto retry;
}
+out_gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
@@ -537,7 +477,7 @@ retry:
}
fault = 0;
out_up:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return fault;
}
@@ -575,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs)
fault = do_exception(regs, access);
}
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_protection_exception);
@@ -584,10 +524,10 @@ void do_dat_exception(struct pt_regs *regs)
int access;
vm_fault_t fault;
- access = VM_READ | VM_EXEC | VM_WRITE;
+ access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_dat_exception);
@@ -737,7 +677,7 @@ static void pfault_interrupt(struct ext_code ext_code,
* interrupt since it must be a leftover of a PFAULT
* CANCEL operation which didn't remove all pending
* completion interrupts. */
- if (tsk->state == TASK_RUNNING)
+ if (task_is_running(tsk))
tsk->thread.pfault_wait = -1;
}
} else {
@@ -816,3 +756,130 @@ out_extint:
early_initcall(pfault_irq_init);
#endif /* CONFIG_PFAULT */
+
+#if IS_ENABLED(CONFIG_PGSTE)
+
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct page *page;
+ struct gmap *gmap;
+ int rc;
+
+ /*
+ * bit 61 tells us if the address is valid, if it's not we
+ * have a major problem and should stop the kernel or send a
+ * SIGSEGV to the process. Unfortunately bit 61 is not
+ * reliable without the misc UV feature so we need to check
+ * for that as well.
+ */
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+ !test_bit_inv(61, &regs->int_parm_long)) {
+ /*
+ * When this happens, userspace did something that it
+ * was not supposed to do, e.g. branching into secure
+ * memory. Trigger a segmentation fault.
+ */
+ if (user_mode(regs)) {
+ send_sig(SIGSEGV, current, 0);
+ return;
+ }
+
+ /*
+ * The kernel should never run into this case and we
+ * have no way out of this situation.
+ */
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
+ }
+
+ switch (get_fault_type(regs)) {
+ case GMAP_FAULT:
+ mm = current->mm;
+ gmap = (struct gmap *)S390_lowcore.gmap;
+ mmap_read_lock(mm);
+ addr = __gmap_translate(gmap, addr);
+ mmap_read_unlock(mm);
+ if (IS_ERR_VALUE(addr)) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ fallthrough;
+ case USER_FAULT:
+ mm = current->mm;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page)) {
+ mmap_read_unlock(mm);
+ break;
+ }
+ if (arch_make_page_accessible(page))
+ send_sig(SIGSEGV, current, 0);
+ put_page(page);
+ mmap_read_unlock(mm);
+ break;
+ case KERNEL_FAULT:
+ page = phys_to_page(addr);
+ if (unlikely(!try_get_page(page)))
+ break;
+ rc = arch_make_page_accessible(page);
+ put_page(page);
+ if (rc)
+ BUG();
+ break;
+ default:
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ }
+}
+NOKPROBE_SYMBOL(do_secure_storage_access);
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ if (get_fault_type(regs) != GMAP_FAULT) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ /*
+ * If the VM has been rebooted, its address space might still contain
+ * secure pages from the previous boot.
+ * Clear the page so it can be reused.
+ */
+ if (!gmap_destroy_page(gmap, gaddr))
+ return;
+ /*
+ * Either KVM messed up the secure guest mapping or the same
+ * page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is running
+ * and can therefore never occur in kernel context.
+ */
+ printk_ratelimited(KERN_WARNING
+ "Secure storage violation in task: %s, pid %d\n",
+ current->comm, current->pid);
+ send_sig(SIGSEGV, current, 0);
+}
+
+#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index edcdca97e85e..02d15c8dc92e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,7 +2,7 @@
/*
* KVM guest address space mapping code
*
- * Copyright IBM Corp. 2007, 2016, 2018
+ * Copyright IBM Corp. 2007, 2020
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
* David Hildenbrand <david@redhat.com>
* Janosch Frank <frankja@linux.vnet.ibm.com>
@@ -17,8 +17,8 @@
#include <linux/swapops.h>
#include <linux/ksm.h>
#include <linux/mman.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/tlb.h>
@@ -27,7 +27,6 @@
/**
* gmap_alloc - allocate and initialize a guest address space
- * @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
@@ -56,19 +55,19 @@ static struct gmap *gmap_alloc(unsigned long limit)
atype = _ASCE_TYPE_REGION1;
etype = _REGION1_ENTRY_EMPTY;
}
- gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+ gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
INIT_LIST_HEAD(&gmap->pt_list);
- INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
- INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
- INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&gmap->guest_table_lock);
spin_lock_init(&gmap->shadow_lock);
refcount_set(&gmap->ref_count, 1);
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
goto out_free;
page->index = 0;
@@ -300,7 +299,7 @@ struct gmap *gmap_get_enabled(void)
EXPORT_SYMBOL_GPL(gmap_get_enabled);
/*
- * gmap_alloc_table is assumed to be called with mmap_sem held
+ * gmap_alloc_table is assumed to be called with mmap_lock held
*/
static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long init, unsigned long gaddr)
@@ -309,7 +308,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long *new;
/* since we dont free the gmap table until gmap_free we can unlock */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
@@ -405,10 +404,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE)
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
return 0;
@@ -438,7 +437,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE) {
/* Remove old translation */
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
@@ -448,7 +447,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
(void *) from + off))
break;
}
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
if (off >= len)
@@ -466,7 +465,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* Returns user space address which corresponds to the guest address or
* -EFAULT if no such mapping exists.
* This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*
* Note: Can also be called for shadow gmaps.
@@ -495,16 +494,16 @@ unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
unsigned long rc;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = __gmap_translate(gmap, gaddr);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_translate);
/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
- * @gmap: pointer to guest mapping meta data structure
+ * @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
* @vmaddr: vm address associated with the host page table
*/
@@ -527,14 +526,14 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
unsigned long gaddr);
/**
- * gmap_link - set up shadow page tables to connect a host to a guest address
+ * __gmap_link - set up shadow page tables to connect a host to a guest address
* @gmap: pointer to guest mapping meta data structure
* @gaddr: guest address
* @vmaddr: vm address
*
* Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
* if the vm address is already mapped to a different guest segment.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*/
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
@@ -594,7 +593,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
return rc;
ptl = pmd_lock(mm, pmd);
@@ -640,7 +639,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
int rc;
bool unlocked;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
retry:
unlocked = false;
@@ -649,13 +648,13 @@ retry:
rc = vmaddr;
goto out_up;
}
- if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+ if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
&unlocked)) {
rc = -EFAULT;
goto out_up;
}
/*
- * In the case that fixup_user_fault unlocked the mmap_sem during
+ * In the case that fixup_user_fault unlocked the mmap_lock during
* faultin redo __gmap_translate to not race with a map/unmap_segment.
*/
if (unlocked)
@@ -663,16 +662,17 @@ retry:
rc = __gmap_link(gmap, gaddr, vmaddr);
out_up:
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_fault);
/*
- * this function is assumed to be called with mmap_sem held
+ * this function is assumed to be called with mmap_lock held
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
+ struct vm_area_struct *vma;
unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
@@ -682,11 +682,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
gaddr >> PMD_SHIFT);
if (vmaddr) {
vmaddr |= gaddr & ~PMD_MASK;
+
+ vma = vma_lookup(gmap->mm, vmaddr);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return;
+
/* Get pointer to the page table entry */
ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep))
+ if (likely(ptep)) {
ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(ptep, ptl);
+ }
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
@@ -696,7 +702,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
unsigned long gaddr, vmaddr, size;
struct vm_area_struct *vma;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
for (gaddr = from; gaddr < to;
gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
/* Find the vm address for the guest address */
@@ -719,7 +725,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
zap_page_range(vma, vmaddr, size);
}
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
}
EXPORT_SYMBOL_GPL(gmap_discard);
@@ -787,16 +793,20 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
static inline unsigned long *gmap_table_walk(struct gmap *gmap,
unsigned long gaddr, int level)
{
- unsigned long *table;
+ const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
+ unsigned long *table = gmap->table;
- if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
- return NULL;
if (gmap_is_shadow(gmap) && gmap->removed)
return NULL;
- if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+
+ if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
return NULL;
- table = gmap->table;
- switch (gmap->asce & _ASCE_TYPE_MASK) {
+
+ if (asce_type != _ASCE_TYPE_REGION1 &&
+ gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
+ return NULL;
+
+ switch (asce_type) {
case _ASCE_TYPE_REGION1:
table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
if (level == 4)
@@ -804,7 +814,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
if (level == 3)
@@ -812,7 +822,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
if (level == 2)
@@ -820,7 +830,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (level == 1)
@@ -875,10 +885,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
BUG_ON(gmap_is_shadow(gmap));
fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
- if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
return -EFAULT;
if (unlocked)
- /* lost mmap_sem, caller has to retry __gmap_translate */
+ /* lost mmap_lock, caller has to retry __gmap_translate */
return 0;
/* Connect the page tables */
return __gmap_link(gmap, gaddr, vmaddr);
@@ -949,7 +959,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
* -EAGAIN if a fixup is needed
* -EINVAL if unsupported notifier bits have been specified
*
- * Expected to be called with sg->mm->mmap_sem in read and
+ * Expected to be called with sg->mm->mmap_lock in read and
* guest_table_lock held.
*/
static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
@@ -964,18 +974,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
return -EAGAIN;
if (prot == PROT_NONE && !pmd_i) {
- pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (prot == PROT_READ && !pmd_p) {
- pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
- pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (bits & GMAP_NOTIFY_MPROT)
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
/* Shadow GMAP protection needs split PMDs */
if (bits & GMAP_NOTIFY_SHADOW)
@@ -995,7 +1005,7 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EAGAIN if a fixup is needed.
*
- * Expected to be called with sg->mm->mmap_sem in read
+ * Expected to be called with sg->mm->mmap_lock in read
*/
static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pmd_t *pmdp, int prot, unsigned long bits)
@@ -1031,7 +1041,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EFAULT if gaddr is invalid (or mapping for shadows is missing).
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
unsigned long len, int prot, unsigned long bits)
@@ -1102,9 +1112,9 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
return -EINVAL;
if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
@@ -1120,7 +1130,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
* if reading using the virtual address failed. -EINVAL if called on a gmap
* shadow.
*
- * Called with gmap->mm->mmap_sem in read.
+ * Called with gmap->mm->mmap_lock in read.
*/
int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
{
@@ -1141,7 +1151,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
address = pte_val(pte) & PAGE_MASK;
address += gaddr & ~PAGE_MASK;
*val = *(unsigned long *) address;
- pte_val(*ptep) |= _PAGE_YOUNG;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
/* Do *NOT* clear the _PAGE_INVALID bit! */
rc = 0;
}
@@ -1173,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table);
static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
struct gmap_rmap *rmap)
{
+ struct gmap_rmap *temp;
void __rcu **slot;
BUG_ON(!gmap_is_shadow(sg));
@@ -1180,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
if (slot) {
rmap->next = radix_tree_deref_slot_protected(slot,
&sg->guest_table_lock);
+ for (temp = rmap->next; temp; temp = temp->next) {
+ if (temp->raddr == rmap->raddr) {
+ kfree(rmap);
+ return;
+ }
+ }
radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
} else {
rmap->next = NULL;
@@ -1214,11 +1231,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
vmaddr = __gmap_translate(parent, paddr);
if (IS_ERR_VALUE(vmaddr))
return vmaddr;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = raddr;
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc) {
kfree(rmap);
return rc;
@@ -1268,7 +1285,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
{
asm volatile(
- " .insn rrf,0xb98e0000,%0,%1,0,0"
+ " idte %0,0,%1"
: : "a" (asce), "a" (vaddr) : "cc", "memory");
}
@@ -1692,11 +1709,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
}
spin_unlock(&parent->shadow_lock);
/* protect after insertion, so it will get properly invalidated */
- down_read(&parent->mm->mmap_sem);
+ mmap_read_lock(parent->mm);
rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
PROT_READ, GMAP_NOTIFY_SHADOW);
- up_read(&parent->mm->mmap_sem);
+ mmap_read_unlock(parent->mm);
spin_lock(&parent->shadow_lock);
new->initialized = true;
if (rc) {
@@ -1725,7 +1742,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
int fake)
@@ -1737,7 +1754,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = r2t & _REGION_ENTRY_ORIGIN;
@@ -1809,7 +1826,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
int fake)
@@ -1821,7 +1838,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = r3t & _REGION_ENTRY_ORIGIN;
@@ -1840,6 +1857,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
goto out_free;
} else if (*table & _REGION_ENTRY_ORIGIN) {
rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
}
crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
@@ -1892,7 +1910,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
int fake)
@@ -1904,7 +1922,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
/* Allocate a shadow segment table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = sgt & _REGION_ENTRY_ORIGIN;
@@ -1966,7 +1984,7 @@ out_free:
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
/**
- * gmap_shadow_lookup_pgtable - find a shadow page table
+ * gmap_shadow_pgt_lookup - find a shadow page table
* @sg: pointer to the shadow guest address space structure
* @saddr: the address in the shadow aguest address space
* @pgt: parent gmap address of the page table to get shadowed
@@ -1976,7 +1994,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
* Returns 0 if the shadow page table was found and -EAGAIN if the page
* table was not found.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection,
@@ -2016,7 +2034,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
* shadow table structure is incomplete, -ENOMEM if out of memory,
* -EFAULT if an address in the parent gmap could not be resolved and
*
- * Called with gmap->mm->mmap_sem in read
+ * Called with gmap->mm->mmap_lock in read
*/
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
int fake)
@@ -2095,7 +2113,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
{
@@ -2111,7 +2129,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
parent = sg->parent;
prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
@@ -2123,7 +2141,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
rc = vmaddr;
break;
}
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
break;
rc = -EAGAIN;
@@ -2160,7 +2178,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
}
EXPORT_SYMBOL_GPL(gmap_shadow_page);
-/**
+/*
* gmap_shadow_notify - handle notifications for shadow gmap
*
* Called with sg->parent->shadow_lock.
@@ -2220,7 +2238,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
+ * @vmaddr: virtual address in the process address space
* @pte: pointer to the page table entry
* @bits: bits from the pgste that caused the notify call
*
@@ -2264,7 +2282,7 @@ EXPORT_SYMBOL_GPL(ptep_notify);
static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
unsigned long gaddr)
{
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
}
@@ -2283,7 +2301,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
{
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
- pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
@@ -2291,7 +2309,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
}
static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
@@ -2313,7 +2331,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
_SEGMENT_ENTRY_GMAP_UC));
if (purge)
__pmdp_csp(pmdp);
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2436,7 +2454,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
return false;
/* Clear UC indication and reset protection */
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
return true;
}
@@ -2480,23 +2498,37 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
}
EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ split_huge_pmd(vma, pmd, addr);
+ return 0;
+}
+
+static const struct mm_walk_ops thp_split_walk_ops = {
+ .pmd_entry = thp_split_walk_pmd_entry,
+};
+
static inline void thp_split_mm(struct mm_struct *mm)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct vm_area_struct *vma;
- unsigned long addr;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
- for (addr = vma->vm_start;
- addr < vma->vm_end;
- addr += PAGE_SIZE)
- follow_page(vma, addr, FOLL_SPLIT);
+ for_each_vma(vmi, vma) {
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
+ walk_page_vma(vma, &thp_split_walk_ops, NULL);
}
mm->def_flags |= VM_NOHUGEPAGE;
-#endif
}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Remove all empty zero pages from the mapping for lazy refaulting
@@ -2538,16 +2570,34 @@ int s390_enable_sie(void)
/* Fail if the page tables are 2K */
if (!mm_alloc_pgste(mm))
return -EINVAL;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
+int gmap_mark_unmergeable(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int ret;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ for_each_vma(vmi, vma) {
+ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ MADV_UNMERGEABLE, &vma->vm_flags);
+ if (ret)
+ return ret;
+ }
+ mm->def_flags &= ~VM_MERGEABLE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2560,6 +2610,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
return 0;
}
+/*
+ * Give a chance to schedule after setting a key to 256 pages.
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
+ * Both can sleep.
+ */
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ cond_resched();
+ return 0;
+}
+
static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
unsigned long hmask, unsigned long next,
struct mm_walk *walk)
@@ -2582,39 +2644,35 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
end = start + HPAGE_SIZE - 1;
__storage_key_init_range(start, end);
set_bit(PG_arch_1, &page->flags);
+ cond_resched();
return 0;
}
static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
+ .pmd_entry = __s390_enable_skey_pmd,
};
int s390_enable_skey(void)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
int rc = 0;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
if (mm_uses_skeys(mm))
goto out_up;
mm->context.uses_skeys = 1;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags)) {
- mm->context.uses_skeys = 0;
- rc = -ENOMEM;
- goto out_up;
- }
+ rc = gmap_mark_unmergeable();
+ if (rc) {
+ mm->context.uses_skeys = 0;
+ goto out_up;
}
- mm->def_flags &= ~VM_MERGEABLE;
-
walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
out_up:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return rc;
}
EXPORT_SYMBOL_GPL(s390_enable_skey);
@@ -2635,8 +2693,174 @@ static const struct mm_walk_ops reset_cmma_walk_ops = {
void s390_reset_cmma(struct mm_struct *mm)
{
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+ unsigned long next;
+ unsigned long count;
+ unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct reset_walk_state *p = walk->private;
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte)) {
+ /* we have a reference from the mapping, take an extra one */
+ get_page(phys_to_page(pte_val(pte)));
+ p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+ p->next = next;
+ p->count++;
+ }
+ return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+ .pte_entry = s390_gather_pages,
+};
+
+/*
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
+ */
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
+{
+ unsigned long i;
+
+ for (i = 0; i < count; i++) {
+ /* we always have an extra reference */
+ uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ /* get rid of the extra reference */
+ put_page(pfn_to_page(pfns[i]));
+ cond_resched();
+ }
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible)
+{
+ struct reset_walk_state state = { .next = start };
+ int r = 1;
+
+ while (r > 0) {
+ state.count = 0;
+ mmap_read_lock(mm);
+ r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+ mmap_read_unlock(mm);
+ cond_resched();
+ s390_uv_destroy_pfns(state.count, state.pfns);
+ if (interruptible && fatal_signal_pending(current))
+ return -EINTR;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
+
+/**
+ * s390_unlist_old_asce - Remove the topmost level of page tables from the
+ * list of page tables of the gmap.
+ * @gmap: the gmap whose table is to be removed
+ *
+ * On s390x, KVM keeps a list of all pages containing the page tables of the
+ * gmap (the CRST list). This list is used at tear down time to free all
+ * pages that are now not needed anymore.
+ *
+ * This function removes the topmost page of the tree (the one pointed to by
+ * the ASCE) from the CRST list.
+ *
+ * This means that it will not be freed when the VM is torn down, and needs
+ * to be handled separately by the caller, unless a leak is actually
+ * intended. Notice that this function will only remove the page from the
+ * list, the page will still be used as a top level page table (and ASCE).
+ */
+void s390_unlist_old_asce(struct gmap *gmap)
+{
+ struct page *old;
+
+ old = virt_to_page(gmap->table);
+ spin_lock(&gmap->guest_table_lock);
+ list_del(&old->lru);
+ /*
+ * Sometimes the topmost page might need to be "removed" multiple
+ * times, for example if the VM is rebooted into secure mode several
+ * times concurrently, or if s390_replace_asce fails after calling
+ * s390_remove_old_asce and is attempted again later. In that case
+ * the old asce has been removed from the list, and therefore it
+ * will not be freed when the VM terminates, but the ASCE is still
+ * in use and still pointed to.
+ * A subsequent call to replace_asce will follow the pointer and try
+ * to remove the same page from the list again.
+ * Therefore it's necessary that the page of the ASCE has valid
+ * pointers, so list_del can work (and do nothing) without
+ * dereferencing stale or invalid pointers.
+ */
+ INIT_LIST_HEAD(&old->lru);
+ spin_unlock(&gmap->guest_table_lock);
+}
+EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+ unsigned long asce;
+ struct page *page;
+ void *table;
+
+ s390_unlist_old_asce(gmap);
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ table = page_to_virt(page);
+ memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
+ /*
+ * The caller has to deal with the old ASCE, but here we make sure
+ * the new one is properly added to the CRST list, so that
+ * it will be freed when the VM is torn down.
+ */
+ spin_lock(&gmap->guest_table_lock);
+ list_add(&page->lru, &gmap->crst_list);
+ spin_unlock(&gmap->guest_table_lock);
+
+ /* Set new table origin while preserving existing ASCE control bits */
+ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+ WRITE_ONCE(gmap->asce, asce);
+ WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+ WRITE_ONCE(gmap->table, table);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 5674710a4841..c299a18273ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -9,6 +9,7 @@
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <asm/pgalloc.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
@@ -72,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte)
static inline pte_t __rste_to_pte(unsigned long rste)
{
+ unsigned long pteval;
int present;
- pte_t pte;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
present = pud_present(__pud(rste));
@@ -101,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste)
* u unused, l large
*/
if (present) {
- pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
- pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
- _PAGE_READ);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
- _PAGE_WRITE);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
- _PAGE_INVALID);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
- _PAGE_PROTECT);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
- _PAGE_DIRTY);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
- _PAGE_YOUNG);
+ pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ pteval |= _PAGE_LARGE | _PAGE_PRESENT;
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
#ifdef CONFIG_MEM_SOFT_DIRTY
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
- _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
#endif
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC,
- _PAGE_NOEXEC);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
} else
- pte_val(pte) = _PAGE_INVALID;
- return pte;
+ pteval = _PAGE_INVALID;
+ return __pte(pteval);
}
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
@@ -159,12 +152,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
rste &= ~_SEGMENT_ENTRY_NOEXEC;
/* Set correct table type for 2G hugepages */
- if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
- else
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+ if (likely(pte_present(pte)))
+ rste |= _REGION3_ENTRY_LARGE;
+ rste |= _REGION_ENTRY_TYPE_R3;
+ } else if (likely(pte_present(pte)))
rste |= _SEGMENT_ENTRY_LARGE;
+
clear_huge_pte_skeys(mm, rste);
- pte_val(*ptep) = rste;
+ set_pte(ptep, __pte(rste));
}
pte_t huge_ptep_get(pte_t *ptep)
@@ -186,7 +182,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
@@ -241,35 +237,15 @@ int pud_huge(pud_t pud)
return pud_large(pud);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- if (flags & FOLL_GET)
- return NULL;
-
- return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-}
-
-static __init int setup_hugepagesz(char *opt)
-{
- unsigned long size;
- char *string = opt;
-
- size = memparse(opt, &opt);
- if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- pr_err("hugepagesz= specifies an unsupported page size %s\n",
- string);
- return 0;
- }
- return 1;
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+ return true;
+ else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
@@ -326,7 +302,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- int rc;
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -353,15 +328,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
else
addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
- return addr;
+ return check_asce_limit(mm, addr, len);
}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ac44bd76db4b..97d66a3e60fb 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -33,10 +33,11 @@
#include <linux/dma-direct.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/kfence.h>
+#include <asm/ptdump.h>
#include <asm/dma.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -46,15 +47,18 @@
#include <asm/kasan.h>
#include <asm/dma-mapping.h>
#include <asm/uv.h>
+#include <linux/virtio_anchor.h>
+#include <linux/virtio_config.h>
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
+static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+
+unsigned long s390_invalid_asce;
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(zero_page_mask);
-bool initmem_freed;
-
static void __init setup_zero_pages(void)
{
unsigned int order;
@@ -91,6 +95,9 @@ void __init paging_init(void)
unsigned long pgd_type, asce_bits;
psw_t psw;
+ s390_invalid_asce = (unsigned long)invalid_pg_dir;
+ s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
init_mm.pgd = swapper_pg_dir;
if (VMALLOC_END > _REGION2_SIZE) {
asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
@@ -101,14 +108,14 @@ void __init paging_init(void)
}
init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
S390_lowcore.kernel_asce = init_mm.context.asce;
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
+ S390_lowcore.user_asce = s390_invalid_asce;
crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
vmem_map_init();
- kasan_copy_shadow(init_mm.pgd);
+ kasan_copy_shadow_mapping();
/* enable virtual mapping in kernel mode */
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
__ctl_load(S390_lowcore.kernel_asce, 13, 13);
psw.mask = __extract_psw();
psw_bits(psw).dat = 1;
@@ -116,13 +123,12 @@ void __init paging_init(void)
__load_psw_mask(psw.mask);
kasan_free_early_identity();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_dma_bits = 31;
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
void mark_rodata_ro(void)
@@ -131,6 +137,7 @@ void mark_rodata_ro(void)
set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
+ debug_checkwx();
}
int set_memory_encrypted(unsigned long addr, int numpages)
@@ -168,10 +175,11 @@ static void pv_init(void)
if (!is_prot_virt_guest())
return;
+ virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+
/* make sure bounce buffers are shared */
- swiotlb_init(1);
+ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
swiotlb_update_mem_attributes();
- swiotlb_force = SWIOTLB_FORCE;
}
void __init mem_init(void)
@@ -183,7 +191,7 @@ void __init mem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
pv_init();
-
+ kfence_split_mapping();
/* Setup guest page hinting */
cmma_init();
@@ -192,16 +200,16 @@ void __init mem_init(void)
setup_zero_pages(); /* Setup zeroed pages. */
cmma_init_nodat();
-
- mem_init_print_info(NULL);
}
void free_initmem(void)
{
- initmem_freed = true;
__set_memory((unsigned long)_sinittext,
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
SET_MEMORY_RW | SET_MEMORY_NX);
+ free_reserved_area(sclp_early_sccb,
+ sclp_early_sccb + EXT_SCCB_READ_SCP,
+ POISON_FREE_INITMEM, "unused early sccb");
free_initmem_default(POISON_FREE_INITMEM);
}
@@ -268,27 +276,30 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(restrictions->altmap))
+ if (WARN_ON_ONCE(params->altmap))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
+ VM_BUG_ON(!mhp_range_allowed(start, size, true));
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+ rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
index 06345616a646..9f988d4582ed 100644
--- a/arch/s390/mm/kasan_init.c
+++ b/arch/s390/mm/kasan_init.c
@@ -2,8 +2,8 @@
#include <linux/kasan.h>
#include <linux/sched/task.h>
#include <linux/memblock.h>
+#include <linux/pgtable.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/kasan.h>
#include <asm/mem_detect.h>
#include <asm/processor.h>
@@ -11,6 +11,7 @@
#include <asm/facility.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/uv.h>
static unsigned long segment_pos __initdata;
static unsigned long segment_low __initdata;
@@ -85,7 +86,7 @@ enum populate_mode {
POPULATE_ZERO_SHADOW,
POPULATE_SHALLOW
};
-static void __init kasan_early_vmemmap_populate(unsigned long address,
+static void __init kasan_early_pgtable_populate(unsigned long address,
unsigned long end,
enum populate_mode mode)
{
@@ -99,9 +100,16 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO);
if (!has_nx)
pgt_prot_zero &= ~_PAGE_NOEXEC;
- pgt_prot = pgprot_val(PAGE_KERNEL_EXEC);
- sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC);
+ pgt_prot = pgprot_val(PAGE_KERNEL);
+ sgt_prot = pgprot_val(SEGMENT_KERNEL);
+ if (!has_nx || mode == POPULATE_ONE2ONE) {
+ pgt_prot &= ~_PAGE_NOEXEC;
+ sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
+ }
+ /*
+ * The first 1MB of 1:1 mapping is mapped with 4KB pages
+ */
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
@@ -117,8 +125,7 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
pgd_populate(&init_mm, pg_dir, p4_dir);
}
- if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) &&
- mode == POPULATE_SHALLOW) {
+ if (mode == POPULATE_SHALLOW) {
address = (address + P4D_SIZE) & P4D_MASK;
continue;
}
@@ -137,12 +144,6 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
p4d_populate(&init_mm, p4_dir, pu_dir);
}
- if (!IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) &&
- mode == POPULATE_SHALLOW) {
- address = (address + PUD_SIZE) & PUD_MASK;
- continue;
- }
-
pu_dir = pud_offset(p4_dir, address);
if (pud_none(*pu_dir)) {
if (mode == POPULATE_ZERO_SHADOW &&
@@ -159,30 +160,26 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
pm_dir = pmd_offset(pu_dir, address);
if (pmd_none(*pm_dir)) {
- if (mode == POPULATE_ZERO_SHADOW &&
- IS_ALIGNED(address, PMD_SIZE) &&
+ if (IS_ALIGNED(address, PMD_SIZE) &&
end - address >= PMD_SIZE) {
- pmd_populate(&init_mm, pm_dir,
- kasan_early_shadow_pte);
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
- /* the first megabyte of 1:1 is mapped with 4k pages */
- if (has_edat && address && end - address >= PMD_SIZE &&
- mode != POPULATE_ZERO_SHADOW) {
- void *page;
-
- if (mode == POPULATE_ONE2ONE) {
- page = (void *)address;
- } else {
- page = kasan_early_alloc_segment();
- memset(page, 0, _SEGMENT_SIZE);
+ if (mode == POPULATE_ZERO_SHADOW) {
+ pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ continue;
+ } else if (has_edat && address) {
+ void *page;
+
+ if (mode == POPULATE_ONE2ONE) {
+ page = (void *)address;
+ } else {
+ page = kasan_early_alloc_segment();
+ memset(page, 0, _SEGMENT_SIZE);
+ }
+ set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot));
+ address = (address + PMD_SIZE) & PMD_MASK;
+ continue;
}
- pmd_val(*pm_dir) = __pa(page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
}
-
pt_dir = kasan_early_pte_alloc();
pmd_populate(&init_mm, pm_dir, pt_dir);
} else if (pmd_large(*pm_dir)) {
@@ -197,16 +194,16 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
switch (mode) {
case POPULATE_ONE2ONE:
page = (void *)address;
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
break;
case POPULATE_MAP:
page = kasan_early_alloc_pages(0);
memset(page, 0, PAGE_SIZE);
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
break;
case POPULATE_ZERO_SHADOW:
page = kasan_early_shadow_page;
- pte_val(*pt_dir) = __pa(page) | pgt_prot_zero;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero));
break;
case POPULATE_SHALLOW:
/* should never happen */
@@ -254,12 +251,9 @@ static void __init kasan_early_detect_facilities(void)
void __init kasan_early_init(void)
{
- unsigned long untracked_mem_end;
unsigned long shadow_alloc_size;
unsigned long initrd_end;
- unsigned long asce_type;
unsigned long memsize;
- unsigned long vmax;
unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO);
pte_t pte_z;
pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
@@ -274,30 +268,23 @@ void __init kasan_early_init(void)
memsize = get_mem_detect_end();
if (!memsize)
kasan_early_panic("cannot detect physical memory size\n");
- /* respect mem= cmdline parameter */
- if (memory_end_set && memsize > memory_end)
- memsize = memory_end;
- if (IS_ENABLED(CONFIG_CRASH_DUMP) && OLDMEM_BASE)
- memsize = min(memsize, OLDMEM_SIZE);
- memsize = min(memsize, KASAN_SHADOW_START);
-
- if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) {
- /* 4 level paging */
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
- crst_table_init((unsigned long *)early_pg_dir,
- _REGION2_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION1_SIZE;
- asce_type = _ASCE_TYPE_REGION2;
- } else {
- /* 3 level paging */
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE));
- crst_table_init((unsigned long *)early_pg_dir,
- _REGION3_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION2_SIZE;
- asce_type = _ASCE_TYPE_REGION3;
- }
+ /*
+ * Kasan currently supports standby memory but only if it follows
+ * online memory (default allocation), i.e. no memory holes.
+ * - memsize represents end of online memory
+ * - ident_map_size represents online + standby and memory limits
+ * accounted.
+ * Kasan maps "memsize" right away.
+ * [0, memsize] - as identity mapping
+ * [__sha(0), __sha(memsize)] - shadow memory for identity mapping
+ * The rest [memsize, ident_map_size] if memsize < ident_map_size
+ * could be mapped/unmapped dynamically later during memory hotplug.
+ */
+ memsize = min(memsize, ident_map_size);
+
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
+ crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY);
/* init kasan zero shadow */
crst_table_init((unsigned long *)kasan_early_shadow_p4d,
@@ -312,7 +299,7 @@ void __init kasan_early_init(void)
pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
initrd_end =
- round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE);
+ round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE);
pgalloc_low = max(pgalloc_low, initrd_end);
}
@@ -363,24 +350,25 @@ void __init kasan_early_init(void)
* +-----------------+ +- shadow end ---+
*/
/* populate kasan shadow (for identity mapping and zero page mapping) */
- kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP);
- if (IS_ENABLED(CONFIG_MODULES))
- untracked_mem_end = vmax - MODULES_LEN;
+ kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
- untracked_mem_end = vmax - vmalloc_size - MODULES_LEN;
/* shallowly populate kasan shadow for vmalloc and modules */
- kasan_early_vmemmap_populate(__sha(untracked_mem_end),
- __sha(vmax), POPULATE_SHALLOW);
+ kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END),
+ POPULATE_SHALLOW);
}
/* populate kasan shadow for untracked memory */
- kasan_early_vmemmap_populate(__sha(max_physmem_end),
- __sha(untracked_mem_end),
+ kasan_early_pgtable_populate(__sha(ident_map_size),
+ IS_ENABLED(CONFIG_KASAN_VMALLOC) ?
+ __sha(VMALLOC_START) :
+ __sha(MODULES_VADDR),
+ POPULATE_ZERO_SHADOW);
+ kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE),
POPULATE_ZERO_SHADOW);
/* memory allocated for identity mapping structs will be freed later */
pgalloc_freeable = pgalloc_pos;
/* populate identity mapping */
- kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE);
- kasan_set_pgd(early_pg_dir, asce_type);
+ kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE);
+ kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2);
kasan_enable_dat();
/* enable kasan */
init_task.kasan_depth = 0;
@@ -388,7 +376,7 @@ void __init kasan_early_init(void)
sclp_early_printk("KernelAddressSanitizer initialized\n");
}
-void __init kasan_copy_shadow(pgd_t *pg_dir)
+void __init kasan_copy_shadow_mapping(void)
{
/*
* At this point we are still running on early pages setup early_pg_dir,
@@ -400,27 +388,16 @@ void __init kasan_copy_shadow(pgd_t *pg_dir)
pgd_t *pg_dir_dst;
p4d_t *p4_dir_src;
p4d_t *p4_dir_dst;
- pud_t *pu_dir_src;
- pud_t *pu_dir_dst;
pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START);
- pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START);
+ pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START);
p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START);
p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START);
- if (!p4d_folded(*p4_dir_src)) {
- /* 4 level paging */
- memcpy(p4_dir_dst, p4_dir_src,
- (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
- return;
- }
- /* 3 level paging */
- pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START);
- pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START);
- memcpy(pu_dir_dst, pu_dir_src,
- (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t));
+ memcpy(p4_dir_dst, p4_dir_src,
+ (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
}
void __init kasan_free_early_identity(void)
{
- memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
+ memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
}
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index de7ca4b6718f..1571cdcb0c50 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -4,8 +4,6 @@
*
* Copyright IBM Corp. 2009, 2015
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/uaccess.h>
@@ -14,9 +12,17 @@
#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/asm-extable.h>
#include <asm/ctl_reg.h>
#include <asm/io.h>
+#include <asm/abs_lowcore.h>
#include <asm/stacktrace.h>
+#include <asm/maccess.h>
+
+unsigned long __bootdata_preserved(__memcpy_real_area);
+static __ro_after_init pte_t *memcpy_real_ptep;
+static DEFINE_MUTEX(memcpy_real_mutex);
static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
{
@@ -55,155 +61,94 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
*/
static DEFINE_SPINLOCK(s390_kernel_write_lock);
-void notrace s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
{
+ void *tmp = dst;
unsigned long flags;
long copied;
spin_lock_irqsave(&s390_kernel_write_lock, flags);
- while (size) {
- copied = s390_kernel_write_odd(dst, src, size);
- dst += copied;
- src += copied;
- size -= copied;
+ if (!(flags & PSW_MASK_DAT)) {
+ memcpy(dst, src, size);
+ } else {
+ while (size) {
+ copied = s390_kernel_write_odd(tmp, src, size);
+ tmp += copied;
+ src += copied;
+ size -= copied;
+ }
}
spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
-}
-
-static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count)
-{
- register unsigned long _dest asm("2") = (unsigned long) dest;
- register unsigned long _len1 asm("3") = (unsigned long) count;
- register unsigned long _src asm("4") = (unsigned long) src;
- register unsigned long _len2 asm("5") = (unsigned long) count;
- int rc = -EFAULT;
-
- asm volatile (
- "0: mvcle %1,%2,0x0\n"
- "1: jo 0b\n"
- " lhi %0,0x0\n"
- "2:\n"
- EX_TABLE(1b,2b)
- : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
- "+d" (_len2), "=m" (*((long *) dest))
- : "m" (*((long *) src))
- : "cc", "memory");
- return rc;
-}
-static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
- unsigned long src,
- unsigned long count)
-{
- int irqs_disabled, rc;
- unsigned long flags;
-
- if (!count)
- return 0;
- flags = arch_local_irq_save();
- irqs_disabled = arch_irqs_disabled_flags(flags);
- if (!irqs_disabled)
- trace_hardirqs_off();
- __arch_local_irq_stnsm(0xf8); // disable DAT
- rc = __memcpy_real((void *) dest, (void *) src, (size_t) count);
- if (flags & PSW_MASK_DAT)
- __arch_local_irq_stosm(0x04); // enable DAT
- if (!irqs_disabled)
- trace_hardirqs_on();
- __arch_local_irq_ssm(flags);
- return rc;
+ return dst;
}
-/*
- * Copy memory in real mode (kernel to kernel)
- */
-int memcpy_real(void *dest, void *src, size_t count)
+void __init memcpy_real_init(void)
{
- int rc;
-
- if (S390_lowcore.nodat_stack != 0) {
- preempt_disable();
- rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
- dest, src, count);
- preempt_enable();
- return rc;
- }
- /*
- * This is a really early memcpy_real call, the stacks are
- * not set up yet. Just call _memcpy_real on the early boot
- * stack
- */
- return _memcpy_real((unsigned long) dest,(unsigned long) src,
- (unsigned long) count);
+ memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true);
+ if (!memcpy_real_ptep)
+ panic("Couldn't setup memcpy real area");
}
-/*
- * Copy memory in absolute mode (kernel to kernel)
- */
-void memcpy_absolute(void *dest, void *src, size_t count)
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- unsigned long cr0, flags, prefix;
-
- flags = arch_local_irq_save();
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- prefix = store_prefix();
- if (prefix) {
- local_mcck_disable();
- set_prefix(0);
- memcpy(dest, src, count);
- set_prefix(prefix);
- local_mcck_enable();
- } else {
- memcpy(dest, src, count);
+ size_t len, copied, res = 0;
+ unsigned long phys, offset;
+ void *chunk;
+ pte_t pte;
+
+ while (count) {
+ phys = src & PAGE_MASK;
+ offset = src & ~PAGE_MASK;
+ chunk = (void *)(__memcpy_real_area + offset);
+ len = min(count, PAGE_SIZE - offset);
+ pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
+
+ mutex_lock(&memcpy_real_mutex);
+ if (pte_val(pte) != pte_val(*memcpy_real_ptep)) {
+ __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL);
+ set_pte(memcpy_real_ptep, pte);
+ }
+ copied = copy_to_iter(chunk, len, iter);
+ mutex_unlock(&memcpy_real_mutex);
+
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- __ctl_load(cr0, 0, 0);
- arch_local_irq_restore(flags);
+ return res;
}
-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, void *src, unsigned long count)
+int memcpy_real(void *dest, unsigned long src, size_t count)
{
- int offs = 0, size, rc;
- char *buf;
-
- buf = (char *) __get_free_page(GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- rc = -EFAULT;
- while (offs < count) {
- size = min(PAGE_SIZE, count - offs);
- if (memcpy_real(buf, src + offs, size))
- goto out;
- if (copy_to_user(dest + offs, buf, size))
- goto out;
- offs += size;
- }
- rc = 0;
-out:
- free_page((unsigned long) buf);
- return rc;
+ struct iov_iter iter;
+ struct kvec kvec;
+
+ kvec.iov_base = dest;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+ if (memcpy_real_iter(&iter, src, count) < count)
+ return -EFAULT;
+ return 0;
}
/*
- * Check if physical address is within prefix or zero page
+ * Find CPU that owns swapped prefix page
*/
-static int is_swapped(unsigned long addr)
+static int get_swapped_owner(phys_addr_t addr)
{
- unsigned long lc;
+ phys_addr_t lc;
int cpu;
- if (addr < sizeof(struct lowcore))
- return 1;
for_each_online_cpu(cpu) {
- lc = (unsigned long) lowcore_ptr[cpu];
+ lc = virt_to_phys(lowcore_ptr[cpu]);
if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
continue;
- return 1;
+ return cpu;
}
- return 0;
+ return -1;
}
/*
@@ -214,27 +159,46 @@ static int is_swapped(unsigned long addr)
*/
void *xlate_dev_mem_ptr(phys_addr_t addr)
{
- void *bounce = (void *) addr;
+ void *ptr = phys_to_virt(addr);
+ void *bounce = ptr;
+ struct lowcore *abs_lc;
+ unsigned long flags;
unsigned long size;
+ int this_cpu, cpu;
- get_online_cpus();
- preempt_disable();
- if (is_swapped(addr)) {
- size = PAGE_SIZE - (addr & ~PAGE_MASK);
- bounce = (void *) __get_free_page(GFP_ATOMIC);
- if (bounce)
- memcpy_absolute(bounce, (void *) addr, size);
+ cpus_read_lock();
+ this_cpu = get_cpu();
+ if (addr >= sizeof(struct lowcore)) {
+ cpu = get_swapped_owner(addr);
+ if (cpu < 0)
+ goto out;
}
- preempt_enable();
- put_online_cpus();
+ bounce = (void *)__get_free_page(GFP_ATOMIC);
+ if (!bounce)
+ goto out;
+ size = PAGE_SIZE - (addr & ~PAGE_MASK);
+ if (addr < sizeof(struct lowcore)) {
+ abs_lc = get_abs_lowcore(&flags);
+ ptr = (void *)abs_lc + addr;
+ memcpy(bounce, ptr, size);
+ put_abs_lowcore(abs_lc, flags);
+ } else if (cpu == this_cpu) {
+ ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+ memcpy(bounce, ptr, size);
+ } else {
+ memcpy(bounce, ptr, size);
+ }
+out:
+ put_cpu();
+ cpus_read_unlock();
return bounce;
}
/*
* Free converted buffer for /dev/mem access (if necessary)
*/
-void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf)
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr)
{
- if ((void *) addr != buf)
- free_page((unsigned long) buf);
+ if (addr != virt_to_phys(ptr))
+ free_page((unsigned long)ptr);
}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index cbc718ba6d78..3327c47bc181 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,7 +17,6 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
@@ -38,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack)
unsigned long arch_mmap_rnd(void)
{
- return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT;
}
static unsigned long mmap_base_legacy(unsigned long rnd)
@@ -59,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd,
/*
* Top of mmap area (just below the process stack).
- * Leave at least a ~32 MB hole.
+ * Leave at least a ~128 MB hole.
*/
- gap_min = 32 * 1024 * 1024UL;
+ gap_min = SZ_128M;
gap_max = (STACK_TOP / 6) * 5;
if (gap < gap_min)
@@ -72,14 +71,13 @@ static inline unsigned long mmap_base(unsigned long rnd,
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- int rc;
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
@@ -105,30 +103,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
- int rc;
/* requested length too big for entire address space */
if (len > TASK_SIZE - mmap_min_addr)
@@ -163,25 +151,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
}
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
/*
@@ -207,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
+
+static const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_RO,
+ [VM_WRITE] = PAGE_RO,
+ [VM_WRITE | VM_READ] = PAGE_RO,
+ [VM_EXEC] = PAGE_RX,
+ [VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_EXEC | VM_WRITE] = PAGE_RX,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_RO,
+ [VM_SHARED | VM_WRITE] = PAGE_RW,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW,
+ [VM_SHARED | VM_EXEC] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX
+};
+DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc141893d028..d5ea09d78938 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -14,6 +14,7 @@
#include <linux/memblock.h>
#include <linux/gfp.h>
#include <linux/init.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
#include <asm/page-states.h>
@@ -31,17 +32,17 @@ __setup("cmma=", cmma);
static inline int cmma_test_essa(void)
{
- register unsigned long tmp asm("0") = 0;
- register int rc asm("1");
+ unsigned long tmp = 0;
+ int rc = -EOPNOTSUPP;
/* test ESSA_GET_STATE */
asm volatile(
- " .insn rrf,0xb9ab0000,%1,%1,%2,0\n"
- "0: la %0,0\n"
+ " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n"
+ "0: la %[rc],0\n"
"1:\n"
EX_TABLE(0b,1b)
- : "=&d" (rc), "+&d" (tmp)
- : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP));
+ : [rc] "+&d" (rc), [tmp] "+&d" (tmp)
+ : [cmd] "i" (ESSA_GET_STATE));
return rc;
}
@@ -112,7 +113,7 @@ static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end)
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || pmd_large(*pmd))
continue;
- page = virt_to_page(pmd_val(*pmd));
+ page = phys_to_page(pmd_val(*pmd));
set_bit(PG_arch_1, &page->flags);
} while (pmd++, addr = next, addr != end);
}
@@ -130,7 +131,7 @@ static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end)
if (pud_none(*pud) || pud_large(*pud))
continue;
if (!pud_folded(*pud)) {
- page = virt_to_page(pud_val(*pud));
+ page = phys_to_page(pud_val(*pud));
for (i = 0; i < 3; i++)
set_bit(PG_arch_1, &page[i].flags);
}
@@ -151,7 +152,7 @@ static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end)
if (p4d_none(*p4d))
continue;
if (!p4d_folded(*p4d)) {
- page = virt_to_page(p4d_val(*p4d));
+ page = phys_to_page(p4d_val(*p4d));
for (i = 0; i < 3; i++)
set_bit(PG_arch_1, &page[i].flags);
}
@@ -173,7 +174,7 @@ static void mark_kernel_pgd(void)
if (pgd_none(*pgd))
continue;
if (!pgd_folded(*pgd)) {
- page = virt_to_page(pgd_val(*pgd));
+ page = phys_to_page(pgd_val(*pgd));
for (i = 0; i < 3; i++)
set_bit(PG_arch_1, &page[i].flags);
}
@@ -183,9 +184,9 @@ static void mark_kernel_pgd(void)
void __init cmma_init_nodat(void)
{
- struct memblock_region *reg;
struct page *page;
unsigned long start, end, ix;
+ int i;
if (cmma_flag < 2)
return;
@@ -193,9 +194,7 @@ void __init cmma_init_nodat(void)
mark_kernel_pgd();
/* Set all kernel pages not used for page tables to stable/no-dat */
- for_each_memblock(memory, reg) {
- start = memblock_region_memory_base_pfn(reg);
- end = memblock_region_memory_end_pfn(reg);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
page = pfn_to_page(start);
for (ix = start; ix < end; ix++, page++) {
if (__test_and_clear_bit(PG_arch_1, &page->flags))
@@ -230,46 +229,3 @@ void arch_set_page_dat(struct page *page, int order)
return;
set_page_stable_dat(page, order);
}
-
-void arch_set_page_nodat(struct page *page, int order)
-{
- if (cmma_flag < 2)
- return;
- set_page_stable_nodat(page, order);
-}
-
-int arch_test_page_nodat(struct page *page)
-{
- unsigned char state;
-
- if (cmma_flag < 2)
- return 0;
- state = get_page_state(page);
- return !!(state & 0x20);
-}
-
-void arch_set_page_states(int make_stable)
-{
- unsigned long flags, order, t;
- struct list_head *l;
- struct page *page;
- struct zone *zone;
-
- if (!cmma_flag)
- return;
- if (make_stable)
- drain_local_pages(NULL);
- for_each_populated_zone(zone) {
- spin_lock_irqsave(&zone->lock, flags);
- for_each_migratetype_order(order, t) {
- list_for_each(l, &zone->free_area[order].free_list[t]) {
- page = list_entry(l, struct page, lru);
- if (make_stable)
- set_page_stable_dat(page, order);
- else
- set_page_unused(page, order);
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f8c6faab41f4..85195c18b2e8 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -7,8 +7,8 @@
#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/facility.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/kfence.h>
#include <asm/page.h>
#include <asm/set_memory.h>
@@ -57,7 +57,7 @@ void arch_report_meminfo(struct seq_file *m)
static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
unsigned long dtt)
{
- unsigned long table, mask;
+ unsigned long *table, mask;
mask = 0;
if (MACHINE_HAS_EDAT2) {
@@ -72,7 +72,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
break;
}
- table = (unsigned long)old & mask;
+ table = (unsigned long *)((unsigned long)old & mask);
crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
} else if (MACHINE_HAS_IDTE) {
cspg(old, *old, new);
@@ -86,7 +86,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
{
pte_t *ptep, new;
- ptep = pte_offset(pmdp, addr);
+ if (flags == SET_MEMORY_4K)
+ return 0;
+ ptep = pte_offset_kernel(pmdp, addr);
do {
new = *ptep;
if (pte_none(new))
@@ -96,9 +98,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
else if (flags & SET_MEMORY_RW)
new = pte_mkwrite(pte_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pte_val(new) |= _PAGE_NOEXEC;
+ new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
else if (flags & SET_MEMORY_X)
- pte_val(new) &= ~_PAGE_NOEXEC;
+ new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
addr += PAGE_SIZE;
@@ -125,11 +127,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
prot &= ~_PAGE_NOEXEC;
ptep = pt_dir;
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_val(*ptep) = pte_addr | prot;
+ set_pte(ptep, __pte(pte_addr | prot));
pte_addr += PAGE_SIZE;
ptep++;
}
- pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY);
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
update_page_count(PG_DIRECT_MAP_1M, -1);
@@ -146,9 +148,9 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pmd_mkwrite(pmd_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
@@ -156,6 +158,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
+ int need_split;
pmd_t *pmdp;
int rc = 0;
@@ -165,7 +168,10 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
return -EINVAL;
next = pmd_addr_end(addr, end);
if (pmd_large(*pmdp)) {
- if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+ need_split = !!(flags & SET_MEMORY_4K);
+ need_split |= !!(addr & ~PMD_MASK);
+ need_split |= !!(addr + PMD_SIZE > next);
+ if (need_split) {
rc = split_pmd_page(pmdp, addr);
if (rc)
return rc;
@@ -202,11 +208,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr)
prot &= ~_SEGMENT_ENTRY_NOEXEC;
pmdp = pm_dir;
for (i = 0; i < PTRS_PER_PMD; i++) {
- pmd_val(*pmdp) = pmd_addr | prot;
+ set_pmd(pmdp, __pmd(pmd_addr | prot));
pmd_addr += PMD_SIZE;
pmdp++;
}
- pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ new = __pud(__pa(pm_dir) | _REGION3_ENTRY);
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
update_page_count(PG_DIRECT_MAP_2G, -1);
@@ -223,9 +229,9 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pud_mkwrite(pud_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pud_val(new) |= _REGION_ENTRY_NOEXEC;
+ new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
+ new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
@@ -233,6 +239,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
+ int need_split;
pud_t *pudp;
int rc = 0;
@@ -242,7 +249,10 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
return -EINVAL;
next = pud_addr_end(addr, end);
if (pud_large(*pudp)) {
- if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+ need_split = !!(flags & SET_MEMORY_4K);
+ need_split |= !!(addr & ~PUD_MASK);
+ need_split |= !!(addr + PUD_SIZE > next);
+ if (need_split) {
rc = split_pud_page(pudp, addr);
if (rc)
break;
@@ -279,7 +289,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
return rc;
}
-static DEFINE_MUTEX(cpa_mutex);
+DEFINE_MUTEX(cpa_mutex);
static int change_page_attr(unsigned long addr, unsigned long end,
unsigned long flags)
@@ -317,7 +327,7 @@ int __set_memory(unsigned long addr, int numpages, unsigned long flags)
return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
static void ipte_range(pte_t *pte, unsigned long address, int nr)
{
@@ -337,50 +347,27 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr)
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long address;
+ pte_t *ptep, pte;
int nr, i, j;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
for (i = 0; i < numpages;) {
- address = page_to_phys(page + i);
- pgd = pgd_offset_k(address);
- p4d = p4d_offset(pgd, address);
- pud = pud_offset(p4d, address);
- pmd = pmd_offset(pud, address);
- pte = pte_offset_kernel(pmd, address);
- nr = (unsigned long)pte >> ilog2(sizeof(long));
+ address = (unsigned long)page_to_virt(page + i);
+ ptep = virt_to_kpte(address);
+ nr = (unsigned long)ptep >> ilog2(sizeof(long));
nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
nr = min(numpages - i, nr);
if (enable) {
for (j = 0; j < nr; j++) {
- pte_val(*pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
address += PAGE_SIZE;
- pte++;
+ ptep++;
}
} else {
- ipte_range(pte, address, nr);
+ ipte_range(ptep, address, nr);
}
i += nr;
}
}
-#ifdef CONFIG_HIBERNATION
-bool kernel_page_present(struct page *page)
-{
- unsigned long addr;
- int cc;
-
- addr = page_to_phys(page);
- asm volatile(
- " lra %1,0(%1)\n"
- " ipm %0\n"
- " srl %0,28"
- : "=d" (cc), "+a" (addr) : : "cc");
- return cc == 0;
-}
-#endif /* CONFIG_HIBERNATION */
-
#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 3dd253f81a77..2de48b2c1b04 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -53,91 +53,92 @@ __initcall(page_table_register_sysctl);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_KERNEL, 2);
+ struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (!page)
return NULL;
- arch_set_page_dat(page, 2);
- return (unsigned long *) page_to_phys(page);
+ arch_set_page_dat(page, CRST_ALLOC_ORDER);
+ return (unsigned long *) page_to_virt(page);
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long) table, 2);
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
}
static void __crst_table_upgrade(void *arg)
{
struct mm_struct *mm = arg;
- if (current->active_mm == mm)
- set_user_asce(mm);
+ /* change all active ASCEs to avoid the creation of new TLBs */
+ if (current->active_mm == mm) {
+ S390_lowcore.user_asce = mm->context.asce;
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
+ }
__tlb_flush_local();
}
int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
{
- unsigned long *table, *pgd;
- int rc, notify;
+ unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
+ unsigned long asce_limit = mm->context.asce_limit;
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
- VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
- rc = 0;
- notify = 0;
- while (mm->context.asce_limit < end) {
- table = crst_table_alloc(mm);
- if (!table) {
- rc = -ENOMEM;
- break;
- }
- spin_lock_bh(&mm->page_table_lock);
- pgd = (unsigned long *) mm->pgd;
- if (mm->context.asce_limit == _REGION2_SIZE) {
- crst_table_init(table, _REGION2_ENTRY_EMPTY);
- p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = _REGION1_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
- mm_inc_nr_puds(mm);
- } else {
- crst_table_init(table, _REGION1_ENTRY_EMPTY);
- pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = -PAGE_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
- }
- notify = 1;
- spin_unlock_bh(&mm->page_table_lock);
- }
- if (notify)
- on_each_cpu(__crst_table_upgrade, mm, 0);
- return rc;
-}
+ VM_BUG_ON(asce_limit < _REGION2_SIZE);
-void crst_table_downgrade(struct mm_struct *mm)
-{
- pgd_t *pgd;
+ if (end <= asce_limit)
+ return 0;
- /* downgrade should only happen from 3 to 2 levels (compat only) */
- VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
+ if (asce_limit == _REGION2_SIZE) {
+ p4d = crst_table_alloc(mm);
+ if (unlikely(!p4d))
+ goto err_p4d;
+ crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+ }
+ if (end > _REGION1_SIZE) {
+ pgd = crst_table_alloc(mm);
+ if (unlikely(!pgd))
+ goto err_pgd;
+ crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+ }
- if (current->active_mm == mm) {
- clear_user_asce();
- __tlb_flush_mm(mm);
+ spin_lock_bh(&mm->page_table_lock);
+
+ /*
+ * This routine gets called with mmap_lock lock held and there is
+ * no reason to optimize for the case of otherwise. However, if
+ * that would ever change, the below check will let us know.
+ */
+ VM_BUG_ON(asce_limit != mm->context.asce_limit);
+
+ if (p4d) {
+ __pgd = (unsigned long *) mm->pgd;
+ p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
+ mm->pgd = (pgd_t *) p4d;
+ mm->context.asce_limit = _REGION1_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ mm_inc_nr_puds(mm);
+ }
+ if (pgd) {
+ __pgd = (unsigned long *) mm->pgd;
+ pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
+ mm->pgd = (pgd_t *) pgd;
+ mm->context.asce_limit = TASK_SIZE_MAX;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
}
- pgd = mm->pgd;
- mm_dec_nr_pmds(mm);
- mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
- mm->context.asce_limit = _REGION3_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
- crst_table_free(mm, (unsigned long *) pgd);
+ spin_unlock_bh(&mm->page_table_lock);
+
+ on_each_cpu(__crst_table_upgrade, mm, 0);
- if (current->active_mm == mm)
- set_user_asce(mm);
+ return 0;
+
+err_pgd:
+ crst_table_free(mm, p4d);
+err_p4d:
+ return -ENOMEM;
}
static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
@@ -160,7 +161,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm)
page = alloc_page(GFP_KERNEL);
if (page) {
- table = (u64 *)page_to_phys(page);
+ table = (u64 *)page_to_virt(page);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
@@ -175,7 +176,75 @@ void page_table_free_pgste(struct page *page)
#endif /* CONFIG_PGSTE */
/*
- * page table entry allocation/free routines.
+ * A 2KB-pgtable is either upper or lower half of a normal page.
+ * The second half of the page may be unused or used as another
+ * 2KB-pgtable.
+ *
+ * Whenever possible the parent page for a new 2KB-pgtable is picked
+ * from the list of partially allocated pages mm_context_t::pgtable_list.
+ * In case the list is empty a new parent page is allocated and added to
+ * the list.
+ *
+ * When a parent page gets fully allocated it contains 2KB-pgtables in both
+ * upper and lower halves and is removed from mm_context_t::pgtable_list.
+ *
+ * When 2KB-pgtable is freed from to fully allocated parent page that
+ * page turns partially allocated and added to mm_context_t::pgtable_list.
+ *
+ * If 2KB-pgtable is freed from the partially allocated parent page that
+ * page turns unused and gets removed from mm_context_t::pgtable_list.
+ * Furthermore, the unused parent page is released.
+ *
+ * As follows from the above, no unallocated or fully allocated parent
+ * pages are contained in mm_context_t::pgtable_list.
+ *
+ * The upper byte (bits 24-31) of the parent page _refcount is used
+ * for tracking contained 2KB-pgtables and has the following format:
+ *
+ * PP AA
+ * 01234567 upper byte (bits 24-31) of struct page::_refcount
+ * || ||
+ * || |+--- upper 2KB-pgtable is allocated
+ * || +---- lower 2KB-pgtable is allocated
+ * |+------- upper 2KB-pgtable is pending for removal
+ * +-------- lower 2KB-pgtable is pending for removal
+ *
+ * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
+ * using _refcount is possible).
+ *
+ * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+ * parent page is still unallocated;
+ * - removed from mm_context_t::pgtable_list in case both hales of the
+ * parent page are allocated;
+ * These operations are protected with mm_context_t::lock.
+ *
+ * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
+ * and the corresponding PP bit is set to 1 in a single atomic operation.
+ * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
+ * exclusive and may never be both set to 1!
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+ * parent page is still allocated;
+ * - removed from mm_context_t::pgtable_list in case the second half of
+ * the parent page is unallocated;
+ * These operations are protected with mm_context_t::lock.
+ *
+ * It is important to understand that mm_context_t::lock only protects
+ * mm_context_t::pgtable_list and AA bits, but not the parent page itself
+ * and PP bits.
+ *
+ * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
+ * while both AA bits and the second PP bit are already unset. Then the
+ * parent page does not contain any 2KB-pgtable fragment anymore, and it has
+ * also been removed from mm_context_t::pgtable_list. It is safe to release
+ * the page therefore.
+ *
+ * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
+ * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
+ * while the PP bits are never used, nor such a page is added to or removed
+ * from mm_context_t::pgtable_list.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
@@ -191,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
mask = atomic_read(&page->_refcount) >> 24;
- mask = (mask | (mask >> 4)) & 3;
- if (mask != 3) {
- table = (unsigned long *) page_to_phys(page);
+ /*
+ * The pending removal bits must also be checked.
+ * Failure to do so might lead to an impossible
+ * value of (i.e 0x13 or 0x23) written to _refcount.
+ * Such values violate the assumption that pending and
+ * allocation bits are mutually exclusive, and the rest
+ * of the code unrails as result. That could lead to
+ * a whole bunch of races and corruptions.
+ */
+ mask = (mask | (mask >> 4)) & 0x03U;
+ if (mask != 0x03U) {
+ table = (unsigned long *) page_to_virt(page);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
atomic_xor_bits(&page->_refcount,
- 1U << (bit + 24));
+ 0x01U << (bit + 24));
list_del(&page->lru);
}
}
@@ -216,15 +294,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
}
arch_set_page_dat(page, 0);
/* Initialize page table */
- table = (unsigned long *) page_to_phys(page);
+ table = (unsigned long *) page_to_virt(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 3 << 24);
+ atomic_xor_bits(&page->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 1 << 24);
+ atomic_xor_bits(&page->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
@@ -233,29 +311,53 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
return table;
}
+static void page_table_release_check(struct page *page, void *table,
+ unsigned int half, unsigned int mask)
+{
+ char msg[128];
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
+ return;
+ snprintf(msg, sizeof(msg),
+ "Invalid pgtable %p release half 0x%02x mask 0x%02x",
+ table, half, mask);
+ dump_page(page, msg);
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
+ unsigned int mask, bit, half;
struct page *page;
- unsigned int bit, mask;
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ page = virt_to_page(table);
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
- bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
+ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ /*
+ * Mark the page for delayed release. The actual release
+ * will happen outside of the critical section from this
+ * function or from __tlb_remove_table()
+ */
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 3)
+ if (mask & 0x03U)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
- if (mask != 0)
+ mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask >>= 24;
+ if (mask != 0x00U)
return;
+ half = 0x01U << bit;
} else {
- atomic_xor_bits(&page->_refcount, 3U << 24);
+ half = 0x03U;
+ mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask >>= 24;
}
+ page_table_release_check(page, table, half, mask);
pgtable_pte_page_dtor(page);
__free_page(page);
}
@@ -268,50 +370,57 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
unsigned int bit, mask;
mm = tlb->mm;
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ page = virt_to_page(table);
if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr);
- table = (unsigned long *) (__pa(table) | 3);
+ table = (unsigned long *) ((unsigned long)table | 0x03U);
tlb_remove_table(tlb, table);
return;
}
- bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
+ bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
+ /*
+ * Mark the page for delayed release. The actual release will happen
+ * outside of the critical section from __tlb_remove_table() or from
+ * page_table_free()
+ */
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 3)
+ if (mask & 0x03U)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
- table = (unsigned long *) (__pa(table) | (1U << bit));
+ table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table);
}
void __tlb_remove_table(void *_table)
{
- unsigned int mask = (unsigned long) _table & 3;
+ unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
void *table = (void *)((unsigned long) _table ^ mask);
- struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ struct page *page = virt_to_page(table);
- switch (mask) {
- case 0: /* pmd, pud, or p4d */
- free_pages((unsigned long) table, 2);
- break;
- case 1: /* lower 2K of a 4K page table */
- case 2: /* higher 2K of a 4K page table */
+ switch (half) {
+ case 0x00U: /* pmd, pud, or p4d */
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ return;
+ case 0x01U: /* lower 2K of a 4K page table */
+ case 0x02U: /* higher 2K of a 4K page table */
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
mask >>= 24;
- if (mask != 0)
- break;
- /* fallthrough */
- case 3: /* 4K page table with pgstes */
- if (mask & 3)
- atomic_xor_bits(&page->_refcount, 3 << 24);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (mask != 0x00U)
+ return;
+ break;
+ case 0x03U: /* 4K page table with pgstes */
+ mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask >>= 24;
break;
}
+
+ page_table_release_check(page, table, half, mask);
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
}
/*
@@ -321,34 +430,34 @@ void __tlb_remove_table(void *_table)
static struct kmem_cache *base_pgt_cache;
-static unsigned long base_pgt_alloc(void)
+static unsigned long *base_pgt_alloc(void)
{
- u64 *table;
+ unsigned long *table;
table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
if (table)
- memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
- return (unsigned long) table;
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ return table;
}
-static void base_pgt_free(unsigned long table)
+static void base_pgt_free(unsigned long *table)
{
- kmem_cache_free(base_pgt_cache, (void *) table);
+ kmem_cache_free(base_pgt_cache, table);
}
-static unsigned long base_crst_alloc(unsigned long val)
+static unsigned long *base_crst_alloc(unsigned long val)
{
- unsigned long table;
+ unsigned long *table;
- table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (table)
- crst_table_init((unsigned long *)table, val);
+ crst_table_init(table, val);
return table;
}
-static void base_crst_free(unsigned long table)
+static void base_crst_free(unsigned long *table)
{
- free_pages(table, CRST_ALLOC_ORDER);
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
@@ -376,14 +485,14 @@ static inline unsigned long base_lra(unsigned long address)
return real;
}
-static int base_page_walk(unsigned long origin, unsigned long addr,
+static int base_page_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
unsigned long *pte, next;
if (!alloc)
return 0;
- pte = (unsigned long *) origin;
+ pte = origin;
pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
do {
next = base_page_addr_end(addr, end);
@@ -392,13 +501,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_segment_walk(unsigned long origin, unsigned long addr,
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *ste, next, table;
+ unsigned long *ste, next, *table;
int rc;
- ste = (unsigned long *) origin;
+ ste = origin;
ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
do {
next = base_segment_addr_end(addr, end);
@@ -408,9 +517,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
table = base_pgt_alloc();
if (!table)
return -ENOMEM;
- *ste = table | _SEGMENT_ENTRY;
+ *ste = __pa(table) | _SEGMENT_ENTRY;
}
- table = *ste & _SEGMENT_ENTRY_ORIGIN;
+ table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
rc = base_page_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -421,13 +530,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region3_walk(unsigned long origin, unsigned long addr,
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rtte, next, table;
+ unsigned long *rtte, next, *table;
int rc;
- rtte = (unsigned long *) origin;
+ rtte = origin;
rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
do {
next = base_region3_addr_end(addr, end);
@@ -437,9 +546,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rtte = table | _REGION3_ENTRY;
+ *rtte = __pa(table) | _REGION3_ENTRY;
}
- table = *rtte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rtte & _REGION_ENTRY_ORIGIN);
rc = base_segment_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -449,13 +558,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region2_walk(unsigned long origin, unsigned long addr,
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rste, next, table;
+ unsigned long *rste, next, *table;
int rc;
- rste = (unsigned long *) origin;
+ rste = origin;
rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
do {
next = base_region2_addr_end(addr, end);
@@ -465,9 +574,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rste = table | _REGION2_ENTRY;
+ *rste = __pa(table) | _REGION2_ENTRY;
}
- table = *rste & _REGION_ENTRY_ORIGIN;
+ table = __va(*rste & _REGION_ENTRY_ORIGIN);
rc = base_region3_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -477,13 +586,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region1_walk(unsigned long origin, unsigned long addr,
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rfte, next, table;
+ unsigned long *rfte, next, *table;
int rc;
- rfte = (unsigned long *) origin;
+ rfte = origin;
rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
do {
next = base_region1_addr_end(addr, end);
@@ -493,9 +602,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rfte = table | _REGION1_ENTRY;
+ *rfte = __pa(table) | _REGION1_ENTRY;
}
- table = *rfte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rfte & _REGION_ENTRY_ORIGIN);
rc = base_region2_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -514,7 +623,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
*/
void base_asce_free(unsigned long asce)
{
- unsigned long table = asce & _ASCE_ORIGIN;
+ unsigned long *table = __va(asce & _ASCE_ORIGIN);
if (!asce)
return;
@@ -529,7 +638,7 @@ void base_asce_free(unsigned long asce)
base_region2_walk(table, 0, _REGION1_SIZE, 0);
break;
case _ASCE_TYPE_REGION1:
- base_region1_walk(table, 0, -_PAGE_SIZE, 0);
+ base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
break;
}
base_crst_free(table);
@@ -566,7 +675,7 @@ static int base_pgt_cache_init(void)
*/
unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
{
- unsigned long asce, table, end;
+ unsigned long asce, *table, end;
int rc;
if (base_pgt_cache_init())
@@ -577,25 +686,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
if (!table)
return 0;
rc = base_segment_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION2_SIZE) {
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region3_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION1_SIZE) {
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region2_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
} else {
table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region1_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
}
if (rc) {
base_asce_free(asce);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 9ebd01219812..4909dcd762e8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -19,13 +19,31 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/page-states.h>
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, int nodat)
{
@@ -97,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pte_val(*ptep) |= _PAGE_INVALID;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID)));
mm->context.flush_mm = 1;
} else
ptep_ipte_global(mm, addr, ptep, nodat);
@@ -206,15 +224,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
* Without enhanced suppression-on-protection force
* the dirty bit on for all writable ptes.
*/
- pte_val(entry) |= _PAGE_DIRTY;
- pte_val(entry) &= ~_PAGE_PROTECT;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
if (!(pte_val(entry) & _PAGE_PROTECT))
/* This pte allows write access, set user-dirty */
pgste_val(pgste) |= PGSTE_UC_BIT;
}
#endif
- *ptep = entry;
+ set_pte(ptep, entry);
return pgste;
}
@@ -257,12 +275,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
pgste = pgste_update_all(old, pgste, mm);
if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
_PGSTE_GPS_USAGE_UNUSED)
- pte_val(old) |= _PAGE_UNUSED;
+ old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
}
pgste = pgste_set_pte(ptep, pgste, new);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = new;
+ set_pte(ptep, new);
}
return old;
}
@@ -327,14 +345,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
if (!MACHINE_HAS_NX)
- pte_val(pte) &= ~_PAGE_NOEXEC;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
if (mm_has_pgste(mm)) {
pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte, mm);
pgste = pgste_set_pte(ptep, pgste, pte);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = pte;
+ set_pte(ptep, pte);
}
preempt_enable();
}
@@ -399,7 +417,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
mm->context.flush_mm = 1;
if (mm_has_pgste(mm))
gmap_pmdp_invalidate(mm, addr);
@@ -411,22 +429,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
}
#ifdef CONFIG_PGSTE
-static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
{
+ struct vm_area_struct *vma;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd;
+
+ /* We need a valid VMA, otherwise this is clearly a fault. */
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ return -EFAULT;
pgd = pgd_offset(mm, addr);
- p4d = p4d_alloc(mm, pgd, addr);
- if (!p4d)
- return NULL;
- pud = pud_alloc(mm, p4d, addr);
- if (!pud)
- return NULL;
- pmd = pmd_alloc(mm, pud, addr);
- return pmd;
+ if (!pgd_present(*pgd))
+ return -ENOENT;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return -ENOENT;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return -ENOENT;
+
+ /* Large PUDs are not supported yet. */
+ if (pud_large(*pud))
+ return -EFAULT;
+
+ *pmdp = pmd_offset(pud, addr);
+ return 0;
}
#endif
@@ -437,7 +469,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_direct(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -450,7 +482,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_lazy(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -507,7 +539,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pudp_flush_direct(mm, addr, pudp);
- *pudp = new;
+ set_pud(pudp, new);
preempt_enable();
return old;
}
@@ -547,9 +579,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
list_del(lh);
}
ptep = (pte_t *) pgtable;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
ptep++;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -614,12 +646,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
if (prot == PROT_NONE && !pte_i) {
ptep_flush_direct(mm, addr, ptep, nodat);
pgste = pgste_update_all(entry, pgste, mm);
- pte_val(entry) |= _PAGE_INVALID;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
}
if (prot == PROT_READ && !pte_p) {
ptep_flush_direct(mm, addr, ptep, nodat);
- pte_val(entry) &= ~_PAGE_INVALID;
- pte_val(entry) |= _PAGE_PROTECT;
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
+ entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
pgste_val(pgste) |= bit;
pgste = pgste_set_pte(ptep, pgste, entry);
@@ -643,8 +675,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
!(pte_val(pte) & _PAGE_PROTECT))) {
pgste_val(spgste) |= PGSTE_VSIE_BIT;
tpgste = pgste_get_lock(tptep);
- pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
- (pte_val(pte) & _PAGE_PROTECT);
+ tpte = __pte((pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT));
/* don't touch the storage key - it belongs to parent pgste */
tpgste = pgste_set_pte(tptep, tpgste, tpte);
pgste_set_unlock(tptep, tpgste);
@@ -673,7 +705,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
- struct page *page = migration_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
dec_mm_counter(mm, mm_counter(page));
}
@@ -716,7 +748,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
ptev = pte_val(*ptep);
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
- page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
+ page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -741,10 +773,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
ptep_ipte_global(mm, addr, ptep, nodat);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
- pte_val(pte) |= _PAGE_PROTECT;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
else
- pte_val(pte) |= _PAGE_INVALID;
- *ptep = pte;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
}
pgste_set_unlock(ptep, pgste);
return dirty;
@@ -760,14 +792,23 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp;
pte_t *ptep;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * we can ignore attempts to set the key to 0, because it already is 0.
+ */
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return key ? -EFAULT : 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
+ }
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
- return -EFAULT;
+ return key ? -EFAULT : 0;
}
if (pmd_large(*pmdp)) {
@@ -783,10 +824,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
PGSTE_ACC_BITS | PGSTE_FP_BIT);
@@ -816,7 +854,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(set_guest_storage_key);
-/**
+/*
* Conditionally set a guest storage key (handling csske).
* oldkey will be updated when either mr or mc is set and a pointer is given.
*
@@ -849,7 +887,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(cond_set_guest_storage_key);
-/**
+/*
* Reset a guest reference bit (rrbe), returning the reference and changed bit.
*
* Returns < 0 in case of error, otherwise the cc to be reported to the guest.
@@ -863,14 +901,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
pte_t *ptep;
int cc = 0;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * the storage key is 0 and there is nothing for us to do.
+ */
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
+ }
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
- return -EFAULT;
+ return 0;
}
if (pmd_large(*pmdp)) {
@@ -882,10 +929,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
new = old = pgste_get_lock(ptep);
/* Reset guest reference bit only */
pgste_val(new) &= ~PGSTE_GR_BIT;
@@ -917,15 +961,24 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp;
pte_t *ptep;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * the storage key is 0.
+ */
+ *key = 0;
+
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
+ }
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
- /* Not yet mapped memory has a zero key */
spin_unlock(ptl);
- *key = 0;
return 0;
}
@@ -938,10 +991,7 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
pgste = pgste_get_lock(ptep);
*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
paddr = pte_val(*ptep) & PAGE_MASK;
@@ -970,6 +1020,7 @@ EXPORT_SYMBOL(get_guest_storage_key);
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
unsigned long *oldpte, unsigned long *oldpgste)
{
+ struct vm_area_struct *vma;
unsigned long pgstev;
spinlock_t *ptl;
pgste_t pgste;
@@ -979,6 +1030,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
WARN_ON_ONCE(orc > ESSA_MAX);
if (unlikely(orc > ESSA_MAX))
return -EINVAL;
+
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1071,10 +1126,14 @@ EXPORT_SYMBOL(pgste_perform_essa);
int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
unsigned long bits, unsigned long value)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pgste_t new;
pte_t *ptep;
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1099,9 +1158,13 @@ EXPORT_SYMBOL(set_pgste_bits);
*/
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pte_t *ptep;
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b403fa14847d..ee1a97078527 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
+#include <linux/memory_hotplug.h>
#include <linux/memblock.h>
#include <linux/pfn.h>
#include <linux/mm.h>
@@ -12,8 +12,8 @@
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <asm/cacheflush.h>
+#include <asm/nospec-branch.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -21,21 +21,22 @@
static DEFINE_MUTEX(vmem_mutex);
-struct memory_segment {
- struct list_head list;
- unsigned long start;
- unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
static void __ref *vmem_alloc_pages(unsigned int order)
{
unsigned long size = PAGE_SIZE << order;
if (slab_is_available())
return (void *)__get_free_pages(GFP_KERNEL, order);
- return (void *) memblock_phys_alloc(size, size);
+ return memblock_alloc(size, size);
+}
+
+static void vmem_free_pages(unsigned long addr, int order)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
+ return;
+ free_pages(addr, order);
}
void *vmem_crst_alloc(unsigned long val)
@@ -56,341 +57,604 @@ pte_t __ref *vmem_pte_alloc(void)
if (slab_is_available())
pte = (pte_t *) page_table_alloc(&init_mm);
else
- pte = (pte_t *) memblock_phys_alloc(size, size);
+ pte = (pte_t *) memblock_alloc(size, size);
if (!pte)
return NULL;
memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
return pte;
}
+static void vmem_pte_free(unsigned long *table)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page(table))))
+ return;
+ page_table_free(&init_mm, table);
+}
+
+#define PAGE_UNUSED 0xFD
+
/*
- * Add a physical memory range to the 1:1 mapping.
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_sub_pmd_start to next PMD_SIZE boundary.
*/
-static int vmem_add_mem(unsigned long start, unsigned long size)
-{
- unsigned long pgt_prot, sgt_prot, r3_prot;
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
- int ret = -ENOMEM;
+static unsigned long unused_sub_pmd_start;
+
+static void vmemmap_flush_unused_sub_pmd(void)
+{
+ if (!unused_sub_pmd_start)
+ return;
+ memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
+ ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
+ unused_sub_pmd_start = 0;
+}
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- r3_prot = pgprot_val(REGION3_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- r3_prot &= ~_REGION_ENTRY_NOEXEC;
+static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed (just in case the memmap never gets initialized,
+ * e.g., because the memory block never gets onlined).
+ */
+ memset((void *)start, 0, sizeof(struct page));
+}
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_sub_pmd_start == start) {
+ unused_sub_pmd_start = end;
+ if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
+ unused_sub_pmd_start = 0;
+ return;
}
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
- pu_dir = pud_offset(p4_dir, address);
- if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
- !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pud_val(*pu_dir) = address | r3_prot;
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
- goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
- !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pmd_val(*pm_dir) = address | sgt_prot;
- address += PMD_SIZE;
- pages1m++;
+ vmemmap_flush_unused_sub_pmd();
+ vmemmap_mark_sub_pmd_used(start, end);
+}
+
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_sub_pmd();
+
+ /* Could be our memmap page is filled with PAGE_UNUSED already ... */
+ vmemmap_mark_sub_pmd_used(start, end);
+
+ /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset((void *)page, PAGE_UNUSED, start - page);
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD the last
+ * unused range in the populated PMD.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_sub_pmd_start = end;
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+ unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_sub_pmd();
+ memset((void *)start, PAGE_UNUSED, end - start);
+ return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long prot, pages = 0;
+ int ret = -ENOMEM;
+ pte_t *pte;
+
+ prot = pgprot_val(PAGE_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_PAGE_NOEXEC;
+
+ pte = pte_offset_kernel(pmd, addr);
+ for (; addr < end; addr += PAGE_SIZE, pte++) {
+ if (!add) {
+ if (pte_none(*pte))
+ continue;
+ if (!direct)
+ vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+ pte_clear(&init_mm, addr, pte);
+ } else if (pte_none(*pte)) {
+ if (!direct) {
+ void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+
+ if (!new_page)
+ goto out;
+ set_pte(pte, __pte(__pa(new_page) | prot));
+ } else {
+ set_pte(pte, __pte(__pa(addr) | prot));
+ }
+ } else {
continue;
}
- if (pmd_none(*pm_dir)) {
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
- goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- }
-
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_val(*pt_dir) = address | pgt_prot;
- address += PAGE_SIZE;
- pages4k++;
+ pages++;
}
ret = 0;
out:
- update_page_count(PG_DIRECT_MAP_4K, pages4k);
- update_page_count(PG_DIRECT_MAP_1M, pages1m);
- update_page_count(PG_DIRECT_MAP_2G, pages2g);
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
return ret;
}
-/*
- * Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
- */
-static void vmem_remove_range(unsigned long start, unsigned long size)
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
{
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
-
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- address += PGDIR_SIZE;
- continue;
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- address += P4D_SIZE;
- continue;
- }
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- address += PUD_SIZE;
- continue;
- }
- if (pud_large(*pu_dir)) {
- pud_clear(pu_dir);
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- address += PMD_SIZE;
- continue;
- }
- if (pmd_large(*pm_dir)) {
- pmd_clear(pm_dir);
- address += PMD_SIZE;
- pages1m++;
- continue;
- }
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_clear(&init_mm, address, pt_dir);
- address += PAGE_SIZE;
- pages4k++;
+ pte_t *pte;
+ int i;
+
+ /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+ pte = pte_offset_kernel(pmd, start);
+ for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(*pte))
+ return;
}
- flush_tlb_kernel_range(start, end);
- update_page_count(PG_DIRECT_MAP_4K, -pages4k);
- update_page_count(PG_DIRECT_MAP_1M, -pages1m);
- update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+ vmem_pte_free((unsigned long *) pmd_deref(*pmd));
+ pmd_clear(pmd);
}
-/*
- * Add a backed mem_map array to the virtual mem_map array.
- */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
-{
- unsigned long pgt_prot, sgt_prot;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
+ pmd_t *pmd;
+ pte_t *pte;
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- }
- for (address = start; address < end;) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
+ prot = pgprot_val(SEGMENT_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_SEGMENT_ENTRY_NOEXEC;
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
+ pmd = pmd_offset(pud, addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+ if (!add) {
+ if (pmd_none(*pmd))
+ continue;
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ pages++;
+ } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ }
+ continue;
+ }
+ } else if (pmd_none(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE) &&
+ MACHINE_HAS_EDAT1 && direct &&
+ !debug_pagealloc_enabled()) {
+ set_pmd(pmd, __pmd(__pa(addr) | prot));
+ pages++;
+ continue;
+ } else if (!direct && MACHINE_HAS_EDAT1) {
+ void *new_page;
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
+ /*
+ * Use 1MB frames for vmemmap if available. We
+ * always use large frames even if they are only
+ * partially used. Otherwise we would have also
+ * page tables since vmemmap_populate gets
+ * called for each section separately.
+ */
+ new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+ if (new_page) {
+ set_pmd(pmd, __pmd(__pa(new_page) | prot));
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE)) {
+ vmemmap_use_new_sub_pmd(addr, next);
+ }
+ continue;
+ }
+ }
+ pte = vmem_pte_alloc();
+ if (!pte)
goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (pmd_large(*pmd)) {
+ if (!direct)
+ vmemmap_use_sub_pmd(addr, next);
+ continue;
}
+ ret = modify_pte_table(pmd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pte_table(pmd, addr & PMD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+ return ret;
+}
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- /* Use 1MB frames for vmemmap if available. We always
- * use large frames even if they are only partially
- * used.
- * Otherwise we would have also page tables since
- * vmemmap_populate gets called for each section
- * separately. */
- if (MACHINE_HAS_EDAT1) {
- void *new_page;
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+ const unsigned long end = start + PUD_SIZE;
+ pmd_t *pmd;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+ pmd = pmd_offset(pud, start);
+ for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+ if (!pmd_none(*pmd))
+ return;
+ vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+ pud_clear(pud);
+}
- new_page = vmemmap_alloc_block(PMD_SIZE, node);
- if (!new_page)
- goto out;
- pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
+ int ret = -ENOMEM;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ prot = pgprot_val(REGION3_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_REGION_ENTRY_NOEXEC;
+ pud = pud_offset(p4d, addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+ if (!add) {
+ if (pud_none(*pud))
+ continue;
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ pud_clear(pud);
+ pages++;
+ }
+ continue;
+ }
+ } else if (pud_none(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE) &&
+ MACHINE_HAS_EDAT2 && direct &&
+ !debug_pagealloc_enabled()) {
+ set_pud(pud, __pud(__pa(addr) | prot));
+ pages++;
continue;
}
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- } else if (pmd_large(*pm_dir)) {
- address = (address + PMD_SIZE) & PMD_MASK;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (pud_large(*pud)) {
continue;
}
+ ret = modify_pmd_table(pud, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pmd_table(pud, addr & PUD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+ return ret;
+}
- pt_dir = pte_offset_kernel(pm_dir, address);
- if (pte_none(*pt_dir)) {
- void *new_page;
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+ const unsigned long end = start + P4D_SIZE;
+ pud_t *pud;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+
+ pud = pud_offset(p4d, start);
+ for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+ if (!pud_none(*pud))
+ return;
+ }
+ vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+ p4d_clear(p4d);
+}
- new_page = vmemmap_alloc_block(PAGE_SIZE, node);
- if (!new_page)
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next;
+ int ret = -ENOMEM;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = p4d_offset(pgd, addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+ if (!add) {
+ if (p4d_none(*p4d))
+ continue;
+ } else if (p4d_none(*p4d)) {
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
goto out;
- pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
+ p4d_populate(&init_mm, p4d, pud);
}
- address += PAGE_SIZE;
+ ret = modify_pud_table(p4d, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pud_table(p4d, addr & P4D_MASK);
}
ret = 0;
out:
return ret;
}
-void vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
{
+ const unsigned long end = start + PGDIR_SIZE;
+ p4d_t *p4d;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+
+ p4d = p4d_offset(pgd, start);
+ for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+ if (!p4d_none(*p4d))
+ return;
+ }
+ vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+ pgd_clear(pgd);
}
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+ bool direct)
{
- struct memory_segment *tmp;
+ unsigned long addr, next;
+ int ret = -ENOMEM;
+ pgd_t *pgd;
+ p4d_t *p4d;
- if (seg->start + seg->size > VMEM_MAX_PHYS ||
- seg->start + seg->size < seg->start)
- return -ERANGE;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+ return -EINVAL;
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+ pgd = pgd_offset_k(addr);
- list_for_each_entry(tmp, &mem_segs, list) {
- if (seg->start >= tmp->start + tmp->size)
- continue;
- if (seg->start + seg->size <= tmp->start)
- continue;
- return -ENOSPC;
+ if (!add) {
+ if (pgd_none(*pgd))
+ continue;
+ } else if (pgd_none(*pgd)) {
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ ret = modify_p4d_table(pgd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_p4d_table(pgd, addr & PGDIR_MASK);
}
- list_add(&seg->list, &mem_segs);
- return 0;
+ ret = 0;
+out:
+ if (!add)
+ flush_tlb_kernel_range(start, end);
+ return ret;
+}
+
+static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, true, direct);
+}
+
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, false, direct);
}
/*
- * Remove memory segment from the segment list.
+ * Add a physical memory range to the 1:1 mapping.
*/
-static void remove_memory_segment(struct memory_segment *seg)
+static int vmem_add_range(unsigned long start, unsigned long size)
{
- list_del(&seg->list);
+ return add_pagetable(start, start + size, true);
}
-static void __remove_shared_memory(struct memory_segment *seg)
+/*
+ * Remove a physical memory range from the 1:1 mapping.
+ */
+static void vmem_remove_range(unsigned long start, unsigned long size)
{
- remove_memory_segment(seg);
- vmem_remove_range(seg->start, seg->size);
+ remove_pagetable(start, start + size, true);
}
-int vmem_remove_mapping(unsigned long start, unsigned long size)
+/*
+ * Add a backed mem_map array to the virtual mem_map array.
+ */
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
- struct memory_segment *seg;
int ret;
mutex_lock(&vmem_mutex);
+ /* We don't care about the node, just use NUMA_NO_NODE on allocations */
+ ret = add_pagetable(start, end, false);
+ if (ret)
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
- ret = -ENOENT;
- list_for_each_entry(seg, &mem_segs, list) {
- if (seg->start == start && seg->size == size)
- break;
- }
-
- if (seg->start != start || seg->size != size)
- goto out;
+void vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ mutex_lock(&vmem_mutex);
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+}
- ret = 0;
- __remove_shared_memory(seg);
- kfree(seg);
-out:
+void vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+ mutex_lock(&vmem_mutex);
+ vmem_remove_range(start, size);
mutex_unlock(&vmem_mutex);
- return ret;
+}
+
+struct range arch_get_mappable_range(void)
+{
+ struct range mhp_range;
+
+ mhp_range.start = 0;
+ mhp_range.end = VMEM_MAX_PHYS - 1;
+ return mhp_range;
}
int vmem_add_mapping(unsigned long start, unsigned long size)
{
- struct memory_segment *seg;
+ struct range range = arch_get_mappable_range();
int ret;
- mutex_lock(&vmem_mutex);
- ret = -ENOMEM;
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- goto out;
- seg->start = start;
- seg->size = size;
+ if (start < range.start ||
+ start + size > range.end + 1 ||
+ start + size < start)
+ return -ERANGE;
- ret = insert_memory_segment(seg);
+ mutex_lock(&vmem_mutex);
+ ret = vmem_add_range(start, size);
if (ret)
- goto out_free;
+ vmem_remove_range(start, size);
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
- ret = vmem_add_mem(start, size);
- if (ret)
- goto out_remove;
- goto out;
+/*
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserverd for 4KB mappings only.
+ */
+pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
+{
+ pte_t *ptep = NULL;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
-out_remove:
- __remove_shared_memory(seg);
-out_free:
- kfree(seg);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ if (!alloc)
+ goto out;
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ if (!alloc)
+ goto out;
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
+ goto out;
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ if (!alloc)
+ goto out;
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
+ goto out;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (WARN_ON_ONCE(pud_large(*pud))) {
+ goto out;
+ }
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ if (!alloc)
+ goto out;
+ pte = vmem_pte_alloc();
+ if (!pte)
+ goto out;
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+ goto out;
+ }
+ ptep = pte_offset_kernel(pmd, addr);
out:
+ return ptep;
+}
+
+int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
+{
+ pte_t *ptep, pte;
+
+ if (!IS_ALIGNED(addr, PAGE_SIZE))
+ return -EINVAL;
+ ptep = vmem_get_alloc_pte(addr, alloc);
+ if (!ptep)
+ return -ENOMEM;
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte = mk_pte_phys(phys, prot);
+ set_pte(ptep, pte);
+ return 0;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
+{
+ int rc;
+
+ mutex_lock(&vmem_mutex);
+ rc = __vmem_map_4k_page(addr, phys, prot, true);
+ mutex_unlock(&vmem_mutex);
+ return rc;
+}
+
+void vmem_unmap_4k_page(unsigned long addr)
+{
+ pte_t *ptep;
+
+ mutex_lock(&vmem_mutex);
+ ptep = virt_to_kpte(addr);
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte_clear(&init_mm, addr, ptep);
mutex_unlock(&vmem_mutex);
- return ret;
}
/*
@@ -400,10 +664,11 @@ out:
*/
void __init vmem_map_init(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg)
- vmem_add_mem(reg->base, reg->size);
+ for_each_mem_range(i, &base, &end)
+ vmem_add_range(base, end - base);
__set_memory((unsigned long)_stext,
(unsigned long)(_etext - _stext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
@@ -413,32 +678,16 @@ void __init vmem_map_init(void)
__set_memory((unsigned long)_sinittext,
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT,
+ __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- pr_info("Write protected kernel read-only data: %luk\n",
- (unsigned long)(__end_rodata - _stext) >> 10);
-}
-/*
- * Convert memblock.memory to a memory segment list so there is a single
- * list that contains all memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
-{
- struct memblock_region *reg;
- struct memory_segment *seg;
+ /* lowcore requires 4k mapping for real addresses / prefixing */
+ set_memory_4k(0, LC_PAGES);
- mutex_lock(&vmem_mutex);
- for_each_memblock(memory, reg) {
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- panic("Out of memory...\n");
- seg->start = reg->base;
- seg->size = reg->size;
- insert_memory_segment(seg);
- }
- mutex_unlock(&vmem_mutex);
- return 0;
-}
+ /* lowcore must be executable for LPSWE */
+ if (!static_key_enabled(&cpu_has_bear))
+ set_memory_x(0, 1);
-core_initcall(vmem_convert_memory_chunk);
+ pr_info("Write protected kernel read-only data: %luk\n",
+ (unsigned long)(__end_rodata - _stext) >> 10);
+}
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 8d2134136290..af35052d06ed 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -7,7 +7,6 @@
* - HAVE_MARCH_Z196_FEATURES: laal, laalg
* - HAVE_MARCH_Z10_FEATURES: msfi, cgrj, clgrj
* - HAVE_MARCH_Z9_109_FEATURES: alfi, llilf, clfi, oilf, nilf
- * - PACK_STACK
* - 64BIT
*
* Copyright IBM Corp. 2012,2015
@@ -26,6 +25,7 @@
#include <linux/mm.h>
#include <linux/kernel.h>
#include <asm/cacheflush.h>
+#include <asm/extable.h>
#include <asm/dis.h>
#include <asm/facility.h>
#include <asm/nospec-branch.h>
@@ -49,7 +49,7 @@ struct bpf_jit {
int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */
int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */
int tail_call_start; /* Tail call start offset */
- int labels[1]; /* Labels for local jumps */
+ int excnt; /* Number of exception table entries */
};
#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */
@@ -112,7 +112,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
{
u32 r1 = reg2hex[b1];
- if (!jit->seen_reg[r1] && r1 >= 6 && r1 <= 15)
+ if (r1 >= 6 && r1 <= 15 && !jit->seen_reg[r1])
jit->seen_reg[r1] = 1;
}
@@ -228,18 +228,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
REG_SET_SEEN(b3); \
})
-#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \
+#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \
({ \
- int rel = (jit->labels[label] - jit->prg) >> 1; \
+ unsigned int rel = (int)((target) - jit->prg) / 2; \
_EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \
(op2) | (mask) << 12); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
-#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \
+#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \
({ \
- int rel = (jit->labels[label] - jit->prg) >> 1; \
+ unsigned int rel = (int)((target) - jit->prg) / 2; \
_EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \
(rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \
REG_SET_SEEN(b1); \
@@ -248,8 +248,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \
({ \
- /* Branch instruction needs 6 bytes */ \
- int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\
+ int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \
_EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
@@ -489,6 +488,24 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
} while (re <= last);
}
+static void bpf_skip(struct bpf_jit *jit, int size)
+{
+ if (size >= 6 && !is_valid_rel(size)) {
+ /* brcl 0xf,size */
+ EMIT6_PCREL_RIL(0xc0f4000000, size);
+ size -= 6;
+ } else if (size >= 4 && is_valid_rel(size)) {
+ /* brc 0xf,size */
+ EMIT4_PCREL(0xa7f40000, size);
+ size -= 4;
+ }
+ while (size >= 2) {
+ /* bcr 0,%0 */
+ _EMIT2(0x0700);
+ size -= 2;
+ }
+}
+
/*
* Emit function prologue
*
@@ -501,9 +518,11 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
} else {
- /* j tail_call_start: NOP if no tail calls are used */
- EMIT4_PCREL(0xa7f40000, 6);
- _EMIT2(0);
+ /*
+ * There are no tail calls. Insert nops in order to have
+ * tail_call_start at a predictable offset.
+ */
+ bpf_skip(jit, 6);
}
/* Tail calls have to skip above initialization */
jit->tail_call_start = jit->prg;
@@ -548,45 +567,101 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
EMIT4(0xb9040000, REG_2, BPF_REG_0);
/* Restore registers */
save_restore_regs(jit, REGS_RESTORE, stack_depth);
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
+ if (nospec_uses_trampoline()) {
jit->r14_thunk_ip = jit->prg;
/* Generate __s390_indirect_jump_r14 thunk */
- if (test_facility(35)) {
- /* exrl %r0,.+10 */
- EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
- } else {
- /* larl %r1,.+14 */
- EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14);
- /* ex 0,0(%r1) */
- EMIT4_DISP(0x44000000, REG_0, REG_1, 0);
- }
+ /* exrl %r0,.+10 */
+ EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
/* j . */
EMIT4_PCREL(0xa7f40000, 0);
}
/* br %r14 */
_EMIT2(0x07fe);
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable &&
+ if ((nospec_uses_trampoline()) &&
(is_first_pass(jit) || (jit->seen & SEEN_FUNC))) {
jit->r1_thunk_ip = jit->prg;
/* Generate __s390_indirect_jump_r1 thunk */
- if (test_facility(35)) {
- /* exrl %r0,.+10 */
- EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
- /* j . */
- EMIT4_PCREL(0xa7f40000, 0);
- /* br %r1 */
- _EMIT2(0x07f1);
- } else {
- /* ex 0,S390_lowcore.br_r1_tampoline */
- EMIT4_DISP(0x44000000, REG_0, REG_0,
- offsetof(struct lowcore, br_r1_trampoline));
- /* j . */
- EMIT4_PCREL(0xa7f40000, 0);
- }
+ /* exrl %r0,.+10 */
+ EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
+ /* j . */
+ EMIT4_PCREL(0xa7f40000, 0);
+ /* br %r1 */
+ _EMIT2(0x07f1);
}
}
+static int get_probe_mem_regno(const u8 *insn)
+{
+ /*
+ * insn must point to llgc, llgh, llgf or lg, which have destination
+ * register at the same position.
+ */
+ if (insn[0] != 0xe3) /* common llgc, llgh, llgf and lg prefix */
+ return -1;
+ if (insn[5] != 0x90 && /* llgc */
+ insn[5] != 0x91 && /* llgh */
+ insn[5] != 0x16 && /* llgf */
+ insn[5] != 0x04) /* lg */
+ return -1;
+ return insn[1] >> 4;
+}
+
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
+{
+ regs->psw.addr = extable_fixup(x);
+ regs->gprs[x->data] = 0;
+ return true;
+}
+
+static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp,
+ int probe_prg, int nop_prg)
+{
+ struct exception_table_entry *ex;
+ int reg, prg;
+ s64 delta;
+ u8 *insn;
+ int i;
+
+ if (!fp->aux->extable)
+ /* Do nothing during early JIT passes. */
+ return 0;
+ insn = jit->prg_buf + probe_prg;
+ reg = get_probe_mem_regno(insn);
+ if (WARN_ON_ONCE(reg < 0))
+ /* JIT bug - unexpected probe instruction. */
+ return -1;
+ if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg))
+ /* JIT bug - gap between probe and nop instructions. */
+ return -1;
+ for (i = 0; i < 2; i++) {
+ if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries))
+ /* Verifier bug - not enough entries. */
+ return -1;
+ ex = &fp->aux->extable[jit->excnt];
+ /* Add extable entries for probe and nop instructions. */
+ prg = i == 0 ? probe_prg : nop_prg;
+ delta = jit->prg_buf + prg - (u8 *)&ex->insn;
+ if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+ /* JIT bug - code and extable must be close. */
+ return -1;
+ ex->insn = delta;
+ /*
+ * Always land on the nop. Note that extable infrastructure
+ * ignores fixup field, it is handled by ex_handler_bpf().
+ */
+ delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup;
+ if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+ /* JIT bug - landing pad and extable must be close. */
+ return -1;
+ ex->fixup = delta;
+ ex->type = EX_TYPE_BPF;
+ ex->data = reg;
+ jit->excnt++;
+ }
+ return 0;
+}
+
/*
* Compile one eBPF instruction into s390x code
*
@@ -594,7 +669,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
* stack space for the large switch statement.
*/
static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
- int i, bool extra_pass)
+ int i, bool extra_pass, u32 stack_depth)
{
struct bpf_insn *insn = &fp->insnsi[i];
u32 dst_reg = insn->dst_reg;
@@ -603,7 +678,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
u32 *addrs = jit->addrs;
s32 imm = insn->imm;
s16 off = insn->off;
+ int probe_prg = -1;
unsigned int mask;
+ int nop_prg;
+ int err;
+
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM)
+ probe_prg = jit->prg;
switch (insn->code) {
/*
@@ -656,10 +738,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9080000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */
- if (!imm)
- break;
- /* alfi %dst,imm */
- EMIT6_IMM(0xc20b0000, dst_reg, imm);
+ if (imm != 0) {
+ /* alfi %dst,imm */
+ EMIT6_IMM(0xc20b0000, dst_reg, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */
@@ -681,17 +763,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9090000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */
- if (!imm)
- break;
- /* alfi %dst,-imm */
- EMIT6_IMM(0xc20b0000, dst_reg, -imm);
+ if (imm != 0) {
+ /* alfi %dst,-imm */
+ EMIT6_IMM(0xc20b0000, dst_reg, -imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */
if (!imm)
break;
- /* agfi %dst,-imm */
- EMIT6_IMM(0xc2080000, dst_reg, -imm);
+ if (imm == -0x80000000) {
+ /* algfi %dst,0x80000000 */
+ EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000);
+ } else {
+ /* agfi %dst,-imm */
+ EMIT6_IMM(0xc2080000, dst_reg, -imm);
+ }
break;
/*
* BPF_MUL
@@ -706,10 +793,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb90c0000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */
- if (imm == 1)
- break;
- /* msfi %r5,imm */
- EMIT6_IMM(0xc2010000, dst_reg, imm);
+ if (imm != 1) {
+ /* msfi %r5,imm */
+ EMIT6_IMM(0xc2010000, dst_reg, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */
@@ -762,6 +849,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
if (BPF_OP(insn->code) == BPF_MOD)
/* lhgi %dst,0 */
EMIT4_IMM(0xa7090000, dst_reg, 0);
+ else
+ EMIT_ZERO(dst_reg);
break;
}
/* lhi %w0,0 */
@@ -894,10 +983,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9820000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */
- if (!imm)
- break;
- /* xilf %dst,imm */
- EMIT6_IMM(0xc0070000, dst_reg, imm);
+ if (imm != 0) {
+ /* xilf %dst,imm */
+ EMIT6_IMM(0xc0070000, dst_reg, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */
@@ -928,10 +1017,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0);
break;
case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */
- if (imm == 0)
- break;
- /* sll %dst,imm(%r0) */
- EMIT4_DISP(0x89000000, dst_reg, REG_0, imm);
+ if (imm != 0) {
+ /* sll %dst,imm(%r0) */
+ EMIT4_DISP(0x89000000, dst_reg, REG_0, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */
@@ -953,10 +1042,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0);
break;
case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */
- if (imm == 0)
- break;
- /* srl %dst,imm(%r0) */
- EMIT4_DISP(0x88000000, dst_reg, REG_0, imm);
+ if (imm != 0) {
+ /* srl %dst,imm(%r0) */
+ EMIT4_DISP(0x88000000, dst_reg, REG_0, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */
@@ -978,10 +1067,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0);
break;
case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */
- if (imm == 0)
- break;
- /* sra %dst,imm(%r0) */
- EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm);
+ if (imm != 0) {
+ /* sra %dst,imm(%r0) */
+ EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */
@@ -1049,6 +1138,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
}
break;
/*
+ * BPF_NOSPEC (speculation barrier)
+ */
+ case BPF_ST | BPF_NOSPEC:
+ break;
+ /*
* BPF_ST(X)
*/
case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */
@@ -1100,24 +1194,76 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
jit->seen |= SEEN_MEM;
break;
/*
- * BPF_STX XADD (atomic_add)
+ * BPF_ATOMIC
*/
- case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */
- /* laal %w0,%src,off(%dst) */
- EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg,
- dst_reg, off);
- jit->seen |= SEEN_MEM;
- break;
- case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */
- /* laalg %w0,%src,off(%dst) */
- EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg,
- dst_reg, off);
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ {
+ bool is32 = BPF_SIZE(insn->code) == BPF_W;
+
+ switch (insn->imm) {
+/* {op32|op64} {%w0|%src},%src,off(%dst) */
+#define EMIT_ATOMIC(op32, op64) do { \
+ EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64), \
+ (insn->imm & BPF_FETCH) ? src_reg : REG_W0, \
+ src_reg, dst_reg, off); \
+ if (is32 && (insn->imm & BPF_FETCH)) \
+ EMIT_ZERO(src_reg); \
+} while (0)
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ /* {laal|laalg} */
+ EMIT_ATOMIC(0x00fa, 0x00ea);
+ break;
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ /* {lan|lang} */
+ EMIT_ATOMIC(0x00f4, 0x00e4);
+ break;
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ /* {lao|laog} */
+ EMIT_ATOMIC(0x00f6, 0x00e6);
+ break;
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ /* {lax|laxg} */
+ EMIT_ATOMIC(0x00f7, 0x00e7);
+ break;
+#undef EMIT_ATOMIC
+ case BPF_XCHG:
+ /* {ly|lg} %w0,off(%dst) */
+ EMIT6_DISP_LH(0xe3000000,
+ is32 ? 0x0058 : 0x0004, REG_W0, REG_0,
+ dst_reg, off);
+ /* 0: {csy|csg} %w0,%src,off(%dst) */
+ EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030,
+ REG_W0, src_reg, dst_reg, off);
+ /* brc 4,0b */
+ EMIT4_PCREL_RIC(0xa7040000, 4, jit->prg - 6);
+ /* {llgfr|lgr} %src,%w0 */
+ EMIT4(is32 ? 0xb9160000 : 0xb9040000, src_reg, REG_W0);
+ if (is32 && insn_is_zext(&insn[1]))
+ insn_count = 2;
+ break;
+ case BPF_CMPXCHG:
+ /* 0: {csy|csg} %b0,%src,off(%dst) */
+ EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030,
+ BPF_REG_0, src_reg, dst_reg, off);
+ break;
+ default:
+ pr_err("Unknown atomic operation %02x\n", insn->imm);
+ return -1;
+ }
+
jit->seen |= SEEN_MEM;
break;
+ }
/*
* BPF_LDX
*/
case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_B:
/* llgc %dst,0(off,%src) */
EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off);
jit->seen |= SEEN_MEM;
@@ -1125,6 +1271,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
insn_count = 2;
break;
case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_H:
/* llgh %dst,0(off,%src) */
EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off);
jit->seen |= SEEN_MEM;
@@ -1132,6 +1279,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
insn_count = 2;
break;
case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_W:
/* llgf %dst,off(%src) */
jit->seen |= SEEN_MEM;
EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off);
@@ -1139,6 +1287,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
insn_count = 2;
break;
case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
/* lg %dst,0(off,%src) */
jit->seen |= SEEN_MEM;
EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off);
@@ -1161,7 +1310,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
jit->seen |= SEEN_FUNC;
/* lgrl %w1,func */
EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func));
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
+ if (nospec_uses_trampoline()) {
/* brasl %r14,__s390_indirect_jump_r1 */
EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
} else {
@@ -1172,7 +1321,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9040000, BPF_REG_0, REG_2);
break;
}
- case BPF_JMP | BPF_TAIL_CALL:
+ case BPF_JMP | BPF_TAIL_CALL: {
+ int patch_1_clrj, patch_2_clij, patch_3_brc;
+
/*
* Implicit input:
* B1: pointer to ctx
@@ -1190,40 +1341,28 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2,
offsetof(struct bpf_array, map.max_entries));
/* if ((u32)%b3 >= (u32)%w1) goto out; */
- if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
- /* clrj %b3,%w1,0xa,label0 */
- EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3,
- REG_W1, 0, 0xa);
- } else {
- /* clr %b3,%w1 */
- EMIT2(0x1500, BPF_REG_3, REG_W1);
- /* brcl 0xa,label0 */
- EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]);
- }
+ /* clrj %b3,%w1,0xa,out */
+ patch_1_clrj = jit->prg;
+ EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa,
+ jit->prg);
/*
- * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
if (jit->seen & SEEN_STACK)
- off = STK_OFF_TCCNT + STK_OFF + fp->aux->stack_depth;
+ off = STK_OFF_TCCNT + STK_OFF + stack_depth;
else
off = STK_OFF_TCCNT;
/* lhi %w0,1 */
EMIT4_IMM(0xa7080000, REG_W0, 1);
/* laal %w1,%w0,off(%r15) */
EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off);
- if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
- /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */
- EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1,
- MAX_TAIL_CALL_CNT, 0, 0x2);
- } else {
- /* clfi %w1,MAX_TAIL_CALL_CNT */
- EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT);
- /* brcl 0x2,label0 */
- EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]);
- }
+ /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */
+ patch_2_clij = jit->prg;
+ EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1,
+ 2, jit->prg);
/*
* prog = array->ptrs[index];
@@ -1238,18 +1377,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/* ltg %r1,prog(%b2,%r1) */
EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2,
REG_1, offsetof(struct bpf_array, ptrs));
- if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
- /* brc 0x8,label0 */
- EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]);
- } else {
- /* brcl 0x8,label0 */
- EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]);
- }
+ /* brc 0x8,out */
+ patch_3_brc = jit->prg;
+ EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg);
/*
* Restore registers before calling function
*/
- save_restore_regs(jit, REGS_RESTORE, fp->aux->stack_depth);
+ save_restore_regs(jit, REGS_RESTORE, stack_depth);
/*
* goto *(prog->bpf_func + tail_call_start);
@@ -1261,14 +1396,26 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/* bc 0xf,tail_call_start(%r1) */
_EMIT4(0x47f01000 + jit->tail_call_start);
/* out: */
- jit->labels[0] = jit->prg;
+ if (jit->prg_buf) {
+ *(u16 *)(jit->prg_buf + patch_1_clrj + 2) =
+ (jit->prg - patch_1_clrj) >> 1;
+ *(u16 *)(jit->prg_buf + patch_2_clij + 2) =
+ (jit->prg - patch_2_clij) >> 1;
+ *(u16 *)(jit->prg_buf + patch_3_brc + 2) =
+ (jit->prg - patch_3_brc) >> 1;
+ }
break;
+ }
case BPF_JMP | BPF_EXIT: /* return b0 */
last = (i == fp->len - 1) ? 1 : 0;
if (last)
break;
- /* j <exit> */
- EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
+ if (!is_first_pass(jit) && can_use_rel(jit, jit->exit_ip))
+ /* brc 0xf, <exit> */
+ EMIT4_PCREL_RIC(0xa7040000, 0xf, jit->exit_ip);
+ else
+ /* brcl 0xf, <exit> */
+ EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->exit_ip);
break;
/*
* Branch relative (number of skipped instructions) to offset on
@@ -1416,21 +1563,10 @@ branch_ks:
}
break;
branch_ku:
- is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* clfi or clgfi %dst,imm */
- EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000,
- dst_reg, imm);
- if (!is_first_pass(jit) &&
- can_use_rel(jit, addrs[i + off + 1])) {
- /* brc mask,off */
- EMIT4_PCREL_RIC(0xa7040000,
- mask >> 12, addrs[i + off + 1]);
- } else {
- /* brcl mask,off */
- EMIT6_PCREL_RILC(0xc0040000,
- mask >> 12, addrs[i + off + 1]);
- }
- break;
+ /* lgfi %w1,imm (load sign extend imm) */
+ src_reg = REG_1;
+ EMIT6_IMM(0xc0010000, src_reg, imm);
+ goto branch_xu;
branch_xs:
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (!is_first_pass(jit) &&
@@ -1484,6 +1620,23 @@ branch_oc:
pr_err("Unknown opcode %02x\n", insn->code);
return -1;
}
+
+ if (probe_prg != -1) {
+ /*
+ * Handlers of certain exceptions leave psw.addr pointing to
+ * the instruction directly after the failing one. Therefore,
+ * create two exception table entries and also add a nop in
+ * case two probing instructions come directly after each
+ * other.
+ */
+ nop_prg = jit->prg;
+ /* bcr 0,%0 */
+ _EMIT2(0x0700);
+ err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg);
+ if (err < 0)
+ return err;
+ }
+
return insn_count;
}
@@ -1509,7 +1662,14 @@ static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i)
*/
static int bpf_set_addr(struct bpf_jit *jit, int i)
{
- if (!bpf_is_new_addr_sane(jit, i))
+ int delta;
+
+ if (is_codegen_pass(jit)) {
+ delta = jit->prg - jit->addrs[i];
+ if (delta < 0)
+ bpf_skip(jit, -delta);
+ }
+ if (WARN_ON_ONCE(!bpf_is_new_addr_sane(jit, i)))
return -1;
jit->addrs[i] = jit->prg;
return 0;
@@ -1519,26 +1679,27 @@ static int bpf_set_addr(struct bpf_jit *jit, int i)
* Compile eBPF program into s390x code
*/
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
- bool extra_pass)
+ bool extra_pass, u32 stack_depth)
{
int i, insn_count, lit32_size, lit64_size;
jit->lit32 = jit->lit32_start;
jit->lit64 = jit->lit64_start;
jit->prg = 0;
+ jit->excnt = 0;
- bpf_jit_prologue(jit, fp->aux->stack_depth);
+ bpf_jit_prologue(jit, stack_depth);
if (bpf_set_addr(jit, 0) < 0)
return -1;
for (i = 0; i < fp->len; i += insn_count) {
- insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
+ insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth);
if (insn_count < 0)
return -1;
/* Next instruction address */
if (bpf_set_addr(jit, i + insn_count) < 0)
return -1;
}
- bpf_jit_epilogue(jit, fp->aux->stack_depth);
+ bpf_jit_epilogue(jit, stack_depth);
lit32_size = jit->lit32 - jit->lit32_start;
lit64_size = jit->lit64 - jit->lit64_start;
@@ -1550,6 +1711,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
jit->lit64_start = ALIGN(jit->lit64_start, 8);
jit->size = jit->lit64_start + lit64_size;
jit->size_prg = jit->prg;
+
+ if (WARN_ON_ONCE(fp->aux->extable &&
+ jit->excnt != fp->aux->num_exentries))
+ /* Verifier bug - too many entries. */
+ return -1;
+
return 0;
}
@@ -1564,11 +1731,35 @@ struct s390_jit_data {
int pass;
};
+static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit,
+ struct bpf_prog *fp)
+{
+ struct bpf_binary_header *header;
+ u32 extable_size;
+ u32 code_size;
+
+ /* We need two entries per insn. */
+ fp->aux->num_exentries *= 2;
+
+ code_size = roundup(jit->size,
+ __alignof__(struct exception_table_entry));
+ extable_size = fp->aux->num_exentries *
+ sizeof(struct exception_table_entry);
+ header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf,
+ 8, jit_fill_hole);
+ if (!header)
+ return NULL;
+ fp->aux->extable = (struct exception_table_entry *)
+ (jit->prg_buf + code_size);
+ return header;
+}
+
/*
* Compile eBPF program "fp"
*/
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
+ u32 stack_depth = round_up(fp->aux->stack_depth, 8);
struct bpf_prog *tmp, *orig_fp = fp;
struct bpf_binary_header *header;
struct s390_jit_data *jit_data;
@@ -1613,15 +1804,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
if (jit.addrs == NULL) {
fp = orig_fp;
- goto out;
+ goto free_addrs;
}
/*
* Three initial passes:
* - 1/2: Determine clobbered registers
- * - 3: Calculate program size and addrs arrray
+ * - 3: Calculate program size and addrs array
*/
for (pass = 1; pass <= 3; pass++) {
- if (bpf_jit_prog(&jit, fp, extra_pass)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
fp = orig_fp;
goto free_addrs;
}
@@ -1629,13 +1820,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
/*
* Final pass: Allocate and generate program
*/
- header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole);
+ header = bpf_jit_alloc(&jit, fp);
if (!header) {
fp = orig_fp;
goto free_addrs;
}
skip_init_ctx:
- if (bpf_jit_prog(&jit, fp, extra_pass)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
deleted file mode 100644
index 66c2dff74895..000000000000
--- a/arch/s390/numa/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-y += numa.o
-obj-y += toptree.o
-obj-$(CONFIG_NUMA_EMU) += mode_emu.o
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
deleted file mode 100644
index 72d742bb2d17..000000000000
--- a/arch/s390/numa/mode_emu.c
+++ /dev/null
@@ -1,577 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
- * without using real topology information about the physical memory of the
- * machine.
- *
- * It distributes the available CPUs to nodes while respecting the original
- * machine topology information. This is done by trying to avoid to separate
- * CPUs which reside on the same book or even on the same MC.
- *
- * Because the current Linux scheduler code requires a stable cpu to node
- * mapping, cores are pinned to nodes when the first CPU thread is set online.
- *
- * Copyright IBM Corp. 2015
- */
-
-#define KMSG_COMPONENT "numa_emu"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/memblock.h>
-#include <linux/node.h>
-#include <linux/memory.h>
-#include <linux/slab.h>
-#include <asm/smp.h>
-#include <asm/topology.h>
-#include "numa_mode.h"
-#include "toptree.h"
-
-/* Distances between the different system components */
-#define DIST_EMPTY 0
-#define DIST_CORE 1
-#define DIST_MC 2
-#define DIST_BOOK 3
-#define DIST_DRAWER 4
-#define DIST_MAX 5
-
-/* Node distance reported to common code */
-#define EMU_NODE_DIST 10
-
-/* Node ID for free (not yet pinned) cores */
-#define NODE_ID_FREE -1
-
-/* Different levels of toptree */
-enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY};
-
-/* The two toptree IDs */
-enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
-
-/* Number of NUMA nodes */
-static int emu_nodes = 1;
-/* NUMA stripe size */
-static unsigned long emu_size;
-
-/*
- * Node to core pinning information updates are protected by
- * "sched_domains_mutex".
- */
-static struct {
- s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */
- int total; /* Total number of pinned cores */
- int per_node_target; /* Cores per node without extra cores */
- int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */
-} *emu_cores;
-
-/*
- * Pin a core to a node
- */
-static void pin_core_to_node(int core_id, int node_id)
-{
- if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
- emu_cores->per_node[node_id]++;
- emu_cores->to_node_id[core_id] = node_id;
- emu_cores->total++;
- } else {
- WARN_ON(emu_cores->to_node_id[core_id] != node_id);
- }
-}
-
-/*
- * Number of pinned cores of a node
- */
-static int cores_pinned(struct toptree *node)
-{
- return emu_cores->per_node[node->id];
-}
-
-/*
- * ID of the node where the core is pinned (or NODE_ID_FREE)
- */
-static int core_pinned_to_node_id(struct toptree *core)
-{
- return emu_cores->to_node_id[core->id];
-}
-
-/*
- * Number of cores in the tree that are not yet pinned
- */
-static int cores_free(struct toptree *tree)
-{
- struct toptree *core;
- int count = 0;
-
- toptree_for_each(core, tree, CORE) {
- if (core_pinned_to_node_id(core) == NODE_ID_FREE)
- count++;
- }
- return count;
-}
-
-/*
- * Return node of core
- */
-static struct toptree *core_node(struct toptree *core)
-{
- return core->parent->parent->parent->parent;
-}
-
-/*
- * Return drawer of core
- */
-static struct toptree *core_drawer(struct toptree *core)
-{
- return core->parent->parent->parent;
-}
-
-/*
- * Return book of core
- */
-static struct toptree *core_book(struct toptree *core)
-{
- return core->parent->parent;
-}
-
-/*
- * Return mc of core
- */
-static struct toptree *core_mc(struct toptree *core)
-{
- return core->parent;
-}
-
-/*
- * Distance between two cores
- */
-static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
-{
- if (core_drawer(core1)->id != core_drawer(core2)->id)
- return DIST_DRAWER;
- if (core_book(core1)->id != core_book(core2)->id)
- return DIST_BOOK;
- if (core_mc(core1)->id != core_mc(core2)->id)
- return DIST_MC;
- /* Same core or sibling on same MC */
- return DIST_CORE;
-}
-
-/*
- * Distance of a node to a core
- */
-static int dist_node_to_core(struct toptree *node, struct toptree *core)
-{
- struct toptree *core_node;
- int dist_min = DIST_MAX;
-
- toptree_for_each(core_node, node, CORE)
- dist_min = min(dist_min, dist_core_to_core(core_node, core));
- return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
-}
-
-/*
- * Unify will delete empty nodes, therefore recreate nodes.
- */
-static void toptree_unify_tree(struct toptree *tree)
-{
- int nid;
-
- toptree_unify(tree);
- for (nid = 0; nid < emu_nodes; nid++)
- toptree_get_child(tree, nid);
-}
-
-/*
- * Find the best/nearest node for a given core and ensure that no node
- * gets more than "emu_cores->per_node_target + extra" cores.
- */
-static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
- int extra)
-{
- struct toptree *node, *node_best = NULL;
- int dist_cur, dist_best, cores_target;
-
- cores_target = emu_cores->per_node_target + extra;
- dist_best = DIST_MAX;
- node_best = NULL;
- toptree_for_each(node, numa, NODE) {
- /* Already pinned cores must use their nodes */
- if (core_pinned_to_node_id(core) == node->id) {
- node_best = node;
- break;
- }
- /* Skip nodes that already have enough cores */
- if (cores_pinned(node) >= cores_target)
- continue;
- dist_cur = dist_node_to_core(node, core);
- if (dist_cur < dist_best) {
- dist_best = dist_cur;
- node_best = node;
- }
- }
- return node_best;
-}
-
-/*
- * Find the best node for each core with respect to "extra" core count
- */
-static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
- int extra)
-{
- struct toptree *node, *core, *tmp;
-
- toptree_for_each_safe(core, tmp, phys, CORE) {
- node = node_for_core(numa, core, extra);
- if (!node)
- return;
- toptree_move(core, node);
- pin_core_to_node(core->id, node->id);
- }
-}
-
-/*
- * Move structures of given level to specified NUMA node
- */
-static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
- enum toptree_level level, bool perfect)
-{
- int cores_free, cores_target = emu_cores->per_node_target;
- struct toptree *cur, *tmp;
-
- toptree_for_each_safe(cur, tmp, phys, level) {
- cores_free = cores_target - toptree_count(node, CORE);
- if (perfect) {
- if (cores_free == toptree_count(cur, CORE))
- toptree_move(cur, node);
- } else {
- if (cores_free >= toptree_count(cur, CORE))
- toptree_move(cur, node);
- }
- }
-}
-
-/*
- * Move structures of a given level to NUMA nodes. If "perfect" is specified
- * move only perfectly fitting structures. Otherwise move also smaller
- * than needed structures.
- */
-static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
- enum toptree_level level, bool perfect)
-{
- struct toptree *node;
-
- toptree_for_each(node, numa, NODE)
- move_level_to_numa_node(node, phys, level, perfect);
-}
-
-/*
- * For the first run try to move the big structures
- */
-static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
-{
- struct toptree *core;
-
- /* Always try to move perfectly fitting structures first */
- move_level_to_numa(numa, phys, DRAWER, true);
- move_level_to_numa(numa, phys, DRAWER, false);
- move_level_to_numa(numa, phys, BOOK, true);
- move_level_to_numa(numa, phys, BOOK, false);
- move_level_to_numa(numa, phys, MC, true);
- move_level_to_numa(numa, phys, MC, false);
- /* Now pin all the moved cores */
- toptree_for_each(core, numa, CORE)
- pin_core_to_node(core->id, core_node(core)->id);
-}
-
-/*
- * Allocate new topology and create required nodes
- */
-static struct toptree *toptree_new(int id, int nodes)
-{
- struct toptree *tree;
- int nid;
-
- tree = toptree_alloc(TOPOLOGY, id);
- if (!tree)
- goto fail;
- for (nid = 0; nid < nodes; nid++) {
- if (!toptree_get_child(tree, nid))
- goto fail;
- }
- return tree;
-fail:
- panic("NUMA emulation could not allocate topology");
-}
-
-/*
- * Allocate and initialize core to node mapping
- */
-static void __ref create_core_to_node_map(void)
-{
- int i;
-
- emu_cores = memblock_alloc(sizeof(*emu_cores), 8);
- if (!emu_cores)
- panic("%s: Failed to allocate %zu bytes align=0x%x\n",
- __func__, sizeof(*emu_cores), 8);
- for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
- emu_cores->to_node_id[i] = NODE_ID_FREE;
-}
-
-/*
- * Move cores from physical topology into NUMA target topology
- * and try to keep as much of the physical topology as possible.
- */
-static struct toptree *toptree_to_numa(struct toptree *phys)
-{
- static int first = 1;
- struct toptree *numa;
- int cores_total;
-
- cores_total = emu_cores->total + cores_free(phys);
- emu_cores->per_node_target = cores_total / emu_nodes;
- numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
- if (first) {
- toptree_to_numa_first(numa, phys);
- first = 0;
- }
- toptree_to_numa_single(numa, phys, 0);
- toptree_to_numa_single(numa, phys, 1);
- toptree_unify_tree(numa);
-
- WARN_ON(cpumask_weight(&phys->mask));
- return numa;
-}
-
-/*
- * Create a toptree out of the physical topology that we got from the hypervisor
- */
-static struct toptree *toptree_from_topology(void)
-{
- struct toptree *phys, *node, *drawer, *book, *mc, *core;
- struct cpu_topology_s390 *top;
- int cpu;
-
- phys = toptree_new(TOPTREE_ID_PHYS, 1);
-
- for_each_cpu(cpu, &cpus_with_topology) {
- top = &cpu_topology[cpu];
- node = toptree_get_child(phys, 0);
- drawer = toptree_get_child(node, top->drawer_id);
- book = toptree_get_child(drawer, top->book_id);
- mc = toptree_get_child(book, top->socket_id);
- core = toptree_get_child(mc, smp_get_base_cpu(cpu));
- if (!drawer || !book || !mc || !core)
- panic("NUMA emulation could not allocate memory");
- cpumask_set_cpu(cpu, &core->mask);
- toptree_update_mask(mc);
- }
- return phys;
-}
-
-/*
- * Add toptree core to topology and create correct CPU masks
- */
-static void topology_add_core(struct toptree *core)
-{
- struct cpu_topology_s390 *top;
- int cpu;
-
- for_each_cpu(cpu, &core->mask) {
- top = &cpu_topology[cpu];
- cpumask_copy(&top->thread_mask, &core->mask);
- cpumask_copy(&top->core_mask, &core_mc(core)->mask);
- cpumask_copy(&top->book_mask, &core_book(core)->mask);
- cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask);
- cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
- top->node_id = core_node(core)->id;
- }
-}
-
-/*
- * Apply toptree to topology and create CPU masks
- */
-static void toptree_to_topology(struct toptree *numa)
-{
- struct toptree *core;
- int i;
-
- /* Clear all node masks */
- for (i = 0; i < MAX_NUMNODES; i++)
- cpumask_clear(&node_to_cpumask_map[i]);
-
- /* Rebuild all masks */
- toptree_for_each(core, numa, CORE)
- topology_add_core(core);
-}
-
-/*
- * Show the node to core mapping
- */
-static void print_node_to_core_map(void)
-{
- int nid, cid;
-
- if (!numa_debug_enabled)
- return;
- printk(KERN_DEBUG "NUMA node to core mapping\n");
- for (nid = 0; nid < emu_nodes; nid++) {
- printk(KERN_DEBUG " node %3d: ", nid);
- for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
- if (emu_cores->to_node_id[cid] == nid)
- printk(KERN_CONT "%d ", cid);
- }
- printk(KERN_CONT "\n");
- }
-}
-
-static void pin_all_possible_cpus(void)
-{
- int core_id, node_id, cpu;
- static int initialized;
-
- if (initialized)
- return;
- print_node_to_core_map();
- node_id = 0;
- for_each_possible_cpu(cpu) {
- core_id = smp_get_base_cpu(cpu);
- if (emu_cores->to_node_id[core_id] != NODE_ID_FREE)
- continue;
- pin_core_to_node(core_id, node_id);
- cpu_topology[cpu].node_id = node_id;
- node_id = (node_id + 1) % emu_nodes;
- }
- print_node_to_core_map();
- initialized = 1;
-}
-
-/*
- * Transfer physical topology into a NUMA topology and modify CPU masks
- * according to the NUMA topology.
- *
- * Must be called with "sched_domains_mutex" lock held.
- */
-static void emu_update_cpu_topology(void)
-{
- struct toptree *phys, *numa;
-
- if (emu_cores == NULL)
- create_core_to_node_map();
- phys = toptree_from_topology();
- numa = toptree_to_numa(phys);
- toptree_free(phys);
- toptree_to_topology(numa);
- toptree_free(numa);
- pin_all_possible_cpus();
-}
-
-/*
- * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
- * alignment (needed for memory hotplug).
- */
-static unsigned long emu_setup_size_adjust(unsigned long size)
-{
- unsigned long size_new;
-
- size = size ? : CONFIG_EMU_SIZE;
- size_new = roundup(size, memory_block_size_bytes());
- if (size_new == size)
- return size;
- pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
- size >> 20, size_new >> 20);
- return size_new;
-}
-
-/*
- * If we have not enough memory for the specified nodes, reduce the node count.
- */
-static int emu_setup_nodes_adjust(int nodes)
-{
- int nodes_max;
-
- nodes_max = memblock.memory.total_size / emu_size;
- nodes_max = max(nodes_max, 1);
- if (nodes_max >= nodes)
- return nodes;
- pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
- return nodes_max;
-}
-
-/*
- * Early emu setup
- */
-static void emu_setup(void)
-{
- int nid;
-
- emu_size = emu_setup_size_adjust(emu_size);
- emu_nodes = emu_setup_nodes_adjust(emu_nodes);
- for (nid = 0; nid < emu_nodes; nid++)
- node_set(nid, node_possible_map);
- pr_info("Creating %d nodes with memory stripe size %ld MB\n",
- emu_nodes, emu_size >> 20);
-}
-
-/*
- * Return node id for given page number
- */
-static int emu_pfn_to_nid(unsigned long pfn)
-{
- return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
-}
-
-/*
- * Return stripe size
- */
-static unsigned long emu_align(void)
-{
- return emu_size;
-}
-
-/*
- * Return distance between two nodes
- */
-static int emu_distance(int node1, int node2)
-{
- return (node1 != node2) * EMU_NODE_DIST;
-}
-
-/*
- * Define callbacks for generic s390 NUMA infrastructure
- */
-const struct numa_mode numa_mode_emu = {
- .name = "emu",
- .setup = emu_setup,
- .update_cpu_topology = emu_update_cpu_topology,
- .__pfn_to_nid = emu_pfn_to_nid,
- .align = emu_align,
- .distance = emu_distance,
-};
-
-/*
- * Kernel parameter: emu_nodes=<n>
- */
-static int __init early_parse_emu_nodes(char *p)
-{
- int count;
-
- if (!p || kstrtoint(p, 0, &count) != 0 || count <= 0)
- return 0;
- emu_nodes = min(count, MAX_NUMNODES);
- return 0;
-}
-early_param("emu_nodes", early_parse_emu_nodes);
-
-/*
- * Kernel parameter: emu_size=[<n>[k|M|G|T]]
- */
-static int __init early_parse_emu_size(char *p)
-{
- if (p)
- emu_size = memparse(p, NULL);
- return 0;
-}
-early_param("emu_size", early_parse_emu_size);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
deleted file mode 100644
index d2910fa834c8..000000000000
--- a/arch/s390/numa/numa.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * Implement NUMA core code.
- *
- * Copyright IBM Corp. 2015
- */
-
-#define KMSG_COMPONENT "numa"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/cpumask.h>
-#include <linux/memblock.h>
-#include <linux/slab.h>
-#include <linux/node.h>
-
-#include <asm/numa.h>
-#include "numa_mode.h"
-
-pg_data_t *node_data[MAX_NUMNODES];
-EXPORT_SYMBOL(node_data);
-
-cpumask_t node_to_cpumask_map[MAX_NUMNODES];
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-static void plain_setup(void)
-{
- node_set(0, node_possible_map);
-}
-
-const struct numa_mode numa_mode_plain = {
- .name = "plain",
- .setup = plain_setup,
-};
-
-static const struct numa_mode *mode = &numa_mode_plain;
-
-int numa_pfn_to_nid(unsigned long pfn)
-{
- return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0;
-}
-
-void numa_update_cpu_topology(void)
-{
- if (mode->update_cpu_topology)
- mode->update_cpu_topology();
-}
-
-int __node_distance(int a, int b)
-{
- return mode->distance ? mode->distance(a, b) : 0;
-}
-EXPORT_SYMBOL(__node_distance);
-
-int numa_debug_enabled;
-
-/*
- * numa_setup_memory() - Assign bootmem to nodes
- *
- * The memory is first added to memblock without any respect to nodes.
- * This is fixed before remaining memblock memory is handed over to the
- * buddy allocator.
- * An important side effect is that large bootmem allocations might easily
- * cross node boundaries, which can be needed for large allocations with
- * smaller memory stripes in each node (i.e. when using NUMA emulation).
- *
- * Memory defines nodes:
- * Therefore this routine also sets the nodes online with memory.
- */
-static void __init numa_setup_memory(void)
-{
- unsigned long cur_base, align, end_of_dram;
- int nid = 0;
-
- end_of_dram = memblock_end_of_DRAM();
- align = mode->align ? mode->align() : ULONG_MAX;
-
- /*
- * Step through all available memory and assign it to the nodes
- * indicated by the mode implementation.
- * All nodes which are seen here will be set online.
- */
- cur_base = 0;
- do {
- nid = numa_pfn_to_nid(PFN_DOWN(cur_base));
- node_set_online(nid);
- memblock_set_node(cur_base, align, &memblock.memory, nid);
- cur_base += align;
- } while (cur_base < end_of_dram);
-
- /* Allocate and fill out node_data */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
- if (!NODE_DATA(nid))
- panic("%s: Failed to allocate %zu bytes align=0x%x\n",
- __func__, sizeof(pg_data_t), 8);
- }
-
- for_each_online_node(nid) {
- unsigned long start_pfn, end_pfn;
- unsigned long t_start, t_end;
- int i;
-
- start_pfn = ULONG_MAX;
- end_pfn = 0;
- for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) {
- if (t_start < start_pfn)
- start_pfn = t_start;
- if (t_end > end_pfn)
- end_pfn = t_end;
- }
- NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
- NODE_DATA(nid)->node_id = nid;
- }
-}
-
-/*
- * numa_setup() - Earliest initialization
- *
- * Assign the mode and call the mode's setup routine.
- */
-void __init numa_setup(void)
-{
- pr_info("NUMA mode: %s\n", mode->name);
- nodes_clear(node_possible_map);
- /* Initially attach all possible CPUs to node 0. */
- cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask);
- if (mode->setup)
- mode->setup();
- numa_setup_memory();
- memblock_dump_all();
-}
-
-/*
- * numa_init_late() - Initialization initcall
- *
- * Register NUMA nodes.
- */
-static int __init numa_init_late(void)
-{
- int nid;
-
- for_each_online_node(nid)
- register_one_node(nid);
- return 0;
-}
-arch_initcall(numa_init_late);
-
-static int __init parse_debug(char *parm)
-{
- numa_debug_enabled = 1;
- return 0;
-}
-early_param("numa_debug", parse_debug);
-
-static int __init parse_numa(char *parm)
-{
- if (!parm)
- return 1;
- if (strcmp(parm, numa_mode_plain.name) == 0)
- mode = &numa_mode_plain;
-#ifdef CONFIG_NUMA_EMU
- if (strcmp(parm, numa_mode_emu.name) == 0)
- mode = &numa_mode_emu;
-#endif
- return 0;
-}
-early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
deleted file mode 100644
index dfd3e2784081..000000000000
--- a/arch/s390/numa/numa_mode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * Define declarations used for communication between NUMA mode
- * implementations and NUMA core functionality.
- *
- * Copyright IBM Corp. 2015
- */
-#ifndef __S390_NUMA_MODE_H
-#define __S390_NUMA_MODE_H
-
-struct numa_mode {
- char *name; /* Name of mode */
- void (*setup)(void); /* Initizalize mode */
- void (*update_cpu_topology)(void); /* Called by topology code */
- int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */
- unsigned long (*align)(void); /* Minimum node alignment */
- int (*distance)(int a, int b); /* Distance between two nodes */
-};
-
-extern const struct numa_mode numa_mode_plain;
-extern const struct numa_mode numa_mode_emu;
-
-#endif /* __S390_NUMA_MODE_H */
diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c
deleted file mode 100644
index 71a608cd4f61..000000000000
--- a/arch/s390/numa/toptree.c
+++ /dev/null
@@ -1,351 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * A tree structure used for machine topology mangling
- *
- * Copyright IBM Corp. 2015
- */
-
-#include <linux/kernel.h>
-#include <linux/memblock.h>
-#include <linux/cpumask.h>
-#include <linux/list.h>
-#include <linux/list_sort.h>
-#include <linux/slab.h>
-#include <asm/numa.h>
-
-#include "toptree.h"
-
-/**
- * toptree_alloc - Allocate and initialize a new tree node.
- * @level: The node's vertical level; level 0 contains the leaves.
- * @id: ID number, explicitly not unique beyond scope of node's siblings
- *
- * Allocate a new tree node and initialize it.
- *
- * RETURNS:
- * Pointer to the new tree node or NULL on error
- */
-struct toptree __ref *toptree_alloc(int level, int id)
-{
- struct toptree *res;
-
- if (slab_is_available())
- res = kzalloc(sizeof(*res), GFP_KERNEL);
- else
- res = memblock_alloc(sizeof(*res), 8);
- if (!res)
- return res;
-
- INIT_LIST_HEAD(&res->children);
- INIT_LIST_HEAD(&res->sibling);
- cpumask_clear(&res->mask);
- res->level = level;
- res->id = id;
- return res;
-}
-
-/**
- * toptree_remove - Remove a tree node from a tree
- * @cand: Pointer to the node to remove
- *
- * The node is detached from its parent node. The parent node's
- * masks will be updated to reflect the loss of the child.
- */
-static void toptree_remove(struct toptree *cand)
-{
- struct toptree *oldparent;
-
- list_del_init(&cand->sibling);
- oldparent = cand->parent;
- cand->parent = NULL;
- toptree_update_mask(oldparent);
-}
-
-/**
- * toptree_free - discard a tree node
- * @cand: Pointer to the tree node to discard
- *
- * Checks if @cand is attached to a parent node. Detaches it
- * cleanly using toptree_remove. Possible children are freed
- * recursively. In the end @cand itself is freed.
- */
-void __ref toptree_free(struct toptree *cand)
-{
- struct toptree *child, *tmp;
-
- if (cand->parent)
- toptree_remove(cand);
- toptree_for_each_child_safe(child, tmp, cand)
- toptree_free(child);
- if (slab_is_available())
- kfree(cand);
- else
- memblock_free_early((unsigned long)cand, sizeof(*cand));
-}
-
-/**
- * toptree_update_mask - Update node bitmasks
- * @cand: Pointer to a tree node
- *
- * The node's cpumask will be updated by combining all children's
- * masks. Then toptree_update_mask is called recursively for the
- * parent if applicable.
- *
- * NOTE:
- * This must not be called on leaves. If called on a leaf, its
- * CPU mask is cleared and lost.
- */
-void toptree_update_mask(struct toptree *cand)
-{
- struct toptree *child;
-
- cpumask_clear(&cand->mask);
- list_for_each_entry(child, &cand->children, sibling)
- cpumask_or(&cand->mask, &cand->mask, &child->mask);
- if (cand->parent)
- toptree_update_mask(cand->parent);
-}
-
-/**
- * toptree_insert - Insert a tree node into tree
- * @cand: Pointer to the node to insert
- * @target: Pointer to the node to which @cand will added as a child
- *
- * Insert a tree node into a tree. Masks will be updated automatically.
- *
- * RETURNS:
- * 0 on success, -1 if NULL is passed as argument or the node levels
- * don't fit.
- */
-static int toptree_insert(struct toptree *cand, struct toptree *target)
-{
- if (!cand || !target)
- return -1;
- if (target->level != (cand->level + 1))
- return -1;
- list_add_tail(&cand->sibling, &target->children);
- cand->parent = target;
- toptree_update_mask(target);
- return 0;
-}
-
-/**
- * toptree_move_children - Move all child nodes of a node to a new place
- * @cand: Pointer to the node whose children are to be moved
- * @target: Pointer to the node to which @cand's children will be attached
- *
- * Take all child nodes of @cand and move them using toptree_move.
- */
-static void toptree_move_children(struct toptree *cand, struct toptree *target)
-{
- struct toptree *child, *tmp;
-
- toptree_for_each_child_safe(child, tmp, cand)
- toptree_move(child, target);
-}
-
-/**
- * toptree_unify - Merge children with same ID
- * @cand: Pointer to node whose direct children should be made unique
- *
- * When mangling the tree it is possible that a node has two or more children
- * which have the same ID. This routine merges these children into one and
- * moves all children of the merged nodes into the unified node.
- */
-void toptree_unify(struct toptree *cand)
-{
- struct toptree *child, *tmp, *cand_copy;
-
- /* Threads cannot be split, cores are not split */
- if (cand->level < 2)
- return;
-
- cand_copy = toptree_alloc(cand->level, 0);
- toptree_for_each_child_safe(child, tmp, cand) {
- struct toptree *tmpchild;
-
- if (!cpumask_empty(&child->mask)) {
- tmpchild = toptree_get_child(cand_copy, child->id);
- toptree_move_children(child, tmpchild);
- }
- toptree_free(child);
- }
- toptree_move_children(cand_copy, cand);
- toptree_free(cand_copy);
-
- toptree_for_each_child(child, cand)
- toptree_unify(child);
-}
-
-/**
- * toptree_move - Move a node to another context
- * @cand: Pointer to the node to move
- * @target: Pointer to the node where @cand should go
- *
- * In the easiest case @cand is exactly on the level below @target
- * and will be immediately moved to the target.
- *
- * If @target's level is not the direct parent level of @cand,
- * nodes for the missing levels are created and put between
- * @cand and @target. The "stacking" nodes' IDs are taken from
- * @cand's parents.
- *
- * After this it is likely to have redundant nodes in the tree
- * which are addressed by means of toptree_unify.
- */
-void toptree_move(struct toptree *cand, struct toptree *target)
-{
- struct toptree *stack_target, *real_insert_point, *ptr, *tmp;
-
- if (cand->level + 1 == target->level) {
- toptree_remove(cand);
- toptree_insert(cand, target);
- return;
- }
-
- real_insert_point = NULL;
- ptr = cand;
- stack_target = NULL;
-
- do {
- tmp = stack_target;
- stack_target = toptree_alloc(ptr->level + 1,
- ptr->parent->id);
- toptree_insert(tmp, stack_target);
- if (!real_insert_point)
- real_insert_point = stack_target;
- ptr = ptr->parent;
- } while (stack_target->level < (target->level - 1));
-
- toptree_remove(cand);
- toptree_insert(cand, real_insert_point);
- toptree_insert(stack_target, target);
-}
-
-/**
- * toptree_get_child - Access a tree node's child by its ID
- * @cand: Pointer to tree node whose child is to access
- * @id: The desired child's ID
- *
- * @cand's children are searched for a child with matching ID.
- * If no match can be found, a new child with the desired ID
- * is created and returned.
- */
-struct toptree *toptree_get_child(struct toptree *cand, int id)
-{
- struct toptree *child;
-
- toptree_for_each_child(child, cand)
- if (child->id == id)
- return child;
- child = toptree_alloc(cand->level-1, id);
- toptree_insert(child, cand);
- return child;
-}
-
-/**
- * toptree_first - Find the first descendant on specified level
- * @context: Pointer to tree node whose descendants are to be used
- * @level: The level of interest
- *
- * RETURNS:
- * @context's first descendant on the specified level, or NULL
- * if there is no matching descendant
- */
-struct toptree *toptree_first(struct toptree *context, int level)
-{
- struct toptree *child, *tmp;
-
- if (context->level == level)
- return context;
-
- if (!list_empty(&context->children)) {
- list_for_each_entry(child, &context->children, sibling) {
- tmp = toptree_first(child, level);
- if (tmp)
- return tmp;
- }
- }
- return NULL;
-}
-
-/**
- * toptree_next_sibling - Return next sibling
- * @cur: Pointer to a tree node
- *
- * RETURNS:
- * If @cur has a parent and is not the last in the parent's children list,
- * the next sibling is returned. Or NULL when there are no siblings left.
- */
-static struct toptree *toptree_next_sibling(struct toptree *cur)
-{
- if (cur->parent == NULL)
- return NULL;
-
- if (cur == list_last_entry(&cur->parent->children,
- struct toptree, sibling))
- return NULL;
- return (struct toptree *) list_next_entry(cur, sibling);
-}
-
-/**
- * toptree_next - Tree traversal function
- * @cur: Pointer to current element
- * @context: Pointer to the root node of the tree or subtree to
- * be traversed.
- * @level: The level of interest.
- *
- * RETURNS:
- * Pointer to the next node on level @level
- * or NULL when there is no next node.
- */
-struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
- int level)
-{
- struct toptree *cur_context, *tmp;
-
- if (!cur)
- return NULL;
-
- if (context->level == level)
- return NULL;
-
- tmp = toptree_next_sibling(cur);
- if (tmp != NULL)
- return tmp;
-
- cur_context = cur;
- while (cur_context->level < context->level - 1) {
- /* Step up */
- cur_context = cur_context->parent;
- /* Step aside */
- tmp = toptree_next_sibling(cur_context);
- if (tmp != NULL) {
- /* Step down */
- tmp = toptree_first(tmp, level);
- if (tmp != NULL)
- return tmp;
- }
- }
- return NULL;
-}
-
-/**
- * toptree_count - Count descendants on specified level
- * @context: Pointer to node whose descendants are to be considered
- * @level: Only descendants on the specified level will be counted
- *
- * RETURNS:
- * Number of descendants on the specified level
- */
-int toptree_count(struct toptree *context, int level)
-{
- struct toptree *cur;
- int cnt = 0;
-
- toptree_for_each(cur, context, level)
- cnt++;
- return cnt;
-}
diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h
deleted file mode 100644
index 5246371ec713..000000000000
--- a/arch/s390/numa/toptree.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * A tree structure used for machine topology mangling
- *
- * Copyright IBM Corp. 2015
- */
-#ifndef S390_TOPTREE_H
-#define S390_TOPTREE_H
-
-#include <linux/cpumask.h>
-#include <linux/list.h>
-
-struct toptree {
- int level;
- int id;
- cpumask_t mask;
- struct toptree *parent;
- struct list_head sibling;
- struct list_head children;
-};
-
-struct toptree *toptree_alloc(int level, int id);
-void toptree_free(struct toptree *cand);
-void toptree_update_mask(struct toptree *cand);
-void toptree_unify(struct toptree *cand);
-struct toptree *toptree_get_child(struct toptree *cand, int id);
-void toptree_move(struct toptree *cand, struct toptree *target);
-int toptree_count(struct toptree *context, int level);
-
-struct toptree *toptree_first(struct toptree *context, int level);
-struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
- int level);
-
-#define toptree_for_each_child(child, ptree) \
- list_for_each_entry(child, &ptree->children, sibling)
-
-#define toptree_for_each_child_safe(child, ptmp, ptree) \
- list_for_each_entry_safe(child, ptmp, &ptree->children, sibling)
-
-#define toptree_is_last(ptree) \
- ((ptree->parent == NULL) || \
- (ptree->parent->children.prev == &ptree->sibling))
-
-#define toptree_for_each(ptree, cont, ttype) \
- for (ptree = toptree_first(cont, ttype); \
- ptree != NULL; \
- ptree = toptree_next(ptree, cont, ttype))
-
-#define toptree_for_each_safe(ptree, tmp, cont, ttype) \
- for (ptree = toptree_first(cont, ttype), \
- tmp = toptree_next(ptree, cont, ttype); \
- ptree != NULL; \
- ptree = tmp, \
- tmp = toptree_next(ptree, cont, ttype))
-
-#define toptree_for_each_sibling(ptree, start) \
- toptree_for_each(ptree, start->parent, start->level)
-
-#endif /* S390_TOPTREE_H */
diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
deleted file mode 100644
index 36261f9d360b..000000000000
--- a/arch/s390/oprofile/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
- oprof.o cpu_buffer.o buffer_sync.o \
- event_buffer.o oprofile_files.o \
- oprofilefs.o oprofile_stats.o \
- timer_int.o )
-
-oprofile-y := $(DRIVER_OBJS) init.o
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
deleted file mode 100644
index 7441857df51b..000000000000
--- a/arch/s390/oprofile/init.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * S390 Version
- * Copyright IBM Corp. 2002, 2011
- * Author(s): Thomas Spatzier (tspat@de.ibm.com)
- * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com)
- * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com)
- * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
- *
- * @remark Copyright 2002-2011 OProfile authors
- */
-
-#include <linux/oprofile.h>
-#include <linux/init.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
-
-static void s390_backtrace(struct pt_regs *regs, unsigned int depth)
-{
- struct unwind_state state;
-
- unwind_for_each_frame(&state, current, regs, 0) {
- if (depth-- == 0)
- break;
- oprofile_add_trace(state.ip);
- }
-}
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
- ops->backtrace = s390_backtrace;
- return 0;
-}
-
-void oprofile_arch_exit(void)
-{
-}
diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile
index 748626a33028..5ae31ca9dd44 100644
--- a/arch/s390/pci/Makefile
+++ b/arch/s390/pci/Makefile
@@ -4,4 +4,6 @@
#
obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \
- pci_event.o pci_debug.o pci_insn.o pci_mmio.o
+ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \
+ pci_bus.o pci_kvm_hook.o
+obj-$(CONFIG_PCI_IOV) += pci_iov.o
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 60716d18ce5a..73cdc5539384 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -36,17 +36,22 @@
#include <asm/pci_clp.h>
#include <asm/pci_dma.h>
+#include "pci_bus.h"
+#include "pci_iov.h"
+
/* list of all detected zpci devices */
static LIST_HEAD(zpci_list);
static DEFINE_SPINLOCK(zpci_list_lock);
-static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES);
+static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE);
static DEFINE_SPINLOCK(zpci_domain_lock);
#define ZPCI_IOMAP_ENTRIES \
min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \
ZPCI_IOMAP_MAX_ENTRIES)
+unsigned int s390_pci_no_rid;
+
static DEFINE_SPINLOCK(zpci_iomap_lock);
static unsigned long *zpci_iomap_bitmap;
struct zpci_iomap_entry *zpci_iomap_start;
@@ -56,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio);
static struct kmem_cache *zdev_fmb_cache;
+/* AEN structures that must be preserved over KVM module re-insertion */
+union zpci_sic_iib *zpci_aipb;
+EXPORT_SYMBOL_GPL(zpci_aipb);
+struct airq_iv *zpci_aif_sbv;
+EXPORT_SYMBOL_GPL(zpci_aif_sbv);
+
struct zpci_dev *get_zdev_by_fid(u32 fid)
{
struct zpci_dev *tmp, *zdev = NULL;
@@ -64,6 +75,7 @@ struct zpci_dev *get_zdev_by_fid(u32 fid)
list_for_each_entry(tmp, &zpci_list, entry) {
if (tmp->fid == fid) {
zdev = tmp;
+ zpci_zdev_get(zdev);
break;
}
}
@@ -87,17 +99,12 @@ void zpci_remove_reserved_devices(void)
spin_unlock(&zpci_list_lock);
list_for_each_entry_safe(zdev, tmp, &remove, entry)
- zpci_remove_device(zdev);
-}
-
-static struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus)
-{
- return (bus && bus->sysdata) ? (struct zpci_dev *) bus->sysdata : NULL;
+ zpci_device_reserved(zdev);
}
int pci_domain_nr(struct pci_bus *bus)
{
- return ((struct zpci_dev *) bus->sysdata)->domain;
+ return ((struct zpci_bus *) bus->sysdata)->domain_nr;
}
EXPORT_SYMBOL_GPL(pci_domain_nr);
@@ -113,14 +120,19 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT);
struct zpci_fib fib = {0};
- u8 status;
+ u8 cc, status;
WARN_ON_ONCE(iota & 0x3fff);
fib.pba = base;
fib.pal = limit;
fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
- return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+ fib.gd = zdev->gisa;
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc)
+ zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
+ return cc;
}
+EXPORT_SYMBOL_GPL(zpci_register_ioat);
/* Modify PCI: Unregister I/O address translation parameters */
int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
@@ -129,10 +141,12 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
struct zpci_fib fib = {0};
u8 cc, status;
+ fib.gd = zdev->gisa;
+
cc = zpci_mod_fc(req, &fib, &status);
- if (cc == 3) /* Function already gone. */
- cc = 0;
- return cc ? -EIO : 0;
+ if (cc)
+ zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
+ return cc;
}
/* Modify PCI: Set PCI function measurement parameters */
@@ -156,6 +170,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
atomic64_set(&zdev->unmapped_pages, 0);
fib.fmb_addr = virt_to_phys(zdev->fmb);
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc) {
kmem_cache_free(zdev_fmb_cache, zdev->fmb);
@@ -174,6 +189,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
if (!zdev->fmb)
return -EINVAL;
+ fib.gd = zdev->gisa;
+
/* Function measurement is disabled if fmb address is zero */
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3) /* Function already gone. */
@@ -227,34 +244,58 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
zpci_memcpy_toio(to, from, count);
}
-void __iomem *ioremap(unsigned long ioaddr, unsigned long size)
+static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot)
{
+ unsigned long offset, vaddr;
struct vm_struct *area;
- unsigned long offset;
+ phys_addr_t last_addr;
- if (!size)
+ last_addr = addr + size - 1;
+ if (!size || last_addr < addr)
return NULL;
if (!static_branch_unlikely(&have_mio))
- return (void __iomem *) ioaddr;
+ return (void __iomem *) addr;
- offset = ioaddr & ~PAGE_MASK;
- ioaddr &= PAGE_MASK;
+ offset = addr & ~PAGE_MASK;
+ addr &= PAGE_MASK;
size = PAGE_ALIGN(size + offset);
area = get_vm_area(size, VM_IOREMAP);
if (!area)
return NULL;
- if (ioremap_page_range((unsigned long) area->addr,
- (unsigned long) area->addr + size,
- ioaddr, PAGE_KERNEL)) {
- vunmap(area->addr);
+ vaddr = (unsigned long) area->addr;
+ if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) {
+ free_vm_area(area);
return NULL;
}
return (void __iomem *) ((unsigned long) area->addr + offset);
}
+
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+{
+ return __ioremap(addr, size, __pgprot(prot));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+void __iomem *ioremap(phys_addr_t addr, size_t size)
+{
+ return __ioremap(addr, size, PAGE_KERNEL);
+}
EXPORT_SYMBOL(ioremap);
+void __iomem *ioremap_wc(phys_addr_t addr, size_t size)
+{
+ return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL));
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_wt(phys_addr_t addr, size_t size)
+{
+ return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
void iounmap(volatile void __iomem *addr)
{
if (static_branch_likely(&have_mio))
@@ -372,29 +413,17 @@ EXPORT_SYMBOL(pci_iounmap);
static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
int size, u32 *val)
{
- struct zpci_dev *zdev = get_zdev_by_bus(bus);
- int ret;
+ struct zpci_dev *zdev = zdev_from_bus(bus, devfn);
- if (!zdev || devfn != ZPCI_DEVFN)
- ret = -ENODEV;
- else
- ret = zpci_cfg_load(zdev, where, val, size);
-
- return ret;
+ return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV;
}
static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
int size, u32 val)
{
- struct zpci_dev *zdev = get_zdev_by_bus(bus);
- int ret;
+ struct zpci_dev *zdev = zdev_from_bus(bus, devfn);
- if (!zdev || devfn != ZPCI_DEVFN)
- ret = -ENODEV;
- else
- ret = zpci_cfg_store(zdev, where, val, size);
-
- return ret;
+ return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV;
}
static struct pci_ops pci_root_ops = {
@@ -402,15 +431,6 @@ static struct pci_ops pci_root_ops = {
.write = pci_write,
};
-#ifdef CONFIG_PCI_IOV
-static struct resource iov_res = {
- .name = "PCI IOV res",
- .start = 0,
- .end = -1,
- .flags = IORESOURCE_MEM,
-};
-#endif
-
static void zpci_map_resources(struct pci_dev *pdev)
{
struct zpci_dev *zdev = to_zpci(pdev);
@@ -431,16 +451,7 @@ static void zpci_map_resources(struct pci_dev *pdev)
pdev->resource[i].end = pdev->resource[i].start + len - 1;
}
-#ifdef CONFIG_PCI_IOV
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- int bar = i + PCI_IOV_RESOURCES;
-
- len = pci_resource_len(pdev, bar);
- if (!len)
- continue;
- pdev->resource[bar].parent = &iov_res;
- }
-#endif
+ zpci_iov_map_resources(pdev);
}
static void zpci_unmap_resources(struct pci_dev *pdev)
@@ -484,6 +495,34 @@ static void zpci_free_iomap(struct zpci_dev *zdev, int entry)
spin_unlock(&zpci_iomap_lock);
}
+static void zpci_do_update_iomap_fh(struct zpci_dev *zdev, u32 fh)
+{
+ int bar, idx;
+
+ spin_lock(&zpci_iomap_lock);
+ for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
+ if (!zdev->bars[bar].size)
+ continue;
+ idx = zdev->bars[bar].map_idx;
+ if (!zpci_iomap_start[idx].count)
+ continue;
+ WRITE_ONCE(zpci_iomap_start[idx].fh, zdev->fh);
+ }
+ spin_unlock(&zpci_iomap_lock);
+}
+
+void zpci_update_fh(struct zpci_dev *zdev, u32 fh)
+{
+ if (!fh || zdev->fh == fh)
+ return;
+
+ zdev->fh = fh;
+ if (zpci_use_mio(zdev))
+ return;
+ if (zdev->has_resources && zdev_enabled(zdev))
+ zpci_do_update_iomap_fh(zdev, fh);
+}
+
static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
unsigned long size, unsigned long flags)
{
@@ -505,15 +544,15 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
return r;
}
-static int zpci_setup_bus_resources(struct zpci_dev *zdev,
- struct list_head *resources)
+int zpci_setup_bus_resources(struct zpci_dev *zdev,
+ struct list_head *resources)
{
unsigned long addr, size, flags;
struct resource *res;
int i, entry;
snprintf(zdev->res_name, sizeof(zdev->res_name),
- "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR);
+ "PCI Bus %04x:%02x", zdev->uid, ZPCI_BUS_NR);
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
if (!zdev->bars[i].size)
@@ -544,6 +583,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
zdev->bars[i].res = res;
pci_add_resource(resources, res);
}
+ zdev->has_resources = 1;
return 0;
}
@@ -560,13 +600,17 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
release_resource(zdev->bars[i].res);
kfree(zdev->bars[i].res);
}
+ zdev->has_resources = 0;
}
-int pcibios_add_device(struct pci_dev *pdev)
+int pcibios_device_add(struct pci_dev *pdev)
{
+ struct zpci_dev *zdev = to_zpci(pdev);
struct resource *res;
int i;
+ /* The pdev has a reference to the zdev via its bus */
+ zpci_zdev_get(zdev);
if (pdev->is_physfn)
pdev->no_vf_scan = 1;
@@ -586,7 +630,10 @@ int pcibios_add_device(struct pci_dev *pdev)
void pcibios_release_device(struct pci_dev *pdev)
{
+ struct zpci_dev *zdev = to_zpci(pdev);
+
zpci_unmap_resources(pdev);
+ zpci_zdev_put(zdev);
}
int pcibios_enable_device(struct pci_dev *pdev, int mask)
@@ -607,210 +654,348 @@ void pcibios_disable_device(struct pci_dev *pdev)
zpci_debug_exit_device(zdev);
}
-#ifdef CONFIG_HIBERNATE_CALLBACKS
-static int zpci_restore(struct device *dev)
+static int __zpci_register_domain(int domain)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct zpci_dev *zdev = to_zpci(pdev);
- int ret = 0;
-
- if (zdev->state != ZPCI_FN_STATE_ONLINE)
- goto out;
-
- ret = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES);
- if (ret)
- goto out;
-
- zpci_map_resources(pdev);
- zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- (u64) zdev->dma_table);
-
-out:
- return ret;
+ spin_lock(&zpci_domain_lock);
+ if (test_bit(domain, zpci_domain)) {
+ spin_unlock(&zpci_domain_lock);
+ pr_err("Domain %04x is already assigned\n", domain);
+ return -EEXIST;
+ }
+ set_bit(domain, zpci_domain);
+ spin_unlock(&zpci_domain_lock);
+ return domain;
}
-static int zpci_freeze(struct device *dev)
+static int __zpci_alloc_domain(void)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct zpci_dev *zdev = to_zpci(pdev);
+ int domain;
- if (zdev->state != ZPCI_FN_STATE_ONLINE)
- return 0;
-
- zpci_unregister_ioat(zdev, 0);
- zpci_unmap_resources(pdev);
- return clp_disable_fh(zdev);
+ spin_lock(&zpci_domain_lock);
+ /*
+ * We can always auto allocate domains below ZPCI_NR_DEVICES.
+ * There is either a free domain or we have reached the maximum in
+ * which case we would have bailed earlier.
+ */
+ domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
+ set_bit(domain, zpci_domain);
+ spin_unlock(&zpci_domain_lock);
+ return domain;
}
-struct dev_pm_ops pcibios_pm_ops = {
- .thaw_noirq = zpci_restore,
- .freeze_noirq = zpci_freeze,
- .restore_noirq = zpci_restore,
- .poweroff_noirq = zpci_freeze,
-};
-#endif /* CONFIG_HIBERNATE_CALLBACKS */
-
-static int zpci_alloc_domain(struct zpci_dev *zdev)
+int zpci_alloc_domain(int domain)
{
if (zpci_unique_uid) {
- zdev->domain = (u16) zdev->uid;
- if (zdev->domain >= ZPCI_NR_DEVICES)
- return 0;
-
- spin_lock(&zpci_domain_lock);
- if (test_bit(zdev->domain, zpci_domain)) {
- spin_unlock(&zpci_domain_lock);
- pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n",
- zdev->fid, zdev->domain);
- return -EEXIST;
- }
- set_bit(zdev->domain, zpci_domain);
- spin_unlock(&zpci_domain_lock);
- return 0;
+ if (domain)
+ return __zpci_register_domain(domain);
+ pr_warn("UID checking was active but no UID is provided: switching to automatic domain allocation\n");
+ update_uid_checking(false);
}
-
- spin_lock(&zpci_domain_lock);
- zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
- if (zdev->domain == ZPCI_NR_DEVICES) {
- spin_unlock(&zpci_domain_lock);
- pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n",
- zdev->fid, ZPCI_NR_DEVICES);
- return -ENOSPC;
- }
- set_bit(zdev->domain, zpci_domain);
- spin_unlock(&zpci_domain_lock);
- return 0;
+ return __zpci_alloc_domain();
}
-static void zpci_free_domain(struct zpci_dev *zdev)
+void zpci_free_domain(int domain)
{
- if (zdev->domain >= ZPCI_NR_DEVICES)
- return;
-
spin_lock(&zpci_domain_lock);
- clear_bit(zdev->domain, zpci_domain);
+ clear_bit(domain, zpci_domain);
spin_unlock(&zpci_domain_lock);
}
-void pcibios_remove_bus(struct pci_bus *bus)
+
+int zpci_enable_device(struct zpci_dev *zdev)
{
- struct zpci_dev *zdev = get_zdev_by_bus(bus);
+ u32 fh = zdev->fh;
+ int rc = 0;
- zpci_exit_slot(zdev);
- zpci_cleanup_bus_resources(zdev);
- zpci_destroy_iommu(zdev);
- zpci_free_domain(zdev);
+ if (clp_enable_fh(zdev, &fh, ZPCI_NR_DMA_SPACES))
+ rc = -EIO;
+ else
+ zpci_update_fh(zdev, fh);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(zpci_enable_device);
- spin_lock(&zpci_list_lock);
- list_del(&zdev->entry);
- spin_unlock(&zpci_list_lock);
+int zpci_disable_device(struct zpci_dev *zdev)
+{
+ u32 fh = zdev->fh;
+ int cc, rc = 0;
- zpci_dbg(3, "rem fid:%x\n", zdev->fid);
- kfree(zdev);
+ cc = clp_disable_fh(zdev, &fh);
+ if (!cc) {
+ zpci_update_fh(zdev, fh);
+ } else if (cc == CLP_RC_SETPCIFN_ALRDY) {
+ pr_info("Disabling PCI function %08x had no effect as it was already disabled\n",
+ zdev->fid);
+ /* Function is already disabled - update handle */
+ rc = clp_refresh_fh(zdev->fid, &fh);
+ if (!rc) {
+ zpci_update_fh(zdev, fh);
+ rc = -EINVAL;
+ }
+ } else {
+ rc = -EIO;
+ }
+ return rc;
}
+EXPORT_SYMBOL_GPL(zpci_disable_device);
-static int zpci_scan_bus(struct zpci_dev *zdev)
+/**
+ * zpci_hot_reset_device - perform a reset of the given zPCI function
+ * @zdev: the slot which should be reset
+ *
+ * Performs a low level reset of the zPCI function. The reset is low level in
+ * the sense that the zPCI function can be reset without detaching it from the
+ * common PCI subsystem. The reset may be performed while under control of
+ * either DMA or IOMMU APIs in which case the existing DMA/IOMMU translation
+ * table is reinstated at the end of the reset.
+ *
+ * After the reset the functions internal state is reset to an initial state
+ * equivalent to its state during boot when first probing a driver.
+ * Consequently after reset the PCI function requires re-initialization via the
+ * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors()
+ * and enabling the function via e.g.pci_enablde_device_flags().The caller
+ * must guard against concurrent reset attempts.
+ *
+ * In most cases this function should not be called directly but through
+ * pci_reset_function() or pci_reset_bus() which handle the save/restore and
+ * locking.
+ *
+ * Return: 0 on success and an error value otherwise
+ */
+int zpci_hot_reset_device(struct zpci_dev *zdev)
{
- LIST_HEAD(resources);
- int ret;
+ int rc;
- ret = zpci_setup_bus_resources(zdev, &resources);
- if (ret)
- goto error;
+ zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh);
+ if (zdev_enabled(zdev)) {
+ /* Disables device access, DMAs and IRQs (reset state) */
+ rc = zpci_disable_device(zdev);
+ /*
+ * Due to a z/VM vs LPAR inconsistency in the error state the
+ * FH may indicate an enabled device but disable says the
+ * device is already disabled don't treat it as an error here.
+ */
+ if (rc == -EINVAL)
+ rc = 0;
+ if (rc)
+ return rc;
+ }
- zdev->bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, &pci_root_ops,
- zdev, &resources);
- if (!zdev->bus) {
- ret = -EIO;
- goto error;
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ return rc;
+
+ if (zdev->dma_table)
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table));
+ else
+ rc = zpci_dma_init_device(zdev);
+ if (rc) {
+ zpci_disable_device(zdev);
+ return rc;
}
- zdev->bus->max_bus_speed = zdev->max_bus_speed;
- pci_bus_add_devices(zdev->bus);
- return 0;
-error:
- zpci_cleanup_bus_resources(zdev);
- pci_free_resource_list(&resources);
- return ret;
+ return 0;
}
-int zpci_enable_device(struct zpci_dev *zdev)
+/**
+ * zpci_create_device() - Create a new zpci_dev and add it to the zbus
+ * @fid: Function ID of the device to be created
+ * @fh: Current Function Handle of the device to be created
+ * @state: Initial state after creation either Standby or Configured
+ *
+ * Creates a new zpci device and adds it to its, possibly newly created, zbus
+ * as well as zpci_list.
+ *
+ * Returns: the zdev on success or an error pointer otherwise
+ */
+struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
{
+ struct zpci_dev *zdev;
int rc;
- rc = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES);
+ zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state);
+ zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
+ if (!zdev)
+ return ERR_PTR(-ENOMEM);
+
+ /* FID and Function Handle are the static/dynamic identifiers */
+ zdev->fid = fid;
+ zdev->fh = fh;
+
+ /* Query function properties and update zdev */
+ rc = clp_query_pci_fn(zdev);
if (rc)
- goto out;
+ goto error;
+ zdev->state = state;
- rc = zpci_dma_init_device(zdev);
+ kref_init(&zdev->kref);
+ mutex_init(&zdev->lock);
+ mutex_init(&zdev->kzdev_lock);
+
+ rc = zpci_init_iommu(zdev);
if (rc)
- goto out_dma;
+ goto error;
- zdev->state = ZPCI_FN_STATE_ONLINE;
- return 0;
+ rc = zpci_bus_device_register(zdev, &pci_root_ops);
+ if (rc)
+ goto error_destroy_iommu;
-out_dma:
- clp_disable_fh(zdev);
-out:
- return rc;
+ spin_lock(&zpci_list_lock);
+ list_add_tail(&zdev->entry, &zpci_list);
+ spin_unlock(&zpci_list_lock);
+
+ return zdev;
+
+error_destroy_iommu:
+ zpci_destroy_iommu(zdev);
+error:
+ zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
+ kfree(zdev);
+ return ERR_PTR(rc);
}
-EXPORT_SYMBOL_GPL(zpci_enable_device);
-int zpci_disable_device(struct zpci_dev *zdev)
+bool zpci_is_device_configured(struct zpci_dev *zdev)
{
- zpci_dma_exit_device(zdev);
- return clp_disable_fh(zdev);
+ enum zpci_state state = zdev->state;
+
+ return state != ZPCI_FN_STATE_RESERVED &&
+ state != ZPCI_FN_STATE_STANDBY;
}
-EXPORT_SYMBOL_GPL(zpci_disable_device);
-int zpci_create_device(struct zpci_dev *zdev)
+/**
+ * zpci_scan_configured_device() - Scan a freshly configured zpci_dev
+ * @zdev: The zpci_dev to be configured
+ * @fh: The general function handle supplied by the platform
+ *
+ * Given a device in the configuration state Configured, enables, scans and
+ * adds it to the common code PCI subsystem if possible. If the PCI device is
+ * parked because we can not yet create a PCI bus because we have not seen
+ * function 0, it is ignored but will be scanned once function 0 appears.
+ * If any failure occurs, the zpci_dev is left disabled.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh)
{
int rc;
- rc = zpci_alloc_domain(zdev);
- if (rc)
- goto out;
+ zpci_update_fh(zdev, fh);
+ /* the PCI function will be scanned once function 0 appears */
+ if (!zdev->zbus->bus)
+ return 0;
- rc = zpci_init_iommu(zdev);
- if (rc)
- goto out_free;
+ /* For function 0 on a multi-function bus scan whole bus as we might
+ * have to pick up existing functions waiting for it to allow creating
+ * the PCI bus
+ */
+ if (zdev->devfn == 0 && zdev->zbus->multifunction)
+ rc = zpci_bus_scan_bus(zdev->zbus);
+ else
+ rc = zpci_bus_scan_device(zdev);
- mutex_init(&zdev->lock);
- if (zdev->state == ZPCI_FN_STATE_CONFIGURED) {
- rc = zpci_enable_device(zdev);
+ return rc;
+}
+
+/**
+ * zpci_deconfigure_device() - Deconfigure a zpci_dev
+ * @zdev: The zpci_dev to configure
+ *
+ * Deconfigure a zPCI function that is currently configured and possibly known
+ * to the common code PCI subsystem.
+ * If any failure occurs the device is left as is.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_deconfigure_device(struct zpci_dev *zdev)
+{
+ int rc;
+
+ if (zdev->zbus->bus)
+ zpci_bus_remove_device(zdev, false);
+
+ if (zdev->dma_table) {
+ rc = zpci_dma_exit_device(zdev);
+ if (rc)
+ return rc;
+ }
+ if (zdev_enabled(zdev)) {
+ rc = zpci_disable_device(zdev);
if (rc)
- goto out_destroy_iommu;
+ return rc;
}
- rc = zpci_scan_bus(zdev);
+
+ rc = sclp_pci_deconfigure(zdev->fid);
+ zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, rc);
if (rc)
- goto out_disable;
+ return rc;
+ zdev->state = ZPCI_FN_STATE_STANDBY;
+
+ return 0;
+}
+/**
+ * zpci_device_reserved() - Mark device as resverved
+ * @zdev: the zpci_dev that was reserved
+ *
+ * Handle the case that a given zPCI function was reserved by another system.
+ * After a call to this function the zpci_dev can not be found via
+ * get_zdev_by_fid() anymore but may still be accessible via existing
+ * references though it will not be functional anymore.
+ */
+void zpci_device_reserved(struct zpci_dev *zdev)
+{
+ if (zdev->has_hp_slot)
+ zpci_exit_slot(zdev);
+ /*
+ * Remove device from zpci_list as it is going away. This also
+ * makes sure we ignore subsequent zPCI events for this device.
+ */
spin_lock(&zpci_list_lock);
- list_add_tail(&zdev->entry, &zpci_list);
+ list_del(&zdev->entry);
spin_unlock(&zpci_list_lock);
+ zdev->state = ZPCI_FN_STATE_RESERVED;
+ zpci_dbg(3, "rsv fid:%x\n", zdev->fid);
+ zpci_zdev_put(zdev);
+}
- zpci_init_slot(zdev);
+void zpci_release_device(struct kref *kref)
+{
+ struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref);
+ int ret;
- return 0;
+ if (zdev->zbus->bus)
+ zpci_bus_remove_device(zdev, false);
-out_disable:
- if (zdev->state == ZPCI_FN_STATE_ONLINE)
+ if (zdev->dma_table)
+ zpci_dma_exit_device(zdev);
+ if (zdev_enabled(zdev))
zpci_disable_device(zdev);
-out_destroy_iommu:
- zpci_destroy_iommu(zdev);
-out_free:
- zpci_free_domain(zdev);
-out:
- return rc;
-}
-
-void zpci_remove_device(struct zpci_dev *zdev)
-{
- if (!zdev->bus)
- return;
- pci_stop_root_bus(zdev->bus);
- pci_remove_root_bus(zdev->bus);
+ switch (zdev->state) {
+ case ZPCI_FN_STATE_CONFIGURED:
+ ret = sclp_pci_deconfigure(zdev->fid);
+ zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret);
+ fallthrough;
+ case ZPCI_FN_STATE_STANDBY:
+ if (zdev->has_hp_slot)
+ zpci_exit_slot(zdev);
+ spin_lock(&zpci_list_lock);
+ list_del(&zdev->entry);
+ spin_unlock(&zpci_list_lock);
+ zpci_dbg(3, "rsv fid:%x\n", zdev->fid);
+ fallthrough;
+ case ZPCI_FN_STATE_RESERVED:
+ if (zdev->has_resources)
+ zpci_cleanup_bus_resources(zdev);
+ zpci_bus_device_unregister(zdev);
+ zpci_destroy_iommu(zdev);
+ fallthrough;
+ default:
+ break;
+ }
+ zpci_dbg(3, "rem fid:%x\n", zdev->fid);
+ kfree(zdev);
}
int zpci_report_error(struct pci_dev *pdev,
@@ -822,6 +1007,59 @@ int zpci_report_error(struct pci_dev *pdev,
}
EXPORT_SYMBOL(zpci_report_error);
+/**
+ * zpci_clear_error_state() - Clears the zPCI error state of the device
+ * @zdev: The zdev for which the zPCI error state should be reset
+ *
+ * Clear the zPCI error state of the device. If clearing the zPCI error state
+ * fails the device is left in the error state. In this case it may make sense
+ * to call zpci_io_perm_failure() on the associated pdev if it exists.
+ *
+ * Returns: 0 on success, -EIO otherwise
+ */
+int zpci_clear_error_state(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_ERROR);
+ struct zpci_fib fib = {0};
+ u8 status;
+ int cc;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc) {
+ zpci_dbg(3, "ces fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * zpci_reset_load_store_blocked() - Re-enables L/S from error state
+ * @zdev: The zdev for which to unblock load/store access
+ *
+ * Re-enables load/store access for a PCI function in the error state while
+ * keeping DMA blocked. In this state drivers can poke MMIO space to determine
+ * if error recovery is possible while catching any rogue DMA access from the
+ * device.
+ *
+ * Returns: 0 on success, -EIO otherwise
+ */
+int zpci_reset_load_store_blocked(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_BLOCK);
+ struct zpci_fib fib = {0};
+ u8 status;
+ int cc;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc) {
+ zpci_dbg(3, "rls fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status);
+ return -EIO;
+ }
+
+ return 0;
+}
+
static int zpci_mem_init(void)
{
BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) ||
@@ -842,6 +1080,9 @@ static int zpci_mem_init(void)
if (!zpci_iomap_bitmap)
goto error_iomap_bitmap;
+ if (static_branch_likely(&have_mio))
+ clp_setup_writeback_mio();
+
return 0;
error_iomap_bitmap:
kfree(zpci_iomap_start);
@@ -859,7 +1100,6 @@ static void zpci_mem_exit(void)
}
static unsigned int s390_pci_probe __initdata = 1;
-static unsigned int s390_pci_no_mio __initdata;
unsigned int s390_pci_force_floating __initdata;
static unsigned int s390_pci_initialized;
@@ -870,13 +1110,17 @@ char * __init pcibios_setup(char *str)
return NULL;
}
if (!strcmp(str, "nomio")) {
- s390_pci_no_mio = 1;
+ S390_lowcore.machine_flags &= ~MACHINE_FLAG_PCI_MIO;
return NULL;
}
if (!strcmp(str, "force_floating")) {
s390_pci_force_floating = 1;
return NULL;
}
+ if (!strcmp(str, "norid")) {
+ s390_pci_no_rid = 1;
+ return NULL;
+ }
return str;
}
@@ -892,10 +1136,12 @@ static int __init pci_base_init(void)
if (!s390_pci_probe)
return 0;
- if (!test_facility(69) || !test_facility(71))
+ if (!test_facility(69) || !test_facility(71)) {
+ pr_info("PCI is not supported because CPU facilities 69 or 71 are not available\n");
return 0;
+ }
- if (test_facility(153) && !s390_pci_no_mio) {
+ if (MACHINE_HAS_PCI_MIO) {
static_branch_enable(&have_mio);
ctl_set_bit(2, 5);
}
@@ -919,6 +1165,7 @@ static int __init pci_base_init(void)
rc = clp_scan_pci_devices();
if (rc)
goto out_find;
+ zpci_bus_scan_busses();
s390_pci_initialized = 1;
return 0;
@@ -935,9 +1182,3 @@ out:
return rc;
}
subsys_initcall_sync(pci_base_init);
-
-void zpci_rescan(void)
-{
- if (zpci_is_enabled())
- clp_rescan_pci_devices_simple(NULL);
-}
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
new file mode 100644
index 000000000000..6a8da1b742ae
--- /dev/null
+++ b/arch/s390/pci/pci_bus.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Pierre Morel <pmorel@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/jump_label.h>
+#include <linux/pci.h>
+#include <linux/printk.h>
+
+#include <asm/pci_clp.h>
+#include <asm/pci_dma.h>
+
+#include "pci_bus.h"
+#include "pci_iov.h"
+
+static LIST_HEAD(zbus_list);
+static DEFINE_MUTEX(zbus_list_lock);
+static int zpci_nb_devices;
+
+/* zpci_bus_prepare_device - Prepare a zPCI function for scanning
+ * @zdev: the zPCI function to be prepared
+ *
+ * The PCI resources for the function are set up and added to its zbus and the
+ * function is enabled. The function must be added to a zbus which must have
+ * a PCI bus created. If an error occurs the zPCI function is not enabled.
+ *
+ * Return: 0 on success, an error code otherwise
+ */
+static int zpci_bus_prepare_device(struct zpci_dev *zdev)
+{
+ struct resource_entry *window, *n;
+ struct resource *res;
+ int rc;
+
+ if (!zdev_enabled(zdev)) {
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ return rc;
+ rc = zpci_dma_init_device(zdev);
+ if (rc) {
+ zpci_disable_device(zdev);
+ return rc;
+ }
+ }
+
+ if (!zdev->has_resources) {
+ zpci_setup_bus_resources(zdev, &zdev->zbus->resources);
+ resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) {
+ res = window->res;
+ pci_bus_add_resource(zdev->zbus->bus, res, 0);
+ }
+ }
+
+ return 0;
+}
+
+/* zpci_bus_scan_device - Scan a single device adding it to the PCI core
+ * @zdev: the zdev to be scanned
+ *
+ * Scans the PCI function making it available to the common PCI code.
+ *
+ * Return: 0 on success, an error value otherwise
+ */
+int zpci_bus_scan_device(struct zpci_dev *zdev)
+{
+ struct pci_dev *pdev;
+ int rc;
+
+ rc = zpci_bus_prepare_device(zdev);
+ if (rc)
+ return rc;
+
+ pdev = pci_scan_single_device(zdev->zbus->bus, zdev->devfn);
+ if (!pdev)
+ return -ENODEV;
+
+ pci_bus_add_device(pdev);
+ pci_lock_rescan_remove();
+ pci_bus_add_devices(zdev->zbus->bus);
+ pci_unlock_rescan_remove();
+
+ return 0;
+}
+
+/* zpci_bus_remove_device - Removes the given zdev from the PCI core
+ * @zdev: the zdev to be removed from the PCI core
+ * @set_error: if true the device's error state is set to permanent failure
+ *
+ * Sets a zPCI device to a configured but offline state; the zPCI
+ * device is still accessible through its hotplug slot and the zPCI
+ * API but is removed from the common code PCI bus, making it
+ * no longer available to drivers.
+ */
+void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error)
+{
+ struct zpci_bus *zbus = zdev->zbus;
+ struct pci_dev *pdev;
+
+ if (!zdev->zbus->bus)
+ return;
+
+ pdev = pci_get_slot(zbus->bus, zdev->devfn);
+ if (pdev) {
+ if (set_error)
+ pdev->error_state = pci_channel_io_perm_failure;
+ if (pdev->is_virtfn) {
+ zpci_iov_remove_virtfn(pdev, zdev->vfn);
+ /* balance pci_get_slot */
+ pci_dev_put(pdev);
+ return;
+ }
+ pci_stop_and_remove_bus_device_locked(pdev);
+ /* balance pci_get_slot */
+ pci_dev_put(pdev);
+ }
+}
+
+/* zpci_bus_scan_bus - Scan all configured zPCI functions on the bus
+ * @zbus: the zbus to be scanned
+ *
+ * Enables and scans all PCI functions on the bus making them available to the
+ * common PCI code. If there is no function 0 on the zbus nothing is scanned. If
+ * a function does not have a slot yet because it was added to the zbus before
+ * function 0 the slot is created. If a PCI function fails to be initialized
+ * an error will be returned but attempts will still be made for all other
+ * functions on the bus.
+ *
+ * Return: 0 on success, an error value otherwise
+ */
+int zpci_bus_scan_bus(struct zpci_bus *zbus)
+{
+ struct zpci_dev *zdev;
+ int devfn, rc, ret = 0;
+
+ for (devfn = 0; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) {
+ zdev = zbus->function[devfn];
+ if (zdev && zdev->state == ZPCI_FN_STATE_CONFIGURED) {
+ rc = zpci_bus_prepare_device(zdev);
+ if (rc)
+ ret = -EIO;
+ }
+ }
+
+ pci_lock_rescan_remove();
+ pci_scan_child_bus(zbus->bus);
+ pci_bus_add_devices(zbus->bus);
+ pci_unlock_rescan_remove();
+
+ return ret;
+}
+
+/* zpci_bus_scan_busses - Scan all registered busses
+ *
+ * Scan all available zbusses
+ *
+ */
+void zpci_bus_scan_busses(void)
+{
+ struct zpci_bus *zbus = NULL;
+
+ mutex_lock(&zbus_list_lock);
+ list_for_each_entry(zbus, &zbus_list, bus_next) {
+ zpci_bus_scan_bus(zbus);
+ cond_resched();
+ }
+ mutex_unlock(&zbus_list_lock);
+}
+
+/* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus
+ * @zbus: the zbus holding the zdevices
+ * @fr: PCI root function that will determine the bus's domain, and bus speeed
+ * @ops: the pci operations
+ *
+ * The PCI function @fr determines the domain (its UID), multifunction property
+ * and maximum bus speed of the entire bus.
+ *
+ * Return: 0 on success, an error code otherwise
+ */
+static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops)
+{
+ struct pci_bus *bus;
+ int domain;
+
+ domain = zpci_alloc_domain((u16)fr->uid);
+ if (domain < 0)
+ return domain;
+
+ zbus->domain_nr = domain;
+ zbus->multifunction = fr->rid_available;
+ zbus->max_bus_speed = fr->max_bus_speed;
+
+ /*
+ * Note that the zbus->resources are taken over and zbus->resources
+ * is empty after a successful call
+ */
+ bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources);
+ if (!bus) {
+ zpci_free_domain(zbus->domain_nr);
+ return -EFAULT;
+ }
+
+ zbus->bus = bus;
+ pci_bus_add_devices(bus);
+
+ return 0;
+}
+
+static void zpci_bus_release(struct kref *kref)
+{
+ struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref);
+
+ if (zbus->bus) {
+ pci_lock_rescan_remove();
+ pci_stop_root_bus(zbus->bus);
+
+ zpci_free_domain(zbus->domain_nr);
+ pci_free_resource_list(&zbus->resources);
+
+ pci_remove_root_bus(zbus->bus);
+ pci_unlock_rescan_remove();
+ }
+
+ mutex_lock(&zbus_list_lock);
+ list_del(&zbus->bus_next);
+ mutex_unlock(&zbus_list_lock);
+ kfree(zbus);
+}
+
+static void zpci_bus_put(struct zpci_bus *zbus)
+{
+ kref_put(&zbus->kref, zpci_bus_release);
+}
+
+static struct zpci_bus *zpci_bus_get(int pchid)
+{
+ struct zpci_bus *zbus;
+
+ mutex_lock(&zbus_list_lock);
+ list_for_each_entry(zbus, &zbus_list, bus_next) {
+ if (pchid == zbus->pchid) {
+ kref_get(&zbus->kref);
+ goto out_unlock;
+ }
+ }
+ zbus = NULL;
+out_unlock:
+ mutex_unlock(&zbus_list_lock);
+ return zbus;
+}
+
+static struct zpci_bus *zpci_bus_alloc(int pchid)
+{
+ struct zpci_bus *zbus;
+
+ zbus = kzalloc(sizeof(*zbus), GFP_KERNEL);
+ if (!zbus)
+ return NULL;
+
+ zbus->pchid = pchid;
+ INIT_LIST_HEAD(&zbus->bus_next);
+ mutex_lock(&zbus_list_lock);
+ list_add_tail(&zbus->bus_next, &zbus_list);
+ mutex_unlock(&zbus_list_lock);
+
+ kref_init(&zbus->kref);
+ INIT_LIST_HEAD(&zbus->resources);
+
+ zbus->bus_resource.start = 0;
+ zbus->bus_resource.end = ZPCI_BUS_NR;
+ zbus->bus_resource.flags = IORESOURCE_BUS;
+ pci_add_resource(&zbus->resources, &zbus->bus_resource);
+
+ return zbus;
+}
+
+void pcibios_bus_add_device(struct pci_dev *pdev)
+{
+ struct zpci_dev *zdev = to_zpci(pdev);
+
+ /*
+ * With pdev->no_vf_scan the common PCI probing code does not
+ * perform PF/VF linking.
+ */
+ if (zdev->vfn) {
+ zpci_iov_setup_virtfn(zdev->zbus, pdev, zdev->vfn);
+ pdev->no_command_memory = 1;
+ }
+}
+
+static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
+{
+ int rc = -EINVAL;
+
+ if (zbus->function[zdev->devfn]) {
+ pr_err("devfn %04x is already assigned\n", zdev->devfn);
+ return rc;
+ }
+
+ zdev->zbus = zbus;
+ zbus->function[zdev->devfn] = zdev;
+ zpci_nb_devices++;
+
+ if (zbus->multifunction && !zdev->rid_available) {
+ WARN_ONCE(1, "rid_available not set for multifunction\n");
+ goto error;
+ }
+ rc = zpci_init_slot(zdev);
+ if (rc)
+ goto error;
+ zdev->has_hp_slot = 1;
+
+ return 0;
+
+error:
+ zbus->function[zdev->devfn] = NULL;
+ zdev->zbus = NULL;
+ zpci_nb_devices--;
+ return rc;
+}
+
+int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
+{
+ struct zpci_bus *zbus = NULL;
+ int rc = -EBADF;
+
+ if (zpci_nb_devices == ZPCI_NR_DEVICES) {
+ pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n",
+ zdev->fid, ZPCI_NR_DEVICES);
+ return -ENOSPC;
+ }
+
+ if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS)
+ return -EINVAL;
+
+ if (!s390_pci_no_rid && zdev->rid_available)
+ zbus = zpci_bus_get(zdev->pchid);
+
+ if (!zbus) {
+ zbus = zpci_bus_alloc(zdev->pchid);
+ if (!zbus)
+ return -ENOMEM;
+ }
+
+ if (!zbus->bus) {
+ /* The UID of the first PCI function registered with a zpci_bus
+ * is used as the domain number for that bus. Currently there
+ * is exactly one zpci_bus per domain.
+ */
+ rc = zpci_bus_create_pci_bus(zbus, zdev, ops);
+ if (rc)
+ goto error;
+ }
+
+ rc = zpci_bus_add_device(zbus, zdev);
+ if (rc)
+ goto error;
+
+ return 0;
+
+error:
+ pr_err("Adding PCI function %08x failed\n", zdev->fid);
+ zpci_bus_put(zbus);
+ return rc;
+}
+
+void zpci_bus_device_unregister(struct zpci_dev *zdev)
+{
+ struct zpci_bus *zbus = zdev->zbus;
+
+ zpci_nb_devices--;
+ zbus->function[zdev->devfn] = NULL;
+ zpci_bus_put(zbus);
+}
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
new file mode 100644
index 000000000000..e96c9860e064
--- /dev/null
+++ b/arch/s390/pci/pci_bus.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Pierre Morel <pmorel@linux.ibm.com>
+ *
+ */
+
+int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops);
+void zpci_bus_device_unregister(struct zpci_dev *zdev);
+
+int zpci_bus_scan_bus(struct zpci_bus *zbus);
+void zpci_bus_scan_busses(void);
+
+int zpci_bus_scan_device(struct zpci_dev *zdev);
+void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);
+
+void zpci_release_device(struct kref *kref);
+static inline void zpci_zdev_put(struct zpci_dev *zdev)
+{
+ if (zdev)
+ kref_put(&zdev->kref, zpci_release_device);
+}
+
+static inline void zpci_zdev_get(struct zpci_dev *zdev)
+{
+ kref_get(&zdev->kref);
+}
+
+int zpci_alloc_domain(int domain);
+void zpci_free_domain(int domain);
+int zpci_setup_bus_resources(struct zpci_dev *zdev,
+ struct list_head *resources);
+
+static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
+ unsigned int devfn)
+{
+ struct zpci_bus *zbus = bus->sysdata;
+
+ return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn];
+}
+
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 0d3d8f170ea4..ee367798e388 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -17,17 +17,20 @@
#include <linux/delay.h>
#include <linux/pci.h>
#include <linux/uaccess.h>
+#include <asm/asm-extable.h>
#include <asm/pci_debug.h>
#include <asm/pci_clp.h>
#include <asm/clp.h>
#include <uapi/asm/clp.h>
+#include "pci_bus.h"
+
bool zpci_unique_uid;
-static void update_uid_checking(bool new)
+void update_uid_checking(bool new)
{
if (zpci_unique_uid != new)
- zpci_dbg(1, "uid checking:%d\n", new);
+ zpci_dbg(3, "uid checking:%d\n", new);
zpci_unique_uid = new;
}
@@ -102,6 +105,9 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
zdev->msi_addr = response->msia;
zdev->max_msi = response->noi;
zdev->fmb_update = response->mui;
+ zdev->version = response->version;
+ zdev->maxstbl = response->maxstbl;
+ zdev->dtsm = response->dtsm;
switch (response->version) {
case 1:
@@ -155,13 +161,19 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
zdev->pfgid = response->pfgid;
zdev->pft = response->pft;
zdev->vfn = response->vfn;
+ zdev->port = response->port;
zdev->uid = response->uid;
zdev->fmb_length = sizeof(u32) * response->fmb_len;
+ zdev->rid_available = response->rid_avail;
+ zdev->is_physfn = response->is_physfn;
+ if (!s390_pci_no_rid && zdev->rid_available)
+ zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip));
if (response->util_str_avail) {
memcpy(zdev->util_str, response->util_str,
sizeof(zdev->util_str));
+ zdev->util_str_avail = 1;
}
zdev->mio_capable = response->mio_addr_avail;
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
@@ -174,7 +186,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
return 0;
}
-static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
+int clp_query_pci_fn(struct zpci_dev *zdev)
{
struct clp_req_rsp_query_pci *rrb;
int rc;
@@ -187,7 +199,7 @@ static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
rrb->request.hdr.len = sizeof(rrb->request);
rrb->request.hdr.cmd = CLP_QUERY_PCI_FN;
rrb->response.hdr.len = sizeof(rrb->response);
- rrb->request.fh = fh;
+ rrb->request.fh = zdev->fh;
rc = clp_req(rrb, CLP_LPS_PCI);
if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
@@ -205,54 +217,30 @@ out:
return rc;
}
-int clp_add_pci_device(u32 fid, u32 fh, int configured)
-{
- struct zpci_dev *zdev;
- int rc = -ENOMEM;
-
- zpci_dbg(3, "add fid:%x, fh:%x, c:%d\n", fid, fh, configured);
- zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
- if (!zdev)
- goto error;
-
- zdev->fh = fh;
- zdev->fid = fid;
-
- /* Query function properties and update zdev */
- rc = clp_query_pci_fn(zdev, fh);
- if (rc)
- goto error;
-
- if (configured)
- zdev->state = ZPCI_FN_STATE_CONFIGURED;
- else
- zdev->state = ZPCI_FN_STATE_STANDBY;
-
- rc = zpci_create_device(zdev);
- if (rc)
- goto error;
- return 0;
-
-error:
- zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
- kfree(zdev);
- return rc;
-}
-
-/*
- * Enable/Disable a given PCI function and update its function handle if
- * necessary
+/**
+ * clp_set_pci_fn() - Execute a command on a PCI function
+ * @zdev: Function that will be affected
+ * @fh: Out parameter for updated function handle
+ * @nr_dma_as: DMA address space number
+ * @command: The command code to execute
+ *
+ * Returns: 0 on success, < 0 for Linux errors (e.g. -ENOMEM), and
+ * > 0 for non-success platform responses
*/
-static int clp_set_pci_fn(struct zpci_dev *zdev, u8 nr_dma_as, u8 command)
+static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 command)
{
struct clp_req_rsp_set_pci *rrb;
int rc, retries = 100;
- u32 fid = zdev->fid;
+ u32 gisa = 0;
+ *fh = 0;
rrb = clp_alloc_block(GFP_KERNEL);
if (!rrb)
return -ENOMEM;
+ if (command != CLP_SET_DISABLE_PCI_FN)
+ gisa = zdev->gisa;
+
do {
memset(rrb, 0, sizeof(*rrb));
rrb->request.hdr.len = sizeof(rrb->request);
@@ -261,6 +249,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u8 nr_dma_as, u8 command)
rrb->request.fh = zdev->fh;
rrb->request.oc = command;
rrb->request.ndas = nr_dma_as;
+ rrb->request.gisa = gisa;
rc = clp_req(rrb, CLP_LPS_PCI);
if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
@@ -271,121 +260,164 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u8 nr_dma_as, u8 command)
}
} while (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY);
- if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
+ if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
+ *fh = rrb->response.fh;
+ } else {
zpci_err("Set PCI FN:\n");
zpci_err_clp(rrb->response.hdr.rsp, rc);
+ if (!rc)
+ rc = rrb->response.hdr.rsp;
}
+ clp_free_block(rrb);
+ return rc;
+}
+
+int clp_setup_writeback_mio(void)
+{
+ struct clp_req_rsp_slpc_pci *rrb;
+ u8 wb_bit_pos;
+ int rc;
+
+ rrb = clp_alloc_block(GFP_KERNEL);
+ if (!rrb)
+ return -ENOMEM;
+
+ memset(rrb, 0, sizeof(*rrb));
+ rrb->request.hdr.len = sizeof(rrb->request);
+ rrb->request.hdr.cmd = CLP_SLPC;
+ rrb->response.hdr.len = sizeof(rrb->response);
+ rc = clp_req(rrb, CLP_LPS_PCI);
if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
- zdev->fh = rrb->response.fh;
- } else if (!rc && rrb->response.hdr.rsp == CLP_RC_SETPCIFN_ALRDY &&
- rrb->response.fh == 0) {
- /* Function is already in desired state - update handle */
- rc = clp_rescan_pci_devices_simple(&fid);
+ if (rrb->response.vwb) {
+ wb_bit_pos = rrb->response.mio_wb;
+ set_bit_inv(wb_bit_pos, &mio_wb_bit_mask);
+ zpci_dbg(3, "wb bit: %d\n", wb_bit_pos);
+ } else {
+ zpci_dbg(3, "wb bit: n.a.\n");
+ }
+
+ } else {
+ zpci_err("SLPC PCI:\n");
+ zpci_err_clp(rrb->response.hdr.rsp, rc);
+ rc = -EIO;
}
clp_free_block(rrb);
return rc;
}
-int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as)
+int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as)
{
int rc;
- rc = clp_set_pci_fn(zdev, nr_dma_as, CLP_SET_ENABLE_PCI_FN);
- zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc);
- if (rc)
- goto out;
-
- if (zpci_use_mio(zdev)) {
- rc = clp_set_pci_fn(zdev, nr_dma_as, CLP_SET_ENABLE_MIO);
+ rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN);
+ zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc);
+ if (!rc && zpci_use_mio(zdev)) {
+ rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_MIO);
zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n",
- zdev->fid, zdev->fh, rc);
+ zdev->fid, *fh, rc);
if (rc)
- clp_disable_fh(zdev);
+ clp_disable_fh(zdev, fh);
}
-out:
return rc;
}
-int clp_disable_fh(struct zpci_dev *zdev)
+int clp_disable_fh(struct zpci_dev *zdev, u32 *fh)
{
- u32 fh = zdev->fh;
int rc;
if (!zdev_enabled(zdev))
return 0;
- rc = clp_set_pci_fn(zdev, 0, CLP_SET_DISABLE_PCI_FN);
- zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
+ rc = clp_set_pci_fn(zdev, fh, 0, CLP_SET_DISABLE_PCI_FN);
+ zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc);
return rc;
}
-static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data,
- void (*cb)(struct clp_fh_list_entry *, void *))
+static int clp_list_pci_req(struct clp_req_rsp_list_pci *rrb,
+ u64 *resume_token, int *nentries)
{
- u64 resume_token = 0;
- int entries, i, rc;
+ int rc;
- do {
- memset(rrb, 0, sizeof(*rrb));
- rrb->request.hdr.len = sizeof(rrb->request);
- rrb->request.hdr.cmd = CLP_LIST_PCI;
- /* store as many entries as possible */
- rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN;
- rrb->request.resume_token = resume_token;
+ memset(rrb, 0, sizeof(*rrb));
+ rrb->request.hdr.len = sizeof(rrb->request);
+ rrb->request.hdr.cmd = CLP_LIST_PCI;
+ /* store as many entries as possible */
+ rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN;
+ rrb->request.resume_token = *resume_token;
- /* Get PCI function handle list */
- rc = clp_req(rrb, CLP_LPS_PCI);
- if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
- zpci_err("List PCI FN:\n");
- zpci_err_clp(rrb->response.hdr.rsp, rc);
- rc = -EIO;
- goto out;
- }
+ /* Get PCI function handle list */
+ rc = clp_req(rrb, CLP_LPS_PCI);
+ if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
+ zpci_err("List PCI FN:\n");
+ zpci_err_clp(rrb->response.hdr.rsp, rc);
+ return -EIO;
+ }
+
+ update_uid_checking(rrb->response.uid_checking);
+ WARN_ON_ONCE(rrb->response.entry_size !=
+ sizeof(struct clp_fh_list_entry));
- update_uid_checking(rrb->response.uid_checking);
- WARN_ON_ONCE(rrb->response.entry_size !=
- sizeof(struct clp_fh_list_entry));
+ *nentries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) /
+ rrb->response.entry_size;
+ *resume_token = rrb->response.resume_token;
- entries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) /
- rrb->response.entry_size;
+ return rc;
+}
- resume_token = rrb->response.resume_token;
- for (i = 0; i < entries; i++)
+static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data,
+ void (*cb)(struct clp_fh_list_entry *, void *))
+{
+ u64 resume_token = 0;
+ int nentries, i, rc;
+
+ do {
+ rc = clp_list_pci_req(rrb, &resume_token, &nentries);
+ if (rc)
+ return rc;
+ for (i = 0; i < nentries; i++)
cb(&rrb->response.fh_list[i], data);
} while (resume_token);
-out:
+
return rc;
}
-static void __clp_add(struct clp_fh_list_entry *entry, void *data)
+static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
+ struct clp_fh_list_entry *entry)
{
- struct zpci_dev *zdev;
+ struct clp_fh_list_entry *fh_list;
+ u64 resume_token = 0;
+ int nentries, i, rc;
- if (!entry->vendor_id)
- return;
+ do {
+ rc = clp_list_pci_req(rrb, &resume_token, &nentries);
+ if (rc)
+ return rc;
+ fh_list = rrb->response.fh_list;
+ for (i = 0; i < nentries; i++) {
+ if (fh_list[i].fid == fid) {
+ *entry = fh_list[i];
+ return 0;
+ }
+ }
+ } while (resume_token);
- zdev = get_zdev_by_fid(entry->fid);
- if (!zdev)
- clp_add_pci_device(entry->fid, entry->fh, entry->config_state);
+ return -ENODEV;
}
-static void __clp_update(struct clp_fh_list_entry *entry, void *data)
+static void __clp_add(struct clp_fh_list_entry *entry, void *data)
{
struct zpci_dev *zdev;
- u32 *fid = data;
if (!entry->vendor_id)
return;
- if (fid && *fid != entry->fid)
- return;
-
zdev = get_zdev_by_fid(entry->fid);
- if (!zdev)
+ if (zdev) {
+ zpci_zdev_put(zdev);
return;
-
- zdev->fh = entry->fh;
+ }
+ zpci_create_device(entry->fid, entry->fh, entry->config_state);
}
int clp_scan_pci_devices(void)
@@ -403,69 +435,44 @@ int clp_scan_pci_devices(void)
return rc;
}
-int clp_rescan_pci_devices(void)
-{
- struct clp_req_rsp_list_pci *rrb;
- int rc;
-
- zpci_remove_reserved_devices();
-
- rrb = clp_alloc_block(GFP_KERNEL);
- if (!rrb)
- return -ENOMEM;
-
- rc = clp_list_pci(rrb, NULL, __clp_add);
-
- clp_free_block(rrb);
- return rc;
-}
-
-/* Rescan PCI functions and refresh function handles. If fid is non-NULL only
- * refresh the handle of the function matching @fid
+/*
+ * Get the current function handle of the function matching @fid
*/
-int clp_rescan_pci_devices_simple(u32 *fid)
+int clp_refresh_fh(u32 fid, u32 *fh)
{
struct clp_req_rsp_list_pci *rrb;
+ struct clp_fh_list_entry entry;
int rc;
rrb = clp_alloc_block(GFP_NOWAIT);
if (!rrb)
return -ENOMEM;
- rc = clp_list_pci(rrb, fid, __clp_update);
+ rc = clp_find_pci(rrb, fid, &entry);
+ if (!rc)
+ *fh = entry.fh;
clp_free_block(rrb);
return rc;
}
-struct clp_state_data {
- u32 fid;
- enum zpci_state state;
-};
-
-static void __clp_get_state(struct clp_fh_list_entry *entry, void *data)
-{
- struct clp_state_data *sd = data;
-
- if (entry->fid != sd->fid)
- return;
-
- sd->state = entry->config_state;
-}
-
int clp_get_state(u32 fid, enum zpci_state *state)
{
struct clp_req_rsp_list_pci *rrb;
- struct clp_state_data sd = {fid, ZPCI_FN_STATE_RESERVED};
+ struct clp_fh_list_entry entry;
int rc;
rrb = clp_alloc_block(GFP_ATOMIC);
if (!rrb)
return -ENOMEM;
- rc = clp_list_pci(rrb, &sd, __clp_get_state);
- if (!rc)
- *state = sd.state;
+ rc = clp_find_pci(rrb, fid, &entry);
+ if (!rc) {
+ *state = entry.config_state;
+ } else if (rc == -ENODEV) {
+ *state = ZPCI_FN_STATE_RESERVED;
+ rc = 0;
+ }
clp_free_block(rrb);
return rc;
@@ -491,7 +498,7 @@ static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb)
}
}
-static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb)
+static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc_pci *lpcb)
{
unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 3408c0df3ebf..ca6bd98eec13 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -196,7 +196,7 @@ int __init zpci_debug_init(void)
if (!pci_debug_err_id)
return -EINVAL;
debug_register_view(pci_debug_err_id, &debug_hex_ascii_view);
- debug_set_level(pci_debug_err_id, 6);
+ debug_set_level(pci_debug_err_id, 3);
debugfs_root = debugfs_create_dir("pci", NULL);
return 0;
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 64b1399a73f0..227cf0a62800 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -10,7 +10,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/iommu-helper.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
#include <linux/vmalloc.h>
#include <linux/pci.h>
#include <asm/pci_dma.h>
@@ -18,6 +18,8 @@
static struct kmem_cache *dma_region_table_cache;
static struct kmem_cache *dma_page_table_cache;
static int s390_iommu_strict;
+static u64 s390_iommu_aperture;
+static u32 s390_iommu_aperture_factor = 1;
static int zpci_refresh_global(struct zpci_dev *zdev)
{
@@ -72,7 +74,7 @@ static unsigned long *dma_get_seg_table_origin(unsigned long *entry)
if (!sto)
return NULL;
- set_rt_sto(entry, sto);
+ set_rt_sto(entry, virt_to_phys(sto));
validate_rt_entry(entry);
entry_clr_protected(entry);
}
@@ -89,7 +91,7 @@ static unsigned long *dma_get_page_table_origin(unsigned long *entry)
pto = dma_alloc_page_table();
if (!pto)
return NULL;
- set_st_pto(entry, pto);
+ set_st_pto(entry, virt_to_phys(pto));
validate_st_entry(entry);
entry_clr_protected(entry);
}
@@ -115,7 +117,7 @@ unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr)
return &pto[px];
}
-void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags)
+void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags)
{
if (flags & ZPCI_PTE_INVALID) {
invalidate_pt_entry(entry);
@@ -130,11 +132,11 @@ void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags)
entry_clr_protected(entry);
}
-static int __dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
+static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa,
dma_addr_t dma_addr, size_t size, int flags)
{
unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
- u8 *page_addr = (u8 *) (pa & PAGE_MASK);
+ phys_addr_t page_addr = (pa & PAGE_MASK);
unsigned long irq_flags;
unsigned long *entry;
int i, rc = 0;
@@ -215,7 +217,7 @@ out:
return ret;
}
-static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
+static int dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa,
dma_addr_t dma_addr, size_t size, int flags)
{
int rc;
@@ -261,13 +263,11 @@ static unsigned long __dma_alloc_iommu(struct device *dev,
unsigned long start, int size)
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- unsigned long boundary_size;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages,
start, size, zdev->start_dma >> PAGE_SHIFT,
- boundary_size, 0);
+ dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT),
+ 0);
}
static dma_addr_t dma_alloc_address(struct device *dev, int size)
@@ -400,7 +400,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
struct page *page;
- unsigned long pa;
+ phys_addr_t pa;
dma_addr_t map;
size = PAGE_ALIGN(size);
@@ -411,18 +411,18 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
pa = page_to_phys(page);
map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0);
if (dma_mapping_error(dev, map)) {
- free_pages(pa, get_order(size));
+ __free_pages(page, get_order(size));
return NULL;
}
atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages);
if (dma_handle)
*dma_handle = map;
- return (void *) pa;
+ return phys_to_virt(pa);
}
static void s390_dma_free(struct device *dev, size_t size,
- void *pa, dma_addr_t dma_handle,
+ void *vaddr, dma_addr_t dma_handle,
unsigned long attrs)
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
@@ -430,7 +430,7 @@ static void s390_dma_free(struct device *dev, size_t size,
size = PAGE_ALIGN(size);
atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages);
s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0);
- free_pages((unsigned long) pa, get_order(size));
+ free_pages((unsigned long)vaddr, get_order(size));
}
/* Map a segment into a contiguous dma address area */
@@ -443,7 +443,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
dma_addr_t dma_addr_base, dma_addr;
int flags = ZPCI_PTE_VALID;
struct scatterlist *s;
- unsigned long pa = 0;
+ phys_addr_t pa = 0;
int ret;
dma_addr_base = dma_alloc_address(dev, nr_pages);
@@ -489,18 +489,18 @@ static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
unsigned int max = dma_get_max_seg_size(dev);
unsigned int size = s->offset + s->length;
unsigned int offset = s->offset;
- int count = 0, i;
+ int count = 0, i, ret;
for (i = 1; i < nr_elements; i++) {
s = sg_next(s);
- s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
if (s->offset || (size & ~PAGE_MASK) ||
size + s->length > max) {
- if (__s390_dma_map_sg(dev, start, size,
- &dma->dma_address, dir))
+ ret = __s390_dma_map_sg(dev, start, size,
+ &dma->dma_address, dir);
+ if (ret)
goto unmap;
dma->dma_address += offset;
@@ -513,7 +513,8 @@ static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
}
size += s->length;
}
- if (__s390_dma_map_sg(dev, start, size, &dma->dma_address, dir))
+ ret = __s390_dma_map_sg(dev, start, size, &dma->dma_address, dir);
+ if (ret)
goto unmap;
dma->dma_address += offset;
@@ -525,7 +526,7 @@ unmap:
s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
dir, attrs);
- return 0;
+ return ret;
}
static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
@@ -566,15 +567,19 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
/*
* Restrict the iommu bitmap size to the minimum of the following:
- * - main memory size
+ * - s390_iommu_aperture which defaults to high_memory
* - 3-level pagetable address limit minus start_dma offset
* - DMA address range allowed by the hardware (clp query pci fn)
*
* Also set zdev->end_dma to the actual end address of the usable
* range, instead of the theoretical maximum as reported by hardware.
+ *
+ * This limits the number of concurrently usable DMA mappings since
+ * for each DMA mapped memory address we need a DMA address including
+ * extra DMA addresses for multiple mappings of the same memory address.
*/
zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
- zdev->iommu_size = min3((u64) high_memory,
+ zdev->iommu_size = min3(s390_iommu_aperture,
ZPCI_TABLE_SIZE_RT - zdev->start_dma,
zdev->end_dma - zdev->start_dma + 1);
zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1;
@@ -592,10 +597,11 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
}
}
- rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- (u64) zdev->dma_table);
- if (rc)
+ if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table))) {
+ rc = -EIO;
goto free_bitmap;
+ }
return 0;
free_bitmap:
@@ -610,17 +616,25 @@ out:
return rc;
}
-void zpci_dma_exit_device(struct zpci_dev *zdev)
+int zpci_dma_exit_device(struct zpci_dev *zdev)
{
+ int cc = 0;
+
/*
* At this point, if the device is part of an IOMMU domain, this would
* be a strong hint towards a bug in the IOMMU API (common) code and/or
* simultaneous access via IOMMU and DMA API. So let's issue a warning.
*/
WARN_ON(zdev->s390_domain);
-
- if (zpci_unregister_ioat(zdev, 0))
- return;
+ if (zdev_enabled(zdev))
+ cc = zpci_unregister_ioat(zdev, 0);
+ /*
+ * cc == 3 indicates the function is gone already. This can happen
+ * if the function was deconfigured/disabled suddenly and we have not
+ * received a new handle yet.
+ */
+ if (cc && cc != 3)
+ return -EIO;
dma_cleanup_tables(zdev->dma_table);
zdev->dma_table = NULL;
@@ -628,8 +642,8 @@ void zpci_dma_exit_device(struct zpci_dev *zdev)
zdev->iommu_bitmap = NULL;
vfree(zdev->lazy_bitmap);
zdev->lazy_bitmap = NULL;
-
zdev->next_bit = 0;
+ return 0;
}
static int __init dma_alloc_cpu_table_caches(void)
@@ -652,6 +666,12 @@ static int __init dma_alloc_cpu_table_caches(void)
int __init zpci_dma_init(void)
{
+ s390_iommu_aperture = (u64)virt_to_phys(high_memory);
+ if (!s390_iommu_aperture_factor)
+ s390_iommu_aperture = ULONG_MAX;
+ else
+ s390_iommu_aperture *= s390_iommu_aperture_factor;
+
return dma_alloc_cpu_table_caches();
}
@@ -670,6 +690,8 @@ const struct dma_map_ops s390_pci_dma_ops = {
.unmap_page = s390_dma_unmap_pages,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
+ .alloc_pages = dma_common_alloc_pages,
+ .free_pages = dma_common_free_pages,
/* dma_supported is unconditionally true without a callback */
};
EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
@@ -682,3 +704,12 @@ static int __init s390_iommu_setup(char *str)
}
__setup("s390_iommu=", s390_iommu_setup);
+
+static int __init s390_iommu_aperture_setup(char *str)
+{
+ if (kstrtou32(str, 10, &s390_iommu_aperture_factor))
+ s390_iommu_aperture_factor = 1;
+ return 1;
+}
+
+__setup("s390_iommu_aperture=", s390_iommu_aperture_setup);
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 8d6ee4af4230..b9324ca2eb94 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -12,8 +12,11 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <asm/pci_debug.h>
+#include <asm/pci_dma.h>
#include <asm/sclp.h>
+#include "pci_bus.h"
+
/* Content Code Description for PCI Function Error */
struct zpci_ccdf_err {
u32 reserved1;
@@ -44,25 +47,247 @@ struct zpci_ccdf_avail {
u16 pec; /* PCI event code */
} __packed;
+static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res)
+{
+ switch (ers_res) {
+ case PCI_ERS_RESULT_CAN_RECOVER:
+ case PCI_ERS_RESULT_RECOVERED:
+ case PCI_ERS_RESULT_NEED_RESET:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static bool is_passed_through(struct zpci_dev *zdev)
+{
+ return zdev->s390_domain;
+}
+
+static bool is_driver_supported(struct pci_driver *driver)
+{
+ if (!driver || !driver->err_handler)
+ return false;
+ if (!driver->err_handler->error_detected)
+ return false;
+ if (!driver->err_handler->slot_reset)
+ return false;
+ if (!driver->err_handler->resume)
+ return false;
+ return true;
+}
+
+static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
+ struct pci_driver *driver)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+
+ ers_res = driver->err_handler->error_detected(pdev, pdev->error_state);
+ if (ers_result_indicates_abort(ers_res))
+ pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev));
+ else if (ers_res == PCI_ERS_RESULT_NEED_RESET)
+ pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
+
+ return ers_res;
+}
+
+static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev,
+ struct pci_driver *driver)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+ struct zpci_dev *zdev = to_zpci(pdev);
+ int rc;
+
+ pr_info("%s: Unblocking device access for examination\n", pci_name(pdev));
+ rc = zpci_reset_load_store_blocked(zdev);
+ if (rc) {
+ pr_err("%s: Unblocking device access failed\n", pci_name(pdev));
+ /* Let's try a full reset instead */
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+
+ if (driver->err_handler->mmio_enabled) {
+ ers_res = driver->err_handler->mmio_enabled(pdev);
+ if (ers_result_indicates_abort(ers_res)) {
+ pr_info("%s: Automatic recovery failed after MMIO re-enable\n",
+ pci_name(pdev));
+ return ers_res;
+ } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) {
+ pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
+ return ers_res;
+ }
+ }
+
+ pr_debug("%s: Unblocking DMA\n", pci_name(pdev));
+ rc = zpci_clear_error_state(zdev);
+ if (!rc) {
+ pdev->error_state = pci_channel_io_normal;
+ } else {
+ pr_err("%s: Unblocking DMA failed\n", pci_name(pdev));
+ /* Let's try a full reset instead */
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+
+ return ers_res;
+}
+
+static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
+ struct pci_driver *driver)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+
+ pr_info("%s: Initiating reset\n", pci_name(pdev));
+ if (zpci_hot_reset_device(to_zpci(pdev))) {
+ pr_err("%s: The reset request failed\n", pci_name(pdev));
+ return ers_res;
+ }
+ pdev->error_state = pci_channel_io_normal;
+ ers_res = driver->err_handler->slot_reset(pdev);
+ if (ers_result_indicates_abort(ers_res)) {
+ pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev));
+ return ers_res;
+ }
+
+ return ers_res;
+}
+
+/* zpci_event_attempt_error_recovery - Try to recover the given PCI function
+ * @pdev: PCI function to recover currently in the error state
+ *
+ * We follow the scheme outlined in Documentation/PCI/pci-error-recovery.rst.
+ * With the simplification that recovery always happens per function
+ * and the platform determines which functions are affected for
+ * multi-function devices.
+ */
+static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+ struct pci_driver *driver;
+
+ /*
+ * Ensure that the PCI function is not removed concurrently, no driver
+ * is unbound or probed and that userspace can't access its
+ * configuration space while we perform recovery.
+ */
+ pci_dev_lock(pdev);
+ if (pdev->error_state == pci_channel_io_perm_failure) {
+ ers_res = PCI_ERS_RESULT_DISCONNECT;
+ goto out_unlock;
+ }
+ pdev->error_state = pci_channel_io_frozen;
+
+ if (is_passed_through(to_zpci(pdev))) {
+ pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
+ pci_name(pdev));
+ goto out_unlock;
+ }
+
+ driver = to_pci_driver(pdev->dev.driver);
+ if (!is_driver_supported(driver)) {
+ if (!driver)
+ pr_info("%s: Cannot be recovered because no driver is bound to the device\n",
+ pci_name(pdev));
+ else
+ pr_info("%s: The %s driver bound to the device does not support error recovery\n",
+ pci_name(pdev),
+ driver->name);
+ goto out_unlock;
+ }
+
+ ers_res = zpci_event_notify_error_detected(pdev, driver);
+ if (ers_result_indicates_abort(ers_res))
+ goto out_unlock;
+
+ if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) {
+ ers_res = zpci_event_do_error_state_clear(pdev, driver);
+ if (ers_result_indicates_abort(ers_res))
+ goto out_unlock;
+ }
+
+ if (ers_res == PCI_ERS_RESULT_NEED_RESET)
+ ers_res = zpci_event_do_reset(pdev, driver);
+
+ if (ers_res != PCI_ERS_RESULT_RECOVERED) {
+ pr_err("%s: Automatic recovery failed; operator intervention is required\n",
+ pci_name(pdev));
+ goto out_unlock;
+ }
+
+ pr_info("%s: The device is ready to resume operations\n", pci_name(pdev));
+ if (driver->err_handler->resume)
+ driver->err_handler->resume(pdev);
+out_unlock:
+ pci_dev_unlock(pdev);
+
+ return ers_res;
+}
+
+/* zpci_event_io_failure - Report PCI channel failure state to driver
+ * @pdev: PCI function for which to report
+ * @es: PCI channel failure state to report
+ */
+static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
+{
+ struct pci_driver *driver;
+
+ pci_dev_lock(pdev);
+ pdev->error_state = es;
+ /**
+ * While vfio-pci's error_detected callback notifies user-space QEMU
+ * reacts to this by freezing the guest. In an s390 environment PCI
+ * errors are rarely fatal so this is overkill. Instead in the future
+ * we will inject the error event and let the guest recover the device
+ * itself.
+ */
+ if (is_passed_through(to_zpci(pdev)))
+ goto out;
+ driver = to_pci_driver(pdev->dev.driver);
+ if (driver && driver->err_handler && driver->err_handler->error_detected)
+ driver->err_handler->error_detected(pdev, pdev->error_state);
+out:
+ pci_dev_unlock(pdev);
+}
+
static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
{
struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
struct pci_dev *pdev = NULL;
+ pci_ers_result_t ers_res;
+ zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n",
+ ccdf->fid, ccdf->fh, ccdf->pec);
zpci_err("error CCDF:\n");
zpci_err_hex(ccdf, sizeof(*ccdf));
- if (zdev)
- pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
+ if (zdev) {
+ zpci_update_fh(zdev, ccdf->fh);
+ if (zdev->zbus->bus)
+ pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
+ }
pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
if (!pdev)
- return;
+ goto no_pdev;
- pdev->error_state = pci_channel_io_perm_failure;
+ switch (ccdf->pec) {
+ case 0x003a: /* Service Action or Error Recovery Successful */
+ ers_res = zpci_event_attempt_error_recovery(pdev);
+ if (ers_res != PCI_ERS_RESULT_RECOVERED)
+ zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+ break;
+ default:
+ /*
+ * Mark as frozen not permanently failed because the device
+ * could be subsequently recovered by the platform.
+ */
+ zpci_event_io_failure(pdev, pci_channel_io_frozen);
+ break;
+ }
pci_dev_put(pdev);
+no_pdev:
+ zpci_zdev_put(zdev);
}
void zpci_event_error(void *data)
@@ -71,90 +296,90 @@ void zpci_event_error(void *data)
__zpci_event_error(data);
}
+static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
+{
+ zpci_update_fh(zdev, fh);
+ /* Give the driver a hint that the function is
+ * already unusable.
+ */
+ zpci_bus_remove_device(zdev, true);
+ /* Even though the device is already gone we still
+ * need to free zPCI resources as part of the disable.
+ */
+ if (zdev->dma_table)
+ zpci_dma_exit_device(zdev);
+ if (zdev_enabled(zdev))
+ zpci_disable_device(zdev);
+ zdev->state = ZPCI_FN_STATE_STANDBY;
+}
+
static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
{
struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
- struct pci_dev *pdev = NULL;
+ bool existing_zdev = !!zdev;
enum zpci_state state;
- int ret;
-
- if (zdev)
- pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
-
- pr_info("%s: Event 0x%x reconfigured PCI function 0x%x\n",
- pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
- zpci_err("avail CCDF:\n");
- zpci_err_hex(ccdf, sizeof(*ccdf));
+ zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n",
+ ccdf->fid, ccdf->fh, ccdf->pec);
switch (ccdf->pec) {
case 0x0301: /* Reserved|Standby -> Configured */
if (!zdev) {
- ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
- if (ret)
+ zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED);
+ if (IS_ERR(zdev))
+ break;
+ } else {
+ /* the configuration request may be stale */
+ if (zdev->state != ZPCI_FN_STATE_STANDBY)
break;
- zdev = get_zdev_by_fid(ccdf->fid);
+ zdev->state = ZPCI_FN_STATE_CONFIGURED;
}
- if (!zdev || zdev->state != ZPCI_FN_STATE_STANDBY)
- break;
- zdev->state = ZPCI_FN_STATE_CONFIGURED;
- zdev->fh = ccdf->fh;
- ret = zpci_enable_device(zdev);
- if (ret)
- break;
- pci_lock_rescan_remove();
- pci_rescan_bus(zdev->bus);
- pci_unlock_rescan_remove();
+ zpci_scan_configured_device(zdev, ccdf->fh);
break;
case 0x0302: /* Reserved -> Standby */
if (!zdev)
- clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
+ zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
+ else
+ zpci_update_fh(zdev, ccdf->fh);
break;
case 0x0303: /* Deconfiguration requested */
- if (!zdev)
- break;
- if (pdev)
- pci_stop_and_remove_bus_device_locked(pdev);
-
- ret = zpci_disable_device(zdev);
- if (ret)
- break;
-
- ret = sclp_pci_deconfigure(zdev->fid);
- zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret);
- if (!ret)
- zdev->state = ZPCI_FN_STATE_STANDBY;
-
+ if (zdev) {
+ /* The event may have been queued before we confirgured
+ * the device.
+ */
+ if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+ break;
+ zpci_update_fh(zdev, ccdf->fh);
+ zpci_deconfigure_device(zdev);
+ }
break;
case 0x0304: /* Configured -> Standby|Reserved */
- if (!zdev)
- break;
- if (pdev) {
- /* Give the driver a hint that the function is
- * already unusable. */
- pdev->error_state = pci_channel_io_perm_failure;
- pci_stop_and_remove_bus_device_locked(pdev);
- }
-
- zdev->fh = ccdf->fh;
- zpci_disable_device(zdev);
- zdev->state = ZPCI_FN_STATE_STANDBY;
- if (!clp_get_state(ccdf->fid, &state) &&
- state == ZPCI_FN_STATE_RESERVED) {
- zpci_remove_device(zdev);
+ if (zdev) {
+ /* The event may have been queued before we confirgured
+ * the device.:
+ */
+ if (zdev->state == ZPCI_FN_STATE_CONFIGURED)
+ zpci_event_hard_deconfigured(zdev, ccdf->fh);
+ /* The 0x0304 event may immediately reserve the device */
+ if (!clp_get_state(zdev->fid, &state) &&
+ state == ZPCI_FN_STATE_RESERVED) {
+ zpci_device_reserved(zdev);
+ }
}
break;
case 0x0306: /* 0x308 or 0x302 for multiple devices */
- clp_rescan_pci_devices();
+ zpci_remove_reserved_devices();
+ clp_scan_pci_devices();
break;
case 0x0308: /* Standby -> Reserved */
if (!zdev)
break;
- zpci_remove_device(zdev);
+ zpci_device_reserved(zdev);
break;
default:
break;
}
- pci_dev_put(pdev);
+ if (existing_zdev)
+ zpci_zdev_put(zdev);
}
void zpci_event_availability(void *data)
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 02f9505c99a8..56480be48244 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -9,6 +9,7 @@
#include <linux/errno.h>
#include <linux/delay.h>
#include <linux/jump_label.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
#include <asm/pci_insn.h>
#include <asm/pci_debug.h>
@@ -17,16 +18,40 @@
#define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */
-static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset)
+struct zpci_err_insn_data {
+ u8 insn;
+ u8 cc;
+ u8 status;
+ union {
+ struct {
+ u64 req;
+ u64 offset;
+ };
+ struct {
+ u64 addr;
+ u64 len;
+ };
+ };
+} __packed;
+
+static inline void zpci_err_insn_req(int lvl, u8 insn, u8 cc, u8 status,
+ u64 req, u64 offset)
{
- struct {
- u64 req;
- u64 offset;
- u8 cc;
- u8 status;
- } __packed data = {req, offset, cc, status};
-
- zpci_err_hex(&data, sizeof(data));
+ struct zpci_err_insn_data data = {
+ .insn = insn, .cc = cc, .status = status,
+ .req = req, .offset = offset};
+
+ zpci_err_hex_level(lvl, &data, sizeof(data));
+}
+
+static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status,
+ u64 addr, u64 len)
+{
+ struct zpci_err_insn_data data = {
+ .insn = insn, .cc = cc, .status = status,
+ .addr = addr, .len = len};
+
+ zpci_err_hex_level(lvl, &data, sizeof(data));
}
/* Modify PCI Function Controls */
@@ -46,33 +71,41 @@ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status)
u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
{
+ bool retried = false;
u8 cc;
do {
cc = __mpcifc(req, fib, status);
- if (cc == 2)
+ if (cc == 2) {
msleep(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(1, 'M', cc, *status, req, 0);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, *status, req, 0);
+ zpci_err_insn_req(0, 'M', cc, *status, req, 0);
+ else if (retried)
+ zpci_err_insn_req(1, 'M', cc, *status, req, 0);
return cc;
}
+EXPORT_SYMBOL_GPL(zpci_mod_fc);
/* Refresh PCI Translations */
static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
{
- register u64 __addr asm("2") = addr;
- register u64 __range asm("3") = range;
+ union register_pair addr_range = {.even = addr, .odd = range};
u8 cc;
asm volatile (
- " .insn rre,0xb9d30000,%[fn],%[addr]\n"
+ " .insn rre,0xb9d30000,%[fn],%[addr_range]\n"
" ipm %[cc]\n"
" srl %[cc],28\n"
: [cc] "=d" (cc), [fn] "+d" (fn)
- : [addr] "d" (__addr), "d" (__range)
+ : [addr_range] "d" (addr_range.pair)
: "cc");
*status = fn >> 24 & 0xff;
return cc;
@@ -80,16 +113,24 @@ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
{
+ bool retried = false;
u8 cc, status;
do {
cc = __rpcit(fn, addr, range, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_addr(1, 'R', cc, status, addr, range);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, addr, range);
+ zpci_err_insn_addr(0, 'R', cc, status, addr, range);
+ else if (retried)
+ zpci_err_insn_addr(1, 'R', cc, status, addr, range);
if (cc == 1 && (status == 4 || status == 16))
return -ENOMEM;
@@ -98,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
}
/* Set Interruption Controls */
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
{
if (!test_facility(72))
return -EIO;
@@ -109,25 +150,24 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
return 0;
}
+EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
/* PCI Load */
static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
{
- register u64 __req asm("2") = req;
- register u64 __offset asm("3") = offset;
+ union register_pair req_off = {.even = req, .odd = offset};
int cc = -ENXIO;
u64 __data;
asm volatile (
- " .insn rre,0xb9d20000,%[data],%[req]\n"
+ " .insn rre,0xb9d20000,%[data],%[req_off]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [data] "=d" (__data), [req] "+d" (__req)
- : "d" (__offset)
- : "cc");
- *status = __req >> 24 & 0xff;
+ : [cc] "+d" (cc), [data] "=d" (__data),
+ [req_off] "+&d" (req_off.pair) :: "cc");
+ *status = req_off.even >> 24 & 0xff;
*data = __data;
return cc;
}
@@ -146,17 +186,25 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
int __zpci_load(u64 *data, u64 req, u64 offset)
{
+ bool retried = false;
u8 status;
int cc;
do {
cc = __pcilg(data, req, offset, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(1, 'l', cc, status, req, offset);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, req, offset);
+ zpci_err_insn_req(0, 'l', cc, status, req, offset);
+ else if (retried)
+ zpci_err_insn_req(1, 'l', cc, status, req, offset);
return (cc > 0) ? -EIO : cc;
}
@@ -166,28 +214,26 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr,
unsigned long len)
{
struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)];
- u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len);
+ u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len);
return __zpci_load(data, req, ZPCI_OFFSET(addr));
}
static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status)
{
- register u64 addr asm("2") = ioaddr;
- register u64 r3 asm("3") = len;
+ union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
int cc = -ENXIO;
u64 __data;
asm volatile (
- " .insn rre,0xb9d60000,%[data],%[ioaddr]\n"
+ " .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3)
- : [ioaddr] "d" (addr)
- : "cc");
- *status = r3 >> 24 & 0xff;
+ : [cc] "+d" (cc), [data] "=d" (__data),
+ [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc");
+ *status = ioaddr_len.odd >> 24 & 0xff;
*data = __data;
return cc;
}
@@ -202,7 +248,7 @@ int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len)
cc = __pcilg_mio(data, (__force u64) addr, len, &status);
if (cc)
- zpci_err_insn(cc, status, 0, (__force u64) addr);
+ zpci_err_insn_addr(0, 'L', cc, status, (__force u64) addr, len);
return (cc > 0) ? -EIO : cc;
}
@@ -211,36 +257,43 @@ EXPORT_SYMBOL_GPL(zpci_load);
/* PCI Store */
static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status)
{
- register u64 __req asm("2") = req;
- register u64 __offset asm("3") = offset;
+ union register_pair req_off = {.even = req, .odd = offset};
int cc = -ENXIO;
asm volatile (
- " .insn rre,0xb9d00000,%[data],%[req]\n"
+ " .insn rre,0xb9d00000,%[data],%[req_off]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [req] "+d" (__req)
- : "d" (__offset), [data] "d" (data)
+ : [cc] "+d" (cc), [req_off] "+&d" (req_off.pair)
+ : [data] "d" (data)
: "cc");
- *status = __req >> 24 & 0xff;
+ *status = req_off.even >> 24 & 0xff;
return cc;
}
int __zpci_store(u64 data, u64 req, u64 offset)
{
+ bool retried = false;
u8 status;
int cc;
do {
cc = __pcistg(data, req, offset, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(1, 's', cc, status, req, offset);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, req, offset);
+ zpci_err_insn_req(0, 's', cc, status, req, offset);
+ else if (retried)
+ zpci_err_insn_req(1, 's', cc, status, req, offset);
return (cc > 0) ? -EIO : cc;
}
@@ -250,27 +303,26 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data,
unsigned long len)
{
struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)];
- u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len);
+ u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len);
return __zpci_store(data, req, ZPCI_OFFSET(addr));
}
static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status)
{
- register u64 addr asm("2") = ioaddr;
- register u64 r3 asm("3") = len;
+ union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
int cc = -ENXIO;
asm volatile (
- " .insn rre,0xb9d40000,%[data],%[ioaddr]\n"
+ " .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), "+d" (r3)
- : [data] "d" (data), [ioaddr] "d" (addr)
- : "cc");
- *status = r3 >> 24 & 0xff;
+ : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+ : [data] "d" (data)
+ : "cc", "memory");
+ *status = ioaddr_len.odd >> 24 & 0xff;
return cc;
}
@@ -284,7 +336,7 @@ int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len)
cc = __pcistg_mio(data, (__force u64) addr, len, &status);
if (cc)
- zpci_err_insn(cc, status, 0, (__force u64) addr);
+ zpci_err_insn_addr(0, 'S', cc, status, (__force u64) addr, len);
return (cc > 0) ? -EIO : cc;
}
@@ -310,17 +362,25 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status)
int __zpci_store_block(const u64 *data, u64 req, u64 offset)
{
+ bool retried = false;
u8 status;
int cc;
do {
cc = __pcistb(data, req, offset, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(0, 'b', cc, status, req, offset);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, req, offset);
+ zpci_err_insn_req(0, 'b', cc, status, req, offset);
+ else if (retried)
+ zpci_err_insn_req(1, 'b', cc, status, req, offset);
return (cc > 0) ? -EIO : cc;
}
@@ -364,7 +424,7 @@ int zpci_write_block(volatile void __iomem *dst,
cc = __pcistb_mio(src, (__force u64) dst, len, &status);
if (cc)
- zpci_err_insn(cc, status, 0, (__force u64) dst);
+ zpci_err_insn_addr(0, 'B', cc, status, (__force u64) dst, len);
return (cc > 0) ? -EIO : cc;
}
@@ -372,10 +432,7 @@ EXPORT_SYMBOL_GPL(zpci_write_block);
static inline void __pciwb_mio(void)
{
- unsigned long unused = 0;
-
- asm volatile (".insn rre,0xb9d50000,%[op],%[op]\n"
- : [op] "+d" (unused));
+ asm volatile (".insn rre,0xb9d50000,0,0\n");
}
void zpci_barrier(void)
diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c
new file mode 100644
index 000000000000..ead062bf2b41
--- /dev/null
+++ b/arch/s390/pci/pci_iov.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "pci_iov.h"
+
+static struct resource iov_res = {
+ .name = "PCI IOV res",
+ .start = 0,
+ .end = -1,
+ .flags = IORESOURCE_MEM,
+};
+
+void zpci_iov_map_resources(struct pci_dev *pdev)
+{
+ resource_size_t len;
+ int i;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ int bar = i + PCI_IOV_RESOURCES;
+
+ len = pci_resource_len(pdev, bar);
+ if (!len)
+ continue;
+ pdev->resource[bar].parent = &iov_res;
+ }
+}
+
+void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn)
+{
+ pci_lock_rescan_remove();
+ /* Linux' vfid's start at 0 vfn at 1 */
+ pci_iov_remove_virtfn(pdev->physfn, vfn - 1);
+ pci_unlock_rescan_remove();
+}
+
+static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, int vfid)
+{
+ int rc;
+
+ rc = pci_iov_sysfs_link(pdev, virtfn, vfid);
+ if (rc)
+ return rc;
+
+ virtfn->is_virtfn = 1;
+ virtfn->multifunction = 0;
+ virtfn->physfn = pci_dev_get(pdev);
+
+ return 0;
+}
+
+int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn)
+{
+ int i, cand_devfn;
+ struct zpci_dev *zdev;
+ struct pci_dev *pdev;
+ int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/
+ int rc = 0;
+
+ if (!zbus->multifunction)
+ return 0;
+
+ /* If the parent PF for the given VF is also configured in the
+ * instance, it must be on the same zbus.
+ * We can then identify the parent PF by checking what
+ * devfn the VF would have if it belonged to that PF using the PF's
+ * stride and offset. Only if this candidate devfn matches the
+ * actual devfn will we link both functions.
+ */
+ for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) {
+ zdev = zbus->function[i];
+ if (zdev && zdev->is_physfn) {
+ pdev = pci_get_slot(zbus->bus, zdev->devfn);
+ if (!pdev)
+ continue;
+ cand_devfn = pci_iov_virtfn_devfn(pdev, vfid);
+ if (cand_devfn == virtfn->devfn) {
+ rc = zpci_iov_link_virtfn(pdev, virtfn, vfid);
+ /* balance pci_get_slot() */
+ pci_dev_put(pdev);
+ break;
+ }
+ /* balance pci_get_slot() */
+ pci_dev_put(pdev);
+ }
+ }
+ return rc;
+}
diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h
new file mode 100644
index 000000000000..b2c828003bad
--- /dev/null
+++ b/arch/s390/pci/pci_iov.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#ifndef __S390_PCI_IOV_H
+#define __S390_PCI_IOV_H
+
+#ifdef CONFIG_PCI_IOV
+void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn);
+
+void zpci_iov_map_resources(struct pci_dev *pdev);
+
+int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn);
+
+#else /* CONFIG_PCI_IOV */
+static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {}
+
+static inline void zpci_iov_map_resources(struct pci_dev *pdev) {}
+
+static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn)
+{
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+#endif /* __S390_PCI_IOV_h */
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index fbe97ab2e228..a2b42a63a53b 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -11,16 +11,10 @@
#include <asm/isc.h>
#include <asm/airq.h>
+#include <asm/tpi.h>
static enum {FLOATING, DIRECTED} irq_delivery;
-#define SIC_IRQ_MODE_ALL 0
-#define SIC_IRQ_MODE_SINGLE 1
-#define SIC_IRQ_MODE_DIRECT 4
-#define SIC_IRQ_MODE_D_ALL 16
-#define SIC_IRQ_MODE_D_SINGLE 17
-#define SIC_IRQ_MODE_SET_CPU 18
-
/*
* summary bit vector
* FLOATING - summary bit per function
@@ -35,7 +29,7 @@ static struct airq_iv *zpci_sbv;
*/
static struct airq_iv **zpci_ibv;
-/* Modify PCI: Register adapter interruptions */
+/* Modify PCI: Register floating adapter interruptions */
static int zpci_set_airq(struct zpci_dev *zdev)
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
@@ -45,21 +39,24 @@ static int zpci_set_airq(struct zpci_dev *zdev)
fib.fmt0.isc = PCI_ISC;
fib.fmt0.sum = 1; /* enable summary notifications */
fib.fmt0.noi = airq_iv_end(zdev->aibv);
- fib.fmt0.aibv = (unsigned long) zdev->aibv->vector;
+ fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */
- fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8;
+ fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8;
fib.fmt0.aisbo = zdev->aisb & 63;
+ fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
-/* Modify PCI: Unregister adapter interruptions */
+/* Modify PCI: Unregister floating adapter interruptions */
static int zpci_clear_airq(struct zpci_dev *zdev)
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
struct zpci_fib fib = {0};
u8 cc, status;
+ fib.gd = zdev->gisa;
+
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
@@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev)
fib.fmt = 1;
fib.fmt1.noi = zdev->msi_nr_irqs;
fib.fmt1.dibvo = zdev->msi_first_bit;
+ fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
@@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
u8 cc, status;
fib.fmt = 1;
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
@@ -98,14 +97,47 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
return cc ? -EIO : 0;
}
+/* Register adapter interruptions */
+static int zpci_set_irq(struct zpci_dev *zdev)
+{
+ int rc;
+
+ if (irq_delivery == DIRECTED)
+ rc = zpci_set_directed_irq(zdev);
+ else
+ rc = zpci_set_airq(zdev);
+
+ if (!rc)
+ zdev->irqs_registered = 1;
+
+ return rc;
+}
+
+/* Clear adapter interruptions */
+static int zpci_clear_irq(struct zpci_dev *zdev)
+{
+ int rc;
+
+ if (irq_delivery == DIRECTED)
+ rc = zpci_clear_directed_irq(zdev);
+ else
+ rc = zpci_clear_airq(zdev);
+
+ if (!rc)
+ zdev->irqs_registered = 0;
+
+ return rc;
+}
+
static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest,
bool force)
{
struct msi_desc *entry = irq_get_msi_desc(data->irq);
struct msi_msg msg = entry->msg;
+ int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest));
msg.address_lo &= 0xff0000ff;
- msg.address_lo |= (cpumask_first(dest) << 8);
+ msg.address_lo |= (cpu_addr << 8);
pci_write_msi_msg(data->irq, &msg);
return IRQ_SET_MASK_OK;
@@ -115,12 +147,12 @@ static struct irq_chip zpci_irq_chip = {
.name = "PCI-MSI",
.irq_unmask = pci_msi_unmask_irq,
.irq_mask = pci_msi_mask_irq,
- .irq_set_affinity = zpci_set_irq_affinity,
};
static void zpci_handle_cpu_local_irq(bool rescan)
{
struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
+ union zpci_sic_iib iib = {{0}};
unsigned long bit;
int irqs_on = 0;
@@ -132,7 +164,7 @@ static void zpci_handle_cpu_local_irq(bool rescan)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC))
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib))
break;
bit = 0;
continue;
@@ -160,6 +192,7 @@ static void zpci_handle_remote_irq(void *data)
static void zpci_handle_fallback_irq(void)
{
struct cpu_irq_data *cpu_data;
+ union zpci_sic_iib iib = {{0}};
unsigned long cpu;
int irqs_on = 0;
@@ -170,7 +203,7 @@ static void zpci_handle_fallback_irq(void)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
cpu = 0;
continue;
@@ -179,15 +212,16 @@ static void zpci_handle_fallback_irq(void)
if (atomic_inc_return(&cpu_data->scheduled) > 1)
continue;
- cpu_data->csd.func = zpci_handle_remote_irq;
- cpu_data->csd.info = &cpu_data->scheduled;
- cpu_data->csd.flags = 0;
+ INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
smp_call_function_single_async(cpu, &cpu_data->csd);
}
}
-static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_directed_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ bool floating = !tpi_info->directed_irq;
+
if (floating) {
inc_irq_stat(IRQIO_PCF);
zpci_handle_fallback_irq();
@@ -197,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
}
}
-static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_floating_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ union zpci_sic_iib iib = {{0}};
unsigned long si, ai;
struct airq_iv *aibv;
int irqs_on = 0;
@@ -212,7 +248,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
si = 0;
continue;
@@ -239,6 +275,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
unsigned long bit;
struct msi_desc *msi;
struct msi_msg msg;
+ int cpu_addr;
int rc, irq;
zdev->aisb = -1UL;
@@ -260,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
zdev->aisb = bit;
/* Create adapter interrupt vector */
- zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK);
+ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
if (!zdev->aibv)
return -ENOMEM;
@@ -272,11 +309,13 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
/* Request MSI interrupts */
hwirq = bit;
- for_each_pci_msi_entry(msi, pdev) {
+ msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) {
rc = -EIO;
if (hwirq - bit >= msi_vecs)
break;
- irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity);
+ irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE,
+ (irq_delivery == DIRECTED) ?
+ msi->affinity : NULL);
if (irq < 0)
return -ENOMEM;
rc = irq_set_msi_desc(irq, msi);
@@ -286,9 +325,15 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
handle_percpu_irq);
msg.data = hwirq - bit;
if (irq_delivery == DIRECTED) {
+ if (msi->affinity)
+ cpu = cpumask_first(&msi->affinity->mask);
+ else
+ cpu = 0;
+ cpu_addr = smp_cpu_get_cpu_address(cpu);
+
msg.address_lo = zdev->msi_addr & 0xff0000ff;
- msg.address_lo |= msi->affinity ?
- (cpumask_first(&msi->affinity->mask) << 8) : 0;
+ msg.address_lo |= (cpu_addr << 8);
+
for_each_possible_cpu(cpu) {
airq_iv_set_data(zpci_ibv[cpu], hwirq, irq);
}
@@ -304,10 +349,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
zdev->msi_first_bit = bit;
zdev->msi_nr_irqs = msi_vecs;
- if (irq_delivery == DIRECTED)
- rc = zpci_set_directed_irq(zdev);
- else
- rc = zpci_set_airq(zdev);
+ rc = zpci_set_irq(zdev);
if (rc)
return rc;
@@ -321,21 +363,12 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
int rc;
/* Disable interrupts */
- if (irq_delivery == DIRECTED)
- rc = zpci_clear_directed_irq(zdev);
- else
- rc = zpci_clear_airq(zdev);
+ rc = zpci_clear_irq(zdev);
if (rc)
return;
/* Release MSI interrupts */
- for_each_pci_msi_entry(msi, pdev) {
- if (!msi->irq)
- continue;
- if (msi->msi_attrib.is_msix)
- __pci_msix_desc_mask_irq(msi, 1);
- else
- __pci_msi_desc_mask_irq(msi, 1, 1);
+ msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) {
irq_set_msi_desc(msi->irq, NULL);
irq_free_desc(msi->irq);
msi->msg.address_lo = 0;
@@ -358,6 +391,15 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs);
}
+bool arch_restore_msi_irqs(struct pci_dev *pdev)
+{
+ struct zpci_dev *zdev = to_zpci(pdev);
+
+ if (!zdev->irqs_registered)
+ zpci_set_irq(zdev);
+ return true;
+}
+
static struct airq_struct zpci_airq = {
.handler = zpci_floating_irq_handler,
.isc = PCI_ISC,
@@ -366,11 +408,12 @@ static struct airq_struct zpci_airq = {
static void __init cpu_enable_directed_irq(void *unused)
{
union zpci_sic_iib iib = {{0}};
+ union zpci_sic_iib ziib = {{0}};
iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector;
- __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
- zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib);
}
static int __init zpci_directed_irq_init(void)
@@ -378,14 +421,14 @@ static int __init zpci_directed_irq_init(void)
union zpci_sic_iib iib = {{0}};
unsigned int cpu;
- zpci_sbv = airq_iv_create(num_possible_cpus(), 0);
+ zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL);
if (!zpci_sbv)
return -ENOMEM;
iib.diib.isc = PCI_ISC;
iib.diib.nr_cpus = num_possible_cpus();
- iib.diib.disb_addr = (u64) zpci_sbv->vector;
- __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
+ iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv),
GFP_KERNEL);
@@ -400,7 +443,7 @@ static int __init zpci_directed_irq_init(void)
zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
AIRQ_IV_DATA |
AIRQ_IV_CACHELINE |
- (!cpu ? AIRQ_IV_ALLOC : 0));
+ (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
if (!zpci_ibv[cpu])
return -ENOMEM;
}
@@ -417,7 +460,7 @@ static int __init zpci_floating_irq_init(void)
if (!zpci_ibv)
return -ENOMEM;
- zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC);
+ zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
if (!zpci_sbv)
goto out_free;
@@ -430,6 +473,7 @@ out_free:
int __init zpci_irq_init(void)
{
+ union zpci_sic_iib iib = {{0}};
int rc;
irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING;
@@ -461,7 +505,7 @@ int __init zpci_irq_init(void)
* Enable floating IRQs (with suppression after one IRQ). When using
* directed IRQs this enables the fallback path.
*/
- zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib);
return 0;
out_airq:
diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c
new file mode 100644
index 000000000000..ff34baf50a3e
--- /dev/null
+++ b/arch/s390/pci/pci_kvm_hook.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2022. All rights reserved.
+ * Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ */
+#include <linux/kvm_host.h>
+
+struct zpci_kvm_hook zpci_kvm_hook;
+EXPORT_SYMBOL_GPL(zpci_kvm_hook);
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 7d42a8794f10..588089332931 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -11,25 +11,108 @@
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/pci.h>
+#include <asm/asm-extable.h>
+#include <asm/pci_io.h>
+#include <asm/pci_debug.h>
-static long get_pfn(unsigned long user_addr, unsigned long access,
- unsigned long *pfn)
+static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset)
{
- struct vm_area_struct *vma;
- long ret;
+ struct {
+ u64 offset;
+ u8 cc;
+ u8 status;
+ } data = {offset, cc, status};
- down_read(&current->mm->mmap_sem);
- ret = -EINVAL;
- vma = find_vma(current->mm, user_addr);
- if (!vma)
- goto out;
- ret = -EACCES;
- if (!(vma->vm_flags & access))
- goto out;
- ret = follow_pfn(vma, user_addr, pfn);
-out:
- up_read(&current->mm->mmap_sem);
- return ret;
+ zpci_err_hex(&data, sizeof(data));
+}
+
+static inline int __pcistb_mio_inuser(
+ void __iomem *ioaddr, const void __user *src,
+ u64 len, u8 *status)
+{
+ int cc = -ENXIO;
+
+ asm volatile (
+ " sacf 256\n"
+ "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n"
+ "1: ipm %[cc]\n"
+ " srl %[cc],28\n"
+ "2: sacf 768\n"
+ EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
+ : [cc] "+d" (cc), [len] "+d" (len)
+ : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src))
+ : "cc", "memory");
+ *status = len >> 24 & 0xff;
+ return cc;
+}
+
+static inline int __pcistg_mio_inuser(
+ void __iomem *ioaddr, const void __user *src,
+ u64 ulen, u8 *status)
+{
+ union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+ int cc = -ENXIO;
+ u64 val = 0;
+ u64 cnt = ulen;
+ u8 tmp;
+
+ /*
+ * copy 0 < @len <= 8 bytes from @src into the right most bytes of
+ * a register, then store it to PCI at @ioaddr while in secondary
+ * address space. pcistg then uses the user mappings.
+ */
+ asm volatile (
+ " sacf 256\n"
+ "0: llgc %[tmp],0(%[src])\n"
+ "4: sllg %[val],%[val],8\n"
+ " aghi %[src],1\n"
+ " ogr %[val],%[tmp]\n"
+ " brctg %[cnt],0b\n"
+ "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n"
+ "2: ipm %[cc]\n"
+ " srl %[cc],28\n"
+ "3: sacf 768\n"
+ EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b)
+ :
+ [src] "+a" (src), [cnt] "+d" (cnt),
+ [val] "+d" (val), [tmp] "=d" (tmp),
+ [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+ :: "cc", "memory");
+ *status = ioaddr_len.odd >> 24 & 0xff;
+
+ /* did we read everything from user memory? */
+ if (!cc && cnt != 0)
+ cc = -EFAULT;
+
+ return cc;
+}
+
+static inline int __memcpy_toio_inuser(void __iomem *dst,
+ const void __user *src, size_t n)
+{
+ int size, rc = 0;
+ u8 status = 0;
+
+ if (!src)
+ return -EINVAL;
+
+ while (n > 0) {
+ size = zpci_get_max_write_size((u64 __force) dst,
+ (u64 __force) src, n,
+ ZPCI_MAX_WRITE_SIZE);
+ if (size > 8) /* main path */
+ rc = __pcistb_mio_inuser(dst, src, size, &status);
+ else
+ rc = __pcistg_mio_inuser(dst, src, size, &status);
+ if (rc)
+ break;
+ src += size;
+ dst += size;
+ n -= size;
+ }
+ if (rc)
+ zpci_err_mmio(rc, status, (__force u64) dst);
+ return rc;
}
SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
@@ -38,7 +121,9 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
u8 local_buf[64];
void __iomem *io_addr;
void *buf;
- unsigned long pfn;
+ struct vm_area_struct *vma;
+ pte_t *ptep;
+ spinlock_t *ptl;
long ret;
if (!zpci_is_enabled())
@@ -46,6 +131,22 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length)
return -EINVAL;
+
+ /*
+ * We only support write access to MIO capable devices if we are on
+ * a MIO enabled system. Otherwise we would have to check for every
+ * address if it is a special ZPCI_ADDR and would have to do
+ * a pfn lookup which we don't need for MIO capable devices. Currently
+ * ISM devices are the only devices without MIO support and there is no
+ * known need for accessing these from userspace.
+ */
+ if (static_branch_likely(&have_mio)) {
+ ret = __memcpy_toio_inuser((void __iomem *) mmio_addr,
+ user_buffer,
+ length);
+ return ret;
+ }
+
if (length > 64) {
buf = kmalloc(length, GFP_KERNEL);
if (!buf)
@@ -53,32 +154,118 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
} else
buf = local_buf;
- ret = get_pfn(mmio_addr, VM_WRITE, &pfn);
+ ret = -EFAULT;
+ if (copy_from_user(buf, user_buffer, length))
+ goto out_free;
+
+ mmap_read_lock(current->mm);
+ ret = -EINVAL;
+ vma = vma_lookup(current->mm, mmio_addr);
+ if (!vma)
+ goto out_unlock_mmap;
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out_unlock_mmap;
+ ret = -EACCES;
+ if (!(vma->vm_flags & VM_WRITE))
+ goto out_unlock_mmap;
+
+ ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
if (ret)
- goto out;
- io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK));
+ goto out_unlock_mmap;
- ret = -EFAULT;
- if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
- goto out;
+ io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
+ (mmio_addr & ~PAGE_MASK));
- if (copy_from_user(buf, user_buffer, length))
- goto out;
+ if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
+ goto out_unlock_pt;
ret = zpci_memcpy_toio(io_addr, buf, length);
-out:
+out_unlock_pt:
+ pte_unmap_unlock(ptep, ptl);
+out_unlock_mmap:
+ mmap_read_unlock(current->mm);
+out_free:
if (buf != local_buf)
kfree(buf);
return ret;
}
+static inline int __pcilg_mio_inuser(
+ void __user *dst, const void __iomem *ioaddr,
+ u64 ulen, u8 *status)
+{
+ union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+ u64 cnt = ulen;
+ int shift = ulen * 8;
+ int cc = -ENXIO;
+ u64 val, tmp;
+
+ /*
+ * read 0 < @len <= 8 bytes from the PCI memory mapped at @ioaddr (in
+ * user space) into a register using pcilg then store these bytes at
+ * user address @dst
+ */
+ asm volatile (
+ " sacf 256\n"
+ "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n"
+ "1: ipm %[cc]\n"
+ " srl %[cc],28\n"
+ " ltr %[cc],%[cc]\n"
+ " jne 4f\n"
+ "2: ahi %[shift],-8\n"
+ " srlg %[tmp],%[val],0(%[shift])\n"
+ "3: stc %[tmp],0(%[dst])\n"
+ "5: aghi %[dst],1\n"
+ " brctg %[cnt],2b\n"
+ "4: sacf 768\n"
+ EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b)
+ :
+ [ioaddr_len] "+&d" (ioaddr_len.pair),
+ [cc] "+d" (cc), [val] "=d" (val),
+ [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp),
+ [shift] "+d" (shift)
+ :: "cc", "memory");
+
+ /* did we write everything to the user space buffer? */
+ if (!cc && cnt != 0)
+ cc = -EFAULT;
+
+ *status = ioaddr_len.odd >> 24 & 0xff;
+ return cc;
+}
+
+static inline int __memcpy_fromio_inuser(void __user *dst,
+ const void __iomem *src,
+ unsigned long n)
+{
+ int size, rc = 0;
+ u8 status;
+
+ while (n > 0) {
+ size = zpci_get_max_write_size((u64 __force) src,
+ (u64 __force) dst, n,
+ ZPCI_MAX_READ_SIZE);
+ rc = __pcilg_mio_inuser(dst, src, size, &status);
+ if (rc)
+ break;
+ src += size;
+ dst += size;
+ n -= size;
+ }
+ if (rc)
+ zpci_err_mmio(rc, status, (__force u64) dst);
+ return rc;
+}
+
SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
void __user *, user_buffer, size_t, length)
{
u8 local_buf[64];
void __iomem *io_addr;
void *buf;
- unsigned long pfn;
+ struct vm_area_struct *vma;
+ pte_t *ptep;
+ spinlock_t *ptl;
long ret;
if (!zpci_is_enabled())
@@ -86,29 +273,62 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length)
return -EINVAL;
+
+ /*
+ * We only support read access to MIO capable devices if we are on
+ * a MIO enabled system. Otherwise we would have to check for every
+ * address if it is a special ZPCI_ADDR and would have to do
+ * a pfn lookup which we don't need for MIO capable devices. Currently
+ * ISM devices are the only devices without MIO support and there is no
+ * known need for accessing these from userspace.
+ */
+ if (static_branch_likely(&have_mio)) {
+ ret = __memcpy_fromio_inuser(
+ user_buffer, (const void __iomem *)mmio_addr,
+ length);
+ return ret;
+ }
+
if (length > 64) {
buf = kmalloc(length, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- } else
+ } else {
buf = local_buf;
+ }
- ret = get_pfn(mmio_addr, VM_READ, &pfn);
+ mmap_read_lock(current->mm);
+ ret = -EINVAL;
+ vma = vma_lookup(current->mm, mmio_addr);
+ if (!vma)
+ goto out_unlock_mmap;
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out_unlock_mmap;
+ ret = -EACCES;
+ if (!(vma->vm_flags & VM_WRITE))
+ goto out_unlock_mmap;
+
+ ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
if (ret)
- goto out;
- io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK));
+ goto out_unlock_mmap;
+
+ io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
+ (mmio_addr & ~PAGE_MASK));
if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) {
ret = -EFAULT;
- goto out;
+ goto out_unlock_pt;
}
ret = zpci_memcpy_fromio(buf, io_addr, length);
- if (ret)
- goto out;
- if (copy_to_user(user_buffer, buf, length))
+
+out_unlock_pt:
+ pte_unmap_unlock(ptep, ptl);
+out_unlock_mmap:
+ mmap_read_unlock(current->mm);
+
+ if (!ret && copy_to_user(user_buffer, buf, length))
ret = -EFAULT;
-out:
if (buf != local_buf)
kfree(buf);
return ret;
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index 215f17437a4f..cae280e5c047 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -33,6 +33,7 @@ zpci_attr(pchid, "0x%04x\n", pchid);
zpci_attr(pfgid, "0x%02x\n", pfgid);
zpci_attr(vfn, "0x%04x\n", vfn);
zpci_attr(pft, "0x%02x\n", pft);
+zpci_attr(port, "%d\n", port);
zpci_attr(uid, "0x%x\n", uid);
zpci_attr(segment0, "0x%02x\n", pfip[0]);
zpci_attr(segment1, "0x%02x\n", pfip[1]);
@@ -81,14 +82,35 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
pci_lock_rescan_remove();
if (pci_dev_is_added(pdev)) {
pci_stop_and_remove_bus_device(pdev);
- ret = zpci_disable_device(zdev);
- if (ret)
- goto out;
+ if (zdev->dma_table) {
+ ret = zpci_dma_exit_device(zdev);
+ if (ret)
+ goto out;
+ }
+
+ if (zdev_enabled(zdev)) {
+ ret = zpci_disable_device(zdev);
+ /*
+ * Due to a z/VM vs LPAR inconsistency in the error
+ * state the FH may indicate an enabled device but
+ * disable says the device is already disabled don't
+ * treat it as an error here.
+ */
+ if (ret == -EINVAL)
+ ret = 0;
+ if (ret)
+ goto out;
+ }
ret = zpci_enable_device(zdev);
if (ret)
goto out;
- pci_rescan_bus(zdev->bus);
+ ret = zpci_dma_init_device(zdev);
+ if (ret) {
+ zpci_disable_device(zdev);
+ goto out;
+ }
+ pci_rescan_bus(zdev->zbus->bus);
}
out:
pci_unlock_rescan_remove();
@@ -130,6 +152,45 @@ static ssize_t report_error_write(struct file *filp, struct kobject *kobj,
}
static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE);
+static ssize_t uid_is_unique_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", zpci_unique_uid ? 1 : 0);
+}
+static DEVICE_ATTR_RO(uid_is_unique);
+
+#ifndef CONFIG_DMI
+/* analogous to smbios index */
+static ssize_t index_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
+ u32 index = ~0;
+
+ if (zpci_unique_uid)
+ index = zdev->uid;
+
+ return sysfs_emit(buf, "%u\n", index);
+}
+static DEVICE_ATTR_RO(index);
+
+static umode_t zpci_index_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ return zpci_unique_uid ? attr->mode : 0;
+}
+
+static struct attribute *zpci_ident_attrs[] = {
+ &dev_attr_index.attr,
+ NULL,
+};
+
+static struct attribute_group zpci_ident_attr_group = {
+ .attrs = zpci_ident_attrs,
+ .is_visible = zpci_index_is_visible,
+};
+#endif
+
static struct bin_attribute *zpci_bin_attrs[] = {
&bin_attr_util_string,
&bin_attr_report_error,
@@ -142,12 +203,15 @@ static struct attribute *zpci_dev_attrs[] = {
&dev_attr_pchid.attr,
&dev_attr_pfgid.attr,
&dev_attr_pft.attr,
+ &dev_attr_port.attr,
&dev_attr_vfn.attr,
&dev_attr_uid.attr,
&dev_attr_recover.attr,
&dev_attr_mio_enabled.attr,
+ &dev_attr_uid_is_unique.attr,
NULL,
};
+
static struct attribute_group zpci_attr_group = {
.attrs = zpci_dev_attrs,
.bin_attrs = zpci_bin_attrs,
@@ -168,5 +232,8 @@ static struct attribute_group pfip_attr_group = {
const struct attribute_group *zpci_attr_groups[] = {
&zpci_attr_group,
&pfip_attr_group,
+#ifndef CONFIG_DMI
+ &zpci_ident_attr_group,
+#endif
NULL,
};
diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore
index c82157f46b18..97ca52779457 100644
--- a/arch/s390/purgatory/.gitignore
+++ b/arch/s390/purgatory/.gitignore
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
purgatory
purgatory.chk
purgatory.lds
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index c57f8c40e992..d237bc6841cb 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -19,11 +19,13 @@ KCOV_INSTRUMENT := n
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes
KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(CLANG_FLAGS)
KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS))
@@ -46,7 +48,6 @@ OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*'
$(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE
$(call if_changed,objcopy)
-$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE
- $(call if_changed_rule,as_o_S)
+$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o
+obj-y += kexec-purgatory.o
diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S
index 5a10ce34b95d..6f835124ee82 100644
--- a/arch/s390/purgatory/head.S
+++ b/arch/s390/purgatory/head.S
@@ -44,11 +44,14 @@
.endm
.macro MEMSWAP dst,src,buf,len
-10: cghi \len,bufsz
+10: larl %r0,purgatory_end
+ larl %r1,stack
+ slgr %r0,%r1
+ cgr \len,%r0
jh 11f
lgr %r4,\len
j 12f
-11: lghi %r4,bufsz
+11: lgr %r4,%r0
12: MEMCPY \buf,\dst,%r4
MEMCPY \dst,\src,%r4
@@ -62,14 +65,15 @@
jh 10b
.endm
-.macro START_NEXT_KERNEL base
+.macro START_NEXT_KERNEL base subcode
lg %r4,kernel_entry-\base(%r13)
lg %r5,load_psw_mask-\base(%r13)
ogr %r4,%r5
stg %r4,0(%r0)
xgr %r0,%r0
- diag %r0,%r0,0x308
+ lghi %r1,\subcode
+ diag %r0,%r1,0x308
.endm
.text
@@ -123,7 +127,7 @@ ENTRY(purgatory_start)
je .start_crash_kernel
/* start normal kernel */
- START_NEXT_KERNEL .base_crash
+ START_NEXT_KERNEL .base_crash 0
.return_old_kernel:
lmg %r6,%r15,gprregs-.base_crash(%r13)
@@ -134,12 +138,18 @@ ENTRY(purgatory_start)
.start_crash_kernel:
/* Location of purgatory_start in crash memory */
+ larl %r0,.base_crash
+ larl %r1,purgatory_start
+ slgr %r0,%r1
lgr %r8,%r13
- aghi %r8,-(.base_crash-purgatory_start)
+ sgr %r8,%r0
/* Destination for this code i.e. end of memory to be swapped. */
+ larl %r0,purgatory_end
+ larl %r1,purgatory_start
+ slgr %r0,%r1
lg %r9,crash_size-.base_crash(%r13)
- aghi %r9,-(purgatory_end-purgatory_start)
+ sgr %r9,%r0
/* Destination in crash memory, i.e. same as r9 but in crash memory. */
lg %r10,crash_start-.base_crash(%r13)
@@ -148,15 +158,19 @@ ENTRY(purgatory_start)
/* Buffer location (in crash memory) and size. As the purgatory is
* behind the point of no return it can re-use the stack as buffer.
*/
- lghi %r11,bufsz
+ larl %r11,purgatory_end
larl %r12,stack
+ slgr %r11,%r12
MEMCPY %r12,%r9,%r11 /* dst -> (crash) buf */
MEMCPY %r9,%r8,%r11 /* self -> dst */
/* Jump to new location. */
lgr %r7,%r9
- aghi %r7,.jump_to_dst-purgatory_start
+ larl %r0,.jump_to_dst
+ larl %r1,purgatory_start
+ slgr %r0,%r1
+ agr %r7,%r0
br %r7
.jump_to_dst:
@@ -168,7 +182,10 @@ ENTRY(purgatory_start)
/* Load new buffer location after jump */
larl %r7,stack
- aghi %r10,stack-purgatory_start
+ lgr %r0,%r7
+ larl %r1,purgatory_start
+ slgr %r0,%r1
+ agr %r10,%r0
MEMCPY %r10,%r7,%r11 /* (new) buf -> (crash) buf */
/* Now the code is set up to run from its designated location. Start
@@ -227,7 +244,7 @@ ENTRY(purgatory_start)
MEMCPY %r9,%r10,%r11
/* start crash kernel */
- START_NEXT_KERNEL .base_dst
+ START_NEXT_KERNEL .base_dst 1
load_psw_mask:
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 0a423bcf6746..030efda05dbe 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -9,7 +9,7 @@
#include <linux/kexec.h>
#include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <asm/purgatory.h>
int verify_sha256_digest(void)
diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss
deleted file mode 100644
index f4f4c2c6dee9..000000000000
--- a/arch/s390/scripts/Makefile.chkbss
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-chkbss-target ?= built-in.a
-$(obj)/$(chkbss-target): chkbss
-
-chkbss-files := $(addsuffix .chkbss, $(chkbss))
-clean-files += $(chkbss-files)
-
-PHONY += chkbss
-chkbss: $(addprefix $(obj)/, $(chkbss-files))
-
-quiet_cmd_chkbss = CHKBSS $<
- cmd_chkbss = \
- if ! $(OBJSIZE) --common $< | $(AWK) 'END { if ($$3) exit 1 }'; then \
- echo "error: $< .bss section is not empty" >&2; exit 1; \
- fi; \
- touch $@;
-
-$(obj)/%.o.chkbss: $(obj)/%.o
- $(call cmd,chkbss)
diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore
index 71bd6f8eebaf..ea62f37b79ef 100644
--- a/arch/s390/tools/.gitignore
+++ b/arch/s390/tools/.gitignore
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
gen_facilities
gen_opcode_table
diff --git a/arch/s390/tools/gcc-thunk-extern.sh b/arch/s390/tools/gcc-thunk-extern.sh
new file mode 100755
index 000000000000..20bcbf6dd7ab
--- /dev/null
+++ b/arch/s390/tools/gcc-thunk-extern.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Borrowed from gcc: gcc/testsuite/gcc.target/s390/nobp-section-type-conflict.c
+# Checks that we don't get error: section type conflict with ‘put_page’.
+
+cat << "END" | $@ -x c - -fno-PIE -march=z10 -mindirect-branch=thunk-extern -mfunction-return=thunk-extern -mindirect-branch-table -O2 -c -o /dev/null
+int a;
+int b (void);
+void c (int);
+
+static void
+put_page (void)
+{
+ if (b ())
+ c (a);
+}
+
+__attribute__ ((__section__ (".init.text"), __cold__)) void
+d (void)
+{
+ put_page ();
+ put_page ();
+}
+END
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 61ce5b59b828..cb0aff5c0187 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -27,24 +27,16 @@ static struct facility_def facility_defs[] = {
*/
.name = "FACILITIES_ALS",
.bits = (int[]){
-#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES
0, /* N3 instructions */
1, /* z/Arch mode installed */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
18, /* long displacement facility */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
21, /* extended-immediate facility */
25, /* store clock fast */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
27, /* mvcos */
32, /* compare and swap and store */
33, /* compare and swap and store 2 */
34, /* general instructions extension */
35, /* execute extensions */
-#endif
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
45, /* fast-BCR, etc. */
#endif
@@ -115,6 +107,11 @@ static struct facility_def facility_defs[] = {
12, /* AP Query Configuration Information */
15, /* AP Facilities Test */
156, /* etoken facility */
+ 165, /* nnpa facility */
+ 193, /* bear enhancement facility */
+ 194, /* rdp enhancement facility */
+ 196, /* processor activity instrumentation facility */
+ 197, /* processor activity instrumentation extension 1 */
-1 /* END */
}
},
diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt
index 46d8ed96cf06..5f008e794898 100644
--- a/arch/s390/tools/opcodes.txt
+++ b/arch/s390/tools/opcodes.txt
@@ -189,6 +189,8 @@ ad stosm SI_URD
ae sigp RS_RRRD
af mc SI_URD
b1 lra RX_RRRD
+b200 lbear S_RD
+b201 stbear S_RD
b202 stidp S_RD
b204 sck S_RD
b205 stck S_RD
@@ -274,6 +276,7 @@ b285 lpctl S_RD
b286 qsi S_RD
b287 lsctl S_RD
b28e qctri S_RD
+b28f qpaci S_RD
b299 srnm S_RD
b29c stfpc S_RD
b29d lfpc S_RD
@@ -523,6 +526,7 @@ b931 clgfr RRE_RR
b938 sortl RRE_RR
b939 dfltcc RRF_R0RR2
b93a kdsa RRE_RR
+b93b nnpa RRE_00
b93c ppno RRE_RR
b93e kimd RRE_RR
b93f klmd RRE_RR
@@ -562,6 +566,7 @@ b987 dlgr RRE_RR
b988 alcgr RRE_RR
b989 slbgr RRE_RR
b98a cspg RRE_RR
+b98b rdp RRF_RURR2
b98d epsw RRE_RR
b98e idte RRF_RURR2
b98f crdte RRF_RURR2
@@ -597,7 +602,7 @@ b9b3 cu42 RRE_RR
b9bd trtre RRF_U0RR
b9be srstu RRE_RR
b9bf trte RRF_U0RR
-b9c0 selhhhr RRF_RURR
+b9c0 selfhr RRF_RURR
b9c8 ahhhr RRF_R0RR2
b9c9 shhhr RRF_R0RR2
b9ca alhhhr RRF_R0RR2
@@ -876,19 +881,32 @@ e63d vstrl VSI_URDV
e63f vstrlr VRS_RRDV
e649 vlip VRI_V0UU2
e650 vcvb VRR_RV0UU
+e651 vclzdp VRR_VV0U2
e652 vcvbg VRR_RV0UU
+e654 vupkzh VRR_VV0U2
+e655 vcnf VRR_VV0UU2
+e656 vclfnh VRR_VV0UU2
e658 vcvd VRI_VR0UU
e659 vsrp VRI_VVUUU2
e65a vcvdg VRI_VR0UU
e65b vpsop VRI_VVUUU2
+e65c vupkzl VRR_VV0U2
+e65d vcfn VRR_VV0UU2
+e65e vclfnl VRR_VV0UU2
e65f vtp VRR_0V
+e670 vpkzr VRI_VVV0UU2
e671 vap VRI_VVV0UU2
+e672 vsrpr VRI_VVV0UU2
e673 vsp VRI_VVV0UU2
+e674 vschp VRR_VVV0U0U
+e675 vcrnf VRR_VVV0UU
e677 vcp VRR_0VV0U
e678 vmp VRI_VVV0UU2
e679 vmsp VRI_VVV0UU2
e67a vdp VRI_VVV0UU2
e67b vrp VRI_VVV0UU2
+e67c vscshp VRR_VVV
+e67d vcsph VRR_VVV0U0
e67e vsdp VRI_VVV0UU2
e700 vleb VRX_VRRDU
e701 vleh VRX_VRRDU
@@ -1081,6 +1099,7 @@ eb61 stric RSY_RDRU
eb62 mric RSY_RDRU
eb6a asi SIY_IRD
eb6e alsi SIY_IRD
+eb71 lpswey SIY_RD
eb7a agsi SIY_IRD
eb7e algsi SIY_IRD
eb80 icmh RSY_RURD