diff options
Diffstat (limited to 'arch/s390')
431 files changed, 32899 insertions, 18184 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index 76e362277179..a5d3503b353c 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -3,11 +3,11 @@ obj-y += kernel/ obj-y += mm/ obj-$(CONFIG_KVM) += kvm/ obj-y += crypto/ -obj-$(CONFIG_S390_HYPFS_FS) += hypfs/ +obj-$(CONFIG_S390_HYPFS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ obj-$(CONFIG_PCI) += pci/ -obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/ +obj-$(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) += purgatory/ # for cleaning subdir- += boot tools diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 318fce77601d..0c16dc443e2f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -17,9 +17,12 @@ config ARCH_HAS_ILOG2_U32 config ARCH_HAS_ILOG2_U64 def_bool n -config GENERIC_HWEIGHT +config ARCH_PROC_KCORE_TEXT def_bool y +config GENERIC_HWEIGHT + def_bool !HAVE_MARCH_Z196_FEATURES + config GENERIC_BUG def_bool y if BUG @@ -38,9 +41,6 @@ config AUDIT_ARCH config NO_IOPORT_MAP def_bool y -config PCI_QUIRKS - def_bool n - config ARCH_SUPPORTS_UPROBES def_bool y @@ -49,6 +49,19 @@ config KASAN_SHADOW_OFFSET depends on KASAN default 0x1C000000000000 +config CC_ASM_FLAG_OUTPUT_BROKEN + def_bool CC_IS_GCC && GCC_VERSION < 140200 + help + GCC versions before 14.2.0 may die with an internal + compiler error in some configurations if flag output + operands are used within inline assemblies. + +config CC_HAS_ASM_AOR_FORMAT_FLAGS + def_bool !(CC_IS_CLANG && CLANG_VERSION < 190100) + help + Clang versions before 19.1.0 do not support A, + O, and R inline assembly format flags. + config S390 def_bool y # @@ -57,30 +70,38 @@ config S390 imply IMA_SECURE_AND_OR_TRUSTED_BOOT select ALTERNATE_USER_ADDRESS_SPACE select ARCH_32BIT_USTAT_F_TINODE - select ARCH_BINFMT_ELF_STATE select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_CRC32 select ARCH_HAS_CURRENT_STACK_POINTER + select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_DMA_OPS if PCI select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYSCALL_WRAPPER - select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VDSO_DATA + select ARCH_HAS_UBSAN + select ARCH_HAS_VDSO_TIME_DATA select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH @@ -110,61 +131,78 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_HUGETLBFS + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF - select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_USE_SYM_ANNOTATIONS select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM + select ARCH_WANT_KERNEL_PMD_MKWRITE + select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 - select DMA_OPS if PCI + select DCACHE_WORD_ACCESS if !KMSAN select DYNAMIC_FTRACE if FUNCTION_TRACER - select GCC12_NO_ARRAY_BOUNDS + select FUNCTION_ALIGNMENT_8B if CC_IS_GCC + select FUNCTION_ALIGNMENT_16B if !CC_IS_GCC select GENERIC_ALLOCATOR + select GENERIC_CPU_DEVICES select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_ENTRY select GENERIC_GETTIMEOFDAY - select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select GENERIC_IOREMAP if PCI + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC select HAVE_ARCH_KCSAN + select HAVE_ARCH_KMSAN select HAVE_ARCH_KFENCE select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_VMAP_STACK select HAVE_ASM_MODVERSIONS + select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_ARGS + select HAVE_FTRACE_REGS_HAVING_PT_REGS select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FENTRY + select HAVE_FTRACE_GRAPH_FUNC select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS @@ -181,45 +219,57 @@ config S390 select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES - select HAVE_KVM select HAVE_LIVEPATCH select HAVE_MEMBLOCK_PHYS_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_NOP_MCOUNT + select HAVE_PAGE_SIZE_4KB select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE + select HAVE_RETHOOK select HAVE_RSEQ select HAVE_SAMPLE_FTRACE_DIRECT select HAVE_SAMPLE_FTRACE_DIRECT_MULTI + select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_VIRT_CPU_ACCOUNTING_IDLE + select HOTPLUG_SMT select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select KASAN_VMALLOC if KASAN + select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_MERGE_VMAS select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE - select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PROC_VMCORE_DEVICE_RAM if PROC_VMCORE select NEED_SG_DMA_LENGTH if PCI select OLD_SIGACTION select OLD_SIGSUSPEND3 select PCI_DOMAINS if PCI select PCI_MSI if PCI select PCI_MSI_ARCH_FALLBACKS if PCI_MSI + select PCI_QUIRKS if PCI select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select TTY + select USER_STACKTRACE_SUPPORT + select VDSO_GETRANDOM select VIRT_CPU_ACCOUNTING + select VMAP_STACK select ZONE_DMA # Note: keep the above list sorted alphabetically @@ -232,6 +282,28 @@ config PGTABLE_LEVELS source "kernel/livepatch/Kconfig" +config ARCH_SUPPORTS_KEXEC + def_bool y + +config ARCH_SUPPORTS_KEXEC_FILE + def_bool y + +config ARCH_SUPPORTS_KEXEC_SIG + def_bool MODULE_SIG_FORMAT + +config ARCH_SUPPORTS_KEXEC_PURGATORY + def_bool y + +config ARCH_SUPPORTS_CRASH_DUMP + def_bool y + help + Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this. + This option also enables s390 zfcpdump. + See also <file:Documentation/arch/s390/zfcpdump.rst> + +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + menu "Processor type and features" config HAVE_MARCH_Z10_FEATURES @@ -261,6 +333,10 @@ config HAVE_MARCH_Z16_FEATURES def_bool n select HAVE_MARCH_Z15_FEATURES +config HAVE_MARCH_Z17_FEATURES + def_bool n + select HAVE_MARCH_Z16_FEATURES + choice prompt "Processor type" default MARCH_Z196 @@ -326,6 +402,14 @@ config MARCH_Z16 Select this to enable optimizations for IBM z16 (3931 and 3932 series). +config MARCH_Z17 + bool "IBM z17" + select HAVE_MARCH_Z17_FEATURES + depends on $(cc-option,-march=z17) + help + Select this to enable optimizations for IBM z17 (9175 and + 9176 series). + endchoice config MARCH_Z10_TUNE @@ -349,6 +433,9 @@ config MARCH_Z15_TUNE config MARCH_Z16_TUNE def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT +config MARCH_Z17_TUNE + def_bool TUNE_Z17 || MARCH_Z17 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -393,6 +480,10 @@ config TUNE_Z16 bool "IBM z16" depends on $(cc-option,-mtune=z16) +config TUNE_Z17 + bool "IBM z17" + depends on $(cc-option,-mtune=z17) + endchoice config 64BIT @@ -407,18 +498,20 @@ config COMMAND_LINE_SIZE line. config COMPAT - def_bool y + def_bool n prompt "Kernel support for 31 bit emulation" select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION select HAVE_UID16 depends on MULTIUSER - depends on !CC_IS_CLANG + depends on !CC_IS_CLANG && !LD_IS_LLD help Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option (and some other stuff like libraries and such) is needed for - executing 31 bit applications. It is safe to say "Y". + executing 31 bit applications. + + If unsure say N. config SMP def_bool y @@ -458,55 +551,48 @@ config SCHED_SMT config SCHED_MC def_bool n -config SCHED_BOOK - def_bool n - -config SCHED_DRAWER - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" select SCHED_SMT select SCHED_MC - select SCHED_BOOK - select SCHED_DRAWER help Topology scheduler support improves the CPU scheduler's decision making when dealing with machines that have multi-threading, multiple cores or multiple books. -source "kernel/Kconfig.hz" - -config KEXEC +config SCHED_TOPOLOGY_VERTICAL def_bool y - select KEXEC_CORE - -config KEXEC_FILE - bool "kexec file based system call" - select KEXEC_CORE - depends on CRYPTO - depends on CRYPTO_SHA256 - depends on CRYPTO_SHA256_S390 + bool "Use vertical CPU polarization by default" + depends on SCHED_TOPOLOGY help - Enable the kexec file based system call. In contrast to the normal - kexec system call this system call takes file descriptors for the - kernel and initramfs as arguments. + Use vertical CPU polarization by default if available. + The default CPU polarization is horizontal. -config ARCH_HAS_KEXEC_PURGATORY +config HIPERDISPATCH_ON def_bool y - depends on KEXEC_FILE + bool "Use hiperdispatch on vertical polarization by default" + depends on SCHED_TOPOLOGY + depends on PROC_SYSCTL + help + Hiperdispatch aims to improve the CPU scheduler's decision + making when using vertical polarization by adjusting CPU + capacities dynamically. Set this option to use hiperdispatch + on vertical polarization by default. This can be overwritten + by sysctl's s390.hiperdispatch attribute later on. + +source "kernel/Kconfig.hz" -config KEXEC_SIG - bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC_FILE && MODULE_SIG_FORMAT +config CERT_STORE + bool "Get user certificates via DIAG320" + depends on KEYS + select CRYPTO_LIB_SHA256 help - This option makes kernel signature verification mandatory for - the kexec_file_load() syscall. + Enable this option if you want to access user-provided secure boot + certificates via DIAG 0x320. - In addition to that option, you need to enable signature - verification for the corresponding kernel image type being - loaded in order for this to work. + These certificates will be made available via the keyring named + 'cert_store'. config KERNEL_NOBP def_bool n @@ -539,17 +625,13 @@ config EXPOLINE If unsure, say N. config EXPOLINE_EXTERN - def_bool n - depends on EXPOLINE - depends on CC_IS_GCC && GCC_VERSION >= 110200 - depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC)) - prompt "Generate expolines as extern functions." + def_bool EXPOLINE && CC_IS_GCC && GCC_VERSION >= 110200 && \ + $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC)) help - This option is required for some tooling like kpatch. The kernel is - compiled with -mindirect-branch=thunk-extern and requires a newer - compiler. - - If unsure, say N. + Generate expolines as external functions if the compiler supports it. + This option is required for some tooling like kpatch, if expolines + are enabled. The kernel is compiled with + -mindirect-branch=thunk-extern, which requires a newer compiler. choice prompt "Expoline default" @@ -568,20 +650,18 @@ config EXPOLINE_FULL endchoice config RELOCATABLE - bool "Build a relocatable kernel" - default y + def_bool y + select ARCH_VMLINUX_NEEDS_RELOCS help This builds a kernel image that retains relocation information so it can be loaded at an arbitrary address. - The kernel is linked as a position-independent executable (PIE) - and contains dynamic relocations which are processed early in the - bootup process. The relocations make the kernel image about 15% larger (compressed 10%), but are discarded at runtime. + Note: this option exists only for documentation purposes, please do + not remove it. config RANDOMIZE_BASE bool "Randomize the address of the kernel image (KASLR)" - depends on RELOCATABLE default y help In support of Kernel Address Space Layout Randomization (KASLR), @@ -589,6 +669,38 @@ config RANDOMIZE_BASE as a security feature that deters exploit attempts relying on knowledge of the location of kernel internals. +config RANDOMIZE_IDENTITY_BASE + bool "Randomize the address of the identity mapping base" + depends on RANDOMIZE_BASE + default DEBUG_VM + help + The identity mapping base address is pinned to zero by default. + Allow randomization of that base to expose otherwise missed + notion of physical and virtual addresses of data structures. + That does not have any impact on the base address at which the + kernel image is loaded. + + If unsure, say N + +config KERNEL_IMAGE_BASE + hex "Kernel image base address" + range 0x100000 0x1FFFFFE0000000 if !KASAN + range 0x100000 0x1BFFFFE0000000 if KASAN + default 0x3FFE0000000 if !KASAN + default 0x7FFFE0000000 if KASAN + help + This is the address at which the kernel image is loaded in case + Kernel Address Space Layout Randomization (KASLR) is disabled. + + In case the Protected virtualization guest support is enabled the + Ultravisor imposes a virtual address limit. If the value of this + option leads to the kernel image exceeding the Ultravisor limit, + this option is ignored and the image is loaded below the limit. + + If the value of this option leads to the kernel image overlapping + the virtual memory where other data structures are located, this + option is ignored and the image is loaded above the structures. + endmenu menu "Memory setup" @@ -611,32 +723,6 @@ config MAX_PHYSMEM_BITS Increasing the number of bits also increases the kernel image size. By default 46 bits (64TB) are supported. -config CHECK_STACK - def_bool y - depends on !VMAP_STACK - prompt "Detect kernel stack overflow" - help - This option enables the compiler option -mstack-guard and - -mstack-size if they are available. If the compiler supports them - it will emit additional code to each function prolog to trigger - an illegal operation if the kernel stack is about to overflow. - - Say N if you are unsure. - -config STACK_GUARD - int "Size of the guard area (128-1024)" - range 128 1024 - depends on CHECK_STACK - default "256" - help - This allows you to specify the size of the guard area at the lower - end of the kernel stack. If the kernel stack points into the guard - area on function entry an illegal operation is triggered. The size - needs to be a power of 2. Please keep in mind that the size of an - interrupt frame is 184 bytes for 31 bit and 328 bytes on 64 bit. - The minimum size for the stack guard should be 256 for 31 bit and - 512 for 64 bit. - endmenu menu "I/O subsystem" @@ -702,10 +788,38 @@ config EADM_SCH To compile this driver as a module, choose M here: the module will be called eadm_sch. +config AP + def_tristate y + prompt "Support for Adjunct Processors (ap)" + help + This driver allows usage to Adjunct Processor (AP) devices via + the ap bus, cards and queues. Supported Adjunct Processors are + the CryptoExpress Cards (CEX). + + To compile this driver as a module, choose M here: the + module will be called ap. + + If unsure, say Y (default). + +config AP_DEBUG + def_bool n + prompt "Enable debug features for Adjunct Processor (ap) devices" + depends on AP + help + Say 'Y' here to enable some additional debug features for Adjunct + Processor (ap) devices. + + There will be some more sysfs attributes displayed for ap queues. + + Do not enable on production level kernel build. + + If unsure, say N. + config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" - depends on S390_CCW_IOMMU && VFIO_MDEV + depends on VFIO + select VFIO_MDEV help This driver allows usage of I/O subchannels via VFIO-CCW. @@ -715,8 +829,10 @@ config VFIO_CCW config VFIO_AP def_tristate n prompt "VFIO support for AP devices" - depends on S390_AP_IOMMU && VFIO_MDEV && KVM - depends on ZCRYPT + depends on KVM + depends on VFIO + depends on AP + select VFIO_MDEV help This driver grants access to Adjunct Processor (AP) devices via the VFIO mediated device interface. @@ -726,22 +842,6 @@ config VFIO_AP endmenu -menu "Dump support" - -config CRASH_DUMP - bool "kernel crash dumps" - select KEXEC - help - Generate crash dump after being started by kexec. - Crash dump kernels are loaded in the main kernel with kexec-tools - into a specially reserved region and then later executed after - a crash by kdump/kexec. - Refer to <file:Documentation/s390/zfcpdump.rst> for more details on this. - This option also enables s390 zfcpdump. - See also <file:Documentation/s390/zfcpdump.rst> - -endmenu - config CCW def_bool y @@ -751,17 +851,6 @@ config HAVE_PNETID menu "Virtualization" -config PROTECTED_VIRTUALIZATION_GUEST - def_bool n - prompt "Protected virtualization guest support" - help - Select this option, if you want to be able to run this - kernel as a protected virtualization KVM guest. - Protected virtualization capable machines have a mini hypervisor - located at machine level (an ultravisor). With help of the - Ultravisor, KVM will be able to run "protected" VMs, special - VMs whose memory and management data are unavailable to KVM. - config PFAULT def_bool y prompt "Pseudo page fault support" @@ -860,13 +949,24 @@ config APPLDATA_NET_SUM This can also be compiled as a module, which will be called appldata_net_sum.o. -config S390_HYPFS_FS +config S390_HYPFS def_bool y + prompt "s390 hypervisor information" + help + This provides several binary files at (debugfs)/s390_hypfs/ to + provide accounting information in an s390 hypervisor environment. + +config S390_HYPFS_FS + def_bool n prompt "s390 hypervisor file system support" select SYS_HYPERVISOR + depends on S390_HYPFS help This is a virtual file system intended to provide accounting - information in an s390 hypervisor environment. + information in an s390 hypervisor environment. This file system + is deprecated and should not be used. + + Say N if you are unsure. source "arch/s390/kvm/Kconfig" diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index c4300ea4abf8..7955d7eee7d8 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -13,6 +13,16 @@ config DEBUG_ENTRY If unsure, say N. +config STRICT_MM_TYPECHECKS + bool "Strict Memory Management Type Checks" + depends on DEBUG_KERNEL + help + Enable strict type checking for memory management types like pte_t + and pmd_t. This generates slightly worse code and should be used + for debug builds. + + If unsure, say N. + config CIO_INJECT bool "CIO Inject interfaces" depends on DEBUG_KERNEL && DEBUG_FS diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 4cb5d17e7ead..7679bc16b692 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -14,28 +14,29 @@ KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 -ifeq ($(CONFIG_RELOCATABLE),y) -KBUILD_CFLAGS += -fPIE -LDFLAGS_vmlinux := -pie -endif +KBUILD_CFLAGS += -fPIC +LDFLAGS_vmlinux := $(call ld-option,-no-pie) +extra_tools := relocs aflags_dwarf := -Wa,-gdwarf-2 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) endif -KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY +KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector +KBUILD_CFLAGS_DECOMPRESSOR += -fPIE KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) UTS_MACHINE := s390x -STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) +STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384)) CHECKFLAGS += -D__s390__ -D__s390x__ export LD_BFD @@ -47,6 +48,7 @@ mflags-$(CONFIG_MARCH_Z13) := -march=z13 mflags-$(CONFIG_MARCH_Z14) := -march=z14 mflags-$(CONFIG_MARCH_Z15) := -march=z15 mflags-$(CONFIG_MARCH_Z16) := -march=z16 +mflags-$(CONFIG_MARCH_Z17) := -march=z17 export CC_FLAGS_MARCH := $(mflags-y) @@ -60,6 +62,7 @@ cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-$(CONFIG_MARCH_Z15_TUNE) += -mtune=z15 cflags-$(CONFIG_MARCH_Z16_TUNE) += -mtune=z16 +cflags-$(CONFIG_MARCH_Z17_TUNE) += -mtune=z17 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include @@ -71,18 +74,8 @@ cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y) KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y) -ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),) - CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE) - ifeq ($(call cc-option,-mstack-size=8192),) - CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD) - endif - export CC_FLAGS_CHECK_STACK - cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK) -endif - ifdef CONFIG_EXPOLINE ifdef CONFIG_EXPOLINE_EXTERN - KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern else @@ -119,10 +112,7 @@ export KBUILD_CFLAGS_DECOMPRESSOR OBJCOPYFLAGS := -O binary -head-y := arch/s390/kernel/head64.o - libs-y += arch/s390/lib/ -drivers-y += drivers/s390/ boot := arch/s390/boot syscalls := arch/s390/kernel/syscalls @@ -142,15 +132,12 @@ bzImage: vmlinux zfcpdump: $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ -vdso_install: - $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@ - archheaders: $(Q)$(MAKE) $(build)=$(syscalls) uapi archprepare: $(Q)$(MAKE) $(build)=$(syscalls) kapi - $(Q)$(MAKE) $(build)=$(tools) kapi + $(Q)$(MAKE) $(build)=$(tools) kapi $(extra_tools) ifeq ($(KBUILD_EXTMOD),) # We need to generate vdso-offsets.h before compiling certain files in kernel/. # In order to do that, we should use the archprepare target, but we can't since @@ -164,11 +151,9 @@ vdso_prepare: prepare0 $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h) -ifdef CONFIG_EXPOLINE_EXTERN -modules_prepare: expoline_prepare -expoline_prepare: - $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o -endif +vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg +vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg + endif # Don't use tabs in echo arguments diff --git a/arch/s390/Makefile.postlink b/arch/s390/Makefile.postlink new file mode 100644 index 000000000000..c2b737500a91 --- /dev/null +++ b/arch/s390/Makefile.postlink @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link s390 pass +# =========================================================================== +# +# 1. Separate relocations from vmlinux into relocs.S. +# 2. Strip relocations from vmlinux. + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include $(srctree)/scripts/Kbuild.include + +CMD_RELOCS=arch/s390/tools/relocs +OUT_RELOCS = arch/s390/boot +quiet_cmd_relocs = RELOCS $(OUT_RELOCS)/relocs.S + cmd_relocs = \ + mkdir -p $(OUT_RELOCS); \ + $(CMD_RELOCS) $@ > $(OUT_RELOCS)/relocs.S + +vmlinux.unstripped: FORCE + $(call cmd,relocs) + +clean: + @rm -f $(OUT_RELOCS)/relocs.S + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index d74a4c7d5df6..dd7ba7587dd5 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -26,12 +26,10 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/workqueue.h> -#include <linux/suspend.h> -#include <linux/platform_device.h> +#include <linux/uaccess.h> +#include <linux/io.h> #include <asm/appldata.h> #include <asm/vtimer.h> -#include <linux/uaccess.h> -#include <asm/io.h> #include <asm/smp.h> #include "appldata.h" @@ -44,19 +42,17 @@ #define TOD_MICRO 0x01000 /* nr. of TOD clock units for 1 microsecond */ -static struct platform_device *appldata_pdev; - /* * /proc entries (sysctl) */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; -static int appldata_timer_handler(struct ctl_table *ctl, int write, +static int appldata_timer_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -static int appldata_interval_handler(struct ctl_table *ctl, int write, +static int appldata_interval_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; -static struct ctl_table appldata_table[] = { +static const struct ctl_table appldata_table[] = { { .procname = "timer", .mode = S_IRUGO | S_IWUSR, @@ -67,17 +63,6 @@ static struct ctl_table appldata_table[] = { .mode = S_IRUGO | S_IWUSR, .proc_handler = appldata_interval_handler, }, - { }, -}; - -static struct ctl_table appldata_dir_table[] = { - { - .procname = appldata_proc_name, - .maxlen = 0, - .mode = S_IRUGO | S_IXUGO, - .child = appldata_table, - }, - { }, }; /* @@ -88,7 +73,6 @@ static struct vtimer_list appldata_timer; static DEFINE_SPINLOCK(appldata_timer_lock); static int appldata_interval = APPLDATA_CPU_INTERVAL; static int appldata_timer_active; -static int appldata_timer_suspended = 0; /* * Work queue @@ -215,7 +199,7 @@ static void __appldata_vtimer_setup(int cmd) * Start/Stop timer, show status of timer (0 = not active, 1 = active) */ static int -appldata_timer_handler(struct ctl_table *ctl, int write, +appldata_timer_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; @@ -248,7 +232,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, * current timer interval. */ static int -appldata_interval_handler(struct ctl_table *ctl, int write, +appldata_interval_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; @@ -278,7 +262,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, * monitoring (0 = not in process, 1 = in process) */ static int -appldata_generic_handler(struct ctl_table *ctl, int write, +appldata_generic_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; @@ -296,7 +280,7 @@ appldata_generic_handler(struct ctl_table *ctl, int write, mutex_lock(&appldata_ops_mutex); list_for_each(lh, &appldata_ops_list) { tmp_ops = list_entry(lh, struct appldata_ops, list); - if (&tmp_ops->ctl_table[2] == ctl) { + if (&tmp_ops->ctl_table[0] == ctl) { found = 1; } } @@ -366,7 +350,7 @@ int appldata_register_ops(struct appldata_ops *ops) if (ops->size > APPLDATA_MAX_REC_SIZE) return -EINVAL; - ops->ctl_table = kcalloc(4, sizeof(struct ctl_table), GFP_KERNEL); + ops->ctl_table = kcalloc(1, sizeof(struct ctl_table), GFP_KERNEL); if (!ops->ctl_table) return -ENOMEM; @@ -374,17 +358,12 @@ int appldata_register_ops(struct appldata_ops *ops) list_add(&ops->list, &appldata_ops_list); mutex_unlock(&appldata_ops_mutex); - ops->ctl_table[0].procname = appldata_proc_name; - ops->ctl_table[0].maxlen = 0; - ops->ctl_table[0].mode = S_IRUGO | S_IXUGO; - ops->ctl_table[0].child = &ops->ctl_table[2]; - - ops->ctl_table[2].procname = ops->name; - ops->ctl_table[2].mode = S_IRUGO | S_IWUSR; - ops->ctl_table[2].proc_handler = appldata_generic_handler; - ops->ctl_table[2].data = ops; + ops->ctl_table[0].procname = ops->name; + ops->ctl_table[0].mode = S_IRUGO | S_IWUSR; + ops->ctl_table[0].proc_handler = appldata_generic_handler; + ops->ctl_table[0].data = ops; - ops->sysctl_header = register_sysctl_table(ops->ctl_table); + ops->sysctl_header = register_sysctl_sz(appldata_proc_name, ops->ctl_table, 1); if (!ops->sysctl_header) goto out; return 0; @@ -412,88 +391,6 @@ void appldata_unregister_ops(struct appldata_ops *ops) /********************** module-ops management <END> **************************/ -/**************************** suspend / resume *******************************/ -static int appldata_freeze(struct device *dev) -{ - struct appldata_ops *ops; - int rc; - struct list_head *lh; - - spin_lock(&appldata_timer_lock); - if (appldata_timer_active) { - __appldata_vtimer_setup(APPLDATA_DEL_TIMER); - appldata_timer_suspended = 1; - } - spin_unlock(&appldata_timer_lock); - - mutex_lock(&appldata_ops_mutex); - list_for_each(lh, &appldata_ops_list) { - ops = list_entry(lh, struct appldata_ops, list); - if (ops->active == 1) { - rc = appldata_diag(ops->record_nr, APPLDATA_STOP_REC, - (unsigned long) ops->data, ops->size, - ops->mod_lvl); - if (rc != 0) - pr_err("Stopping the data collection for %s " - "failed with rc=%d\n", ops->name, rc); - } - } - mutex_unlock(&appldata_ops_mutex); - return 0; -} - -static int appldata_restore(struct device *dev) -{ - struct appldata_ops *ops; - int rc; - struct list_head *lh; - - spin_lock(&appldata_timer_lock); - if (appldata_timer_suspended) { - __appldata_vtimer_setup(APPLDATA_ADD_TIMER); - appldata_timer_suspended = 0; - } - spin_unlock(&appldata_timer_lock); - - mutex_lock(&appldata_ops_mutex); - list_for_each(lh, &appldata_ops_list) { - ops = list_entry(lh, struct appldata_ops, list); - if (ops->active == 1) { - ops->callback(ops->data); // init record - rc = appldata_diag(ops->record_nr, - APPLDATA_START_INTERVAL_REC, - (unsigned long) ops->data, ops->size, - ops->mod_lvl); - if (rc != 0) { - pr_err("Starting the data collection for %s " - "failed with rc=%d\n", ops->name, rc); - } - } - } - mutex_unlock(&appldata_ops_mutex); - return 0; -} - -static int appldata_thaw(struct device *dev) -{ - return appldata_restore(dev); -} - -static const struct dev_pm_ops appldata_pm_ops = { - .freeze = appldata_freeze, - .thaw = appldata_thaw, - .restore = appldata_restore, -}; - -static struct platform_driver appldata_pdrv = { - .driver = { - .name = "appldata", - .pm = &appldata_pm_ops, - }, -}; -/************************* suspend / resume <END> ****************************/ - - /******************************* init / exit *********************************/ /* @@ -503,36 +400,14 @@ static struct platform_driver appldata_pdrv = { */ static int __init appldata_init(void) { - int rc; - init_virt_timer(&appldata_timer); appldata_timer.function = appldata_timer_function; appldata_timer.data = (unsigned long) &appldata_work; - - rc = platform_driver_register(&appldata_pdrv); - if (rc) - return rc; - - appldata_pdev = platform_device_register_simple("appldata", -1, NULL, - 0); - if (IS_ERR(appldata_pdev)) { - rc = PTR_ERR(appldata_pdev); - goto out_driver; - } appldata_wq = alloc_ordered_workqueue("appldata", 0); - if (!appldata_wq) { - rc = -ENOMEM; - goto out_device; - } - - appldata_sysctl_header = register_sysctl_table(appldata_dir_table); + if (!appldata_wq) + return -ENOMEM; + appldata_sysctl_header = register_sysctl(appldata_proc_name, appldata_table); return 0; - -out_device: - platform_device_unregister(appldata_pdev); -out_driver: - platform_driver_unregister(&appldata_pdrv); - return rc; } __initcall(appldata_init); diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index 21c3147bd92a..fc608f9b79ab 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -15,7 +15,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/io.h> +#include <linux/io.h> #include "appldata.h" diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore index f56591bc0897..af2a6a7bc028 100644 --- a/arch/s390/boot/.gitignore +++ b/arch/s390/boot/.gitignore @@ -1,7 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only image bzImage +relocs.S section_cmp.* vmlinux vmlinux.lds +vmlinux.map vmlinux.syms diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 883357a211a3..bee49626be4b 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -3,61 +3,53 @@ # Makefile for the linux s390-specific parts of the memory manager. # +# Tooling runtimes are unavailable and cannot be linked for early boot code KCOV_INSTRUMENT := n GCOV_PROFILE := n UBSAN_SANITIZE := n KASAN_SANITIZE := n KCSAN_SANITIZE := n - -KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) -KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) +KMSAN_SANITIZE := n # -# Use minimum architecture for als.c to be able to print an error +# Use minimum architecture level so it is possible to print an error # message if the kernel is started on a machine which is too old # -ifndef CONFIG_CC_IS_CLANG -CC_FLAGS_MARCH_MINIMUM := -march=z900 -else CC_FLAGS_MARCH_MINIMUM := -march=z10 -endif - -ifneq ($(CC_FLAGS_MARCH),$(CC_FLAGS_MARCH_MINIMUM)) -AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) -AFLAGS_head.o += $(CC_FLAGS_MARCH_MINIMUM) -AFLAGS_REMOVE_mem.o += $(CC_FLAGS_MARCH) -AFLAGS_mem.o += $(CC_FLAGS_MARCH_MINIMUM) -CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) -CFLAGS_als.o += $(CC_FLAGS_MARCH_MINIMUM) -CFLAGS_REMOVE_sclp_early_core.o += $(CC_FLAGS_MARCH) -CFLAGS_sclp_early_core.o += $(CC_FLAGS_MARCH_MINIMUM) -endif + +KBUILD_AFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_AFLAGS_DECOMPRESSOR)) +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_CFLAGS_DECOMPRESSOR)) +KBUILD_AFLAGS += $(CC_FLAGS_MARCH_MINIMUM) +KBUILD_CFLAGS += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o ipl_data.o -obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o -obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o +obj-y += version.o pgm_check.o ctype.o ipl_data.o relocs.o alternative.o +obj-y += uv.o printk.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o +obj-$(CONFIG_KMSAN) += kmsan.o obj-all := $(obj-y) piggy.o syms.o targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all) +targets += relocs.S OBJECTS := $(addprefix $(obj)/,$(obj-y)) OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all)) +clean-files += vmlinux.map + quiet_cmd_section_cmp = SECTCMP $* define cmd_section_cmp - s1=`$(OBJDUMP) -t -j "$*" "$<" | sort | \ + s1=`$(OBJDUMP) -t "$<" | grep "\s$*\s\+" | sort | \ sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \ - s2=`$(OBJDUMP) -t -j "$*" "$(word 2,$^)" | sort | \ + s2=`$(OBJDUMP) -t "$(word 2,$^)" | grep "\s$*\s\+" | sort | \ sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \ if [ "$$s1" != "$$s2" ]; then \ echo "error: section $* differs between $< and $(word 2,$^)" >&2; \ @@ -72,11 +64,12 @@ $(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.b $(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE $(call if_changed,section_cmp) -LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup --build-id=sha1 -T +LDFLAGS_vmlinux-$(CONFIG_LD_ORPHAN_WARN) := --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) +LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE $(call if_changed,ld) -LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T +LDFLAGS_vmlinux.syms := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup -T $(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE $(call if_changed,ld) @@ -92,7 +85,7 @@ OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section . $(obj)/syms.o: $(obj)/syms.bin FORCE $(call if_changed,objcopy) -OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load +OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=alloc,load $(obj)/info.bin: vmlinux FORCE $(call if_changed,objcopy) @@ -104,6 +97,10 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) +# relocs.S is created by the vmlinux postlink step. +$(obj)/relocs.S: vmlinux + @true + suffix-$(CONFIG_KERNEL_GZIP) := .gz suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 suffix-$(CONFIG_KERNEL_LZ4) := .lz4 diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index 47c48fbfb563..79afb5fa7f1f 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -9,42 +9,8 @@ #include <asm/sclp.h> #include "boot.h" -/* - * The code within this file will be called very early. It may _not_ - * access anything within the bss section, since that is not cleared - * yet and may contain data (e.g. initrd) that must be saved by other - * code. - * For temporary objects the stack (16k) should be used. - */ - static unsigned long als[] = { FACILITIES_ALS }; -static void u16_to_hex(char *str, u16 val) -{ - int i, num; - - for (i = 1; i <= 4; i++) { - num = (val >> (16 - 4 * i)) & 0xf; - if (num >= 10) - num += 7; - *str++ = '0' + num; - } - *str = '\0'; -} - -static void print_machine_type(void) -{ - static char mach_str[80] = "Detected machine-type number: "; - char type_str[5]; - struct cpuid id; - - get_cpu_id(&id); - u16_to_hex(type_str, id.machine); - strcat(mach_str, type_str); - strcat(mach_str, "\n"); - sclp_early_printk(mach_str); -} - static void u16_to_decimal(char *str, u16 val) { int div = 1; @@ -80,8 +46,7 @@ void print_missing_facilities(void) * z/VM adds a four character prefix. */ if (strlen(als_str) > 70) { - strcat(als_str, "\n"); - sclp_early_printk(als_str); + boot_emerg("%s\n", als_str); *als_str = '\0'; } u16_to_decimal(val_str, i * BITS_PER_LONG + j); @@ -89,16 +54,18 @@ void print_missing_facilities(void) first = 0; } } - strcat(als_str, "\n"); - sclp_early_printk(als_str); + boot_emerg("%s\n", als_str); } static void facility_mismatch(void) { - sclp_early_printk("The Linux kernel requires more recent processor hardware\n"); - print_machine_type(); + struct cpuid id; + + get_cpu_id(&id); + boot_emerg("The Linux kernel requires more recent processor hardware\n"); + boot_emerg("Detected machine-type number: %4x\n", id.machine); print_missing_facilities(); - sclp_early_printk("See Principles of Operations for facility bits\n"); + boot_emerg("See Principles of Operations for facility bits\n"); disabled_wait(); } diff --git a/arch/s390/boot/alternative.c b/arch/s390/boot/alternative.c new file mode 100644 index 000000000000..19ea7934b918 --- /dev/null +++ b/arch/s390/boot/alternative.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "alt: " fmt +#include "boot.h" + +#define a_debug boot_debug + +#include "../kernel/alternative.c" + +static void alt_debug_all(int type) +{ + int i; + + switch (type) { + case ALT_TYPE_FACILITY: + for (i = 0; i < ARRAY_SIZE(alt_debug.facilities); i++) + alt_debug.facilities[i] = -1UL; + break; + case ALT_TYPE_FEATURE: + for (i = 0; i < ARRAY_SIZE(alt_debug.mfeatures); i++) + alt_debug.mfeatures[i] = -1UL; + break; + case ALT_TYPE_SPEC: + alt_debug.spec = 1; + break; + } +} + +static void alt_debug_modify(int type, unsigned int nr, bool clear) +{ + switch (type) { + case ALT_TYPE_FACILITY: + if (clear) + __clear_facility(nr, alt_debug.facilities); + else + __set_facility(nr, alt_debug.facilities); + break; + case ALT_TYPE_FEATURE: + if (clear) + __clear_machine_feature(nr, alt_debug.mfeatures); + else + __set_machine_feature(nr, alt_debug.mfeatures); + break; + } +} + +static char *alt_debug_parse(int type, char *str) +{ + unsigned long val, endval; + char *endp; + bool clear; + int i; + + if (*str == ':') { + str++; + } else { + alt_debug_all(type); + return str; + } + clear = false; + if (*str == '!') { + alt_debug_all(type); + clear = true; + str++; + } + while (*str) { + val = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + if (*str == '-') { + str++; + endval = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + while (val <= endval) { + alt_debug_modify(type, val, clear); + val++; + } + } else { + alt_debug_modify(type, val, clear); + } + if (*str != ',') + break; + str++; + } + return str; +} + +/* + * Use debug-alternative command line parameter for debugging: + * "debug-alternative" + * -> print debug message for every single alternative + * + * "debug-alternative=0;2" + * -> print debug message for all alternatives with type 0 and 2 + * + * "debug-alternative=0:0-7" + * -> print debug message for all alternatives with type 0 and with + * facility numbers within the range of 0-7 + * (if type 0 is ALT_TYPE_FACILITY) + * + * "debug-alternative=0:!8;1" + * -> print debug message for all alternatives with type 0, for all + * facility number, except facility 8, and in addition print all + * alternatives with type 1 + */ +void alt_debug_setup(char *str) +{ + unsigned long type; + char *endp; + int i; + + if (!str) { + alt_debug_all(ALT_TYPE_FACILITY); + alt_debug_all(ALT_TYPE_FEATURE); + alt_debug_all(ALT_TYPE_SPEC); + return; + } + while (*str) { + type = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + switch (type) { + case ALT_TYPE_FACILITY: + case ALT_TYPE_FEATURE: + str = alt_debug_parse(type, str); + break; + case ALT_TYPE_SPEC: + alt_debug_all(ALT_TYPE_SPEC); + break; + } + if (*str != ';') + break; + str++; + } +} diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 70418389414d..e045cae6e80a 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -8,31 +8,118 @@ #ifndef __ASSEMBLY__ +#include <linux/printk.h> +#include <asm/physmem_info.h> + +struct vmlinux_info { + unsigned long entry; + unsigned long image_size; /* does not include .bss */ + unsigned long bss_size; /* uncompressed image .bss size */ + unsigned long bootdata_off; + unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; + unsigned long got_start; + unsigned long got_end; + unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; + unsigned long alt_instructions; + unsigned long alt_instructions_end; +#ifdef CONFIG_KASAN + unsigned long kasan_early_shadow_page_off; + unsigned long kasan_early_shadow_pte_off; + unsigned long kasan_early_shadow_pmd_off; + unsigned long kasan_early_shadow_pud_off; + unsigned long kasan_early_shadow_p4d_off; +#endif +}; + void startup_kernel(void); -unsigned long detect_memory(void); +unsigned long detect_max_physmem_end(void); +void detect_physmem_online_ranges(unsigned long max_physmem_end); +void physmem_set_usable_limit(unsigned long limit); +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size); +void physmem_free(enum reserved_range_type type); +/* for continuous/multiple allocations per type */ +unsigned long physmem_alloc_or_die(enum reserved_range_type type, unsigned long size, + unsigned long align); +unsigned long physmem_alloc(enum reserved_range_type type, unsigned long size, + unsigned long align, bool die_on_oom); +/* for single allocations, 1 per type */ +unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, + unsigned long align, unsigned long min, unsigned long max, + bool die_on_oom); +unsigned long get_physmem_alloc_pos(void); +void dump_physmem_reserved(void); +bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start); bool is_ipl_block_dump(void); void store_ipl_parmblock(void); +int read_ipl_report(void); +void save_ipl_cert_comp_list(void); void setup_boot_command_line(void); void parse_boot_command_line(void); void verify_facilities(void); void print_missing_facilities(void); void sclp_early_setup_buffer(void); -void print_pgm_check_info(void); -unsigned long get_random_base(unsigned long safe_addr); -void __printf(1, 2) decompressor_printk(const char *fmt, ...); +void alt_debug_setup(char *str); +void do_pgm_check(struct pt_regs *regs); +unsigned long randomize_within_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max); +void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit); +int __printf(1, 2) boot_printk(const char *fmt, ...); +void print_stacktrace(unsigned long sp); +void error(char *m); +int get_random(unsigned long limit, unsigned long *value); +void boot_rb_dump(void); + +#ifndef boot_fmt +#define boot_fmt(fmt) fmt +#endif + +#define boot_emerg(fmt, ...) boot_printk(KERN_EMERG boot_fmt(fmt), ##__VA_ARGS__) +#define boot_alert(fmt, ...) boot_printk(KERN_ALERT boot_fmt(fmt), ##__VA_ARGS__) +#define boot_crit(fmt, ...) boot_printk(KERN_CRIT boot_fmt(fmt), ##__VA_ARGS__) +#define boot_err(fmt, ...) boot_printk(KERN_ERR boot_fmt(fmt), ##__VA_ARGS__) +#define boot_warn(fmt, ...) boot_printk(KERN_WARNING boot_fmt(fmt), ##__VA_ARGS__) +#define boot_notice(fmt, ...) boot_printk(KERN_NOTICE boot_fmt(fmt), ##__VA_ARGS__) +#define boot_info(fmt, ...) boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__) +#define boot_debug(fmt, ...) boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__) + +extern struct machine_info machine; +extern int boot_console_loglevel; +extern bool boot_ignore_loglevel; /* Symbols defined by linker scripts */ extern const char kernel_version[]; extern unsigned long memory_limit; extern unsigned long vmalloc_size; extern int vmalloc_size_set; -extern int kaslr_enabled; extern char __boot_data_start[], __boot_data_end[]; extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +extern char __vmlinux_relocs_64_start[], __vmlinux_relocs_64_end[]; extern char _decompressor_syms_start[], _decompressor_syms_end[]; extern char _stack_start[], _stack_end[]; +extern char _end[], _decompressor_end[]; +extern unsigned char _compressed_start[]; +extern unsigned char _compressed_end[]; +extern struct vmlinux_info _vmlinux_info; + +#define vmlinux _vmlinux_info -unsigned long read_ipl_report(unsigned long safe_offset); +#define __lowcore_pa(x) ((unsigned long)(x) % sizeof(struct lowcore)) +#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore)) +#define __kernel_va(x) ((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset)) +#define __kernel_pa(x) ((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys) +#define __identity_va(x) ((void *)((unsigned long)(x) + __identity_base)) +#define __identity_pa(x) ((unsigned long)(x) - __identity_base) +static inline bool intersects(unsigned long addr0, unsigned long size0, + unsigned long addr1, unsigned long size1) +{ + return addr0 + size0 > addr1 && addr1 + size1 > addr0; +} #endif /* __ASSEMBLY__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index e27c2140d620..03500b9d9fb9 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -9,8 +9,10 @@ #include <linux/kernel.h> #include <linux/string.h> +#include <asm/boot_data.h> #include <asm/page.h> #include "decompressor.h" +#include "boot.h" /* * gzip declarations @@ -23,9 +25,9 @@ #define memmove memmove #define memzero(s, n) memset((s), 0, (n)) -#ifdef CONFIG_KERNEL_BZIP2 +#if defined(CONFIG_KERNEL_BZIP2) #define BOOT_HEAP_SIZE 0x400000 -#elif CONFIG_KERNEL_ZSTD +#elif defined(CONFIG_KERNEL_ZSTD) #define BOOT_HEAP_SIZE 0x30000 #else #define BOOT_HEAP_SIZE 0x10000 @@ -62,24 +64,22 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE; #include "../../../../lib/decompress_unzstd.c" #endif -#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE) +static void decompress_error(char *m) +{ + if (bootdebug) + boot_rb_dump(); + boot_emerg("Decompression error: %s\n", m); + boot_emerg(" -- System halted\n"); + disabled_wait(); +} unsigned long mem_safe_offset(void) { - /* - * due to 4MB HEAD_SIZE for bzip2 - * 'decompress_offset + vmlinux.image_size' could be larger than - * kernel at final position + its .bss, so take the larger of two - */ - return max(decompress_offset + vmlinux.image_size, - vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size); + return ALIGN(free_mem_end_ptr, PAGE_SIZE); } -void *decompress_kernel(void) +void deploy_kernel(void *output) { - void *output = (void *)decompress_offset; - __decompress(_compressed_start, _compressed_end - _compressed_start, - NULL, NULL, output, 0, NULL, error); - return output; + NULL, NULL, output, vmlinux.image_size, NULL, decompress_error); } diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h index f75cc31a77dd..4f966f06bd65 100644 --- a/arch/s390/boot/decompressor.h +++ b/arch/s390/boot/decompressor.h @@ -2,37 +2,9 @@ #ifndef BOOT_COMPRESSED_DECOMPRESSOR_H #define BOOT_COMPRESSED_DECOMPRESSOR_H -#include <linux/stddef.h> - -#ifdef CONFIG_KERNEL_UNCOMPRESSED -static inline void *decompress_kernel(void) { return NULL; } -#else -void *decompress_kernel(void); -#endif +#ifndef CONFIG_KERNEL_UNCOMPRESSED unsigned long mem_safe_offset(void); -void error(char *m); - -struct vmlinux_info { - unsigned long default_lma; - void (*entry)(void); - unsigned long image_size; /* does not include .bss */ - unsigned long bss_size; /* uncompressed image .bss size */ - unsigned long bootdata_off; - unsigned long bootdata_size; - unsigned long bootdata_preserved_off; - unsigned long bootdata_preserved_size; - unsigned long dynsym_start; - unsigned long rela_dyn_start; - unsigned long rela_dyn_end; - unsigned long amode31_size; -}; - -/* Symbols defined by linker scripts */ -extern char _end[]; -extern unsigned char _compressed_start[]; -extern unsigned char _compressed_end[]; -extern char _vmlinux_info[]; - -#define vmlinux (*(struct vmlinux_info *)_vmlinux_info) +void deploy_kernel(void *output); +#endif #endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */ diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 3f79b9efb803..0b511d5c030b 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -67,7 +67,7 @@ ipl_start: jz .Lagain1 # skip dataset header larl %r13,.L_eof clc 0(3,%r4),0(%r13) # if it is EOFx - jz .Lagain1 # skip dateset trailer + jz .Lagain1 # skip data set trailer lgr %r5,%r2 la %r6,COMMAND_LINE-PARMAREA(%r12) lgr %r7,%r2 @@ -185,19 +185,19 @@ ipl_start: larl %r13,.Lcrash lpsw 0(%r13) - .align 8 + .balign 8 .Lwaitpsw: .quad 0x0202000180000000,.Lioint .Lnewpswmask: .quad 0x0000000180000000 - .align 8 + .balign 8 .Lorb: .long 0x00000000,0x0080ff00,.Lccws .Lirb: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 - .align 8 + .balign 8 .Lcr6: .quad 0x00000000ff000000 - .align 8 + .balign 8 .Lcrash:.long 0x000a0000,0x00000000 - .align 8 + .balign 8 .Lccws: .rept 19 .long 0x02600050,0x00000000 .endr @@ -207,7 +207,7 @@ ipl_start: .byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold" .L_eof: .long 0xc5d6c600 /* C'EOF' */ .L_hdr: .long 0xc8c4d900 /* C'HDR' */ - .align 8 + .balign 8 .Lcpuid:.fill 8,1,0 # @@ -254,8 +254,9 @@ SYM_CODE_START_LOCAL(startup_normal) xc 0xf00(256),0xf00 larl %r13,.Lctl lctlg %c0,%c15,0(%r13) # load control registers - stcke __LC_BOOT_CLOCK - mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 + larl %r13,tod_clock_base + stcke 0(%r13) + mvc __LC_LAST_UPDATE_CLOCK(8),1(%r13) larl %r13,6f spt 0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),0(%r13) @@ -265,7 +266,7 @@ SYM_CODE_START_LOCAL(startup_normal) brasl %r14,startup_kernel SYM_CODE_END(startup_normal) - .align 8 + .balign 8 6: .long 0x7fffffff,0xffffffff .Lext_new_psw: .quad 0x0002000180000000,0x1b0 # disabled wait @@ -292,18 +293,12 @@ SYM_CODE_END(startup_normal) #include "head_kdump.S" -# -# This program check is active immediately after kernel start -# and until early_pgm_check_handler is set in kernel/early.c -# It simply saves general/control registers and psw in -# the save area and does disabled wait with a faulty address. -# SYM_CODE_START_LOCAL(startup_pgm_check_handler) - stmg %r8,%r15,__LC_SAVE_AREA_SYNC + stmg %r8,%r15,__LC_SAVE_AREA la %r8,4095 stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8) stmg %r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8) - mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA_SYNC + mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA mvc __LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW mvc __LC_RETURN_PSW(16),__LC_PGM_OLD_PSW ni __LC_RETURN_PSW,0xfc # remove IO and EX bits @@ -311,8 +306,18 @@ SYM_CODE_START_LOCAL(startup_pgm_check_handler) oi __LC_RETURN_PSW+1,0x2 # set wait state bit larl %r9,.Lold_psw_disabled_wait stg %r9,__LC_PGM_NEW_PSW+8 - larl %r15,_dump_info_stack_end-STACK_FRAME_OVERHEAD - brasl %r14,print_pgm_check_info + larl %r15,_dump_info_stack_end-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r2,STACK_FRAME_OVERHEAD(%r15) + mvc __PT_PSW(16,%r2),__LC_PSW_SAVE_AREA-4095(%r8) + mvc __PT_R0(128,%r2),__LC_GPREGS_SAVE_AREA-4095(%r8) + mvc __PT_LAST_BREAK(8,%r2),__LC_PGM_LAST_BREAK + mvc __PT_INT_CODE(4,%r2),__LC_PGM_INT_CODE + brasl %r14,do_pgm_check + larl %r9,startup_pgm_check_handler + stg %r9,__LC_PGM_NEW_PSW+8 + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + lpswe __LC_RETURN_PSW .Lold_psw_disabled_wait: la %r8,4095 lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8) diff --git a/arch/s390/boot/head_kdump.S b/arch/s390/boot/head_kdump.S index f015469e7db9..f7107c76258c 100644 --- a/arch/s390/boot/head_kdump.S +++ b/arch/s390/boot/head_kdump.S @@ -82,12 +82,12 @@ SYM_CODE_START_LOCAL(startup_kdump) # # Startup of kdump (relocated new kernel) # -.align 2 + .balign 2 startup_kdump_relocated: basr %r13,0 0: lpswe .Lrestart_psw-0b(%r13) # Start new kernel... SYM_CODE_END(startup_kdump) -.align 8 + .balign 8 .Lrestart_psw: .quad 0x0000000080000000,0x0000000000000000 + startup #else @@ -95,7 +95,7 @@ SYM_CODE_START_LOCAL(startup_kdump) larl %r13,startup_kdump_crash lpswe 0(%r13) SYM_CODE_END(startup_kdump) -.align 8 + .balign 8 startup_kdump_crash: .quad 0x0002000080000000,0x0000000000000000 + startup_kdump_crash #endif /* CONFIG_CRASH_DUMP */ diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh index 616ba1660f08..fa41486258ee 100755 --- a/arch/s390/boot/install.sh +++ b/arch/s390/boot/install.sh @@ -15,10 +15,12 @@ # $3 - kernel map file # $4 - default install path (blank if root directory) +set -e + echo "Warning: '${INSTALLKERNEL}' command not available - additional " \ "bootloader config required" >&2 -if [ -f $4/vmlinuz-$1 ]; then mv $4/vmlinuz-$1 $4/vmlinuz-$1.old; fi -if [ -f $4/System.map-$1 ]; then mv $4/System.map-$1 $4/System.map-$1.old; fi +if [ -f "$4/vmlinuz-$1" ]; then mv -- "$4/vmlinuz-$1" "$4/vmlinuz-$1.old"; fi +if [ -f "$4/System.map-$1" ]; then mv -- "$4/System.map-$1" "$4/System.map-$1.old"; fi -cat $2 > $4/vmlinuz-$1 -cp $3 $4/System.map-$1 +cat -- "$2" > "$4/vmlinuz-$1" +cp -- "$3" "$4/System.map-$1" diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index ca78d6162245..f584d7da29cb 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -3,6 +3,9 @@ #include <linux/init.h> #include <linux/ctype.h> #include <linux/pgtable.h> +#include <asm/abs_lowcore.h> +#include <asm/page-states.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/sclp.h> #include <asm/sections.h> @@ -19,42 +22,27 @@ struct parmarea parmarea __section(".parmarea") = { }; char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; -int __bootdata(noexec_disabled); unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL; struct ipl_parameter_block __bootdata_preserved(ipl_block); int __bootdata_preserved(ipl_block_valid); +int __bootdata_preserved(__kaslr_enabled); +int __bootdata_preserved(cmma_flag) = 1; unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE; unsigned long memory_limit; int vmalloc_size_set; -int kaslr_enabled; static inline int __diag308(unsigned long subcode, void *addr) { - unsigned long reg1, reg2; - union register_pair r1; - psw_t old; - - r1.even = (unsigned long) addr; - r1.odd = 0; - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" + union register_pair r1 = { .even = (unsigned long)addr, .odd = 0 }; + + asm_inline volatile( " diag %[r1],%[subcode],0x308\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" - : [r1] "+&d" (r1.pair), - [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - "+Q" (S390_lowcore.program_new_psw), - "=Q" (old) - : [subcode] "d" (subcode), - [psw_old] "a" (&old), - [psw_pgm] "a" (&S390_lowcore.program_new_psw) + "0:\n" + EX_TABLE(0b, 0b) + : [r1] "+d" (r1.pair) + : [subcode] "d" (subcode) : "cc", "memory"); return r1.odd; } @@ -77,6 +65,9 @@ bool is_ipl_block_dump(void) if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME && ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) return true; + if (ipl_block.pb0_hdr.pbt == IPL_PBT_ECKD && + ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return true; return false; } @@ -108,6 +99,11 @@ static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, scp_data_len = ipb->nvme.scp_data_len; scp_data = ipb->nvme.scp_data; break; + case IPL_PBT_ECKD: + scp_data_len = ipb->eckd.scp_data_len; + scp_data = ipb->eckd.scp_data; + break; + default: goto out; } @@ -153,6 +149,7 @@ static void append_ipl_block_parm(void) break; case IPL_PBT_FCP: case IPL_PBT_NVME: + case IPL_PBT_ECKD: rc = ipl_block_get_ascii_scpdata( parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; @@ -182,7 +179,7 @@ void setup_boot_command_line(void) if (has_ebcdic_char(parmarea.command_line)) EBCASC(parmarea.command_line, COMMAND_LINE_SIZE); /* copy arch command line */ - strcpy(early_command_line, strim(parmarea.command_line)); + strscpy(early_command_line, strim(parmarea.command_line)); /* append IPL PARM data to the boot command line */ if (!is_prot_virt_guest() && ipl_block_valid) @@ -204,7 +201,7 @@ static void check_cleared_facilities(void) for (i = 0; i < ARRAY_SIZE(als); i++) { if ((stfle_fac_list[i] & als[i]) != als[i]) { - sclp_early_printk("Warning: The Linux kernel requires facilities cleared via command line option\n"); + boot_emerg("The Linux kernel requires facilities cleared via command line option\n"); print_missing_facilities(); break; } @@ -255,8 +252,9 @@ void parse_boot_command_line(void) char *args; int rc; - kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); - args = strcpy(command_line_buf, early_command_line); + __kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); + strscpy(command_line_buf, early_command_line); + args = command_line_buf; while (*args) { args = next_arg(args, ¶m, &val); @@ -264,7 +262,7 @@ void parse_boot_command_line(void) memory_limit = round_down(memparse(val, NULL), PAGE_SIZE); if (!strcmp(param, "vmalloc") && val) { - vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE); + vmalloc_size = round_up(memparse(val, NULL), _SEGMENT_SIZE); vmalloc_size_set = 1; } @@ -281,17 +279,20 @@ void parse_boot_command_line(void) zlib_dfltcc_support = ZLIB_DFLTCC_FULL_DEBUG; } - if (!strcmp(param, "noexec")) { - rc = kstrtobool(val, &enabled); - if (!rc && !enabled) - noexec_disabled = 1; - } - if (!strcmp(param, "facilities") && val) modify_fac_list(val); + if (!strcmp(param, "debug-alternative")) + alt_debug_setup(val); + if (!strcmp(param, "nokaslr")) - kaslr_enabled = 0; + __kaslr_enabled = 0; + + if (!strcmp(param, "cmma")) { + rc = kstrtobool(val, &enabled); + if (!rc && !enabled) + cmma_flag = 0; + } #if IS_ENABLED(CONFIG_KVM) if (!strcmp(param, "prot_virt")) { @@ -300,5 +301,25 @@ void parse_boot_command_line(void) prot_virt_host = 1; } #endif + if (!strcmp(param, "relocate_lowcore") && test_facility(193)) + set_machine_feature(MFEATURE_LOWCORE); + if (!strcmp(param, "earlyprintk")) + boot_earlyprintk = true; + if (!strcmp(param, "debug")) + boot_console_loglevel = CONSOLE_LOGLEVEL_DEBUG; + if (!strcmp(param, "bootdebug")) { + bootdebug = true; + if (val) + strscpy(bootdebug_filter, val); + } + if (!strcmp(param, "quiet")) + boot_console_loglevel = CONSOLE_LOGLEVEL_QUIET; + if (!strcmp(param, "ignore_loglevel")) + boot_ignore_loglevel = true; + if (!strcmp(param, "loglevel")) { + boot_console_loglevel = simple_strtoull(val, NULL, 10); + if (boot_console_loglevel < CONSOLE_LOGLEVEL_MIN) + boot_console_loglevel = CONSOLE_LOGLEVEL_MIN; + } } } diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c index 9b14045065b6..f73cd757a5f7 100644 --- a/arch/s390/boot/ipl_report.c +++ b/arch/s390/boot/ipl_report.c @@ -5,6 +5,7 @@ #include <asm/sclp.h> #include <asm/sections.h> #include <asm/boot_data.h> +#include <asm/physmem_info.h> #include <uapi/asm/ipl.h> #include "boot.h" @@ -16,24 +17,19 @@ unsigned long __bootdata_preserved(ipl_cert_list_size); unsigned long __bootdata(early_ipl_comp_list_addr); unsigned long __bootdata(early_ipl_comp_list_size); +static struct ipl_rb_certificates *certs; +static struct ipl_rb_components *comps; +static bool ipl_report_needs_saving; + #define for_each_rb_entry(entry, rb) \ for (entry = rb->entries; \ (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \ entry++) -static inline bool intersects(unsigned long addr0, unsigned long size0, - unsigned long addr1, unsigned long size1) -{ - return addr0 + size0 > addr1 && addr1 + size1 > addr0; -} - -static unsigned long find_bootdata_space(struct ipl_rb_components *comps, - struct ipl_rb_certificates *certs, - unsigned long safe_addr) +static unsigned long get_cert_comp_list_size(void) { struct ipl_rb_certificate_entry *cert; struct ipl_rb_component_entry *comp; - size_t size; /* * Find the length for the IPL report boot data @@ -44,36 +40,27 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps, ipl_cert_list_size = 0; for_each_rb_entry(cert, certs) ipl_cert_list_size += sizeof(unsigned int) + cert->len; - size = ipl_cert_list_size + early_ipl_comp_list_size; + return ipl_cert_list_size + early_ipl_comp_list_size; +} - /* - * Start from safe_addr to find a free memory area large - * enough for the IPL report boot data. This area is used - * for ipl_cert_list_addr/ipl_cert_list_size and - * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must - * not overlap with any component or any certificate. - */ -repeat: - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size && - intersects(initrd_data.start, initrd_data.size, safe_addr, size)) - safe_addr = initrd_data.start + initrd_data.size; - for_each_rb_entry(comp, comps) - if (intersects(safe_addr, size, comp->addr, comp->len)) { - safe_addr = comp->addr + comp->len; - goto repeat; - } - for_each_rb_entry(cert, certs) - if (intersects(safe_addr, size, cert->addr, cert->len)) { - safe_addr = cert->addr + cert->len; - goto repeat; - } - early_ipl_comp_list_addr = safe_addr; - ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size; +bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start) +{ + struct ipl_rb_certificate_entry *cert; - return safe_addr + size; + if (!ipl_report_needs_saving) + return false; + + for_each_rb_entry(cert, certs) { + if (intersects(addr, size, cert->addr, cert->len)) { + *intersection_start = cert->addr; + return true; + } + } + return false; } -static void copy_components_bootdata(struct ipl_rb_components *comps) +static void copy_components_bootdata(void) { struct ipl_rb_component_entry *comp, *ptr; @@ -82,7 +69,7 @@ static void copy_components_bootdata(struct ipl_rb_components *comps) memcpy(ptr++, comp, sizeof(*ptr)); } -static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) +static void copy_certificates_bootdata(void) { struct ipl_rb_certificate_entry *cert; void *ptr; @@ -96,10 +83,8 @@ static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) } } -unsigned long read_ipl_report(unsigned long safe_addr) +int read_ipl_report(void) { - struct ipl_rb_certificates *certs; - struct ipl_rb_components *comps; struct ipl_pl_hdr *pl_hdr; struct ipl_rl_hdr *rl_hdr; struct ipl_rb_hdr *rb_hdr; @@ -112,7 +97,7 @@ unsigned long read_ipl_report(unsigned long safe_addr) */ if (!ipl_block_valid || !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)) - return safe_addr; + return -1; ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL); /* * There is an IPL report, to find it load the pointer to the @@ -120,7 +105,7 @@ unsigned long read_ipl_report(unsigned long safe_addr) * the IPL parameter list, then align the address to a double * word boundary. */ - tmp = (unsigned long) S390_lowcore.ipl_parmblock_ptr; + tmp = (unsigned long)get_lowcore()->ipl_parmblock_ptr; pl_hdr = (struct ipl_pl_hdr *) tmp; tmp = (tmp + pl_hdr->len + 7) & -8UL; rl_hdr = (struct ipl_rl_hdr *) tmp; @@ -150,16 +135,30 @@ unsigned long read_ipl_report(unsigned long safe_addr) * With either the component list or the certificate list * missing the kernel will stay ignorant of secure IPL. */ - if (!comps || !certs) - return safe_addr; + if (!comps || !certs) { + certs = NULL; + return -1; + } - /* - * Copy component and certificate list to a safe area - * where the decompressed kernel can find them. - */ - safe_addr = find_bootdata_space(comps, certs, safe_addr); - copy_components_bootdata(comps); - copy_certificates_bootdata(certs); + ipl_report_needs_saving = true; + physmem_reserve(RR_IPLREPORT, (unsigned long)pl_hdr, + (unsigned long)rl_end - (unsigned long)pl_hdr); + return 0; +} + +void save_ipl_cert_comp_list(void) +{ + unsigned long size; + + if (!ipl_report_needs_saving) + return; + + size = get_cert_comp_list_size(); + early_ipl_comp_list_addr = physmem_alloc_or_die(RR_CERT_COMP_LIST, size, sizeof(int)); + ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size; - return safe_addr; + copy_components_bootdata(); + copy_certificates_bootdata(); + physmem_free(RR_IPLREPORT); + ipl_report_needs_saving = false; } diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index e8d74d4f62aa..941f4c9e27cc 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -3,7 +3,7 @@ * Copyright IBM Corp. 2019 */ #include <linux/pgtable.h> -#include <asm/mem_detect.h> +#include <asm/physmem_info.h> #include <asm/cpacf.h> #include <asm/timex.h> #include <asm/sclp.h> @@ -32,7 +32,7 @@ struct prng_parm { static int check_prng(void) { if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) { - sclp_early_printk("KASLR disabled: CPU has no PRNG\n"); + boot_warn("KASLR disabled: CPU has no PRNG\n"); return 0; } if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) @@ -43,7 +43,7 @@ static int check_prng(void) return PRNG_MODE_TDES; } -static int get_random(unsigned long limit, unsigned long *value) +int get_random(unsigned long limit, unsigned long *value) { struct prng_parm prng = { /* initial parameter block for tdes mode, copied from libica */ @@ -91,119 +91,108 @@ static int get_random(unsigned long limit, unsigned long *value) return 0; } -/* - * To randomize kernel base address we have to consider several facts: - * 1. physical online memory might not be continuous and have holes. mem_detect - * info contains list of online memory ranges we should consider. - * 2. we have several memory regions which are occupied and we should not - * overlap and destroy them. Currently safe_addr tells us the border below - * which all those occupied regions are. We are safe to use anything above - * safe_addr. - * 3. the upper limit might apply as well, even if memory above that limit is - * online. Currently those limitations are: - * 3.1. Limit set by "mem=" kernel command line option - * 3.2. memory reserved at the end for kasan initialization. - * 4. kernel base address must be aligned to THREAD_SIZE (kernel stack size). - * Which is required for CONFIG_CHECK_STACK. Currently THREAD_SIZE is 4 pages - * (16 pages when the kernel is built with kasan enabled) - * Assumptions: - * 1. kernel size (including .bss size) and upper memory limit are page aligned. - * 2. mem_detect memory region start is THREAD_SIZE aligned / end is PAGE_SIZE - * aligned (in practice memory configurations granularity on z/VM and LPAR - * is 1mb). - * - * To guarantee uniform distribution of kernel base address among all suitable - * addresses we generate random value just once. For that we need to build a - * continuous range in which every value would be suitable. We can build this - * range by simply counting all suitable addresses (let's call them positions) - * which would be valid as kernel base address. To count positions we iterate - * over online memory ranges. For each range which is big enough for the - * kernel image we count all suitable addresses we can put the kernel image at - * that is - * (end - start - kernel_size) / THREAD_SIZE + 1 - * Two functions count_valid_kernel_positions and position_to_address help - * to count positions in memory range given and then convert position back - * to address. - */ -static unsigned long count_valid_kernel_positions(unsigned long kernel_size, - unsigned long _min, - unsigned long _max) +static void sort_reserved_ranges(struct reserved_range *res, unsigned long size) { - unsigned long start, end, pos = 0; - int i; - - for_each_mem_detect_block(i, &start, &end) { - if (_min >= end) - continue; - if (start >= _max) - break; - start = max(_min, start); - end = min(_max, end); - if (end - start < kernel_size) - continue; - pos += (end - start - kernel_size) / THREAD_SIZE + 1; + struct reserved_range tmp; + int i, j; + + for (i = 1; i < size; i++) { + tmp = res[i]; + for (j = i - 1; j >= 0 && res[j].start > tmp.start; j--) + res[j + 1] = res[j]; + res[j + 1] = tmp; } - - return pos; } -static unsigned long position_to_address(unsigned long pos, unsigned long kernel_size, - unsigned long _min, unsigned long _max) +static unsigned long iterate_valid_positions(unsigned long size, unsigned long align, + unsigned long _min, unsigned long _max, + struct reserved_range *res, size_t res_count, + bool pos_count, unsigned long find_pos) { - unsigned long start, end; + unsigned long start, end, tmp_end, range_pos, pos = 0; + struct reserved_range *res_end = res + res_count; + struct reserved_range *skip_res; int i; - for_each_mem_detect_block(i, &start, &end) { + align = max(align, 8UL); + _min = round_up(_min, align); + for_each_physmem_usable_range(i, &start, &end) { if (_min >= end) continue; + start = round_up(start, align); if (start >= _max) break; start = max(_min, start); end = min(_max, end); - if (end - start < kernel_size) - continue; - if ((end - start - kernel_size) / THREAD_SIZE + 1 >= pos) - return start + (pos - 1) * THREAD_SIZE; - pos -= (end - start - kernel_size) / THREAD_SIZE + 1; + + while (start + size <= end) { + /* skip reserved ranges below the start */ + while (res && res->end <= start) { + res++; + if (res >= res_end) + res = NULL; + } + skip_res = NULL; + tmp_end = end; + /* has intersecting reserved range */ + if (res && res->start < end) { + skip_res = res; + tmp_end = res->start; + } + if (start + size <= tmp_end) { + range_pos = (tmp_end - start - size) / align + 1; + if (pos_count) { + pos += range_pos; + } else { + if (range_pos >= find_pos) + return start + (find_pos - 1) * align; + find_pos -= range_pos; + } + } + if (!skip_res) + break; + start = round_up(skip_res->end, align); + } } - return 0; + return pos_count ? pos : 0; } -unsigned long get_random_base(unsigned long safe_addr) +/* + * Two types of decompressor memory allocations/reserves are considered + * differently. + * + * "Static" or "single" allocations are done via physmem_alloc_range() and + * physmem_reserve(), and they are listed in physmem_info.reserved[]. Each + * type of "static" allocation can only have one allocation per type and + * cannot have chains. + * + * On the other hand, "dynamic" or "repetitive" allocations are done via + * physmem_alloc_or_die(). These allocations are tightly packed together + * top down from the end of online memory. physmem_alloc_pos represents + * current position where those allocations start. + * + * Functions randomize_within_range() and iterate_valid_positions() + * only consider "dynamic" allocations by never looking above + * physmem_alloc_pos. "Static" allocations, however, are explicitly + * considered by checking the "res" (reserves) array. The first + * reserved_range of a "dynamic" allocation may also be checked along the + * way, but it will always be above the maximum value anyway. + */ +unsigned long randomize_within_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max) { - unsigned long memory_limit = get_mem_detect_end(); - unsigned long base_pos, max_pos, kernel_size; - unsigned long kasan_needs; - int i; + struct reserved_range res[RR_MAX]; + unsigned long max_pos, pos; - memory_limit = min(memory_limit, ident_map_size); + memcpy(res, physmem_info.reserved, sizeof(res)); + sort_reserved_ranges(res, ARRAY_SIZE(res)); + max = min(max, get_physmem_alloc_pos()); - /* - * Avoid putting kernel in the end of physical memory - * which kasan will use for shadow memory and early pgtable - * mapping allocations. - */ - memory_limit -= kasan_estimate_memory_needs(memory_limit); - - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size) { - if (safe_addr < initrd_data.start + initrd_data.size) - safe_addr = initrd_data.start + initrd_data.size; - } - safe_addr = ALIGN(safe_addr, THREAD_SIZE); - - kernel_size = vmlinux.image_size + vmlinux.bss_size; - if (safe_addr + kernel_size > memory_limit) + max_pos = iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), true, 0); + if (!max_pos) return 0; - - max_pos = count_valid_kernel_positions(kernel_size, safe_addr, memory_limit); - if (!max_pos) { - sclp_early_printk("KASLR disabled: not enough memory\n"); - return 0; - } - - /* we need a value in the range [1, base_pos] inclusive */ - if (get_random(max_pos, &base_pos)) + if (get_random(max_pos, &pos)) return 0; - return position_to_address(base_pos + 1, kernel_size, safe_addr, memory_limit); + return iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), false, pos + 1); } diff --git a/arch/s390/boot/kmsan.c b/arch/s390/boot/kmsan.c new file mode 100644 index 000000000000..e7b3ac48143e --- /dev/null +++ b/arch/s390/boot/kmsan.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kmsan-checks.h> + +void kmsan_unpoison_memory(const void *address, size_t size) +{ +} diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c deleted file mode 100644 index 7fa1a32ea0f3..000000000000 --- a/arch/s390/boot/mem_detect.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/errno.h> -#include <linux/init.h> -#include <asm/setup.h> -#include <asm/processor.h> -#include <asm/sclp.h> -#include <asm/sections.h> -#include <asm/mem_detect.h> -#include <asm/sparsemem.h> -#include "decompressor.h" -#include "boot.h" - -struct mem_detect_info __bootdata(mem_detect); - -/* up to 256 storage elements, 1020 subincrements each */ -#define ENTRIES_EXTENDED_MAX \ - (256 * (1020 / 2) * sizeof(struct mem_detect_block)) - -/* - * To avoid corrupting old kernel memory during dump, find lowest memory - * chunk possible either right after the kernel end (decompressed kernel) or - * after initrd (if it is present and there is no hole between the kernel end - * and initrd) - */ -static void *mem_detect_alloc_extended(void) -{ - unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64)); - - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size && - initrd_data.start < offset + ENTRIES_EXTENDED_MAX) - offset = ALIGN(initrd_data.start + initrd_data.size, sizeof(u64)); - - return (void *)offset; -} - -static struct mem_detect_block *__get_mem_detect_block_ptr(u32 n) -{ - if (n < MEM_INLINED_ENTRIES) - return &mem_detect.entries[n]; - if (unlikely(!mem_detect.entries_extended)) - mem_detect.entries_extended = mem_detect_alloc_extended(); - return &mem_detect.entries_extended[n - MEM_INLINED_ENTRIES]; -} - -/* - * sequential calls to add_mem_detect_block with adjacent memory areas - * are merged together into single memory block. - */ -void add_mem_detect_block(u64 start, u64 end) -{ - struct mem_detect_block *block; - - if (mem_detect.count) { - block = __get_mem_detect_block_ptr(mem_detect.count - 1); - if (block->end == start) { - block->end = end; - return; - } - } - - block = __get_mem_detect_block_ptr(mem_detect.count); - block->start = start; - block->end = end; - mem_detect.count++; -} - -static int __diag260(unsigned long rx1, unsigned long rx2) -{ - unsigned long reg1, reg2, ry; - union register_pair rx; - psw_t old; - int rc; - - rx.even = rx1; - rx.odd = rx2; - ry = 0x10; /* storage configuration */ - rc = -1; /* fail */ - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" - " diag %[rx],%[ry],0x260\n" - " ipm %[rc]\n" - " srl %[rc],28\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" - : [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - [rc] "+&d" (rc), - [ry] "+&d" (ry), - "+Q" (S390_lowcore.program_new_psw), - "=Q" (old) - : [rx] "d" (rx.pair), - [psw_old] "a" (&old), - [psw_pgm] "a" (&S390_lowcore.program_new_psw) - : "cc", "memory"); - return rc == 0 ? ry : -1; -} - -static int diag260(void) -{ - int rc, i; - - struct { - unsigned long start; - unsigned long end; - } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */ - - memset(storage_extents, 0, sizeof(storage_extents)); - rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents)); - if (rc == -1) - return -1; - - for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++) - add_mem_detect_block(storage_extents[i].start, storage_extents[i].end + 1); - return 0; -} - -static int tprot(unsigned long addr) -{ - unsigned long reg1, reg2; - int rc = -EFAULT; - psw_t old; - - asm volatile( - " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" - " epsw %[reg1],%[reg2]\n" - " st %[reg1],0(%[psw_pgm])\n" - " st %[reg2],4(%[psw_pgm])\n" - " larl %[reg1],1f\n" - " stg %[reg1],8(%[psw_pgm])\n" - " tprot 0(%[addr]),0\n" - " ipm %[rc]\n" - " srl %[rc],28\n" - "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" - : [reg1] "=&d" (reg1), - [reg2] "=&a" (reg2), - [rc] "+&d" (rc), - "=Q" (S390_lowcore.program_new_psw.addr), - "=Q" (old) - : [psw_old] "a" (&old), - [psw_pgm] "a" (&S390_lowcore.program_new_psw), - [addr] "a" (addr) - : "cc", "memory"); - return rc; -} - -static void search_mem_end(void) -{ - unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */ - unsigned long offset = 0; - unsigned long pivot; - - while (range > 1) { - range >>= 1; - pivot = offset + range; - if (!tprot(pivot << 20)) - offset = pivot; - } - - add_mem_detect_block(0, (offset + 1) << 20); -} - -unsigned long detect_memory(void) -{ - unsigned long max_physmem_end; - - sclp_early_get_memsize(&max_physmem_end); - - if (!sclp_early_read_storage_info()) { - mem_detect.info_source = MEM_DETECT_SCLP_STOR_INFO; - return max_physmem_end; - } - - if (!diag260()) { - mem_detect.info_source = MEM_DETECT_DIAG260; - return max_physmem_end; - } - - if (max_physmem_end) { - add_mem_detect_block(0, max_physmem_end); - mem_detect.info_source = MEM_DETECT_SCLP_READ_INFO; - return max_physmem_end; - } - - search_mem_end(); - mem_detect.info_source = MEM_DETECT_BIN_SEARCH; - return get_mem_detect_end(); -} diff --git a/arch/s390/boot/pgm_check.c b/arch/s390/boot/pgm_check.c new file mode 100644 index 000000000000..fa621fa5bc02 --- /dev/null +++ b/arch/s390/boot/pgm_check.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/stdarg.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <asm/stacktrace.h> +#include <asm/boot_data.h> +#include <asm/lowcore.h> +#include <asm/setup.h> +#include <asm/sclp.h> +#include <asm/uv.h> +#include "boot.h" + +void print_stacktrace(unsigned long sp) +{ + struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start, + (unsigned long)_stack_end }; + bool first = true; + + boot_emerg("Call Trace:\n"); + while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) { + struct stack_frame *sf = (struct stack_frame *)sp; + + if (first) + boot_emerg("(sp:%016lx [<%016lx>] %pS)\n", sp, sf->gprs[8], (void *)sf->gprs[8]); + else + boot_emerg(" sp:%016lx [<%016lx>] %pS\n", sp, sf->gprs[8], (void *)sf->gprs[8]); + if (sf->back_chain <= sp) + break; + sp = sf->back_chain; + first = false; + } +} + +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +static inline unsigned long extable_insn(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} + +static bool ex_handler(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + for (ex = __start___ex_table; ex < __stop___ex_table; ex++) { + if (extable_insn(ex) != regs->psw.addr) + continue; + if (ex->type != EX_TYPE_FIXUP) + return false; + regs->psw.addr = extable_fixup(ex); + return true; + } + return false; +} + +void do_pgm_check(struct pt_regs *regs) +{ + struct psw_bits *psw = &psw_bits(regs->psw); + unsigned long *gpregs = regs->gprs; + + if (ex_handler(regs)) + return; + if (bootdebug) + boot_rb_dump(); + boot_emerg("Linux version %s\n", kernel_version); + if (!is_prot_virt_guest() && early_command_line[0]) + boot_emerg("Kernel command line: %s\n", early_command_line); + boot_emerg("Kernel fault: interruption code %04x ilc:%d\n", + regs->int_code & 0xffff, regs->int_code >> 17); + if (kaslr_enabled()) { + boot_emerg("Kernel random base: %lx\n", __kaslr_offset); + boot_emerg("Kernel random base phys: %lx\n", __kaslr_offset_phys); + } + boot_emerg("PSW : %016lx %016lx (%pS)\n", + regs->psw.mask, regs->psw.addr, (void *)regs->psw.addr); + boot_emerg(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", + psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, + psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, psw->eaba); + boot_emerg("GPRS: %016lx %016lx %016lx %016lx\n", gpregs[0], gpregs[1], gpregs[2], gpregs[3]); + boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[4], gpregs[5], gpregs[6], gpregs[7]); + boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[8], gpregs[9], gpregs[10], gpregs[11]); + boot_emerg(" %016lx %016lx %016lx %016lx\n", gpregs[12], gpregs[13], gpregs[14], gpregs[15]); + print_stacktrace(gpregs[15]); + boot_emerg("Last Breaking-Event-Address:\n"); + boot_emerg(" [<%016lx>] %pS\n", regs->last_break, (void *)regs->last_break); + /* Convert to disabled wait PSW */ + psw->io = 0; + psw->ext = 0; + psw->wait = 1; +} diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c deleted file mode 100644 index c2a1defc79da..000000000000 --- a/arch/s390/boot/pgm_check_info.c +++ /dev/null @@ -1,180 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/stdarg.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <asm/stacktrace.h> -#include <asm/boot_data.h> -#include <asm/lowcore.h> -#include <asm/setup.h> -#include <asm/sclp.h> -#include <asm/uv.h> -#include "boot.h" - -const char hex_asc[] = "0123456789abcdef"; - -static char *as_hex(char *dst, unsigned long val, int pad) -{ - char *p, *end = p = dst + max(pad, (int)__fls(val | 1) / 4 + 1); - - for (*p-- = 0; p >= dst; val >>= 4) - *p-- = hex_asc[val & 0x0f]; - return end; -} - -static char *symstart(char *p) -{ - while (*p) - p--; - return p + 1; -} - -static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len) -{ - /* symbol entries are in a form "10000 c4 startup\0" */ - char *a = _decompressor_syms_start; - char *b = _decompressor_syms_end; - unsigned long start; - unsigned long size; - char *pivot; - char *endp; - - while (a < b) { - pivot = symstart(a + (b - a) / 2); - start = simple_strtoull(pivot, &endp, 16); - size = simple_strtoull(endp + 1, &endp, 16); - if (ip < start) { - b = pivot; - continue; - } - if (ip > start + size) { - a = pivot + strlen(pivot) + 1; - continue; - } - *off = ip - start; - *len = size; - return endp + 1; - } - return NULL; -} - -static noinline char *strsym(void *ip) -{ - static char buf[64]; - unsigned short off; - unsigned short len; - char *p; - - p = findsym((unsigned long)ip, &off, &len); - if (p) { - strncpy(buf, p, sizeof(buf)); - /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */ - p = buf + strnlen(buf, sizeof(buf) - 15); - strcpy(p, "+0x"); - p = as_hex(p + 3, off, 0); - strcpy(p, "/0x"); - as_hex(p + 3, len, 0); - } else { - as_hex(buf, (unsigned long)ip, 16); - } - return buf; -} - -void decompressor_printk(const char *fmt, ...) -{ - char buf[1024] = { 0 }; - char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */ - unsigned long pad; - char *p = buf; - va_list args; - - va_start(args, fmt); - for (; p < end && *fmt; fmt++) { - if (*fmt != '%') { - *p++ = *fmt; - continue; - } - pad = isdigit(*++fmt) ? simple_strtol(fmt, (char **)&fmt, 10) : 0; - switch (*fmt) { - case 's': - p = buf + strlcat(buf, va_arg(args, char *), sizeof(buf)); - break; - case 'p': - if (*++fmt != 'S') - goto out; - p = buf + strlcat(buf, strsym(va_arg(args, void *)), sizeof(buf)); - break; - case 'l': - if (*++fmt != 'x' || end - p <= max(sizeof(long) * 2, pad)) - goto out; - p = as_hex(p, va_arg(args, unsigned long), pad); - break; - case 'x': - if (end - p <= max(sizeof(int) * 2, pad)) - goto out; - p = as_hex(p, va_arg(args, unsigned int), pad); - break; - default: - goto out; - } - } -out: - va_end(args); - sclp_early_printk(buf); -} - -static noinline void print_stacktrace(void) -{ - struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start, - (unsigned long)_stack_end }; - unsigned long sp = S390_lowcore.gpregs_save_area[15]; - bool first = true; - - decompressor_printk("Call Trace:\n"); - while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) { - struct stack_frame *sf = (struct stack_frame *)sp; - - decompressor_printk(first ? "(sp:%016lx [<%016lx>] %pS)\n" : - " sp:%016lx [<%016lx>] %pS\n", - sp, sf->gprs[8], (void *)sf->gprs[8]); - if (sf->back_chain <= sp) - break; - sp = sf->back_chain; - first = false; - } -} - -void print_pgm_check_info(void) -{ - unsigned long *gpregs = (unsigned long *)S390_lowcore.gpregs_save_area; - struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area); - - decompressor_printk("Linux version %s\n", kernel_version); - if (!is_prot_virt_guest() && early_command_line[0]) - decompressor_printk("Kernel command line: %s\n", early_command_line); - decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n", - S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1); - if (kaslr_enabled) - decompressor_printk("Kernel random base: %lx\n", __kaslr_offset); - decompressor_printk("PSW : %016lx %016lx (%pS)\n", - S390_lowcore.psw_save_area.mask, - S390_lowcore.psw_save_area.addr, - (void *)S390_lowcore.psw_save_area.addr); - decompressor_printk( - " R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", - psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, - psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, - psw->eaba); - decompressor_printk("GPRS: %016lx %016lx %016lx %016lx\n", - gpregs[0], gpregs[1], gpregs[2], gpregs[3]); - decompressor_printk(" %016lx %016lx %016lx %016lx\n", - gpregs[4], gpregs[5], gpregs[6], gpregs[7]); - decompressor_printk(" %016lx %016lx %016lx %016lx\n", - gpregs[8], gpregs[9], gpregs[10], gpregs[11]); - decompressor_printk(" %016lx %016lx %016lx %016lx\n", - gpregs[12], gpregs[13], gpregs[14], gpregs[15]); - print_stacktrace(); - decompressor_printk("Last Breaking-Event-Address:\n"); - decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break, - (void *)S390_lowcore.pgm_last_break); -} diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c new file mode 100644 index 000000000000..45e3d057cfaa --- /dev/null +++ b/arch/s390/boot/physmem_info.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "physmem: " fmt +#include <linux/processor.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <asm/physmem_info.h> +#include <asm/stacktrace.h> +#include <asm/boot_data.h> +#include <asm/sparsemem.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/sclp.h> +#include <asm/asm.h> +#include <asm/uv.h> +#include "decompressor.h" +#include "boot.h" + +struct physmem_info __bootdata(physmem_info); +static unsigned int physmem_alloc_ranges; +static unsigned long physmem_alloc_pos; + +/* up to 256 storage elements, 1020 subincrements each */ +#define ENTRIES_EXTENDED_MAX \ + (256 * (1020 / 2) * sizeof(struct physmem_range)) + +static struct physmem_range *__get_physmem_range_ptr(u32 n) +{ + if (n < MEM_INLINED_ENTRIES) + return &physmem_info.online[n]; + if (unlikely(!physmem_info.online_extended)) { + physmem_info.online_extended = (struct physmem_range *)physmem_alloc_range( + RR_MEM_DETECT_EXT, ENTRIES_EXTENDED_MAX, sizeof(long), 0, + physmem_alloc_pos, true); + } + return &physmem_info.online_extended[n - MEM_INLINED_ENTRIES]; +} + +/* + * sequential calls to add_physmem_online_range with adjacent memory ranges + * are merged together into single memory range. + */ +void add_physmem_online_range(u64 start, u64 end) +{ + struct physmem_range *range; + + if (physmem_info.range_count) { + range = __get_physmem_range_ptr(physmem_info.range_count - 1); + if (range->end == start) { + range->end = end; + return; + } + } + + range = __get_physmem_range_ptr(physmem_info.range_count); + range->start = start; + range->end = end; + physmem_info.range_count++; +} + +static int __diag260(unsigned long rx1, unsigned long rx2) +{ + union register_pair rx; + int cc, exception; + unsigned long ry; + + rx.even = rx1; + rx.odd = rx2; + ry = 0x10; /* storage configuration */ + exception = 1; + asm_inline volatile( + " diag %[rx],%[ry],0x260\n" + "0: lhi %[exc],0\n" + "1:\n" + CC_IPM(cc) + EX_TABLE(0b, 1b) + : CC_OUT(cc, cc), [exc] "+d" (exception), [ry] "+d" (ry) + : [rx] "d" (rx.pair) + : CC_CLOBBER_LIST("memory")); + cc = exception ? -1 : CC_TRANSFORM(cc); + return cc == 0 ? ry : -1; +} + +static int diag260(void) +{ + int rc, i; + + struct { + unsigned long start; + unsigned long end; + } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */ + + memset(storage_extents, 0, sizeof(storage_extents)); + rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents)); + if (rc == -1) + return -1; + + for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++) + add_physmem_online_range(storage_extents[i].start, storage_extents[i].end + 1); + return 0; +} + +#define DIAG500_SC_STOR_LIMIT 4 + +static int diag500_storage_limit(unsigned long *max_physmem_end) +{ + unsigned long storage_limit; + + asm_inline volatile( + " lghi %%r1,%[subcode]\n" + " lghi %%r2,0\n" + " diag %%r2,%%r4,0x500\n" + "0: lgr %[slimit],%%r2\n" + EX_TABLE(0b, 0b) + : [slimit] "=d" (storage_limit) + : [subcode] "i" (DIAG500_SC_STOR_LIMIT) + : "memory", "1", "2"); + if (!storage_limit) + return -EINVAL; + /* Convert inclusive end to exclusive end */ + *max_physmem_end = storage_limit + 1; + return 0; +} + +static int tprot(unsigned long addr) +{ + int cc, exception; + + exception = 1; + asm_inline volatile( + " tprot 0(%[addr]),0\n" + "0: lhi %[exc],0\n" + "1:\n" + CC_IPM(cc) + EX_TABLE(0b, 1b) + : CC_OUT(cc, cc), [exc] "+d" (exception) + : [addr] "a" (addr) + : CC_CLOBBER_LIST("memory")); + cc = exception ? -EFAULT : CC_TRANSFORM(cc); + return cc; +} + +static unsigned long search_mem_end(void) +{ + unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */ + unsigned long offset = 0; + unsigned long pivot; + + while (range > 1) { + range >>= 1; + pivot = offset + range; + if (!tprot(pivot << 20)) + offset = pivot; + } + return (offset + 1) << 20; +} + +unsigned long detect_max_physmem_end(void) +{ + unsigned long max_physmem_end = 0; + + if (!diag500_storage_limit(&max_physmem_end)) { + physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT; + } else if (!sclp_early_get_memsize(&max_physmem_end)) { + physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO; + } else { + max_physmem_end = search_mem_end(); + physmem_info.info_source = MEM_DETECT_BIN_SEARCH; + } + boot_debug("Max physical memory: 0x%016lx (info source: %s)\n", max_physmem_end, + get_physmem_info_source()); + return max_physmem_end; +} + +void detect_physmem_online_ranges(unsigned long max_physmem_end) +{ + unsigned long start, end; + int i; + + if (!sclp_early_read_storage_info()) { + physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO; + } else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) { + unsigned long online_end; + + if (!sclp_early_get_memsize(&online_end)) { + physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO; + add_physmem_online_range(0, online_end); + } + } else if (!diag260()) { + physmem_info.info_source = MEM_DETECT_DIAG260; + } else if (max_physmem_end) { + add_physmem_online_range(0, max_physmem_end); + } + boot_debug("Online memory ranges (info source: %s):\n", get_physmem_info_source()); + for_each_physmem_online_range(i, &start, &end) + boot_debug(" online [%d]: 0x%016lx-0x%016lx\n", i, start, end); +} + +void physmem_set_usable_limit(unsigned long limit) +{ + physmem_info.usable = limit; + physmem_alloc_pos = limit; + boot_debug("Usable memory limit: 0x%016lx\n", limit); +} + +static void die_oom(unsigned long size, unsigned long align, unsigned long min, unsigned long max) +{ + unsigned long start, end, total_mem = 0, total_reserved_mem = 0; + struct reserved_range *range; + enum reserved_range_type t; + int i; + + boot_emerg("Linux version %s\n", kernel_version); + if (!is_prot_virt_guest() && early_command_line[0]) + boot_emerg("Kernel command line: %s\n", early_command_line); + boot_emerg("Out of memory allocating %lu bytes 0x%lx aligned in range %lx:%lx\n", + size, align, min, max); + boot_emerg("Reserved memory ranges:\n"); + for_each_physmem_reserved_range(t, range, &start, &end) { + boot_emerg("%016lx %016lx %s\n", start, end, get_rr_type_name(t)); + total_reserved_mem += end - start; + } + boot_emerg("Usable online memory ranges (info source: %s [%d]):\n", + get_physmem_info_source(), physmem_info.info_source); + for_each_physmem_usable_range(i, &start, &end) { + boot_emerg("%016lx %016lx\n", start, end); + total_mem += end - start; + } + boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n", + total_mem, total_reserved_mem, + total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); + print_stacktrace(current_frame_address()); + boot_emerg(" -- System halted\n"); + disabled_wait(); +} + +static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) +{ + physmem_info.reserved[type].start = addr; + physmem_info.reserved[type].end = addr + size; +} + +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) +{ + _physmem_reserve(type, addr, size); + boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Reserve:", addr, addr + size, + get_rr_type_name(type)); +} + +void physmem_free(enum reserved_range_type type) +{ + boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Free:", physmem_info.reserved[type].start, + physmem_info.reserved[type].end, get_rr_type_name(type)); + physmem_info.reserved[type].start = 0; + physmem_info.reserved[type].end = 0; +} + +static bool __physmem_alloc_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start) +{ + unsigned long res_addr, res_size; + int t; + + for (t = 0; t < RR_MAX; t++) { + if (!get_physmem_reserved(t, &res_addr, &res_size)) + continue; + if (intersects(addr, size, res_addr, res_size)) { + *intersection_start = res_addr; + return true; + } + } + return ipl_report_certs_intersects(addr, size, intersection_start); +} + +static unsigned long __physmem_alloc_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max, + unsigned int from_ranges, unsigned int *ranges_left, + bool die_on_oom) +{ + unsigned int nranges = from_ranges ?: physmem_info.range_count; + unsigned long range_start, range_end; + unsigned long intersection_start; + unsigned long addr, pos = max; + + align = max(align, 8UL); + while (nranges) { + __get_physmem_range(nranges - 1, &range_start, &range_end, false); + pos = min(range_end, pos); + + if (round_up(min, align) + size > pos) + break; + addr = round_down(pos - size, align); + if (range_start > addr) { + nranges--; + continue; + } + if (__physmem_alloc_intersects(addr, size, &intersection_start)) { + pos = intersection_start; + continue; + } + + if (ranges_left) + *ranges_left = nranges; + return addr; + } + if (die_on_oom) + die_oom(size, align, min, max); + return 0; +} + +unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, + unsigned long align, unsigned long min, unsigned long max, + bool die_on_oom) +{ + unsigned long addr; + + max = min(max, physmem_alloc_pos); + addr = __physmem_alloc_range(size, align, min, max, 0, NULL, die_on_oom); + if (addr) + _physmem_reserve(type, addr, size); + boot_debug("%-14s 0x%016lx-0x%016lx %s\n", "Alloc range:", addr, addr + size, + get_rr_type_name(type)); + return addr; +} + +unsigned long physmem_alloc(enum reserved_range_type type, unsigned long size, + unsigned long align, bool die_on_oom) +{ + struct reserved_range *range = &physmem_info.reserved[type]; + struct reserved_range *new_range = NULL; + unsigned int ranges_left; + unsigned long addr; + + addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, physmem_alloc_ranges, + &ranges_left, die_on_oom); + if (!addr) + return 0; + /* if not a consecutive allocation of the same type or first allocation */ + if (range->start != addr + size) { + if (range->end) { + addr = __physmem_alloc_range(sizeof(struct reserved_range), 0, 0, + physmem_alloc_pos, physmem_alloc_ranges, + &ranges_left, true); + new_range = (struct reserved_range *)addr; + addr = __physmem_alloc_range(size, align, 0, addr, ranges_left, + &ranges_left, die_on_oom); + if (!addr) + return 0; + *new_range = *range; + range->chain = new_range; + } + range->end = addr + size; + } + if (type != RR_VMEM) { + boot_debug("%-14s 0x%016lx-0x%016lx %-20s align 0x%lx split %d\n", "Alloc topdown:", + addr, addr + size, get_rr_type_name(type), align, !!new_range); + } + range->start = addr; + physmem_alloc_pos = addr; + physmem_alloc_ranges = ranges_left; + return addr; +} + +unsigned long physmem_alloc_or_die(enum reserved_range_type type, unsigned long size, + unsigned long align) +{ + return physmem_alloc(type, size, align, true); +} + +unsigned long get_physmem_alloc_pos(void) +{ + return physmem_alloc_pos; +} + +void dump_physmem_reserved(void) +{ + struct reserved_range *range; + enum reserved_range_type t; + unsigned long start, end; + + boot_debug("Reserved memory ranges:\n"); + for_each_physmem_reserved_range(t, range, &start, &end) { + if (end) { + boot_debug("%-14s 0x%016lx-0x%016lx @%012lx chain %012lx\n", + get_rr_type_name(t), start, end, (unsigned long)range, + (unsigned long)range->chain); + } + } +} diff --git a/arch/s390/boot/printk.c b/arch/s390/boot/printk.c new file mode 100644 index 000000000000..4bb6bc95704e --- /dev/null +++ b/arch/s390/boot/printk.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/stdarg.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <asm/stacktrace.h> +#include <asm/boot_data.h> +#include <asm/sections.h> +#include <asm/lowcore.h> +#include <asm/setup.h> +#include <asm/timex.h> +#include <asm/sclp.h> +#include <asm/uv.h> +#include "boot.h" + +int boot_console_loglevel = CONFIG_CONSOLE_LOGLEVEL_DEFAULT; +bool boot_ignore_loglevel; +char __bootdata(boot_rb)[PAGE_SIZE * 2]; +bool __bootdata(boot_earlyprintk); +size_t __bootdata(boot_rb_off); +char __bootdata(bootdebug_filter)[128]; +bool __bootdata(bootdebug); + +static void boot_rb_add(const char *str, size_t len) +{ + /* leave double '\0' in the end */ + size_t avail = sizeof(boot_rb) - boot_rb_off - 1; + + /* store strings separated by '\0' */ + if (len + 1 > avail) + boot_rb_off = 0; + avail = sizeof(boot_rb) - boot_rb_off - 1; + strscpy(boot_rb + boot_rb_off, str, avail); + boot_rb_off += len + 1; +} + +static void print_rb_entry(const char *str) +{ + sclp_early_printk(printk_skip_level(str)); +} + +static bool debug_messages_printed(void) +{ + return boot_earlyprintk && (boot_ignore_loglevel || boot_console_loglevel > LOGLEVEL_DEBUG); +} + +void boot_rb_dump(void) +{ + if (debug_messages_printed()) + return; + sclp_early_printk("Boot messages ring buffer:\n"); + boot_rb_foreach(print_rb_entry); +} + +const char hex_asc[] = "0123456789abcdef"; + +static char *as_hex(char *dst, unsigned long val, int pad) +{ + char *p = dst + max(pad, (int)__fls(val | 1) / 4 + 1); + + for (*p-- = '\0'; p >= dst; val >>= 4) + *p-- = hex_asc[val & 0x0f]; + return dst; +} + +#define MAX_NUMLEN 21 +static char *as_dec(char *buf, unsigned long val, bool is_signed) +{ + bool negative = false; + char *p = buf + MAX_NUMLEN; + + if (is_signed && (long)val < 0) { + val = (val == LONG_MIN ? LONG_MIN : -(long)val); + negative = true; + } + + *--p = '\0'; + do { + *--p = '0' + (val % 10); + val /= 10; + } while (val); + + if (negative) + *--p = '-'; + return p; +} + +static ssize_t strpad(char *dst, size_t dst_size, const char *src, + int _pad, bool zero_pad, bool decimal) +{ + ssize_t len = strlen(src), pad = _pad; + char *p = dst; + + if (max(len, abs(pad)) >= dst_size) + return -E2BIG; + + if (pad > len) { + if (decimal && zero_pad && *src == '-') { + *p++ = '-'; + src++; + len--; + pad--; + } + memset(p, zero_pad ? '0' : ' ', pad - len); + p += pad - len; + } + memcpy(p, src, len); + p += len; + if (pad < 0 && -pad > len) { + memset(p, ' ', -pad - len); + p += -pad - len; + } + *p = '\0'; + return p - dst; +} + +static char *symstart(char *p) +{ + while (*p) + p--; + return p + 1; +} + +static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len) +{ + /* symbol entries are in a form "10000 c4 startup\0" */ + char *a = _decompressor_syms_start; + char *b = _decompressor_syms_end; + unsigned long start; + unsigned long size; + char *pivot; + char *endp; + + while (a < b) { + pivot = symstart(a + (b - a) / 2); + start = simple_strtoull(pivot, &endp, 16); + size = simple_strtoull(endp + 1, &endp, 16); + if (ip < start) { + b = pivot; + continue; + } + if (ip > start + size) { + a = pivot + strlen(pivot) + 1; + continue; + } + *off = ip - start; + *len = size; + return endp + 1; + } + return NULL; +} + +#define MAX_SYMLEN 64 +static noinline char *strsym(char *buf, void *ip) +{ + unsigned short off; + unsigned short len; + char *p; + + p = findsym((unsigned long)ip, &off, &len); + if (p) { + strscpy(buf, p, MAX_SYMLEN); + /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */ + p = buf + strnlen(buf, MAX_SYMLEN - 15); + strscpy(p, "+0x", MAX_SYMLEN - (p - buf)); + as_hex(p + 3, off, 0); + strcat(p, "/0x"); + as_hex(p + strlen(p), len, 0); + } else { + as_hex(buf, (unsigned long)ip, 16); + } + return buf; +} + +static inline int printk_loglevel(const char *buf) +{ + if (buf[0] == KERN_SOH_ASCII && buf[1]) { + switch (buf[1]) { + case '0' ... '7': + return buf[1] - '0'; + } + } + return MESSAGE_LOGLEVEL_DEFAULT; +} + +static void boot_console_earlyprintk(const char *buf) +{ + int level = printk_loglevel(buf); + + /* always print emergency messages */ + if (level > LOGLEVEL_EMERG && !boot_earlyprintk) + return; + buf = printk_skip_level(buf); + /* print debug messages only when bootdebug is enabled */ + if (level == LOGLEVEL_DEBUG && (!bootdebug || !bootdebug_filter_match(skip_timestamp(buf)))) + return; + if (boot_ignore_loglevel || level < boot_console_loglevel) + sclp_early_printk(buf); +} + +static char *add_timestamp(char *buf) +{ +#ifdef CONFIG_PRINTK_TIME + unsigned long ns = tod_to_ns(__get_tod_clock_monotonic()); + char ts[MAX_NUMLEN]; + + *buf++ = '['; + buf += strpad(buf, MAX_NUMLEN, as_dec(ts, ns / NSEC_PER_SEC, 0), 5, 0, 0); + *buf++ = '.'; + buf += strpad(buf, MAX_NUMLEN, as_dec(ts, (ns % NSEC_PER_SEC) / NSEC_PER_USEC, 0), 6, 1, 0); + *buf++ = ']'; + *buf++ = ' '; +#endif + return buf; +} + +#define va_arg_len_type(args, lenmod, typemod) \ + ((lenmod == 'l') ? va_arg(args, typemod long) : \ + (lenmod == 'h') ? (typemod short)va_arg(args, typemod int) : \ + (lenmod == 'H') ? (typemod char)va_arg(args, typemod int) : \ + (lenmod == 'z') ? va_arg(args, typemod long) : \ + va_arg(args, typemod int)) + +int boot_printk(const char *fmt, ...) +{ + char buf[1024] = { 0 }; + char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */ + bool zero_pad, decimal; + char *strval, *p = buf; + char valbuf[MAX(MAX_SYMLEN, MAX_NUMLEN)]; + va_list args; + char lenmod; + ssize_t len; + int pad; + + *p++ = KERN_SOH_ASCII; + *p++ = printk_get_level(fmt) ?: '0' + MESSAGE_LOGLEVEL_DEFAULT; + p = add_timestamp(p); + fmt = printk_skip_level(fmt); + + va_start(args, fmt); + for (; p < end && *fmt; fmt++) { + if (*fmt != '%') { + *p++ = *fmt; + continue; + } + if (*++fmt == '%') { + *p++ = '%'; + continue; + } + zero_pad = (*fmt == '0'); + pad = simple_strtol(fmt, (char **)&fmt, 10); + lenmod = (*fmt == 'h' || *fmt == 'l' || *fmt == 'z') ? *fmt++ : 0; + if (lenmod == 'h' && *fmt == 'h') { + lenmod = 'H'; + fmt++; + } + decimal = false; + switch (*fmt) { + case 's': + if (lenmod) + goto out; + strval = va_arg(args, char *); + zero_pad = false; + break; + case 'p': + if (*++fmt != 'S' || lenmod) + goto out; + strval = strsym(valbuf, va_arg(args, void *)); + zero_pad = false; + break; + case 'd': + case 'i': + strval = as_dec(valbuf, va_arg_len_type(args, lenmod, signed), 1); + decimal = true; + break; + case 'u': + strval = as_dec(valbuf, va_arg_len_type(args, lenmod, unsigned), 0); + break; + case 'x': + strval = as_hex(valbuf, va_arg_len_type(args, lenmod, unsigned), 0); + break; + default: + goto out; + } + len = strpad(p, end - p, strval, pad, zero_pad, decimal); + if (len == -E2BIG) + break; + p += len; + } +out: + va_end(args); + len = strlen(buf); + if (len) { + boot_rb_add(buf, len); + boot_console_earlyprintk(buf); + } + return len; +} diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 863e6bcaa5a1..da8337e63a3e 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,69 +1,220 @@ // SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "startup: " fmt #include <linux/string.h> #include <linux/elf.h> +#include <asm/page-states.h> #include <asm/boot_data.h> +#include <asm/extmem.h> #include <asm/sections.h> +#include <asm/diag288.h> +#include <asm/maccess.h> +#include <asm/machine.h> +#include <asm/sysinfo.h> #include <asm/cpu_mf.h> #include <asm/setup.h> +#include <asm/timex.h> #include <asm/kasan.h> #include <asm/kexec.h> #include <asm/sclp.h> #include <asm/diag.h> #include <asm/uv.h> +#include <asm/abs_lowcore.h> +#include <asm/physmem_info.h> #include "decompressor.h" #include "boot.h" #include "uv.h" -unsigned long __bootdata_preserved(__kaslr_offset); -unsigned long __bootdata(__amode31_base); +struct vm_layout __bootdata_preserved(vm_layout); +unsigned long __bootdata_preserved(__abs_lowcore); +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); unsigned long __bootdata_preserved(VMALLOC_START); unsigned long __bootdata_preserved(VMALLOC_END); struct page *__bootdata_preserved(vmemmap); unsigned long __bootdata_preserved(vmemmap_size); unsigned long __bootdata_preserved(MODULES_VADDR); unsigned long __bootdata_preserved(MODULES_END); -unsigned long __bootdata(ident_map_size); -int __bootdata(is_full_image) = 1; -struct initrd_data __bootdata(initrd_data); +unsigned long __bootdata_preserved(max_mappable); +unsigned long __bootdata_preserved(page_noexec_mask); +unsigned long __bootdata_preserved(segment_noexec_mask); +unsigned long __bootdata_preserved(region_noexec_mask); +union tod_clock __bootdata_preserved(tod_clock_base); +u64 __bootdata_preserved(clock_comparator_max) = -1UL; u64 __bootdata_preserved(stfle_fac_list[16]); -u64 __bootdata_preserved(alt_stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); void error(char *x) { - sclp_early_printk("\n\n"); - sclp_early_printk(x); - sclp_early_printk("\n\n -- System halted"); - + boot_emerg("%s\n", x); + boot_emerg(" -- System halted\n"); disabled_wait(); } +static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static void detect_machine_type(void) +{ + struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; + + /* Check current-configuration-level */ + if (stsi(NULL, 0, 0, 0) <= 2) { + set_machine_feature(MFEATURE_LPAR); + return; + } + /* Get virtual-machine cpu information. */ + if (stsi(vmms, 3, 2, 2) || !vmms->count) + return; + /* Detect known hypervisors */ + if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) + set_machine_feature(MFEATURE_KVM); + else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) + set_machine_feature(MFEATURE_VM); +} + +static void detect_diag288(void) +{ + /* "BEGIN" in EBCDIC character set */ + static const char cmd[] = "\xc2\xc5\xc7\xc9\xd5"; + unsigned long action, len; + + action = machine_is_vm() ? (unsigned long)cmd : LPARWDT_RESTART; + len = machine_is_vm() ? sizeof(cmd) : 0; + if (__diag288(WDT_FUNC_INIT, MIN_INTERVAL, action, len)) + return; + __diag288(WDT_FUNC_CANCEL, 0, 0, 0); + set_machine_feature(MFEATURE_DIAG288); +} + +static void detect_diag9c(void) +{ + unsigned int cpu; + int rc = 1; + + cpu = stap(); + asm_inline volatile( + " diag %[cpu],%%r0,0x9c\n" + "0: lhi %[rc],0\n" + "1:\n" + EX_TABLE(0b, 1b) + : [rc] "+d" (rc) + : [cpu] "d" (cpu) + : "cc", "memory"); + if (!rc) + set_machine_feature(MFEATURE_DIAG9C); +} + +static void reset_tod_clock(void) +{ + union tod_clock clk; + + if (store_tod_clock_ext_cc(&clk) == 0) + return; + /* TOD clock not running. Set the clock to Unix Epoch. */ + if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) + disabled_wait(); + memset(&tod_clock_base, 0, sizeof(tod_clock_base)); + tod_clock_base.tod = TOD_UNIX_EPOCH; + get_lowcore()->last_update_clock = TOD_UNIX_EPOCH; +} + +static void detect_facilities(void) +{ + if (cpu_has_edat1()) + local_ctl_set_bit(0, CR0_EDAT_BIT); + page_noexec_mask = -1UL; + segment_noexec_mask = -1UL; + region_noexec_mask = -1UL; + if (!cpu_has_nx()) { + page_noexec_mask &= ~_PAGE_NOEXEC; + segment_noexec_mask &= ~_SEGMENT_ENTRY_NOEXEC; + region_noexec_mask &= ~_REGION_ENTRY_NOEXEC; + } + if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) + set_machine_feature(MFEATURE_PCI_MIO); + reset_tod_clock(); + if (test_facility(139) && (tod_clock_base.tod >> 63)) { + /* Enable signed clock comparator comparisons */ + set_machine_feature(MFEATURE_SCC); + clock_comparator_max = -1UL >> 1; + local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT); + } + if (test_facility(50) && test_facility(73)) { + set_machine_feature(MFEATURE_TX); + local_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT); + } + if (cpu_has_vx()) + local_ctl_set_bit(0, CR0_VECTOR_BIT); +} + +static int cmma_test_essa(void) +{ + unsigned long tmp = 0; + int rc = 1; + + /* Test ESSA_GET_STATE */ + asm_inline volatile( + " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" + "0: lhi %[rc],0\n" + "1:\n" + EX_TABLE(0b, 1b) + : [rc] "+d" (rc), [tmp] "+d" (tmp) + : [cmd] "i" (ESSA_GET_STATE) + : "cc", "memory"); + return rc; +} + +static void cmma_init(void) +{ + if (!cmma_flag) + return; + if (cmma_test_essa()) { + cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; +} + static void setup_lpp(void) { - S390_lowcore.current_pid = 0; - S390_lowcore.lpp = LPP_MAGIC; + get_lowcore()->current_pid = 0; + get_lowcore()->lpp = LPP_MAGIC; if (test_facility(40)) - lpp(&S390_lowcore.lpp); + lpp(&get_lowcore()->lpp); } #ifdef CONFIG_KERNEL_UNCOMPRESSED -unsigned long mem_safe_offset(void) +static unsigned long mem_safe_offset(void) { - return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + return (unsigned long)_compressed_start; +} + +static void deploy_kernel(void *output) +{ + void *uncompressed_start = (void *)_compressed_start; + + if (output == uncompressed_start) + return; + memmove(output, uncompressed_start, vmlinux.image_size); + memset(uncompressed_start, 0, vmlinux.image_size); } #endif -static void rescue_initrd(unsigned long addr) +static void rescue_initrd(unsigned long min, unsigned long max) { + unsigned long old_addr, addr, size; + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) return; - if (!initrd_data.start || !initrd_data.size) + if (!get_physmem_reserved(RR_INITRD, &addr, &size)) return; - if (addr <= initrd_data.start) + if (addr >= min && addr + size <= max) return; - memmove((void *)addr, (void *)initrd_data.start, initrd_data.size); - initrd_data.start = addr; + old_addr = addr; + physmem_free(RR_INITRD); + addr = physmem_alloc_or_die(RR_INITRD, size, 0); + memmove((void *)addr, (void *)old_addr, size); } static void copy_bootdata(void) @@ -76,34 +227,33 @@ static void copy_bootdata(void) memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } -static void handle_relocs(unsigned long offset) +static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, + unsigned long offset, unsigned long phys_offset) { - Elf64_Rela *rela_start, *rela_end, *rela; - int r_type, r_sym, rc; - Elf64_Addr loc, val; - Elf64_Sym *dynsym; - - rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; - rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; - dynsym = (Elf64_Sym *) vmlinux.dynsym_start; - for (rela = rela_start; rela < rela_end; rela++) { - loc = rela->r_offset + offset; - val = rela->r_addend; - r_sym = ELF64_R_SYM(rela->r_info); - if (r_sym) { - if (dynsym[r_sym].st_shndx != SHN_UNDEF) - val += dynsym[r_sym].st_value + offset; - } else { - /* - * 0 == undefined symbol table index (STN_UNDEF), - * used for R_390_RELATIVE, only add KASLR offset - */ - val += offset; - } - r_type = ELF64_R_TYPE(rela->r_info); - rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); - if (rc) - error("Unknown relocation type"); + int *reloc; + long loc; + + /* Adjust R_390_64 relocations */ + for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) { + loc = (long)*reloc + phys_offset; + if (loc < min_addr || loc > max_addr) + error("64-bit relocation outside of kernel!\n"); + *(u64 *)loc += offset; + } +} + +static void kaslr_adjust_got(unsigned long offset) +{ + u64 *entry; + + /* + * Adjust GOT entries, except for ones for undefined weak symbols + * that resolved to zero. This also skips the first three reserved + * entries on s390x that are zero. + */ + for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) { + if (*entry) + *entry += offset; } } @@ -111,13 +261,16 @@ static void handle_relocs(unsigned long offset) * Merge information from several sources into a single ident_map_size value. * "ident_map_size" represents the upper limit of physical memory we may ever * reach. It might not be all online memory, but also include standby (offline) - * memory. "ident_map_size" could be lower then actual standby or even online + * memory or memory areas reserved for other means (e.g., memory devices such as + * virtio-mem). + * + * "ident_map_size" could be lower then actual standby/reserved or even online * memory present, due to limiting factors. We should never go above this limit. * It is the size of our identity mapping. * * Consider the following factors: - * 1. max_physmem_end - end of physical memory online or standby. - * Always <= end of the last online memory block (get_mem_detect_end()). + * 1. max_physmem_end - end of physical memory online, standby or reserved. + * Always >= end of the last online memory range (get_physmem_online_end()). * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the * kernel is able to support. * 3. "mem=" kernel command line option which limits physical memory usage. @@ -137,76 +290,173 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #ifdef CONFIG_CRASH_DUMP if (oldmem_data.start) { - kaslr_enabled = 0; + __kaslr_enabled = 0; ident_map_size = min(ident_map_size, oldmem_data.size); + boot_debug("kdump memory limit: 0x%016lx\n", oldmem_data.size); } else if (ipl_block_valid && is_ipl_block_dump()) { - kaslr_enabled = 0; - if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) + __kaslr_enabled = 0; + if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) { ident_map_size = min(ident_map_size, hsa_size); + boot_debug("Stand-alone dump limit: 0x%016lx\n", hsa_size); + } } #endif + boot_debug("Identity map size: 0x%016lx\n", ident_map_size); } -static void setup_kernel_memory_layout(void) +#define FIXMAP_SIZE round_up(MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)) + +static unsigned long get_vmem_size(unsigned long identity_size, + unsigned long vmemmap_size, + unsigned long vmalloc_size, + unsigned long rte_size) +{ + unsigned long max_mappable, vsize; + + max_mappable = max(identity_size, MAX_DCSS_ADDR); + vsize = round_up(SZ_2G + max_mappable, rte_size) + + round_up(vmemmap_size, rte_size) + + FIXMAP_SIZE + MODULES_LEN + KASLR_LEN; + if (IS_ENABLED(CONFIG_KMSAN)) + vsize += MODULES_LEN * 2; + return size_add(vsize, vmalloc_size); +} + +static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) { unsigned long vmemmap_start; + unsigned long kernel_start; + unsigned long asce_limit; unsigned long rte_size; unsigned long pages; + unsigned long vsize; + unsigned long vmax; pages = ident_map_size / PAGE_SIZE; /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - vmemmap_start = round_up(ident_map_size, _REGION3_SIZE); - if (IS_ENABLED(CONFIG_KASAN) || - vmalloc_size > _REGION2_SIZE || - vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > - _REGION2_SIZE) { - MODULES_END = _REGION1_SIZE; - rte_size = _REGION2_SIZE; + BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); + BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); + vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); + boot_debug("vmem size estimated: 0x%016lx\n", vsize); + if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE || + (vsize > _REGION2_SIZE && kaslr_enabled())) { + asce_limit = _REGION1_SIZE; + if (__NO_KASLR_END_KERNEL > _REGION2_SIZE) { + rte_size = _REGION2_SIZE; + vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE); + } else { + rte_size = _REGION3_SIZE; + } } else { - MODULES_END = _REGION2_SIZE; + asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } + /* - * forcing modules and vmalloc area under the ultravisor + * Forcing modules and vmalloc area under the ultravisor * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. + * + * Assume the secure storage limit always exceeds _REGION2_SIZE, + * otherwise asce_limit and rte_size would have been adjusted. */ - adjust_to_uv_max(&MODULES_END); + vmax = adjust_to_uv_max(asce_limit); + boot_debug("%d level paging 0x%016lx vmax\n", vmax == _REGION1_SIZE ? 4 : 3, vmax); #ifdef CONFIG_KASAN + BUILD_BUG_ON(__NO_KASLR_END_KERNEL > KASAN_SHADOW_START); + boot_debug("KASAN shadow area: 0x%016lx-0x%016lx\n", KASAN_SHADOW_START, KASAN_SHADOW_END); /* force vmalloc and modules below kasan shadow */ - MODULES_END = min(MODULES_END, KASAN_SHADOW_START); + vmax = min(vmax, KASAN_SHADOW_START); #endif + vsize = min(vsize, vmax); + if (kaslr_enabled()) { + unsigned long kernel_end, kaslr_len, slots, pos; + + kaslr_len = max(KASLR_LEN, vmax - vsize); + slots = DIV_ROUND_UP(kaslr_len - kernel_size, THREAD_SIZE); + if (get_random(slots, &pos)) + pos = 0; + kernel_end = vmax - pos * THREAD_SIZE; + kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE); + boot_debug("Randomization range: 0x%016lx-0x%016lx\n", vmax - kaslr_len, vmax); + boot_debug("kernel image: 0x%016lx-0x%016lx (kaslr)\n", kernel_start, + kernel_size + kernel_size); + } else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) { + kernel_start = round_down(vmax - kernel_size, THREAD_SIZE); + boot_debug("kernel image: 0x%016lx-0x%016lx (constrained)\n", kernel_start, + kernel_start + kernel_size); + } else { + kernel_start = __NO_KASLR_START_KERNEL; + boot_debug("kernel image: 0x%016lx-0x%016lx (nokaslr)\n", kernel_start, + kernel_start + kernel_size); + } + __kaslr_offset = kernel_start; + boot_debug("__kaslr_offset: 0x%016lx\n", __kaslr_offset); + + MODULES_END = round_down(kernel_start, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; + if (IS_ENABLED(CONFIG_KMSAN)) + VMALLOC_END -= MODULES_LEN * 2; + boot_debug("modules area: 0x%016lx-0x%016lx\n", MODULES_VADDR, MODULES_END); /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ - vmalloc_size = min(vmalloc_size, round_down(VMALLOC_END / 2, _REGION3_SIZE)); + vsize = (VMALLOC_END - FIXMAP_SIZE) / 2; + vsize = round_down(vsize, _SEGMENT_SIZE); + vmalloc_size = min(vmalloc_size, vsize); + if (IS_ENABLED(CONFIG_KMSAN)) { + /* take 2/3 of vmalloc area for KMSAN shadow and origins */ + vmalloc_size = round_down(vmalloc_size / 3, _SEGMENT_SIZE); + VMALLOC_END -= vmalloc_size * 2; + } VMALLOC_START = VMALLOC_END - vmalloc_size; + boot_debug("vmalloc area: 0x%016lx-0x%016lx\n", VMALLOC_START, VMALLOC_END); + + __memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE); + boot_debug("memcpy real area: 0x%016lx-0x%016lx\n", __memcpy_real_area, + __memcpy_real_area + MEMCPY_REAL_SIZE); + __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, + sizeof(struct lowcore)); + boot_debug("abs lowcore: 0x%016lx-0x%016lx\n", __abs_lowcore, + __abs_lowcore + ABS_LOWCORE_MAP_SIZE); /* split remaining virtual space between 1:1 mapping & vmemmap array */ - pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + pages = __abs_lowcore / (PAGE_SIZE + sizeof(struct page)); pages = SECTION_ALIGN_UP(pages); /* keep vmemmap_start aligned to a top level region table entry */ - vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); - /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */ - vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); + vmemmap_start = round_down(__abs_lowcore - pages * sizeof(struct page), rte_size); /* make sure identity map doesn't overlay with vmemmap */ ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); - /* make sure vmemmap doesn't overlay with vmalloc area */ - VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); + /* make sure vmemmap doesn't overlay with absolute lowcore area */ + if (vmemmap_start + vmemmap_size > __abs_lowcore) { + vmemmap_size = SECTION_ALIGN_DOWN(ident_map_size / PAGE_SIZE) * sizeof(struct page); + ident_map_size = vmemmap_size / sizeof(struct page) * PAGE_SIZE; + } vmemmap = (struct page *)vmemmap_start; + /* maximum address for which linear mapping could be created (DCSS, memory) */ + BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); + max_mappable = max(ident_map_size, MAX_DCSS_ADDR); + max_mappable = min(max_mappable, vmemmap_start); +#ifdef CONFIG_RANDOMIZE_IDENTITY_BASE + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); +#endif + boot_debug("identity map: 0x%016lx-0x%016lx\n", __identity_base, + __identity_base + ident_map_size); + + return asce_limit; } /* * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's. */ -static void clear_bss_section(void) +static void clear_bss_section(unsigned long kernel_start) { - memset((void *)vmlinux.default_lma + vmlinux.image_size, 0, vmlinux.bss_size); + memset((void *)kernel_start + vmlinux.image_size, 0, vmlinux.bss_size); } /* @@ -223,78 +473,174 @@ static void setup_vmalloc_size(void) vmalloc_size = max(size, vmalloc_size); } -static void offset_vmlinux_info(unsigned long offset) +static void kaslr_adjust_vmlinux_info(long offset) { - vmlinux.default_lma += offset; - *(unsigned long *)(&vmlinux.entry) += offset; vmlinux.bootdata_off += offset; vmlinux.bootdata_preserved_off += offset; - vmlinux.rela_dyn_start += offset; - vmlinux.rela_dyn_end += offset; - vmlinux.dynsym_start += offset; -} - -static unsigned long reserve_amode31(unsigned long safe_addr) -{ - __amode31_base = PAGE_ALIGN(safe_addr); - return safe_addr + vmlinux.amode31_size; + vmlinux.got_start += offset; + vmlinux.got_end += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; + vmlinux.alt_instructions += offset; + vmlinux.alt_instructions_end += offset; +#ifdef CONFIG_KASAN + vmlinux.kasan_early_shadow_page_off += offset; + vmlinux.kasan_early_shadow_pte_off += offset; + vmlinux.kasan_early_shadow_pmd_off += offset; + vmlinux.kasan_early_shadow_pud_off += offset; + vmlinux.kasan_early_shadow_p4d_off += offset; +#endif } void startup_kernel(void) { - unsigned long random_lma; + unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size; + unsigned long nokaslr_text_lma, text_lma = 0, amode31_lma = 0; + unsigned long kernel_size = TEXT_OFFSET + vmlinux_size; + unsigned long kaslr_large_page_offset; + unsigned long max_physmem_end; + unsigned long asce_limit; unsigned long safe_addr; - void *img; - - initrd_data.start = parmarea.initrd_start; - initrd_data.size = parmarea.initrd_size; - oldmem_data.start = parmarea.oldmem_base; - oldmem_data.size = parmarea.oldmem_size; + psw_t psw; setup_lpp(); store_ipl_parmblock(); - safe_addr = mem_safe_offset(); - safe_addr = reserve_amode31(safe_addr); - safe_addr = read_ipl_report(safe_addr); uv_query_info(); - rescue_initrd(safe_addr); - sclp_early_read_info(); setup_boot_command_line(); parse_boot_command_line(); + + /* + * Non-randomized kernel physical start address must be _SEGMENT_SIZE + * aligned (see blow). + */ + nokaslr_text_lma = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); + safe_addr = PAGE_ALIGN(nokaslr_text_lma + vmlinux_size); + + /* + * Reserve decompressor memory together with decompression heap, + * buffer and memory which might be occupied by uncompressed kernel + * (if KASLR is off or failed). + */ + physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size) + physmem_reserve(RR_INITRD, parmarea.initrd_start, parmarea.initrd_size); + oldmem_data.start = parmarea.oldmem_base; + oldmem_data.size = parmarea.oldmem_size; + + read_ipl_report(); + sclp_early_read_info(); + sclp_early_detect_machine_features(); + detect_facilities(); + detect_diag9c(); + detect_machine_type(); + /* detect_diag288() needs machine type */ + detect_diag288(); + cmma_init(); sanitize_prot_virt_host(); - setup_ident_map_size(detect_memory()); + max_physmem_end = detect_max_physmem_end(); + setup_ident_map_size(max_physmem_end); setup_vmalloc_size(); - setup_kernel_memory_layout(); - - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { - random_lma = get_random_base(safe_addr); - if (random_lma) { - __kaslr_offset = random_lma - vmlinux.default_lma; - img = (void *)vmlinux.default_lma; - offset_vmlinux_info(__kaslr_offset); - } + asce_limit = setup_kernel_memory_layout(kernel_size); + /* got final ident_map_size, physmem allocations could be performed now */ + physmem_set_usable_limit(ident_map_size); + detect_physmem_online_ranges(max_physmem_end); + save_ipl_cert_comp_list(); + rescue_initrd(safe_addr, ident_map_size); + + /* + * __kaslr_offset_phys must be _SEGMENT_SIZE aligned, so the lower + * 20 bits (the offset within a large page) are zero. Copy the last + * 20 bits of __kaslr_offset, which is THREAD_SIZE aligned, to + * __kaslr_offset_phys. + * + * With this the last 20 bits of __kaslr_offset_phys and __kaslr_offset + * are identical, which is required to allow for large mappings of the + * kernel image. + */ + kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; + if (kaslr_enabled()) { + unsigned long size = vmlinux_size + kaslr_large_page_offset; + + text_lma = randomize_within_range(size, _SEGMENT_SIZE, TEXT_OFFSET, ident_map_size); } + if (!text_lma) + text_lma = nokaslr_text_lma; + text_lma |= kaslr_large_page_offset; - if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { - img = decompress_kernel(); - memmove((void *)vmlinux.default_lma, img, vmlinux.image_size); - } else if (__kaslr_offset) - memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + /* + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region is + * never accessed via the kernel image mapping as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, this region could be used for something else and does + * not need to be reserved. See how it is skipped in setup_vmem(). + */ + __kaslr_offset_phys = text_lma - TEXT_OFFSET; + kaslr_adjust_vmlinux_info(__kaslr_offset_phys); + physmem_reserve(RR_VMLINUX, text_lma, vmlinux_size); + deploy_kernel((void *)text_lma); - clear_bss_section(); - copy_bootdata(); - if (IS_ENABLED(CONFIG_RELOCATABLE)) - handle_relocs(__kaslr_offset); - - if (__kaslr_offset) { - /* - * Save KASLR offset for early dumps, before vmcore_info is set. - * Mark as uneven to distinguish from real vmcore_info pointer. - */ - S390_lowcore.vmcore_info = __kaslr_offset | 0x1UL; - /* Clear non-relocated kernel */ - if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) - memset(img, 0, vmlinux.image_size); + /* vmlinux decompression is done, shrink reserved low memory */ + physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); + + /* + * In case KASLR is enabled the randomized location of .amode31 + * section might overlap with .vmlinux.relocs section. To avoid that + * the below randomize_within_range() could have been called with + * __vmlinux_relocs_64_end as the lower range address. However, + * .amode31 section is written to by the decompressed kernel - at + * that time the contents of .vmlinux.relocs is not needed anymore. + * Conversely, .vmlinux.relocs is read only by the decompressor, even + * before the kernel started. Therefore, in case the two sections + * overlap there is no risk of corrupting any data. + */ + if (kaslr_enabled()) { + unsigned long amode31_min; + + amode31_min = (unsigned long)_decompressor_end; + amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G); } - vmlinux.entry(); + if (!amode31_lma) + amode31_lma = text_lma - vmlinux.amode31_size; + physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); + + /* + * The order of the following operations is important: + * + * - kaslr_adjust_relocs() must follow clear_bss_section() to establish + * static memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow kaslr_adjust_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes + * to bootdata made by setup_vmem() + */ + clear_bss_section(text_lma); + kaslr_adjust_relocs(text_lma, text_lma + vmlinux.image_size, + __kaslr_offset, __kaslr_offset_phys); + kaslr_adjust_got(__kaslr_offset); + setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); + dump_physmem_reserved(); + copy_bootdata(); + __apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions, + (struct alt_instr *)_vmlinux_info.alt_instructions_end, + ALT_CTX_EARLY); + + /* + * Save KASLR offset for early dumps, before vmcore_info is set. + * Mark as uneven to distinguish from real vmcore_info pointer. + */ + get_lowcore()->vmcore_info = __kaslr_offset_phys ? __kaslr_offset_phys | 0x1UL : 0; + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = __kaslr_offset + vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + boot_debug("Starting kernel at: 0x%016lx\n", psw.addr); + __load_psw(psw); } diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c index faccb33b462c..bd68161434a6 100644 --- a/arch/s390/boot/string.c +++ b/arch/s390/boot/string.c @@ -1,11 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#define IN_BOOT_STRING_C 1 #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/errno.h> #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC +#undef CONFIG_KMSAN #include "../lib/string.c" +/* + * Duplicate some functions from the common lib/string.c + * instead of fully including it. + */ + int strncmp(const char *cs, const char *ct, size_t count) { unsigned char c1, c2; @@ -22,6 +29,27 @@ int strncmp(const char *cs, const char *ct, size_t count) return 0; } +ssize_t sized_strscpy(char *dst, const char *src, size_t count) +{ + size_t len; + + if (count == 0) + return -E2BIG; + len = strnlen(src, count - 1); + memcpy(dst, src, len); + dst[len] = '\0'; + return src[len] ? -E2BIG : len; +} + +void *memset64(uint64_t *s, uint64_t v, size_t count) +{ + uint64_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} + char *skip_spaces(const char *str) { while (isspace(*str)) diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index e6be155ab2e5..4568e8f81dac 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -8,12 +8,8 @@ #include "uv.h" /* will be used in arch/s390/kernel/uv.c */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); -#endif -#if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); -#endif struct uv_info __bootdata_preserved(uv_info); void uv_query_info(void) @@ -26,8 +22,8 @@ void uv_query_info(void) if (!test_facility(158)) return; - /* rc==0x100 means that there is additional data we do not process */ - if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100) + /* Ignore that there might be more data we do not process */ + if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != UVC_RC_MORE_DATA) return; if (IS_ENABLED(CONFIG_KVM)) { @@ -41,20 +37,29 @@ void uv_query_info(void) uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id; uv_info.uv_feature_indications = uvcb.uv_feature_indications; + uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions; + uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf; + uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len; + uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len; + uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver; + uv_info.supp_att_pflags = uvcb.supp_att_pflags; + uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver; + uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf; + uv_info.supp_secret_types = uvcb.supp_secret_types; + uv_info.max_assoc_secrets = uvcb.max_assoc_secrets; + uv_info.max_retr_secrets = uvcb.max_retr_secrets; } -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) prot_virt_guest = 1; -#endif } -#if IS_ENABLED(CONFIG_KVM) -void adjust_to_uv_max(unsigned long *vmax) +unsigned long adjust_to_uv_max(unsigned long limit) { if (is_prot_virt_host() && uv_info.max_sec_stor_addr) - *vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr); + limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr); + return limit; } static int is_prot_virt_host_capable(void) @@ -81,4 +86,3 @@ void sanitize_prot_virt_host(void) { prot_virt_host = is_prot_virt_host_capable(); } -#endif diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h index 690ce019af5a..da4a4a8d48e0 100644 --- a/arch/s390/boot/uv.h +++ b/arch/s390/boot/uv.h @@ -2,18 +2,8 @@ #ifndef BOOT_UV_H #define BOOT_UV_H -#if IS_ENABLED(CONFIG_KVM) -void adjust_to_uv_max(unsigned long *vmax); +unsigned long adjust_to_uv_max(unsigned long limit); void sanitize_prot_virt_host(void); -#else -static inline void adjust_to_uv_max(unsigned long *vmax) {} -static inline void sanitize_prot_virt_host(void) {} -#endif - -#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) void uv_query_info(void); -#else -static inline void uv_query_info(void) {} -#endif #endif /* BOOT_UV_H */ diff --git a/arch/s390/boot/version.c b/arch/s390/boot/version.c index d32e58bdda6a..fd32f038777f 100644 --- a/arch/s390/boot/version.c +++ b/arch/s390/boot/version.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <generated/utsversion.h> #include <generated/utsrelease.h> #include <generated/compile.h> #include "boot.h" diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..1d073acd05a7 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 +#define boot_fmt(fmt) "vmem: " fmt +#include <linux/cpufeature.h> +#include <linux/sched/task.h> +#include <linux/pgtable.h> +#include <linux/kasan.h> +#include <asm/page-states.h> +#include <asm/pgalloc.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/ctlreg.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> +#include <asm/machine.h> +#include <asm/abs_lowcore.h> +#include "decompressor.h" +#include "boot.h" + +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) +struct ctlreg __bootdata_preserved(s390_invalid_asce); + +#ifdef CONFIG_PROC_FS +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); +#endif + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +enum populate_mode { + POPULATE_NONE, + POPULATE_DIRECT, + POPULATE_LOWCORE, + POPULATE_ABS_LOWCORE, + POPULATE_IDENTITY, + POPULATE_KERNEL, +#ifdef CONFIG_KASAN + /* KASAN modes should be last and grouped together, see is_kasan_populate_mode() */ + POPULATE_KASAN_MAP_SHADOW, + POPULATE_KASAN_ZERO_SHADOW, + POPULATE_KASAN_SHALLOW +#endif +}; + +#define POPULATE_MODE_NAME(t) case POPULATE_ ## t: return #t +static inline const char *get_populate_mode_name(enum populate_mode t) +{ + switch (t) { + POPULATE_MODE_NAME(NONE); + POPULATE_MODE_NAME(DIRECT); + POPULATE_MODE_NAME(LOWCORE); + POPULATE_MODE_NAME(ABS_LOWCORE); + POPULATE_MODE_NAME(IDENTITY); + POPULATE_MODE_NAME(KERNEL); +#ifdef CONFIG_KASAN + POPULATE_MODE_NAME(KASAN_MAP_SHADOW); + POPULATE_MODE_NAME(KASAN_ZERO_SHADOW); + POPULATE_MODE_NAME(KASAN_SHALLOW); +#endif + default: + return "UNKNOWN"; + } +} + +static bool is_kasan_populate_mode(enum populate_mode mode) +{ +#ifdef CONFIG_KASAN + return mode >= POPULATE_KASAN_MAP_SHADOW; +#else + return false; +#endif +} + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode); + +#ifdef CONFIG_KASAN + +#define kasan_early_shadow_page vmlinux.kasan_early_shadow_page_off +#define kasan_early_shadow_pte ((pte_t *)vmlinux.kasan_early_shadow_pte_off) +#define kasan_early_shadow_pmd ((pmd_t *)vmlinux.kasan_early_shadow_pmd_off) +#define kasan_early_shadow_pud ((pud_t *)vmlinux.kasan_early_shadow_pud_off) +#define kasan_early_shadow_p4d ((p4d_t *)vmlinux.kasan_early_shadow_p4d_off) +#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) + +static pte_t pte_z; + +static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode) +{ + unsigned long sha_start = PAGE_ALIGN_DOWN(__sha(start)); + unsigned long sha_end = PAGE_ALIGN(__sha(end)); + + boot_debug("%-17s 0x%016lx-0x%016lx >> 0x%016lx-0x%016lx\n", get_populate_mode_name(mode), + start, end, sha_start, sha_end); + pgtable_populate(sha_start, sha_end, mode); +} + +static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end) +{ + pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); + pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); + p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); + unsigned long memgap_start = 0; + unsigned long start, end; + int i; + + pte_z = __pte(__pa(kasan_early_shadow_page) | pgprot_val(PAGE_KERNEL_RO)); + crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); + memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); + __arch_set_page_dat(kasan_early_shadow_p4d, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pud, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pte, 1); + + for_each_physmem_usable_range(i, &start, &end) { + kasan_populate((unsigned long)__identity_va(start), + (unsigned long)__identity_va(end), + POPULATE_KASAN_MAP_SHADOW); + if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) { + kasan_populate((unsigned long)__identity_va(memgap_start), + (unsigned long)__identity_va(start), + POPULATE_KASAN_ZERO_SHADOW); + } + memgap_start = end; + } + kasan_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KASAN_MAP_SHADOW); + kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW); + /* shallowly populate kasan shadow for vmalloc and modules */ + kasan_populate(VMALLOC_START, MODULES_END, POPULATE_KASAN_SHALLOW); + /* populate kasan shadow for untracked memory */ + kasan_populate((unsigned long)__identity_va(ident_map_size), VMALLOC_START, + POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(kernel_end, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW); +} + +static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + pgd_populate(&init_mm, pgd, kasan_early_shadow_p4d); + return true; + } + return false; +} + +static bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + p4d_populate(&init_mm, p4d, kasan_early_shadow_pud); + return true; + } + return false; +} + +static bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pud_populate(&init_mm, pud, kasan_early_shadow_pmd); + return true; + } + return false; +} + +static bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate(&init_mm, pmd, kasan_early_shadow_pte); + return true; + } + return false; +} + +static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW) { + set_pte(pte, pte_z); + return true; + } + return false; +} +#else + +static inline void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end) +{ +} + +static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) +{ + return false; +} + +#endif + +/* + * Mimic virt_to_kpte() in lack of init_mm symbol. Skip pmd NULL check though. + */ +static inline pte_t *__virt_to_kpte(unsigned long va) +{ + return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va), va); +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long size = PAGE_SIZE << CRST_ALLOC_ORDER; + unsigned long *table; + + table = (unsigned long *)physmem_alloc_or_die(RR_VMEM, size, size); + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + /* + * handling pte_leftovers this way helps to avoid memory fragmentation + * during POPULATE_KASAN_MAP_SHADOW when EDAT is off + */ + if (!pte_leftover) { + pte_leftover = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + pte = pte_leftover + _PAGE_TABLE_SIZE; + __arch_set_page_dat(pte, 1); + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static unsigned long resolve_pa_may_alloc(unsigned long addr, unsigned long size, + enum populate_mode mode) +{ + switch (mode) { + case POPULATE_NONE: + return INVALID_PHYS_ADDR; + case POPULATE_DIRECT: + return addr; + case POPULATE_LOWCORE: + return __lowcore_pa(addr); + case POPULATE_ABS_LOWCORE: + return __abs_lowcore_pa(addr); + case POPULATE_KERNEL: + return __kernel_pa(addr); + case POPULATE_IDENTITY: + return __identity_pa(addr); +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: + /* Allow to fail large page allocations, this will fall back to 1mb/4k pages */ + addr = physmem_alloc(RR_VMEM, size, size, size == PAGE_SIZE); + if (addr) { + memset((void *)addr, 0, size); + return addr; + } + return INVALID_PHYS_ADDR; +#endif + default: + return INVALID_PHYS_ADDR; + } +} + +static bool large_page_mapping_allowed(enum populate_mode mode) +{ + switch (mode) { + case POPULATE_DIRECT: + case POPULATE_IDENTITY: + case POPULATE_KERNEL: +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: +#endif + return true; + default: + return false; + } +} + +static unsigned long try_get_large_pud_pa(pud_t *pu_dir, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pa, size = end - addr; + + if (!cpu_has_edat2() || !large_page_mapping_allowed(mode) || + !IS_ALIGNED(addr, PUD_SIZE) || (size < PUD_SIZE)) + return INVALID_PHYS_ADDR; + + pa = resolve_pa_may_alloc(addr, size, mode); + if (!IS_ALIGNED(pa, PUD_SIZE)) + return INVALID_PHYS_ADDR; + + return pa; +} + +static unsigned long try_get_large_pmd_pa(pmd_t *pm_dir, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pa, size = end - addr; + + if (!cpu_has_edat1() || !large_page_mapping_allowed(mode) || + !IS_ALIGNED(addr, PMD_SIZE) || (size < PMD_SIZE)) + return INVALID_PHYS_ADDR; + + pa = resolve_pa_may_alloc(addr, size, mode); + if (!IS_ALIGNED(pa, PMD_SIZE)) + return INVALID_PHYS_ADDR; + + return pa; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pages = 0; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + if (kasan_pte_populate_zero_shadow(pte, mode)) + continue; + entry = __pte(resolve_pa_may_alloc(addr, PAGE_SIZE, mode)); + entry = set_pte_bit(entry, PAGE_KERNEL); + set_pte(pte, entry); + pages++; + } + } + if (mode == POPULATE_IDENTITY) + update_page_count(PG_DIRECT_MAP_4K, pages); +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pa, next, pages = 0; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode)) + continue; + pa = try_get_large_pmd_pa(pmd, addr, next, mode); + if (pa != INVALID_PHYS_ADDR) { + entry = __pmd(pa); + entry = set_pmd_bit(entry, SEGMENT_KERNEL); + set_pmd(pmd, entry); + pages++; + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_leaf(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next, mode); + } + if (mode == POPULATE_IDENTITY) + update_page_count(PG_DIRECT_MAP_1M, pages); +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pa, next, pages = 0; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (kasan_pud_populate_zero_shadow(pud, addr, next, mode)) + continue; + pa = try_get_large_pud_pa(pud, addr, next, mode); + if (pa != INVALID_PHYS_ADDR) { + entry = __pud(pa); + entry = set_pud_bit(entry, REGION3_KERNEL); + set_pud(pud, entry); + pages++; + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_leaf(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next, mode); + } + if (mode == POPULATE_IDENTITY) + update_page_count(PG_DIRECT_MAP_2G, pages); +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + if (kasan_p4d_populate_zero_shadow(p4d, addr, next, mode)) + continue; + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next, mode); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + if (!is_kasan_populate_mode(mode)) { + boot_debug("%-17s 0x%016lx-0x%016lx -> 0x%016lx-0x%016lx\n", + get_populate_mode_name(mode), addr, end, + resolve_pa_may_alloc(addr, 0, mode), + resolve_pa_may_alloc(end - 1, 0, mode) + 1); + } + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + if (kasan_pgd_populate_zero_shadow(pgd, addr, next, mode)) + continue; + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } +#ifdef CONFIG_KASAN + if (mode == POPULATE_KASAN_SHALLOW) + continue; +#endif + pgtable_p4d_populate(pgd, addr, next, mode); + } +} + +void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit) +{ + unsigned long lowcore_address = 0; + unsigned long start, end; + unsigned long asce_type; + unsigned long asce_bits; + pgd_t *init_mm_pgd; + int i; + + /* + * Mark whole memory as no-dat. This must be done before any + * page tables are allocated, or kernel image builtin pages + * are marked as dat tables. + */ + for_each_physmem_online_range(i, &start, &end) + __arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT); + + /* + * init_mm->pgd contains virtual address of swapper_pg_dir. + * It is unusable at this stage since DAT is yet off. Swap + * it for physical address of swapper_pg_dir and restore + * the virtual address after all page tables are created. + */ + init_mm_pgd = init_mm.pgd; + init_mm.pgd = (pgd_t *)swapper_pg_dir; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce.val = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + __arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER); + + if (machine_has_relocated_lowcore()) + lowcore_address = LOWCORE_ALT_ADDRESS; + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + */ + pgtable_populate(lowcore_address, + lowcore_address + sizeof(struct lowcore), + POPULATE_LOWCORE); + for_each_physmem_usable_range(i, &start, &end) { + pgtable_populate((unsigned long)__identity_va(start), + (unsigned long)__identity_va(end), + POPULATE_IDENTITY); + } + + /* + * [kernel_start..kernel_start + TEXT_OFFSET] region is never + * accessed as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, skip mapping TEXT_OFFSET bytes to prevent access to + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region. + */ + pgtable_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KERNEL); + pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT); + pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), + POPULATE_ABS_LOWCORE); + pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE, + POPULATE_NONE); + memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area)); + + kasan_populate_shadow(kernel_start, kernel_end); + + get_lowcore()->kernel_asce.val = swapper_pg_dir | asce_bits; + get_lowcore()->user_asce = s390_invalid_asce; + + local_ctl_load(1, &get_lowcore()->kernel_asce); + local_ctl_load(7, &get_lowcore()->user_asce); + local_ctl_load(13, &get_lowcore()->kernel_asce); + + init_mm.context.asce = get_lowcore()->kernel_asce.val; + init_mm.pgd = init_mm_pgd; +} diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index af5c6860e0a1..50988022f9ea 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -31,6 +31,7 @@ SECTIONS _text = .; /* Text */ *(.text) *(.text.*) + INIT_TEXT _etext = . ; } .rodata : { @@ -39,6 +40,10 @@ SECTIONS *(.rodata.*) _erodata = . ; } + EXCEPTION_TABLE(16) + .got : { + *(.got) + } NOTES .data : { _data = . ; @@ -93,8 +98,24 @@ SECTIONS _decompressor_syms_end = .; } + _decompressor_end = .; + + . = ALIGN(4); + .vmlinux.relocs : { + __vmlinux_relocs_64_start = .; + *(.vmlinux.relocs_64) + __vmlinux_relocs_64_end = .; + } + #ifdef CONFIG_KERNEL_UNCOMPRESSED - . = 0x100000; + . = ALIGN(PAGE_SIZE); + . += AMODE31_SIZE; /* .amode31 section */ + + /* + * Make sure the location counter is not less than TEXT_OFFSET. + * _SEGMENT_SIZE is not available, use ALIGN(1 << 20) instead. + */ + . = MAX(TEXT_OFFSET, ALIGN(1 << 20)); #else . = ALIGN(8); #endif @@ -102,15 +123,49 @@ SECTIONS _compressed_start = .; *(.vmlinux.bin.compressed) _compressed_end = .; - FILL(0xff); - . = ALIGN(4096); + } + +#define SB_TRAILER_SIZE 32 + /* Trailer needed for Secure Boot */ + . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */ + . = ALIGN(4096) - SB_TRAILER_SIZE; + .sb.trailer : { + QUAD(0) + QUAD(0) + QUAD(0) + QUAD(0x000000207a49504c) } _end = .; + DWARF_DEBUG + ELF_DETAILS + + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the three reserved double words. + */ + .got.plt : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") + /* Sections to be discarded */ /DISCARD/ : { + COMMON_DISCARDS *(.eh_frame) - *(__ex_table) *(*__ksymtab*) *(___kcrctab*) } diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config new file mode 100644 index 000000000000..eb7f84f5925c --- /dev/null +++ b/arch/s390/configs/btf.config @@ -0,0 +1,2 @@ +# Help: Enable BTF debug info +CONFIG_DEBUG_INFO_BTF=y diff --git a/arch/s390/configs/compat.config b/arch/s390/configs/compat.config new file mode 100644 index 000000000000..6fd051453ae8 --- /dev/null +++ b/arch/s390/configs/compat.config @@ -0,0 +1,3 @@ +# Help: Enable compat support +CONFIG_COMPAT=y +CONFIG_COMPAT_32BIT_TIME=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index f6dfde577ce8..8ecad727497e 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -23,7 +23,6 @@ CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y -CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y @@ -39,28 +38,26 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y CONFIG_LIVEPATCH=y -CONFIG_MARCH_ZEC12=y -CONFIG_TUNE_ZEC12=y +CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_SIG=y +CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m -CONFIG_CRASH_DUMP=y -CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y +CONFIG_S390_HYPFS_FS=y CONFIG_KVM=m +CONFIG_KVM_S390_UCONTROL=y CONFIG_S390_UNWIND_SELFTEST=m CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_S390_MODULES_SANITY_TEST=m @@ -74,6 +71,7 @@ CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_SIG_SHA256=y @@ -91,25 +89,26 @@ CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m +CONFIG_ZSWAP=y +CONFIG_ZSMALLOC_STAT=y +CONFIG_SLAB_BUCKETS=y +CONFIG_SLUB_STATS=y +# CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y -CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y CONFIG_GUP_TEST=y CONFIG_ANON_VMA_NAME=y +CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -117,8 +116,8 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m -CONFIG_SMC=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -133,7 +132,6 @@ CONFIG_IP_MROUTE=y CONFIG_IP_MROUTE_MULTIPLE_TABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -167,6 +165,8 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y @@ -182,17 +182,39 @@ CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m -CONFIG_NFT_OBJREF=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NFT_REJECT_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_FLOW_TABLE_PROCFS=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -205,8 +227,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m CONFIG_NETFILTER_XT_TARGET_TRACE=m @@ -215,6 +239,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -229,6 +254,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_IPVS=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m @@ -246,6 +272,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -297,12 +324,10 @@ CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m @@ -339,7 +364,6 @@ CONFIG_BRIDGE_MRP=y CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m CONFIG_NET_SCH_PRIO=m @@ -350,7 +374,6 @@ CONFIG_NET_SCH_SFQ=m CONFIG_NET_SCH_TEQL=m CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_DRR=m CONFIG_NET_SCH_MQPRIO=m @@ -362,29 +385,32 @@ CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m CONFIG_NET_SCH_ETS=m CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m CONFIG_CLS_U32_PERF=y CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m +CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y CONFIG_OPENVSWITCH=m CONFIG_VSOCKETS=m @@ -401,8 +427,16 @@ CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_SAFE=y +# CONFIG_FW_LOADER is not set CONFIG_CONNECTOR=y CONFIG_ZRAM=y +CONFIG_ZRAM_BACKEND_LZ4=y +CONFIG_ZRAM_BACKEND_LZ4HC=y +CONFIG_ZRAM_BACKEND_ZSTD=y +CONFIG_ZRAM_BACKEND_DEFLATE=y +CONFIG_ZRAM_BACKEND_842=y +CONFIG_ZRAM_BACKEND_LZO=y +CONFIG_ZRAM_DEF_COMP_DEFLATE=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m @@ -438,9 +472,8 @@ CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y +# CONFIG_MD_BITMAP_FILE is not set CONFIG_MD_LINEAR=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -465,8 +498,10 @@ CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m +CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -493,7 +528,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_ASIX is not set # CONFIG_NET_VENDOR_ATHEROS is not set # CONFIG_NET_VENDOR_BROADCOM is not set -# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set # CONFIG_NET_VENDOR_CAVIUM is not set # CONFIG_NET_VENDOR_CHELSIO is not set @@ -509,25 +543,27 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_GOOGLE is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set -# CONFIG_NET_VENDOR_MICROSOFT is not set # CONFIG_NET_VENDOR_LITEX is not set # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set # CONFIG_NET_VENDOR_MICROSEMI is not set +# CONFIG_NET_VENDOR_MICROSOFT is not set # CONFIG_NET_VENDOR_MYRI is not set +# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NATSEMI is not set # CONFIG_NET_VENDOR_NETERION is not set # CONFIG_NET_VENDOR_NETRONOME is not set -# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set # CONFIG_NET_VENDOR_PENSANDO is not set # CONFIG_NET_VENDOR_QLOGIC is not set +# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set # CONFIG_NET_VENDOR_REALTEK is not set @@ -535,9 +571,9 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_ROCKER is not set # CONFIG_NET_VENDOR_SAMSUNG is not set # CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SILAN is not set # CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SMSC is not set # CONFIG_NET_VENDOR_SOCIONEXT is not set # CONFIG_NET_VENDOR_STMICRO is not set @@ -547,6 +583,7 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_TI is not set # CONFIG_NET_VENDOR_VERTEXCOM is not set # CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WANGXUN is not set # CONFIG_NET_VENDOR_WIZNET is not set # CONFIG_NET_VENDOR_XILINX is not set CONFIG_PPP=m @@ -566,6 +603,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 +# CONFIG_LEGACY_TIOCSTI is not set CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_HANGCHECK_TIMER=m @@ -577,28 +615,33 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m -# CONFIG_DRM_DEBUG_MODESET_LOCK is not set +CONFIG_DRM=m +CONFIG_DRM_VIRTIO_GPU=m CONFIG_FB=y -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -# CONFIG_HID is not set +# CONFIG_FB_DEVICE is not set +# CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m -CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m -CONFIG_S390_CCW_IOMMU=y -CONFIG_S390_AP_IOMMU=y +CONFIG_VHOST_VDPA=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -620,7 +663,9 @@ CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_BTRFS_DEBUG=y CONFIG_BTRFS_ASSERT=y CONFIG_NILFS2_FS=m -CONFIG_FS_DAX=y +CONFIG_BCACHEFS_FS=y +CONFIG_BCACHEFS_QUOTA=y +CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -631,13 +676,13 @@ CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_QUOTA_DEBUG=y CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m -CONFIG_AUTOFS4_FS=m +CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m CONFIG_NETFS_STATS=y -CONFIG_FSCACHE=m +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y @@ -647,16 +692,16 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_INODE64=y +CONFIG_TMPFS_QUOTA=y CONFIG_HUGETLBFS=y -CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y CONFIG_SQUASHFS_XATTR=y CONFIG_SQUASHFS_LZ4=y CONFIG_SQUASHFS_LZO=y @@ -664,6 +709,7 @@ CONFIG_SQUASHFS_XZ=y CONFIG_SQUASHFS_ZSTD=y CONFIG_ROMFS_FS=m CONFIG_NFS_FS=m +CONFIG_NFS_V2=m CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y @@ -671,6 +717,7 @@ CONFIG_NFSD=m CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_V4_SECURITY_LABEL=y +# CONFIG_NFSD_LEGACY_CLIENT_TRACKING is not set CONFIG_CIFS=m CONFIG_CIFS_UPCALL=y CONFIG_CIFS_XATTR=y @@ -688,61 +735,39 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_LOCKDOWN_LSM=y CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y CONFIG_SECURITY_LANDLOCK=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_INTEGRITY_PLATFORM_KEYRING=y CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_FORTIFY_SOURCE=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m -# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_SELFTESTS=y CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -750,9 +775,26 @@ CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_GENERIC=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_HCTR2=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m @@ -762,28 +804,26 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_STATS=y -CONFIG_ZCRYPT=m -CONFIG_PKEY=m -CONFIG_CRYPTO_PAES_S390=m -CONFIG_CRYPTO_SHA1_S390=m -CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m +CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_DES_S390=m +CONFIG_CRYPTO_HMAC_S390=m +CONFIG_ZCRYPT=m +CONFIG_PKEY=m +CONFIG_PKEY_CCA=m +CONFIG_PKEY_EP11=m +CONFIG_PKEY_PCKMO=m +CONFIG_PKEY_UV=m +CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRC32_SELFTEST=y -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_CRC8=m CONFIG_RANDOM32_SELFTEST=y CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y @@ -791,12 +831,12 @@ CONFIG_CMA_SIZE_MBYTES=0 CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y CONFIG_GDB_SCRIPTS=y CONFIG_HEADERS_INSTALL=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_PAGEALLOC=y +CONFIG_SLUB_DEBUG_ON=y CONFIG_PAGE_OWNER=y CONFIG_DEBUG_RODATA_TEST=y CONFIG_DEBUG_WX=y @@ -808,11 +848,10 @@ CONFIG_DEBUG_OBJECTS_TIMERS=y CONFIG_DEBUG_OBJECTS_WORK=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y -CONFIG_SLUB_DEBUG_ON=y -CONFIG_SLUB_STATS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_VIRTUAL=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_DEBUG_PER_CPU_MAPS=y @@ -824,21 +863,26 @@ CONFIG_PANIC_ON_OOPS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y CONFIG_TEST_LOCKUP=m +CONFIG_DEBUG_PREEMPT=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y +CONFIG_LOCKDEP_BITS=16 +CONFIG_LOCKDEP_CHAINS_BITS=17 CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y CONFIG_DEBUG_IRQFLAGS=y +CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_DEBUG_CREDENTIALS=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_REF_SCALE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=300 # CONFIG_RCU_TRACE is not set CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y +CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y CONFIG_IRQSOFF_TRACER=y @@ -846,15 +890,18 @@ CONFIG_PREEMPT_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_USER_EVENTS=y CONFIG_HIST_TRIGGERS=y CONFIG_FTRACE_STARTUP_TEST=y # CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_FTRACE_SORT_STARTUP_TEST=y CONFIG_SAMPLES=y CONFIG_SAMPLE_TRACE_PRINTK=m CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m +CONFIG_SAMPLE_FTRACE_OPS=m CONFIG_DEBUG_ENTRY=y +CONFIG_STRICT_MM_TYPECHECKS=y CONFIG_CIO_INJECT=y CONFIG_KUNIT=m CONFIG_KUNIT_DEBUGFS=y @@ -868,7 +915,7 @@ CONFIG_FAIL_MAKE_REQUEST=y CONFIG_FAIL_IO_TIMEOUT=y CONFIG_FAIL_FUTEX=y CONFIG_FAULT_INJECTION_DEBUG_FS=y -CONFIG_FAIL_FUNCTION=y +CONFIG_FAULT_INJECTION_CONFIGFS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_LKDTM=m CONFIG_TEST_MIN_HEAP=y @@ -877,7 +924,5 @@ CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y -CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 706df3a4a867..c13a77765162 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -21,7 +21,6 @@ CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y -CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y @@ -37,27 +36,24 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y CONFIG_LIVEPATCH=y -CONFIG_MARCH_ZEC12=y -CONFIG_TUNE_ZEC12=y +CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_SIG=y +CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m -CONFIG_CRASH_DUMP=y -CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y +CONFIG_S390_HYPFS_FS=y CONFIG_KVM=m CONFIG_S390_UNWIND_SELFTEST=m CONFIG_S390_KPROBES_SANITY_TEST=m @@ -69,6 +65,7 @@ CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_SIG_SHA256=y @@ -86,8 +83,11 @@ CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m +CONFIG_ZSWAP=y +CONFIG_ZSMALLOC_STAT=y +CONFIG_SLAB_BUCKETS=y +# CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y @@ -95,13 +95,11 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y -CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y CONFIG_ANON_VMA_NAME=y +CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -109,8 +107,8 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m -CONFIG_SMC=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -125,7 +123,6 @@ CONFIG_IP_MROUTE=y CONFIG_IP_MROUTE_MULTIPLE_TABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -159,6 +156,8 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y @@ -174,17 +173,39 @@ CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m -CONFIG_NFT_OBJREF=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NFT_REJECT_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_FLOW_TABLE_PROCFS=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -197,8 +218,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m CONFIG_NETFILTER_XT_TARGET_TRACE=m @@ -207,6 +230,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -221,6 +245,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_IPVS=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m @@ -238,6 +263,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -289,12 +315,10 @@ CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m @@ -330,7 +354,6 @@ CONFIG_BRIDGE_MRP=y CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m CONFIG_NET_SCH_PRIO=m @@ -341,7 +364,6 @@ CONFIG_NET_SCH_SFQ=m CONFIG_NET_SCH_TEQL=m CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_DRR=m CONFIG_NET_SCH_MQPRIO=m @@ -353,29 +375,32 @@ CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m CONFIG_NET_SCH_ETS=m CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m CONFIG_CLS_U32_PERF=y CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m +CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y CONFIG_OPENVSWITCH=m CONFIG_VSOCKETS=m @@ -392,8 +417,16 @@ CONFIG_HOTPLUG_PCI_S390=y CONFIG_UEVENT_HELPER=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_SAFE=y +# CONFIG_FW_LOADER is not set CONFIG_CONNECTOR=y CONFIG_ZRAM=y +CONFIG_ZRAM_BACKEND_LZ4=y +CONFIG_ZRAM_BACKEND_LZ4HC=y +CONFIG_ZRAM_BACKEND_ZSTD=y +CONFIG_ZRAM_BACKEND_DEFLATE=y +CONFIG_ZRAM_BACKEND_842=y +CONFIG_ZRAM_BACKEND_LZO=y +CONFIG_ZRAM_DEF_COMP_DEFLATE=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m @@ -429,9 +462,8 @@ CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y +# CONFIG_MD_BITMAP_FILE is not set CONFIG_MD_LINEAR=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -456,8 +488,10 @@ CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m +CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -484,7 +518,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_ASIX is not set # CONFIG_NET_VENDOR_ATHEROS is not set # CONFIG_NET_VENDOR_BROADCOM is not set -# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set # CONFIG_NET_VENDOR_CAVIUM is not set # CONFIG_NET_VENDOR_CHELSIO is not set @@ -500,25 +533,27 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_GOOGLE is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set -# CONFIG_NET_VENDOR_MICROSOFT is not set # CONFIG_NET_VENDOR_LITEX is not set # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set # CONFIG_NET_VENDOR_MICROSEMI is not set +# CONFIG_NET_VENDOR_MICROSOFT is not set # CONFIG_NET_VENDOR_MYRI is not set +# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NATSEMI is not set # CONFIG_NET_VENDOR_NETERION is not set # CONFIG_NET_VENDOR_NETRONOME is not set -# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set # CONFIG_NET_VENDOR_PENSANDO is not set # CONFIG_NET_VENDOR_QLOGIC is not set +# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set # CONFIG_NET_VENDOR_REALTEK is not set @@ -526,9 +561,9 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_ROCKER is not set # CONFIG_NET_VENDOR_SAMSUNG is not set # CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SILAN is not set # CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SMSC is not set # CONFIG_NET_VENDOR_SOCIONEXT is not set # CONFIG_NET_VENDOR_STMICRO is not set @@ -538,6 +573,7 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_TI is not set # CONFIG_NET_VENDOR_VERTEXCOM is not set # CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WANGXUN is not set # CONFIG_NET_VENDOR_WIZNET is not set # CONFIG_NET_VENDOR_XILINX is not set CONFIG_PPP=m @@ -557,6 +593,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 +# CONFIG_LEGACY_TIOCSTI is not set CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_HANGCHECK_TIMER=m @@ -568,27 +605,33 @@ CONFIG_WATCHDOG_CORE=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m +CONFIG_DRM=m +CONFIG_DRM_VIRTIO_GPU=m CONFIG_FB=y -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -# CONFIG_HID is not set +# CONFIG_FB_DEVICE is not set +# CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m -CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m -CONFIG_S390_CCW_IOMMU=y -CONFIG_S390_AP_IOMMU=y +CONFIG_VHOST_VDPA=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -607,7 +650,9 @@ CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m -CONFIG_FS_DAX=y +CONFIG_BCACHEFS_FS=m +CONFIG_BCACHEFS_QUOTA=y +CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -617,13 +662,13 @@ CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m -CONFIG_AUTOFS4_FS=m +CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m CONFIG_NETFS_STATS=y -CONFIG_FSCACHE=m +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y @@ -633,16 +678,17 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_INODE64=y +CONFIG_TMPFS_QUOTA=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y CONFIG_SQUASHFS_XATTR=y CONFIG_SQUASHFS_LZ4=y CONFIG_SQUASHFS_LZO=y @@ -650,6 +696,7 @@ CONFIG_SQUASHFS_XZ=y CONFIG_SQUASHFS_ZSTD=y CONFIG_ROMFS_FS=m CONFIG_NFS_FS=m +CONFIG_NFS_V2=m CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y @@ -657,6 +704,7 @@ CONFIG_NFSD=m CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_V4_SECURITY_LABEL=y +# CONFIG_NFSD_LEGACY_CLIENT_TRACKING is not set CONFIG_CIFS=m CONFIG_CIFS_UPCALL=y CONFIG_CIFS_XATTR=y @@ -674,61 +722,38 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_LOCKDOWN_LSM=y CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y CONFIG_SECURITY_LANDLOCK=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_INTEGRITY_PLATFORM_KEYRING=y CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m -# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_SELFTESTS=y CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -736,47 +761,63 @@ CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_GENERIC=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_HCTR2=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_ZSTD=m CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_JITTERENTROPY_OSR=1 CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_STATS=y -CONFIG_ZCRYPT=m -CONFIG_PKEY=m -CONFIG_CRYPTO_PAES_S390=m -CONFIG_CRYPTO_SHA1_S390=m -CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m +CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_DES_S390=m +CONFIG_CRYPTO_HMAC_S390=m +CONFIG_ZCRYPT=m +CONFIG_PKEY=m +CONFIG_PKEY_CCA=m +CONFIG_PKEY_EP11=m +CONFIG_PKEY_PCKMO=m +CONFIG_PKEY_UV=m +CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m CONFIG_PRIME_NUMBERS=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_CRC8=m CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y CONFIG_GDB_SCRIPTS=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y @@ -785,23 +826,26 @@ CONFIG_PTDUMP_DEBUGFS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_PANIC_ON_OOPS=y CONFIG_TEST_LOCKUP=m -CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_REF_SCALE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y +CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_USER_EVENTS=y CONFIG_HIST_TRIGGERS=y CONFIG_SAMPLES=y CONFIG_SAMPLE_TRACE_PRINTK=m CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m +CONFIG_SAMPLE_FTRACE_OPS=m CONFIG_KUNIT=m CONFIG_KUNIT_DEBUGFS=y CONFIG_LKDTM=m @@ -809,4 +853,3 @@ CONFIG_KPROBES_SANITY_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config new file mode 100644 index 000000000000..cefbe2ba1228 --- /dev/null +++ b/arch/s390/configs/kasan.config @@ -0,0 +1,4 @@ +# Help: Enable KASan for debugging +CONFIG_KASAN=y +CONFIG_KASAN_INLINE=y +CONFIG_KERNEL_IMAGE_BASE=0x7FFFE0000000 diff --git a/arch/s390/configs/mmtypes.config b/arch/s390/configs/mmtypes.config new file mode 100644 index 000000000000..fe32b442d789 --- /dev/null +++ b/arch/s390/configs/mmtypes.config @@ -0,0 +1,2 @@ +# Help: Enable strict memory management typechecks +CONFIG_STRICT_MM_TYPECHECKS=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index f4976f611b94..8163c1702720 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -1,4 +1,3 @@ -# CONFIG_SWAP is not set CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BPF_SYSCALL=y @@ -9,27 +8,23 @@ CONFIG_BPF_SYSCALL=y # CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -# CONFIG_COMPAT_BRK is not set -CONFIG_MARCH_ZEC12=y -CONFIG_TUNE_ZEC12=y -# CONFIG_COMPAT is not set +CONFIG_KEXEC=y +CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=2 CONFIG_HZ_100=y -# CONFIG_RELOCATABLE is not set # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set -CONFIG_CRASH_DUMP=y +# CONFIG_AP is not set # CONFIG_PFAULT is not set -# CONFIG_S390_HYPFS_FS is not set +# CONFIG_S390_HYPFS is not set # CONFIG_VIRTUALIZATION is not set # CONFIG_S390_GUEST is not set # CONFIG_SECCOMP is not set -# CONFIG_GCC_PLUGINS is not set # CONFIG_BLOCK_LEGACY_AUTOLOAD is not set CONFIG_PARTITION_ADVANCED=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -# CONFIG_COMPACTION is not set -# CONFIG_MIGRATION is not set +# CONFIG_SWAP is not set +# CONFIG_COMPAT_BRK is not set CONFIG_NET=y # CONFIG_IUCV is not set # CONFIG_PCPU_DEV_REFCNT is not set @@ -50,14 +45,16 @@ CONFIG_ZFCP=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set +# CONFIG_LEGACY_TIOCSTI is not set # CONFIG_HVC_IUCV is not set # CONFIG_HW_RANDOM_S390 is not set # CONFIG_HMC_DRV is not set +# CONFIG_S390_UV_UAPI is not set # CONFIG_S390_TAPE is not set # CONFIG_VMCP is not set # CONFIG_MONWRITER is not set # CONFIG_S390_VMUR is not set -# CONFIG_HID is not set +# CONFIG_HID_SUPPORT is not set # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set @@ -65,17 +62,14 @@ CONFIG_ZFCP=y # CONFIG_INOTIFY_USER is not set # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_LSM="yama,loadpin,safesetid,integrity" # CONFIG_ZLIB_DFLTCC is not set CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y # CONFIG_SYMBOLIC_ERRNAME is not set CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y -# CONFIG_SCHED_DEBUG is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set # CONFIG_FTRACE is not set diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig new file mode 100644 index 000000000000..e2c27588b21a --- /dev/null +++ b/arch/s390/crypto/Kconfig @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (s390)" + +config CRYPTO_SHA512_S390 + tristate "Hash functions: SHA-384 and SHA-512" + select CRYPTO_HASH + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: s390 + + It is available as of z10. + +config CRYPTO_SHA1_S390 + tristate "Hash functions: SHA-1" + select CRYPTO_HASH + help + SHA-1 secure hash algorithm (FIPS 180) + + Architecture: s390 + + It is available as of z990. + +config CRYPTO_SHA3_256_S390 + tristate "Hash functions: SHA3-224 and SHA3-256" + select CRYPTO_HASH + help + SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202) + + Architecture: s390 + + It is available as of z14. + +config CRYPTO_SHA3_512_S390 + tristate "Hash functions: SHA3-384 and SHA3-512" + select CRYPTO_HASH + help + SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202) + + Architecture: s390 + + It is available as of z14. + +config CRYPTO_GHASH_S390 + tristate "Hash functions: GHASH" + select CRYPTO_HASH + help + GCM GHASH hash function (NIST SP800-38D) + + Architecture: s390 + + It is available as of z196. + +config CRYPTO_AES_S390 + tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM" + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + help + Block cipher: AES cipher algorithms (FIPS 197) + AEAD cipher: AES with GCM + Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes + + Architecture: s390 + + As of z9 the ECB and CBC modes are hardware accelerated + for 128 bit keys. + + As of z10 the ECB and CBC modes are hardware accelerated + for all AES key sizes. + + As of z196 the CTR mode is hardware accelerated for all AES + key sizes and XTS mode is hardware accelerated for 256 and + 512 bit keys. + +config CRYPTO_DES_S390 + tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR" + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + select CRYPTO_LIB_DES + help + Block ciphers: DES (FIPS 46-2) cipher algorithm + Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm + Length-preserving ciphers: DES with ECB, CBC, and CTR modes + Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes + + Architecture: s390 + + As of z990 the ECB and CBC mode are hardware accelerated. + As of z196 the CTR mode is hardware accelerated. + +config CRYPTO_HMAC_S390 + tristate "Keyed-hash message authentication code: HMAC" + select CRYPTO_HASH + help + s390 specific HMAC hardware support for SHA224, SHA256, SHA384 and + SHA512. + + Architecture: s390 + +endmenu diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 1b1cc478fa94..21757d86cd49 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -4,18 +4,13 @@ # obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o -obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o -obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o +obj-$(CONFIG_CRYPTO_HMAC_S390) += hmac_s390.o obj-y += arch_random.o - -crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o -chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 1023e9d43d44..5d36f4020dfa 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -51,8 +51,13 @@ struct s390_aes_ctx { }; struct s390_xts_ctx { - u8 key[32]; - u8 pcc_key[32]; + union { + u8 keys[64]; + struct { + u8 key[32]; + u8 pcc_key[32]; + }; + }; int key_len; unsigned long fc; struct crypto_skcipher *fallback; @@ -61,7 +66,6 @@ struct s390_xts_ctx { struct gcm_sg_walk { struct scatter_walk walk; unsigned int walk_bytes; - u8 *walk_ptr; unsigned int walk_bytes_remain; u8 buf[AES_BLOCK_SIZE]; unsigned int buf_bytes; @@ -398,10 +402,6 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (err) return err; - /* In fips mode only 128 bit or 256 bit keys are valid */ - if (fips_enabled && key_len != 32 && key_len != 64) - return -EINVAL; - /* Pick the correct function code based on the key length */ fc = (key_len == 32) ? CPACF_KM_XTS_128 : (key_len == 64) ? CPACF_KM_XTS_256 : 0; @@ -530,6 +530,108 @@ static struct skcipher_alg xts_aes_alg = { .decrypt = xts_aes_decrypt, }; +static int fullxts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + unsigned long fc; + int err; + + err = xts_fallback_setkey(tfm, in_key, key_len); + if (err) + return err; + + /* Pick the correct function code based on the key length */ + fc = (key_len == 32) ? CPACF_KM_XTS_128_FULL : + (key_len == 64) ? CPACF_KM_XTS_256_FULL : 0; + + /* Check if the function code is available */ + xts_ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!xts_ctx->fc) + return 0; + + /* Store double-key */ + memcpy(xts_ctx->keys, in_key, key_len); + xts_ctx->key_len = key_len; + return 0; +} + +static int fullxts_aes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + unsigned int offset, nbytes, n; + struct skcipher_walk walk; + int ret; + struct { + __u8 key[64]; + __u8 tweak[16]; + __u8 nap[16]; + } fxts_param = { + .nap = {0}, + }; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + if (unlikely(!xts_ctx->fc || (req->cryptlen % AES_BLOCK_SIZE) != 0)) { + struct skcipher_request *subreq = skcipher_request_ctx(req); + + *subreq = *req; + skcipher_request_set_tfm(subreq, xts_ctx->fallback); + return (modifier & CPACF_DECRYPT) ? + crypto_skcipher_decrypt(subreq) : + crypto_skcipher_encrypt(subreq); + } + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + offset = xts_ctx->key_len & 0x20; + memcpy(fxts_param.key + offset, xts_ctx->keys, xts_ctx->key_len); + memcpy(fxts_param.tweak, req->iv, AES_BLOCK_SIZE); + fxts_param.nap[0] = 0x01; /* initial alpha power (1, little-endian) */ + + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + cpacf_km(xts_ctx->fc | modifier, fxts_param.key + offset, + walk.dst.virt.addr, walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); + } + memzero_explicit(&fxts_param, sizeof(fxts_param)); + return ret; +} + +static int fullxts_aes_encrypt(struct skcipher_request *req) +{ + return fullxts_aes_crypt(req, 0); +} + +static int fullxts_aes_decrypt(struct skcipher_request *req) +{ + return fullxts_aes_crypt(req, CPACF_DECRYPT); +} + +static struct skcipher_alg fullxts_aes_alg = { + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "full-xts-aes-s390", + .base.cra_priority = 403, /* aes-xts-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_xts_ctx), + .base.cra_module = THIS_MODULE, + .init = xts_fallback_init, + .exit = xts_fallback_exit, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = fullxts_aes_set_key, + .encrypt = fullxts_aes_encrypt, + .decrypt = fullxts_aes_decrypt, +}; + static int ctr_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -601,7 +703,9 @@ static int ctr_aes_crypt(struct skcipher_request *req) * final block may be < AES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { - cpacf_kmctr(sctx->fc, sctx->key, buf, walk.src.virt.addr, + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk.src.virt.addr, nbytes); + cpacf_kmctr(sctx->fc, sctx->key, buf, buf, AES_BLOCK_SIZE, walk.iv); memcpy(walk.dst.virt.addr, buf, nbytes); crypto_inc(walk.iv, AES_BLOCK_SIZE); @@ -682,29 +786,20 @@ static void gcm_walk_start(struct gcm_sg_walk *gw, struct scatterlist *sg, static inline unsigned int _gcm_sg_clamp_and_map(struct gcm_sg_walk *gw) { - struct scatterlist *nextsg; - - gw->walk_bytes = scatterwalk_clamp(&gw->walk, gw->walk_bytes_remain); - while (!gw->walk_bytes) { - nextsg = sg_next(gw->walk.sg); - if (!nextsg) - return 0; - scatterwalk_start(&gw->walk, nextsg); - gw->walk_bytes = scatterwalk_clamp(&gw->walk, - gw->walk_bytes_remain); - } - gw->walk_ptr = scatterwalk_map(&gw->walk); + if (gw->walk_bytes_remain == 0) + return 0; + gw->walk_bytes = scatterwalk_next(&gw->walk, gw->walk_bytes_remain); return gw->walk_bytes; } static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw, - unsigned int nbytes) + unsigned int nbytes, bool out) { gw->walk_bytes_remain -= nbytes; - scatterwalk_unmap(gw->walk_ptr); - scatterwalk_advance(&gw->walk, nbytes); - scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain); - gw->walk_ptr = NULL; + if (out) + scatterwalk_done_dst(&gw->walk, nbytes); + else + scatterwalk_done_src(&gw->walk, nbytes); } static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) @@ -730,16 +825,16 @@ static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) } if (!gw->buf_bytes && gw->walk_bytes >= minbytesneeded) { - gw->ptr = gw->walk_ptr; + gw->ptr = gw->walk.addr; gw->nbytes = gw->walk_bytes; goto out; } while (1) { n = min(gw->walk_bytes, AES_BLOCK_SIZE - gw->buf_bytes); - memcpy(gw->buf + gw->buf_bytes, gw->walk_ptr, n); + memcpy(gw->buf + gw->buf_bytes, gw->walk.addr, n); gw->buf_bytes += n; - _gcm_sg_unmap_and_advance(gw, n); + _gcm_sg_unmap_and_advance(gw, n, false); if (gw->buf_bytes >= minbytesneeded) { gw->ptr = gw->buf; gw->nbytes = gw->buf_bytes; @@ -771,13 +866,12 @@ static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) } if (gw->walk_bytes >= minbytesneeded) { - gw->ptr = gw->walk_ptr; + gw->ptr = gw->walk.addr; gw->nbytes = gw->walk_bytes; goto out; } - scatterwalk_unmap(gw->walk_ptr); - gw->walk_ptr = NULL; + scatterwalk_unmap(&gw->walk); gw->ptr = gw->buf; gw->nbytes = sizeof(gw->buf); @@ -799,7 +893,7 @@ static int gcm_in_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) } else gw->buf_bytes = 0; } else - _gcm_sg_unmap_and_advance(gw, bytesdone); + _gcm_sg_unmap_and_advance(gw, bytesdone, false); return bytesdone; } @@ -816,11 +910,11 @@ static int gcm_out_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) if (!_gcm_sg_clamp_and_map(gw)) return i; n = min(gw->walk_bytes, bytesdone - i); - memcpy(gw->walk_ptr, gw->buf + i, n); - _gcm_sg_unmap_and_advance(gw, n); + memcpy(gw->walk.addr, gw->buf + i, n); + _gcm_sg_unmap_and_advance(gw, n, true); } } else - _gcm_sg_unmap_and_advance(gw, bytesdone); + _gcm_sg_unmap_and_advance(gw, bytesdone, true); return bytesdone; } @@ -957,7 +1051,7 @@ static struct aead_alg gcm_aes_aead = { }; static struct crypto_alg *aes_s390_alg; -static struct skcipher_alg *aes_s390_skcipher_algs[4]; +static struct skcipher_alg *aes_s390_skcipher_algs[5]; static int aes_s390_skciphers_num; static struct aead_alg *aes_s390_aead_alg; @@ -1014,6 +1108,13 @@ static int __init aes_s390_init(void) goto out_err; } + if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128_FULL) || + cpacf_test_func(&km_functions, CPACF_KM_XTS_256_FULL)) { + ret = aes_s390_register_skcipher(&fullxts_aes_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128) || cpacf_test_func(&km_functions, CPACF_KM_XTS_256)) { ret = aes_s390_register_skcipher(&xts_aes_alg); @@ -1049,11 +1150,11 @@ out_err: return ret; } -module_cpu_feature_match(MSA, aes_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init); module_exit(aes_s390_fini); MODULE_ALIAS_CRYPTO("aes-all"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); -MODULE_IMPORT_NS(CRYPTO_INTERNAL); +MODULE_IMPORT_NS("CRYPTO_INTERNAL"); diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index 1f2d40993c4d..a8a2407381af 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/random.h> #include <linux/static_key.h> +#include <asm/archrandom.h> #include <asm/cpacf.h> DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c deleted file mode 100644 index 2ec51f339cec..000000000000 --- a/arch/s390/crypto/chacha-glue.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * s390 ChaCha stream cipher. - * - * Copyright IBM Corp. 2021 - */ - -#define KMSG_COMPONENT "chacha_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <crypto/internal/chacha.h> -#include <crypto/internal/skcipher.h> -#include <crypto/algapi.h> -#include <linux/cpufeature.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/fpu/api.h> -#include "chacha-s390.h" - -static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src, - unsigned int nbytes, const u32 *key, - u32 *counter) -{ - struct kernel_fpu vxstate; - - kernel_fpu_begin(&vxstate, KERNEL_VXR); - chacha20_vx(dst, src, nbytes, key, counter); - kernel_fpu_end(&vxstate, KERNEL_VXR); - - *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static int chacha20_s390(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 state[CHACHA_STATE_WORDS] __aligned(16); - struct skcipher_walk walk; - unsigned int nbytes; - int rc; - - rc = skcipher_walk_virt(&walk, req, false); - chacha_init_generic(state, ctx->key, req->iv); - - while (walk.nbytes > 0) { - nbytes = walk.nbytes; - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - if (nbytes <= CHACHA_BLOCK_SIZE) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - chacha20_crypt_s390(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - &state[4], &state[12]); - } - rc = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - return rc; -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - /* TODO: implement hchacha_block_arch() in assembly */ - hchacha_block_generic(state, stream, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) -{ - chacha_init_generic(state, key, iv); -} -EXPORT_SYMBOL(chacha_init_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - /* s390 chacha20 implementation has 20 rounds hard-coded, - * it cannot handle a block of data or less, but otherwise - * it can handle data of arbitrary size - */ - if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20) - chacha_crypt_generic(state, dst, src, bytes, nrounds); - else - chacha20_crypt_s390(state, dst, src, bytes, - &state[4], &state[12]); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static struct skcipher_alg chacha_algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-s390", - .base.cra_priority = 900, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha20_s390, - .decrypt = chacha20_s390, - } -}; - -static int __init chacha_mod_init(void) -{ - return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? - crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0; -} - -static void __exit chacha_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) - crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)); -} - -module_cpu_feature_match(VXRS, chacha_mod_init); -module_exit(chacha_mod_fini); - -MODULE_DESCRIPTION("ChaCha20 stream cipher"); -MODULE_LICENSE("GPL v2"); - -MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c deleted file mode 100644 index fafecad20752..000000000000 --- a/arch/s390/crypto/crc32-vx.c +++ /dev/null @@ -1,310 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Crypto-API module for CRC-32 algorithms implemented with the - * z/Architecture Vector Extension Facility. - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ -#define KMSG_COMPONENT "crc32-vx" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <linux/crc32.h> -#include <crypto/internal/hash.h> -#include <asm/fpu/api.h> - - -#define CRC32_BLOCK_SIZE 1 -#define CRC32_DIGEST_SIZE 4 - -#define VX_MIN_LEN 64 -#define VX_ALIGNMENT 16L -#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) - -struct crc_ctx { - u32 key; -}; - -struct crc_desc_ctx { - u32 crc; -}; - -/* Prototypes for functions in assembly files */ -u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); -u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); -u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); - -/* - * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension - * - * Creates a function to perform a particular CRC-32 computation. Depending - * on the message buffer, the hardware-accelerated or software implementation - * is used. Note that the message buffer is aligned to improve fetch - * operations of VECTOR LOAD MULTIPLE instructions. - * - */ -#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ - static u32 __pure ___fname(u32 crc, \ - unsigned char const *data, size_t datalen) \ - { \ - struct kernel_fpu vxstate; \ - unsigned long prealign, aligned, remaining; \ - \ - if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \ - return ___crc32_sw(crc, data, datalen); \ - \ - if ((unsigned long)data & VX_ALIGN_MASK) { \ - prealign = VX_ALIGNMENT - \ - ((unsigned long)data & VX_ALIGN_MASK); \ - datalen -= prealign; \ - crc = ___crc32_sw(crc, data, prealign); \ - data = (void *)((unsigned long)data + prealign); \ - } \ - \ - aligned = datalen & ~VX_ALIGN_MASK; \ - remaining = datalen & VX_ALIGN_MASK; \ - \ - kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ - crc = ___crc32_vx(crc, data, aligned); \ - kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ - \ - if (remaining) \ - crc = ___crc32_sw(crc, data + aligned, remaining); \ - \ - return crc; \ - } - -DEFINE_CRC32_VX(crc32_le_vx, crc32_le_vgfm_16, crc32_le) -DEFINE_CRC32_VX(crc32_be_vx, crc32_be_vgfm_16, crc32_be) -DEFINE_CRC32_VX(crc32c_le_vx, crc32c_le_vgfm_16, __crc32c_le) - - -static int crc32_vx_cra_init_zero(struct crypto_tfm *tfm) -{ - struct crc_ctx *mctx = crypto_tfm_ctx(tfm); - - mctx->key = 0; - return 0; -} - -static int crc32_vx_cra_init_invert(struct crypto_tfm *tfm) -{ - struct crc_ctx *mctx = crypto_tfm_ctx(tfm); - - mctx->key = ~0; - return 0; -} - -static int crc32_vx_init(struct shash_desc *desc) -{ - struct crc_ctx *mctx = crypto_shash_ctx(desc->tfm); - struct crc_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = mctx->key; - return 0; -} - -static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, - unsigned int newkeylen) -{ - struct crc_ctx *mctx = crypto_shash_ctx(tfm); - - if (newkeylen != sizeof(mctx->key)) - return -EINVAL; - mctx->key = le32_to_cpu(*(__le32 *)newkey); - return 0; -} - -static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, - unsigned int newkeylen) -{ - struct crc_ctx *mctx = crypto_shash_ctx(tfm); - - if (newkeylen != sizeof(mctx->key)) - return -EINVAL; - mctx->key = be32_to_cpu(*(__be32 *)newkey); - return 0; -} - -static int crc32le_vx_final(struct shash_desc *desc, u8 *out) -{ - struct crc_desc_ctx *ctx = shash_desc_ctx(desc); - - *(__le32 *)out = cpu_to_le32p(&ctx->crc); - return 0; -} - -static int crc32be_vx_final(struct shash_desc *desc, u8 *out) -{ - struct crc_desc_ctx *ctx = shash_desc_ctx(desc); - - *(__be32 *)out = cpu_to_be32p(&ctx->crc); - return 0; -} - -static int crc32c_vx_final(struct shash_desc *desc, u8 *out) -{ - struct crc_desc_ctx *ctx = shash_desc_ctx(desc); - - /* - * Perform a final XOR with 0xFFFFFFFF to be in sync - * with the generic crc32c shash implementation. - */ - *(__le32 *)out = ~cpu_to_le32p(&ctx->crc); - return 0; -} - -static int __crc32le_vx_finup(u32 *crc, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = cpu_to_le32(crc32_le_vx(*crc, data, len)); - return 0; -} - -static int __crc32be_vx_finup(u32 *crc, const u8 *data, unsigned int len, - u8 *out) -{ - *(__be32 *)out = cpu_to_be32(crc32_be_vx(*crc, data, len)); - return 0; -} - -static int __crc32c_vx_finup(u32 *crc, const u8 *data, unsigned int len, - u8 *out) -{ - /* - * Perform a final XOR with 0xFFFFFFFF to be in sync - * with the generic crc32c shash implementation. - */ - *(__le32 *)out = ~cpu_to_le32(crc32c_le_vx(*crc, data, len)); - return 0; -} - - -#define CRC32_VX_FINUP(alg, func) \ - static int alg ## _vx_finup(struct shash_desc *desc, const u8 *data, \ - unsigned int datalen, u8 *out) \ - { \ - return __ ## alg ## _vx_finup(shash_desc_ctx(desc), \ - data, datalen, out); \ - } - -CRC32_VX_FINUP(crc32le, crc32_le_vx) -CRC32_VX_FINUP(crc32be, crc32_be_vx) -CRC32_VX_FINUP(crc32c, crc32c_le_vx) - -#define CRC32_VX_DIGEST(alg, func) \ - static int alg ## _vx_digest(struct shash_desc *desc, const u8 *data, \ - unsigned int len, u8 *out) \ - { \ - return __ ## alg ## _vx_finup(crypto_shash_ctx(desc->tfm), \ - data, len, out); \ - } - -CRC32_VX_DIGEST(crc32le, crc32_le_vx) -CRC32_VX_DIGEST(crc32be, crc32_be_vx) -CRC32_VX_DIGEST(crc32c, crc32c_le_vx) - -#define CRC32_VX_UPDATE(alg, func) \ - static int alg ## _vx_update(struct shash_desc *desc, const u8 *data, \ - unsigned int datalen) \ - { \ - struct crc_desc_ctx *ctx = shash_desc_ctx(desc); \ - ctx->crc = func(ctx->crc, data, datalen); \ - return 0; \ - } - -CRC32_VX_UPDATE(crc32le, crc32_le_vx) -CRC32_VX_UPDATE(crc32be, crc32_be_vx) -CRC32_VX_UPDATE(crc32c, crc32c_le_vx) - - -static struct shash_alg crc32_vx_algs[] = { - /* CRC-32 LE */ - { - .init = crc32_vx_init, - .setkey = crc32_vx_setkey, - .update = crc32le_vx_update, - .final = crc32le_vx_final, - .finup = crc32le_vx_finup, - .digest = crc32le_vx_digest, - .descsize = sizeof(struct crc_desc_ctx), - .digestsize = CRC32_DIGEST_SIZE, - .base = { - .cra_name = "crc32", - .cra_driver_name = "crc32-vx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CRC32_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crc_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32_vx_cra_init_zero, - }, - }, - /* CRC-32 BE */ - { - .init = crc32_vx_init, - .setkey = crc32be_vx_setkey, - .update = crc32be_vx_update, - .final = crc32be_vx_final, - .finup = crc32be_vx_finup, - .digest = crc32be_vx_digest, - .descsize = sizeof(struct crc_desc_ctx), - .digestsize = CRC32_DIGEST_SIZE, - .base = { - .cra_name = "crc32be", - .cra_driver_name = "crc32be-vx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CRC32_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crc_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32_vx_cra_init_zero, - }, - }, - /* CRC-32C LE */ - { - .init = crc32_vx_init, - .setkey = crc32_vx_setkey, - .update = crc32c_vx_update, - .final = crc32c_vx_final, - .finup = crc32c_vx_finup, - .digest = crc32c_vx_digest, - .descsize = sizeof(struct crc_desc_ctx), - .digestsize = CRC32_DIGEST_SIZE, - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-vx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CRC32_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crc_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32_vx_cra_init_invert, - }, - }, -}; - - -static int __init crc_vx_mod_init(void) -{ - return crypto_register_shashes(crc32_vx_algs, - ARRAY_SIZE(crc32_vx_algs)); -} - -static void __exit crc_vx_mod_exit(void) -{ - crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs)); -} - -module_cpu_feature_match(VXRS, crc_vx_mod_init); -module_exit(crc_vx_mod_exit); - -MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>"); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crc32"); -MODULE_ALIAS_CRYPTO("crc32-vx"); -MODULE_ALIAS_CRYPTO("crc32c"); -MODULE_ALIAS_CRYPTO("crc32c-vx"); diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index e013088b5115..8e75b83a5ddc 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -492,7 +492,7 @@ out_err: return ret; } -module_cpu_feature_match(MSA, des_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init); module_exit(des_s390_exit); MODULE_ALIAS_CRYPTO("des"); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index 6b07a2f1ce8a..dcbcee37cb63 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -8,29 +8,28 @@ * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> */ +#include <asm/cpacf.h> +#include <crypto/ghash.h> #include <crypto/internal/hash.h> -#include <linux/module.h> #include <linux/cpufeature.h> -#include <asm/cpacf.h> - -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> -struct ghash_ctx { +struct s390_ghash_ctx { u8 key[GHASH_BLOCK_SIZE]; }; -struct ghash_desc_ctx { +struct s390_ghash_desc_ctx { u8 icv[GHASH_BLOCK_SIZE]; u8 key[GHASH_BLOCK_SIZE]; - u8 buffer[GHASH_BLOCK_SIZE]; - u32 bytes; }; static int ghash_init(struct shash_desc *desc) { - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + struct s390_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc); memset(dctx, 0, sizeof(*dctx)); memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE); @@ -41,7 +40,7 @@ static int ghash_init(struct shash_desc *desc) static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { - struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + struct s390_ghash_ctx *ctx = crypto_shash_ctx(tfm); if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; @@ -54,80 +53,71 @@ static int ghash_setkey(struct crypto_shash *tfm, static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc); unsigned int n; - u8 *buf = dctx->buffer; - - if (dctx->bytes) { - u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); - n = min(srclen, dctx->bytes); - dctx->bytes -= n; - srclen -= n; - - memcpy(pos, src, n); - src += n; + n = srclen & ~(GHASH_BLOCK_SIZE - 1); + cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n); + return srclen - n; +} - if (!dctx->bytes) { - cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, - GHASH_BLOCK_SIZE); - } - } +static void ghash_flush(struct s390_ghash_desc_ctx *dctx, const u8 *src, + unsigned int len) +{ + if (len) { + u8 buf[GHASH_BLOCK_SIZE] = {}; - n = srclen & ~(GHASH_BLOCK_SIZE - 1); - if (n) { - cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n); - src += n; - srclen -= n; + memcpy(buf, src, len); + cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE); + memzero_explicit(buf, sizeof(buf)); } +} - if (srclen) { - dctx->bytes = GHASH_BLOCK_SIZE - srclen; - memcpy(buf, src, srclen); - } +static int ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) +{ + struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc); + ghash_flush(dctx, src, len); + memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE); return 0; } -static int ghash_flush(struct ghash_desc_ctx *dctx) +static int ghash_export(struct shash_desc *desc, void *out) { - u8 *buf = dctx->buffer; - - if (dctx->bytes) { - u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); - - memset(pos, 0, dctx->bytes); - cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE); - dctx->bytes = 0; - } + struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc); + memcpy(out, dctx->icv, GHASH_DIGEST_SIZE); return 0; } -static int ghash_final(struct shash_desc *desc, u8 *dst) +static int ghash_import(struct shash_desc *desc, const void *in) { - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - int ret; + struct s390_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc); - ret = ghash_flush(dctx); - if (!ret) - memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE); - return ret; + memcpy(dctx->icv, in, GHASH_DIGEST_SIZE); + memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE); + return 0; } static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, - .final = ghash_final, + .finup = ghash_finup, .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), + .export = ghash_export, + .import = ghash_import, + .statesize = sizeof(struct ghash_desc_ctx), + .descsize = sizeof(struct s390_ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "ghash-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_ctxsize = sizeof(struct s390_ghash_ctx), .cra_module = THIS_MODULE, }, }; @@ -145,7 +135,7 @@ static void __exit ghash_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_cpu_feature_match(MSA, ghash_mod_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init); module_exit(ghash_mod_exit); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/s390/crypto/hmac_s390.c b/arch/s390/crypto/hmac_s390.c new file mode 100644 index 000000000000..93a1098d9f8d --- /dev/null +++ b/arch/s390/crypto/hmac_s390.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright IBM Corp. 2024 + * + * s390 specific HMAC support. + */ + +#define KMSG_COMPONENT "hmac_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <asm/cpacf.h> +#include <crypto/internal/hash.h> +#include <crypto/hmac.h> +#include <crypto/sha2.h> +#include <linux/cpufeature.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +/* + * KMAC param block layout for sha2 function codes: + * The layout of the param block for the KMAC instruction depends on the + * blocksize of the used hashing sha2-algorithm function codes. The param block + * contains the hash chaining value (cv), the input message bit-length (imbl) + * and the hmac-secret (key). To prevent code duplication, the sizes of all + * these are calculated based on the blocksize. + * + * param-block: + * +-------+ + * | cv | + * +-------+ + * | imbl | + * +-------+ + * | key | + * +-------+ + * + * sizes: + * part | sh2-alg | calculation | size | type + * -----+---------+-------------+------+-------- + * cv | 224/256 | blocksize/2 | 32 | u64[8] + * | 384/512 | | 64 | u128[8] + * imbl | 224/256 | blocksize/8 | 8 | u64 + * | 384/512 | | 16 | u128 + * key | 224/256 | blocksize | 64 | u8[64] + * | 384/512 | | 128 | u8[128] + */ + +#define MAX_DIGEST_SIZE SHA512_DIGEST_SIZE +#define MAX_IMBL_SIZE sizeof(u128) +#define MAX_BLOCK_SIZE SHA512_BLOCK_SIZE + +#define SHA2_CV_SIZE(bs) ((bs) >> 1) +#define SHA2_IMBL_SIZE(bs) ((bs) >> 3) + +#define SHA2_IMBL_OFFSET(bs) (SHA2_CV_SIZE(bs)) +#define SHA2_KEY_OFFSET(bs) (SHA2_CV_SIZE(bs) + SHA2_IMBL_SIZE(bs)) + +struct s390_hmac_ctx { + u8 key[MAX_BLOCK_SIZE]; +}; + +union s390_kmac_gr0 { + unsigned long reg; + struct { + unsigned long : 48; + unsigned long ikp : 1; + unsigned long iimp : 1; + unsigned long ccup : 1; + unsigned long : 6; + unsigned long fc : 7; + }; +}; + +struct s390_kmac_sha2_ctx { + u8 param[MAX_DIGEST_SIZE + MAX_IMBL_SIZE + MAX_BLOCK_SIZE]; + union s390_kmac_gr0 gr0; + u64 buflen[2]; +}; + +/* + * kmac_sha2_set_imbl - sets the input message bit-length based on the blocksize + */ +static inline void kmac_sha2_set_imbl(u8 *param, u64 buflen_lo, + u64 buflen_hi, unsigned int blocksize) +{ + u8 *imbl = param + SHA2_IMBL_OFFSET(blocksize); + + switch (blocksize) { + case SHA256_BLOCK_SIZE: + *(u64 *)imbl = buflen_lo * BITS_PER_BYTE; + break; + case SHA512_BLOCK_SIZE: + *(u128 *)imbl = (((u128)buflen_hi << 64) + buflen_lo) << 3; + break; + default: + break; + } +} + +static int hash_data(const u8 *in, unsigned int inlen, + u8 *digest, unsigned int digestsize, bool final) +{ + unsigned long func; + union { + struct sha256_paramblock { + u32 h[8]; + u64 mbl; + } sha256; + struct sha512_paramblock { + u64 h[8]; + u128 mbl; + } sha512; + } __packed param; + +#define PARAM_INIT(x, y, z) \ + param.sha##x.h[0] = SHA##y ## _H0; \ + param.sha##x.h[1] = SHA##y ## _H1; \ + param.sha##x.h[2] = SHA##y ## _H2; \ + param.sha##x.h[3] = SHA##y ## _H3; \ + param.sha##x.h[4] = SHA##y ## _H4; \ + param.sha##x.h[5] = SHA##y ## _H5; \ + param.sha##x.h[6] = SHA##y ## _H6; \ + param.sha##x.h[7] = SHA##y ## _H7; \ + param.sha##x.mbl = (z) + + switch (digestsize) { + case SHA224_DIGEST_SIZE: + func = final ? CPACF_KLMD_SHA_256 : CPACF_KIMD_SHA_256; + PARAM_INIT(256, 224, inlen * 8); + if (!final) + digestsize = SHA256_DIGEST_SIZE; + break; + case SHA256_DIGEST_SIZE: + func = final ? CPACF_KLMD_SHA_256 : CPACF_KIMD_SHA_256; + PARAM_INIT(256, 256, inlen * 8); + break; + case SHA384_DIGEST_SIZE: + func = final ? CPACF_KLMD_SHA_512 : CPACF_KIMD_SHA_512; + PARAM_INIT(512, 384, inlen * 8); + if (!final) + digestsize = SHA512_DIGEST_SIZE; + break; + case SHA512_DIGEST_SIZE: + func = final ? CPACF_KLMD_SHA_512 : CPACF_KIMD_SHA_512; + PARAM_INIT(512, 512, inlen * 8); + break; + default: + return -EINVAL; + } + +#undef PARAM_INIT + + cpacf_klmd(func, ¶m, in, inlen); + + memcpy(digest, ¶m, digestsize); + + return 0; +} + +static int hash_key(const u8 *in, unsigned int inlen, + u8 *digest, unsigned int digestsize) +{ + return hash_data(in, inlen, digest, digestsize, true); +} + +static int s390_hmac_sha2_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct s390_hmac_ctx *tfm_ctx = crypto_shash_ctx(tfm); + unsigned int ds = crypto_shash_digestsize(tfm); + unsigned int bs = crypto_shash_blocksize(tfm); + + memset(tfm_ctx, 0, sizeof(*tfm_ctx)); + + if (keylen > bs) + return hash_key(key, keylen, tfm_ctx->key, ds); + + memcpy(tfm_ctx->key, key, keylen); + return 0; +} + +static int s390_hmac_sha2_init(struct shash_desc *desc) +{ + struct s390_hmac_ctx *tfm_ctx = crypto_shash_ctx(desc->tfm); + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + + memcpy(ctx->param + SHA2_KEY_OFFSET(bs), + tfm_ctx->key, bs); + + ctx->buflen[0] = 0; + ctx->buflen[1] = 0; + ctx->gr0.reg = 0; + switch (crypto_shash_digestsize(desc->tfm)) { + case SHA224_DIGEST_SIZE: + ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_224; + break; + case SHA256_DIGEST_SIZE: + ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_256; + break; + case SHA384_DIGEST_SIZE: + ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_384; + break; + case SHA512_DIGEST_SIZE: + ctx->gr0.fc = CPACF_KMAC_HMAC_SHA_512; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int s390_hmac_sha2_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + unsigned int n = round_down(len, bs); + + ctx->buflen[0] += n; + if (ctx->buflen[0] < n) + ctx->buflen[1]++; + + /* process as many blocks as possible */ + ctx->gr0.iimp = 1; + _cpacf_kmac(&ctx->gr0.reg, ctx->param, data, n); + return len - n; +} + +static int s390_hmac_sha2_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + + ctx->buflen[0] += len; + if (ctx->buflen[0] < len) + ctx->buflen[1]++; + + ctx->gr0.iimp = 0; + kmac_sha2_set_imbl(ctx->param, ctx->buflen[0], ctx->buflen[1], bs); + _cpacf_kmac(&ctx->gr0.reg, ctx->param, src, len); + memcpy(out, ctx->param, crypto_shash_digestsize(desc->tfm)); + + return 0; +} + +static int s390_hmac_sha2_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int ds = crypto_shash_digestsize(desc->tfm); + int rc; + + rc = s390_hmac_sha2_init(desc); + if (rc) + return rc; + + ctx->gr0.iimp = 0; + kmac_sha2_set_imbl(ctx->param, len, 0, + crypto_shash_blocksize(desc->tfm)); + _cpacf_kmac(&ctx->gr0.reg, ctx->param, data, len); + memcpy(out, ctx->param, ds); + + return 0; +} + +static int s390_hmac_export_zero(struct shash_desc *desc, void *out) +{ + struct crypto_shash *tfm = desc->tfm; + u8 ipad[SHA512_BLOCK_SIZE]; + struct s390_hmac_ctx *ctx; + unsigned int bs; + int err, i; + + ctx = crypto_shash_ctx(tfm); + bs = crypto_shash_blocksize(tfm); + for (i = 0; i < bs; i++) + ipad[i] = ctx->key[i] ^ HMAC_IPAD_VALUE; + + err = hash_data(ipad, bs, out, crypto_shash_digestsize(tfm), false); + memzero_explicit(ipad, sizeof(ipad)); + return err; +} + +static int s390_hmac_export(struct shash_desc *desc, void *out) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + unsigned int ds = bs / 2; + union { + u8 *u8; + u64 *u64; + } p = { .u8 = out }; + int err = 0; + + if (!ctx->gr0.ikp) + err = s390_hmac_export_zero(desc, out); + else + memcpy(p.u8, ctx->param, ds); + p.u8 += ds; + put_unaligned(ctx->buflen[0], p.u64++); + if (ds == SHA512_DIGEST_SIZE) + put_unaligned(ctx->buflen[1], p.u64); + return err; +} + +static int s390_hmac_import(struct shash_desc *desc, const void *in) +{ + struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc); + unsigned int bs = crypto_shash_blocksize(desc->tfm); + unsigned int ds = bs / 2; + union { + const u8 *u8; + const u64 *u64; + } p = { .u8 = in }; + int err; + + err = s390_hmac_sha2_init(desc); + memcpy(ctx->param, p.u8, ds); + p.u8 += ds; + ctx->buflen[0] = get_unaligned(p.u64++); + if (ds == SHA512_DIGEST_SIZE) + ctx->buflen[1] = get_unaligned(p.u64); + if (ctx->buflen[0] | ctx->buflen[1]) + ctx->gr0.ikp = 1; + return err; +} + +#define S390_HMAC_SHA2_ALG(x, ss) { \ + .fc = CPACF_KMAC_HMAC_SHA_##x, \ + .alg = { \ + .init = s390_hmac_sha2_init, \ + .update = s390_hmac_sha2_update, \ + .finup = s390_hmac_sha2_finup, \ + .digest = s390_hmac_sha2_digest, \ + .setkey = s390_hmac_sha2_setkey, \ + .export = s390_hmac_export, \ + .import = s390_hmac_import, \ + .descsize = sizeof(struct s390_kmac_sha2_ctx), \ + .halg = { \ + .statesize = ss, \ + .digestsize = SHA##x##_DIGEST_SIZE, \ + .base = { \ + .cra_name = "hmac(sha" #x ")", \ + .cra_driver_name = "hmac_s390_sha" #x, \ + .cra_blocksize = SHA##x##_BLOCK_SIZE, \ + .cra_priority = 400, \ + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | \ + CRYPTO_AHASH_ALG_FINUP_MAX, \ + .cra_ctxsize = sizeof(struct s390_hmac_ctx), \ + .cra_module = THIS_MODULE, \ + }, \ + }, \ + }, \ +} + +static struct s390_hmac_alg { + bool registered; + unsigned int fc; + struct shash_alg alg; +} s390_hmac_algs[] = { + S390_HMAC_SHA2_ALG(224, sizeof(struct crypto_sha256_state)), + S390_HMAC_SHA2_ALG(256, sizeof(struct crypto_sha256_state)), + S390_HMAC_SHA2_ALG(384, SHA512_STATE_SIZE), + S390_HMAC_SHA2_ALG(512, SHA512_STATE_SIZE), +}; + +static __always_inline void _s390_hmac_algs_unregister(void) +{ + struct s390_hmac_alg *hmac; + int i; + + for (i = ARRAY_SIZE(s390_hmac_algs) - 1; i >= 0; i--) { + hmac = &s390_hmac_algs[i]; + if (!hmac->registered) + continue; + crypto_unregister_shash(&hmac->alg); + } +} + +static int __init hmac_s390_init(void) +{ + struct s390_hmac_alg *hmac; + int i, rc = -ENODEV; + + if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_256)) + return -ENODEV; + if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_512)) + return -ENODEV; + + for (i = 0; i < ARRAY_SIZE(s390_hmac_algs); i++) { + hmac = &s390_hmac_algs[i]; + if (!cpacf_query_func(CPACF_KMAC, hmac->fc)) + continue; + + rc = crypto_register_shash(&hmac->alg); + if (rc) { + pr_err("unable to register %s\n", + hmac->alg.halg.base.cra_name); + goto out; + } + hmac->registered = true; + pr_debug("registered %s\n", hmac->alg.halg.base.cra_name); + } + return rc; +out: + _s390_hmac_algs_unregister(); + return rc; +} + +static void __exit hmac_s390_exit(void) +{ + _s390_hmac_algs_unregister(); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, hmac_s390_init); +module_exit(hmac_s390_exit); + +MODULE_DESCRIPTION("S390 HMAC driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index a279b7d23a5e..8a340c16acb4 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -5,7 +5,7 @@ * s390 implementation of the AES Cipher Algorithm with protected keys. * * s390 Version: - * Copyright IBM Corp. 2017,2020 + * Copyright IBM Corp. 2017, 2025 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> * Harald Freudenberger <freude@de.ibm.com> */ @@ -13,16 +13,18 @@ #define KMSG_COMPONENT "paes_s390" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include <crypto/aes.h> -#include <crypto/algapi.h> -#include <linux/bug.h> -#include <linux/err.h> -#include <linux/module.h> +#include <linux/atomic.h> #include <linux/cpufeature.h> +#include <linux/delay.h> +#include <linux/err.h> #include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> -#include <linux/delay.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/engine.h> #include <crypto/internal/skcipher.h> #include <crypto/xts.h> #include <asm/cpacf.h> @@ -34,214 +36,464 @@ * is called. As paes can handle different kinds of key blobs * and padding is also possible, the limits need to be generous. */ -#define PAES_MIN_KEYSIZE 16 -#define PAES_MAX_KEYSIZE 320 +#define PAES_MIN_KEYSIZE 16 +#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE +#define PAES_256_PROTKEY_SIZE (32 + 32) /* key + verification pattern */ +#define PXTS_256_PROTKEY_SIZE (32 + 32 + 32) /* k1 + k2 + verification pattern */ static u8 *ctrblk; static DEFINE_MUTEX(ctrblk_lock); static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; -struct key_blob { - /* - * Small keys will be stored in the keybuf. Larger keys are - * stored in extra allocated memory. In both cases does - * key point to the memory where the key is stored. - * The code distinguishes by checking keylen against - * sizeof(keybuf). See the two following helper functions. - */ - u8 *key; - u8 keybuf[128]; +static struct crypto_engine *paes_crypto_engine; +#define MAX_QLEN 10 + +/* + * protected key specific stuff + */ + +struct paes_protkey { + u32 type; + u32 len; + u8 protkey[PXTS_256_PROTKEY_SIZE]; +}; + +#define PK_STATE_NO_KEY 0 +#define PK_STATE_CONVERT_IN_PROGRESS 1 +#define PK_STATE_VALID 2 + +struct s390_paes_ctx { + /* source key material used to derive a protected key from */ + u8 keybuf[PAES_MAX_KEYSIZE]; + unsigned int keylen; + + /* cpacf function code to use with this protected key type */ + long fc; + + /* nr of requests enqueued via crypto engine which use this tfm ctx */ + atomic_t via_engine_ctr; + + /* spinlock to atomic read/update all the following fields */ + spinlock_t pk_lock; + + /* see PK_STATE* defines above, < 0 holds convert failure rc */ + int pk_state; + /* if state is valid, pk holds the protected key */ + struct paes_protkey pk; +}; + +struct s390_pxts_ctx { + /* source key material used to derive a protected key from */ + u8 keybuf[2 * PAES_MAX_KEYSIZE]; unsigned int keylen; + + /* cpacf function code to use with this protected key type */ + long fc; + + /* nr of requests enqueued via crypto engine which use this tfm ctx */ + atomic_t via_engine_ctr; + + /* spinlock to atomic read/update all the following fields */ + spinlock_t pk_lock; + + /* see PK_STATE* defines above, < 0 holds convert failure rc */ + int pk_state; + /* if state is valid, pk[] hold(s) the protected key(s) */ + struct paes_protkey pk[2]; }; -static inline int _key_to_kb(struct key_blob *kb, - const u8 *key, - unsigned int keylen) +/* + * make_clrkey_token() - wrap the raw key ck with pkey clearkey token + * information. + * @returns the size of the clearkey token + */ +static inline u32 make_clrkey_token(const u8 *ck, size_t cklen, u8 *dest) { - struct clearkey_header { + struct clrkey_token { u8 type; u8 res0[3]; u8 version; u8 res1[3]; u32 keytype; u32 len; - } __packed * h; + u8 key[]; + } __packed *token = (struct clrkey_token *)dest; + + token->type = 0x00; + token->version = 0x02; + token->keytype = (cklen - 8) >> 3; + token->len = cklen; + memcpy(token->key, ck, cklen); + + return sizeof(*token) + cklen; +} + +/* + * paes_ctx_setkey() - Set key value into context, maybe construct + * a clear key token digestible by pkey from a clear key value. + */ +static inline int paes_ctx_setkey(struct s390_paes_ctx *ctx, + const u8 *key, unsigned int keylen) +{ + if (keylen > sizeof(ctx->keybuf)) + return -EINVAL; switch (keylen) { case 16: case 24: case 32: /* clear key value, prepare pkey clear key token in keybuf */ - memset(kb->keybuf, 0, sizeof(kb->keybuf)); - h = (struct clearkey_header *) kb->keybuf; - h->version = 0x02; /* TOKVER_CLEAR_KEY */ - h->keytype = (keylen - 8) >> 3; - h->len = keylen; - memcpy(kb->keybuf + sizeof(*h), key, keylen); - kb->keylen = sizeof(*h) + keylen; - kb->key = kb->keybuf; + memset(ctx->keybuf, 0, sizeof(ctx->keybuf)); + ctx->keylen = make_clrkey_token(key, keylen, ctx->keybuf); break; default: /* other key material, let pkey handle this */ - if (keylen <= sizeof(kb->keybuf)) - kb->key = kb->keybuf; - else { - kb->key = kmalloc(keylen, GFP_KERNEL); - if (!kb->key) - return -ENOMEM; - } - memcpy(kb->key, key, keylen); - kb->keylen = keylen; + memcpy(ctx->keybuf, key, keylen); + ctx->keylen = keylen; break; } return 0; } -static inline void _free_kb_keybuf(struct key_blob *kb) +/* + * pxts_ctx_setkey() - Set key value into context, maybe construct + * a clear key token digestible by pkey from a clear key value. + */ +static inline int pxts_ctx_setkey(struct s390_pxts_ctx *ctx, + const u8 *key, unsigned int keylen) { - if (kb->key && kb->key != kb->keybuf - && kb->keylen > sizeof(kb->keybuf)) { - kfree(kb->key); - kb->key = NULL; - } -} + size_t cklen = keylen / 2; -struct s390_paes_ctx { - struct key_blob kb; - struct pkey_protkey pk; - spinlock_t pk_lock; - unsigned long fc; -}; + if (keylen > sizeof(ctx->keybuf)) + return -EINVAL; -struct s390_pxts_ctx { - struct key_blob kb[2]; - struct pkey_protkey pk[2]; - spinlock_t pk_lock; - unsigned long fc; -}; + switch (keylen) { + case 32: + case 64: + /* clear key value, prepare pkey clear key tokens in keybuf */ + memset(ctx->keybuf, 0, sizeof(ctx->keybuf)); + ctx->keylen = make_clrkey_token(key, cklen, ctx->keybuf); + ctx->keylen += make_clrkey_token(key + cklen, cklen, + ctx->keybuf + ctx->keylen); + break; + default: + /* other key material, let pkey handle this */ + memcpy(ctx->keybuf, key, keylen); + ctx->keylen = keylen; + break; + } + + return 0; +} -static inline int __paes_keyblob2pkey(struct key_blob *kb, - struct pkey_protkey *pk) +/* + * Convert the raw key material into a protected key via PKEY api. + * This function may sleep - don't call in non-sleeping context. + */ +static inline int convert_key(const u8 *key, unsigned int keylen, + struct paes_protkey *pk) { - int i, ret; + int rc, i; - /* try three times in case of failure */ - for (i = 0; i < 3; i++) { - if (i > 0 && ret == -EAGAIN && in_task()) - if (msleep_interruptible(1000)) - return -EINTR; - ret = pkey_keyblob2pkey(kb->key, kb->keylen, pk); - if (ret == 0) - break; + pk->len = sizeof(pk->protkey); + + /* + * In case of a busy card retry with increasing delay + * of 200, 400, 800 and 1600 ms - in total 3 s. + */ + for (rc = -EIO, i = 0; rc && i < 5; i++) { + if (rc == -EBUSY && msleep_interruptible((1 << i) * 100)) { + rc = -EINTR; + goto out; + } + rc = pkey_key2protkey(key, keylen, + pk->protkey, &pk->len, &pk->type, + PKEY_XFLAG_NOMEMALLOC); } - return ret; +out: + pr_debug("rc=%d\n", rc); + return rc; } -static inline int __paes_convert_key(struct s390_paes_ctx *ctx) +/* + * (Re-)Convert the raw key material from the ctx into a protected key + * via convert_key() function. Update the pk_state, pk_type, pk_len + * and the protected key in the tfm context. + * Please note this function may be invoked concurrently with the very + * same tfm context. The pk_lock spinlock in the context ensures an + * atomic update of the pk and the pk state but does not guarantee any + * order of update. So a fresh converted valid protected key may get + * updated with an 'old' expired key value. As the cpacf instructions + * detect this, refuse to operate with an invalid key and the calling + * code triggers a (re-)conversion this does no harm. This may lead to + * unnecessary additional conversion but never to invalid data on en- + * or decrypt operations. + */ +static int paes_convert_key(struct s390_paes_ctx *ctx) { - int ret; - struct pkey_protkey pkey; + struct paes_protkey pk; + int rc; + + spin_lock_bh(&ctx->pk_lock); + ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS; + spin_unlock_bh(&ctx->pk_lock); - ret = __paes_keyblob2pkey(&ctx->kb, &pkey); - if (ret) - return ret; + rc = convert_key(ctx->keybuf, ctx->keylen, &pk); + /* update context */ spin_lock_bh(&ctx->pk_lock); - memcpy(&ctx->pk, &pkey, sizeof(pkey)); + if (rc) { + ctx->pk_state = rc; + } else { + ctx->pk_state = PK_STATE_VALID; + ctx->pk = pk; + } spin_unlock_bh(&ctx->pk_lock); - return 0; + memzero_explicit(&pk, sizeof(pk)); + pr_debug("rc=%d\n", rc); + return rc; } -static int ecb_paes_init(struct crypto_skcipher *tfm) +/* + * (Re-)Convert the raw xts key material from the ctx into a + * protected key via convert_key() function. Update the pk_state, + * pk_type, pk_len and the protected key in the tfm context. + * See also comments on function paes_convert_key. + */ +static int pxts_convert_key(struct s390_pxts_ctx *ctx) { - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct paes_protkey pk0, pk1; + size_t split_keylen; + int rc; - ctx->kb.key = NULL; - spin_lock_init(&ctx->pk_lock); + spin_lock_bh(&ctx->pk_lock); + ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS; + spin_unlock_bh(&ctx->pk_lock); - return 0; -} + rc = convert_key(ctx->keybuf, ctx->keylen, &pk0); + if (rc) + goto out; + + switch (pk0.type) { + case PKEY_KEYTYPE_AES_128: + case PKEY_KEYTYPE_AES_256: + /* second keytoken required */ + if (ctx->keylen % 2) { + rc = -EINVAL; + goto out; + } + split_keylen = ctx->keylen / 2; + rc = convert_key(ctx->keybuf + split_keylen, + split_keylen, &pk1); + if (rc) + goto out; + if (pk0.type != pk1.type) { + rc = -EINVAL; + goto out; + } + break; + case PKEY_KEYTYPE_AES_XTS_128: + case PKEY_KEYTYPE_AES_XTS_256: + /* single key */ + pk1.type = 0; + break; + default: + /* unsupported protected keytype */ + rc = -EINVAL; + goto out; + } -static void ecb_paes_exit(struct crypto_skcipher *tfm) -{ - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); +out: + /* update context */ + spin_lock_bh(&ctx->pk_lock); + if (rc) { + ctx->pk_state = rc; + } else { + ctx->pk_state = PK_STATE_VALID; + ctx->pk[0] = pk0; + ctx->pk[1] = pk1; + } + spin_unlock_bh(&ctx->pk_lock); - _free_kb_keybuf(&ctx->kb); + memzero_explicit(&pk0, sizeof(pk0)); + memzero_explicit(&pk1, sizeof(pk1)); + pr_debug("rc=%d\n", rc); + return rc; } -static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx) +/* + * PAES ECB implementation + */ + +struct ecb_param { + u8 key[PAES_256_PROTKEY_SIZE]; +} __packed; + +struct s390_pecb_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + struct ecb_param param; +}; + +static int ecb_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) { + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + long fc; int rc; - unsigned long fc; - rc = __paes_convert_key(ctx); + /* set raw key into context */ + rc = paes_ctx_setkey(ctx, in_key, key_len); if (rc) - return rc; + goto out; - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0; + /* convert key into protected key */ + rc = paes_convert_key(ctx); + if (rc) + goto out; - /* Check if the function code is available */ + /* Pick the correct function code based on the protected key type */ + switch (ctx->pk.type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KM_PAES_128; + break; + case PKEY_KEYTYPE_AES_192: + fc = CPACF_KM_PAES_192; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KM_PAES_256; + break; + default: + fc = 0; + break; + } ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; - return ctx->fc ? 0 : -EINVAL; + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) +static int ecb_paes_do_crypt(struct s390_paes_ctx *ctx, + struct s390_pecb_req_ctx *req_ctx, + bool maysleep) { - int rc; - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - - _free_kb_keybuf(&ctx->kb); - rc = _key_to_kb(&ctx->kb, in_key, key_len); + struct ecb_param *param = &req_ctx->param; + struct skcipher_walk *walk = &req_ctx->walk; + unsigned int nbytes, n, k; + int pk_state, rc = 0; + + if (!req_ctx->param_init_done) { + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + req_ctx->param_init_done = true; + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + } if (rc) - return rc; + goto out; + + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + k = cpacf_km(ctx->fc | req_ctx->modifier, param, + walk->dst.virt.addr, walk->src.virt.addr, n); + if (k) + rc = skcipher_walk_done(walk, nbytes - k); + if (k < n) { + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx); + if (rc) + goto out; + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + spin_unlock_bh(&ctx->pk_lock); + } + } - return __ecb_paes_set_key(ctx); +out: + pr_debug("rc=%d\n", rc); + return rc; } static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) { + struct s390_pecb_req_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes, n, k; - int ret; - struct { - u8 key[MAXPROTKEYSIZE]; - } param; + struct skcipher_walk *walk = &req_ctx->walk; + int rc; - ret = skcipher_walk_virt(&walk, req, false); - if (ret) - return ret; + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - spin_unlock_bh(&ctx->pk_lock); + rc = skcipher_walk_virt(walk, req, false); + if (rc) + goto out; - while ((nbytes = walk.nbytes) != 0) { - /* only use complete blocks */ - n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_km(ctx->fc | modifier, ¶m, - walk.dst.virt.addr, walk.src.virt.addr, n); - if (k) - ret = skcipher_walk_done(&walk, nbytes - k); - if (k < n) { - if (__paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - spin_unlock_bh(&ctx->pk_lock); - } + req_ctx->modifier = modifier; + req_ctx->param_init_done = false; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = ecb_paes_do_crypt(ctx, req_ctx, false); + if (rc == 0) + goto out; } - return ret; + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); + return rc; } static int ecb_paes_encrypt(struct skcipher_request *req) @@ -254,113 +506,257 @@ static int ecb_paes_decrypt(struct skcipher_request *req) return ecb_paes_crypt(req, CPACF_DECRYPT); } -static struct skcipher_alg ecb_paes_alg = { - .base.cra_name = "ecb(paes)", - .base.cra_driver_name = "ecb-paes-s390", - .base.cra_priority = 401, /* combo: aes + ecb + 1 */ - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_paes_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(ecb_paes_alg.base.cra_list), - .init = ecb_paes_init, - .exit = ecb_paes_exit, - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .setkey = ecb_paes_set_key, - .encrypt = ecb_paes_encrypt, - .decrypt = ecb_paes_decrypt, -}; - -static int cbc_paes_init(struct crypto_skcipher *tfm) +static int ecb_paes_init(struct crypto_skcipher *tfm) { struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - ctx->kb.key = NULL; + memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->pk_lock); + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pecb_req_ctx)); + return 0; } -static void cbc_paes_exit(struct crypto_skcipher *tfm) +static void ecb_paes_exit(struct crypto_skcipher *tfm) { struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - _free_kb_keybuf(&ctx->kb); + memzero_explicit(ctx, sizeof(*ctx)); } -static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx) +static int ecb_paes_do_one_request(struct crypto_engine *engine, void *areq) { + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pecb_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; int rc; - unsigned long fc; - rc = __paes_convert_key(ctx); - if (rc) - return rc; + /* walk has already been prepared */ + + rc = ecb_paes_do_crypt(ctx, req_ctx, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0; + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; +} - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; +static struct skcipher_engine_alg ecb_paes_alg = { + .base = { + .base.cra_name = "ecb(paes)", + .base.cra_driver_name = "ecb-paes-s390", + .base.cra_priority = 401, /* combo: aes + ecb + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ecb_paes_alg.base.base.cra_list), + .init = ecb_paes_init, + .exit = ecb_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .setkey = ecb_paes_setkey, + .encrypt = ecb_paes_encrypt, + .decrypt = ecb_paes_decrypt, + }, + .op = { + .do_one_request = ecb_paes_do_one_request, + }, +}; - return ctx->fc ? 0 : -EINVAL; -} +/* + * PAES CBC implementation + */ + +struct cbc_param { + u8 iv[AES_BLOCK_SIZE]; + u8 key[PAES_256_PROTKEY_SIZE]; +} __packed; + +struct s390_pcbc_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + struct cbc_param param; +}; -static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) +static int cbc_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) { - int rc; struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + long fc; + int rc; - _free_kb_keybuf(&ctx->kb); - rc = _key_to_kb(&ctx->kb, in_key, key_len); + /* set raw key into context */ + rc = paes_ctx_setkey(ctx, in_key, key_len); if (rc) - return rc; + goto out; + + /* convert raw key into protected key */ + rc = paes_convert_key(ctx); + if (rc) + goto out; + + /* Pick the correct function code based on the protected key type */ + switch (ctx->pk.type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KMC_PAES_128; + break; + case PKEY_KEYTYPE_AES_192: + fc = CPACF_KMC_PAES_192; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KMC_PAES_256; + break; + default: + fc = 0; + break; + } + ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; - return __cbc_paes_set_key(ctx); + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) +static int cbc_paes_do_crypt(struct s390_paes_ctx *ctx, + struct s390_pcbc_req_ctx *req_ctx, + bool maysleep) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; + struct cbc_param *param = &req_ctx->param; + struct skcipher_walk *walk = &req_ctx->walk; unsigned int nbytes, n, k; - int ret; - struct { - u8 iv[AES_BLOCK_SIZE]; - u8 key[MAXPROTKEYSIZE]; - } param; - - ret = skcipher_walk_virt(&walk, req, false); - if (ret) - return ret; + int pk_state, rc = 0; + + if (!req_ctx->param_init_done) { + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + req_ctx->param_init_done = true; + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + } + if (rc) + goto out; - memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - spin_unlock_bh(&ctx->pk_lock); + memcpy(param->iv, walk->iv, AES_BLOCK_SIZE); - while ((nbytes = walk.nbytes) != 0) { + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_kmc(ctx->fc | modifier, ¶m, - walk.dst.virt.addr, walk.src.virt.addr, n); + k = cpacf_kmc(ctx->fc | req_ctx->modifier, param, + walk->dst.virt.addr, walk->src.virt.addr, n); if (k) { - memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, nbytes - k); + memcpy(walk->iv, param->iv, AES_BLOCK_SIZE); + rc = skcipher_walk_done(walk, nbytes - k); } if (k < n) { - if (__paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx); + if (rc) + goto out; spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); spin_unlock_bh(&ctx->pk_lock); } } - return ret; + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct s390_pcbc_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + int rc; + + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ + + rc = skcipher_walk_virt(walk, req, false); + if (rc) + goto out; + + req_ctx->modifier = modifier; + req_ctx->param_init_done = false; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = cbc_paes_do_crypt(ctx, req_ctx, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); + return rc; } static int cbc_paes_encrypt(struct skcipher_request *req) @@ -373,378 +769,881 @@ static int cbc_paes_decrypt(struct skcipher_request *req) return cbc_paes_crypt(req, CPACF_DECRYPT); } -static struct skcipher_alg cbc_paes_alg = { - .base.cra_name = "cbc(paes)", - .base.cra_driver_name = "cbc-paes-s390", - .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_paes_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(cbc_paes_alg.base.cra_list), - .init = cbc_paes_init, - .exit = cbc_paes_exit, - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = cbc_paes_set_key, - .encrypt = cbc_paes_encrypt, - .decrypt = cbc_paes_decrypt, -}; - -static int xts_paes_init(struct crypto_skcipher *tfm) +static int cbc_paes_init(struct crypto_skcipher *tfm) { - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - ctx->kb[0].key = NULL; - ctx->kb[1].key = NULL; + memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->pk_lock); + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pcbc_req_ctx)); + return 0; } -static void xts_paes_exit(struct crypto_skcipher *tfm) +static void cbc_paes_exit(struct crypto_skcipher *tfm) { - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - _free_kb_keybuf(&ctx->kb[0]); - _free_kb_keybuf(&ctx->kb[1]); + memzero_explicit(ctx, sizeof(*ctx)); } -static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx) +static int cbc_paes_do_one_request(struct crypto_engine *engine, void *areq) { - struct pkey_protkey pkey0, pkey1; - - if (__paes_keyblob2pkey(&ctx->kb[0], &pkey0) || - __paes_keyblob2pkey(&ctx->kb[1], &pkey1)) - return -EINVAL; + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pcbc_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + int rc; - spin_lock_bh(&ctx->pk_lock); - memcpy(&ctx->pk[0], &pkey0, sizeof(pkey0)); - memcpy(&ctx->pk[1], &pkey1, sizeof(pkey1)); - spin_unlock_bh(&ctx->pk_lock); + /* walk has already been prepared */ + + rc = cbc_paes_do_crypt(ctx, req_ctx, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } - return 0; + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; } -static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx) +static struct skcipher_engine_alg cbc_paes_alg = { + .base = { + .base.cra_name = "cbc(paes)", + .base.cra_driver_name = "cbc-paes-s390", + .base.cra_priority = 402, /* cbc-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(cbc_paes_alg.base.base.cra_list), + .init = cbc_paes_init, + .exit = cbc_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = cbc_paes_setkey, + .encrypt = cbc_paes_encrypt, + .decrypt = cbc_paes_decrypt, + }, + .op = { + .do_one_request = cbc_paes_do_one_request, + }, +}; + +/* + * PAES CTR implementation + */ + +struct ctr_param { + u8 key[PAES_256_PROTKEY_SIZE]; +} __packed; + +struct s390_pctr_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + struct ctr_param param; +}; + +static int ctr_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) { - unsigned long fc; + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + long fc; + int rc; - if (__xts_paes_convert_key(ctx)) - return -EINVAL; + /* set raw key into context */ + rc = paes_ctx_setkey(ctx, in_key, key_len); + if (rc) + goto out; - if (ctx->pk[0].type != ctx->pk[1].type) - return -EINVAL; + /* convert raw key into protected key */ + rc = paes_convert_key(ctx); + if (rc) + goto out; /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 : - (ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ? - CPACF_KM_PXTS_256 : 0; + switch (ctx->pk.type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KMCTR_PAES_128; + break; + case PKEY_KEYTYPE_AES_192: + fc = CPACF_KMCTR_PAES_192; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KMCTR_PAES_256; + break; + default: + fc = 0; + break; + } + ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + rc = fc ? 0 : -EINVAL; - return ctx->fc ? 0 : -EINVAL; +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int xts_key_len) +static inline unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) { + unsigned int i, n; + + /* only use complete blocks, max. PAGE_SIZE */ + memcpy(ctrptr, iv, AES_BLOCK_SIZE); + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); + for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); + crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); + ctrptr += AES_BLOCK_SIZE; + } + return n; +} + +static int ctr_paes_do_crypt(struct s390_paes_ctx *ctx, + struct s390_pctr_req_ctx *req_ctx, + bool maysleep) +{ + struct ctr_param *param = &req_ctx->param; + struct skcipher_walk *walk = &req_ctx->walk; + u8 buf[AES_BLOCK_SIZE], *ctrptr; + unsigned int nbytes, n, k; + int pk_state, locked, rc = 0; + + if (!req_ctx->param_init_done) { + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + req_ctx->param_init_done = true; + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + } + if (rc) + goto out; + + locked = mutex_trylock(&ctrblk_lock); + + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + n = AES_BLOCK_SIZE; + if (nbytes >= 2 * AES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk->iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv; + k = cpacf_kmctr(ctx->fc, param, walk->dst.virt.addr, + walk->src.virt.addr, n, ctrptr); + if (k) { + if (ctrptr == ctrblk) + memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(walk->iv, AES_BLOCK_SIZE); + rc = skcipher_walk_done(walk, nbytes - k); + } + if (k < n) { + if (!maysleep) { + if (locked) + mutex_unlock(&ctrblk_lock); + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx); + if (rc) { + if (locked) + mutex_unlock(&ctrblk_lock); + goto out; + } + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + spin_unlock_bh(&ctx->pk_lock); + } + } + if (locked) + mutex_unlock(&ctrblk_lock); + + /* final block may be < AES_BLOCK_SIZE, copy only nbytes */ + if (nbytes) { + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk->src.virt.addr, nbytes); + while (1) { + if (cpacf_kmctr(ctx->fc, param, buf, + buf, AES_BLOCK_SIZE, + walk->iv) == AES_BLOCK_SIZE) + break; + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = paes_convert_key(ctx); + if (rc) + goto out; + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key, ctx->pk.protkey, sizeof(param->key)); + spin_unlock_bh(&ctx->pk_lock); + } + memcpy(walk->dst.virt.addr, buf, nbytes); + crypto_inc(walk->iv, AES_BLOCK_SIZE); + rc = skcipher_walk_done(walk, 0); + } + +out: + pr_debug("rc=%d\n", rc); + return rc; +} + +static int ctr_paes_crypt(struct skcipher_request *req) +{ + struct s390_pctr_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; int rc; + + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ + + rc = skcipher_walk_virt(walk, req, false); + if (rc) + goto out; + + req_ctx->param_init_done = false; + + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = ctr_paes_do_crypt(ctx, req_ctx, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } + + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); + return rc; +} + +static int ctr_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->pk_lock); + + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pctr_req_ctx)); + + return 0; +} + +static void ctr_paes_exit(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + memzero_explicit(ctx, sizeof(*ctx)); +} + +static int ctr_paes_do_one_request(struct crypto_engine *engine, void *areq) +{ + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pctr_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + int rc; + + /* walk has already been prepared */ + + rc = ctr_paes_do_crypt(ctx, req_ctx, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } + + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; +} + +static struct skcipher_engine_alg ctr_paes_alg = { + .base = { + .base.cra_name = "ctr(paes)", + .base.cra_driver_name = "ctr-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ctr_paes_alg.base.base.cra_list), + .init = ctr_paes_init, + .exit = ctr_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_paes_setkey, + .encrypt = ctr_paes_crypt, + .decrypt = ctr_paes_crypt, + .chunksize = AES_BLOCK_SIZE, + }, + .op = { + .do_one_request = ctr_paes_do_one_request, + }, +}; + +/* + * PAES XTS implementation + */ + +struct xts_full_km_param { + u8 key[64]; + u8 tweak[16]; + u8 nap[16]; + u8 wkvp[32]; +} __packed; + +struct xts_km_param { + u8 key[PAES_256_PROTKEY_SIZE]; + u8 init[16]; +} __packed; + +struct xts_pcc_param { + u8 key[PAES_256_PROTKEY_SIZE]; + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; +} __packed; + +struct s390_pxts_req_ctx { + unsigned long modifier; + struct skcipher_walk walk; + bool param_init_done; + union { + struct xts_full_km_param full_km_param; + struct xts_km_param km_param; + } param; +}; + +static int xts_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int in_keylen) +{ struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); u8 ckey[2 * AES_MAX_KEY_SIZE]; - unsigned int ckey_len, key_len; + unsigned int ckey_len; + long fc; + int rc; - if (xts_key_len % 2) + if ((in_keylen == 32 || in_keylen == 64) && + xts_verify_key(tfm, in_key, in_keylen)) return -EINVAL; - key_len = xts_key_len / 2; - - _free_kb_keybuf(&ctx->kb[0]); - _free_kb_keybuf(&ctx->kb[1]); - rc = _key_to_kb(&ctx->kb[0], in_key, key_len); + /* set raw key into context */ + rc = pxts_ctx_setkey(ctx, in_key, in_keylen); if (rc) - return rc; - rc = _key_to_kb(&ctx->kb[1], in_key + key_len, key_len); - if (rc) - return rc; + goto out; - rc = __xts_paes_set_key(ctx); + /* convert raw key(s) into protected key(s) */ + rc = pxts_convert_key(ctx); if (rc) - return rc; + goto out; /* - * xts_check_key verifies the key length is not odd and makes + * xts_verify_key verifies the key length is not odd and makes * sure that the two keys are not the same. This can be done - * on the two protected keys as well + * on the two protected keys as well - but not for full xts keys. */ - ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? - AES_KEYSIZE_128 : AES_KEYSIZE_256; - memcpy(ckey, ctx->pk[0].protkey, ckey_len); - memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len); - return xts_verify_key(tfm, ckey, 2*ckey_len); + if (ctx->pk[0].type == PKEY_KEYTYPE_AES_128 || + ctx->pk[0].type == PKEY_KEYTYPE_AES_256) { + ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? + AES_KEYSIZE_128 : AES_KEYSIZE_256; + memcpy(ckey, ctx->pk[0].protkey, ckey_len); + memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len); + rc = xts_verify_key(tfm, ckey, 2 * ckey_len); + memzero_explicit(ckey, sizeof(ckey)); + if (rc) + goto out; + } + + /* Pick the correct function code based on the protected key type */ + switch (ctx->pk[0].type) { + case PKEY_KEYTYPE_AES_128: + fc = CPACF_KM_PXTS_128; + break; + case PKEY_KEYTYPE_AES_256: + fc = CPACF_KM_PXTS_256; + break; + case PKEY_KEYTYPE_AES_XTS_128: + fc = CPACF_KM_PXTS_128_FULL; + break; + case PKEY_KEYTYPE_AES_XTS_256: + fc = CPACF_KM_PXTS_256_FULL; + break; + default: + fc = 0; + break; + } + ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + + rc = fc ? 0 : -EINVAL; + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) +static int xts_paes_do_crypt_fullkey(struct s390_pxts_ctx *ctx, + struct s390_pxts_req_ctx *req_ctx, + bool maysleep) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; + struct xts_full_km_param *param = &req_ctx->param.full_km_param; + struct skcipher_walk *walk = &req_ctx->walk; unsigned int keylen, offset, nbytes, n, k; - int ret; - struct { - u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */ - u8 tweak[16]; - u8 block[16]; - u8 bit[16]; - u8 xts[16]; - } pcc_param; - struct { - u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */ - u8 init[16]; - } xts_param; - - ret = skcipher_walk_virt(&walk, req, false); - if (ret) - return ret; + int rc = 0; - keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64; - offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0; + /* + * The calling function xts_paes_do_crypt() ensures the + * protected key state is always PK_STATE_VALID when this + * function is invoked. + */ - memset(&pcc_param, 0, sizeof(pcc_param)); - memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); - spin_lock_bh(&ctx->pk_lock); - memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); - memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen); - spin_unlock_bh(&ctx->pk_lock); - cpacf_pcc(ctx->fc, pcc_param.key + offset); - memcpy(xts_param.init, pcc_param.xts, 16); + keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 64; + offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 0; + + if (!req_ctx->param_init_done) { + memset(param, 0, sizeof(*param)); + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + memcpy(param->wkvp, ctx->pk[0].protkey + keylen, sizeof(param->wkvp)); + spin_unlock_bh(&ctx->pk_lock); + memcpy(param->tweak, walk->iv, sizeof(param->tweak)); + param->nap[0] = 0x01; /* initial alpha power (1, little-endian) */ + req_ctx->param_init_done = true; + } - while ((nbytes = walk.nbytes) != 0) { + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_km(ctx->fc | modifier, xts_param.key + offset, - walk.dst.virt.addr, walk.src.virt.addr, n); + k = cpacf_km(ctx->fc | req_ctx->modifier, param->key + offset, + walk->dst.virt.addr, walk->src.virt.addr, n); if (k) - ret = skcipher_walk_done(&walk, nbytes - k); + rc = skcipher_walk_done(walk, nbytes - k); if (k < n) { - if (__xts_paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = pxts_convert_key(ctx); + if (rc) + goto out; spin_lock_bh(&ctx->pk_lock); - memcpy(xts_param.key + offset, - ctx->pk[0].protkey, keylen); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + memcpy(param->wkvp, ctx->pk[0].protkey + keylen, sizeof(param->wkvp)); spin_unlock_bh(&ctx->pk_lock); } } - return ret; +out: + pr_debug("rc=%d\n", rc); + return rc; } -static int xts_paes_encrypt(struct skcipher_request *req) +static inline int __xts_2keys_prep_param(struct s390_pxts_ctx *ctx, + struct xts_km_param *param, + struct skcipher_walk *walk, + unsigned int keylen, + unsigned int offset, bool maysleep) { - return xts_paes_crypt(req, 0); + struct xts_pcc_param pcc_param; + unsigned long cc = 1; + int rc = 0; + + while (cc) { + memset(&pcc_param, 0, sizeof(pcc_param)); + memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); + spin_lock_bh(&ctx->pk_lock); + memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); + cc = cpacf_pcc(ctx->fc, pcc_param.key + offset); + if (cc) { + if (!maysleep) { + rc = -EKEYEXPIRED; + break; + } + rc = pxts_convert_key(ctx); + if (rc) + break; + continue; + } + memcpy(param->init, pcc_param.xts, 16); + } + + memzero_explicit(pcc_param.key, sizeof(pcc_param.key)); + return rc; } -static int xts_paes_decrypt(struct skcipher_request *req) +static int xts_paes_do_crypt_2keys(struct s390_pxts_ctx *ctx, + struct s390_pxts_req_ctx *req_ctx, + bool maysleep) { - return xts_paes_crypt(req, CPACF_DECRYPT); -} + struct xts_km_param *param = &req_ctx->param.km_param; + struct skcipher_walk *walk = &req_ctx->walk; + unsigned int keylen, offset, nbytes, n, k; + int rc = 0; -static struct skcipher_alg xts_paes_alg = { - .base.cra_name = "xts(paes)", - .base.cra_driver_name = "xts-paes-s390", - .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct s390_pxts_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(xts_paes_alg.base.cra_list), - .init = xts_paes_init, - .exit = xts_paes_exit, - .min_keysize = 2 * PAES_MIN_KEYSIZE, - .max_keysize = 2 * PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_paes_set_key, - .encrypt = xts_paes_encrypt, - .decrypt = xts_paes_decrypt, -}; + /* + * The calling function xts_paes_do_crypt() ensures the + * protected key state is always PK_STATE_VALID when this + * function is invoked. + */ -static int ctr_paes_init(struct crypto_skcipher *tfm) -{ - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64; + offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0; - ctx->kb.key = NULL; - spin_lock_init(&ctx->pk_lock); + if (!req_ctx->param_init_done) { + rc = __xts_2keys_prep_param(ctx, param, walk, + keylen, offset, maysleep); + if (rc) + goto out; + req_ctx->param_init_done = true; + } - return 0; + /* + * Note that in case of partial processing or failure the walk + * is NOT unmapped here. So a follow up task may reuse the walk + * or in case of unrecoverable failure needs to unmap it. + */ + while ((nbytes = walk->nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + k = cpacf_km(ctx->fc | req_ctx->modifier, param->key + offset, + walk->dst.virt.addr, walk->src.virt.addr, n); + if (k) + rc = skcipher_walk_done(walk, nbytes - k); + if (k < n) { + if (!maysleep) { + rc = -EKEYEXPIRED; + goto out; + } + rc = pxts_convert_key(ctx); + if (rc) + goto out; + spin_lock_bh(&ctx->pk_lock); + memcpy(param->key + offset, ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); + } + } + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static void ctr_paes_exit(struct crypto_skcipher *tfm) +static int xts_paes_do_crypt(struct s390_pxts_ctx *ctx, + struct s390_pxts_req_ctx *req_ctx, + bool maysleep) { - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + int pk_state, rc = 0; - _free_kb_keybuf(&ctx->kb); + /* fetch and check protected key state */ + spin_lock_bh(&ctx->pk_lock); + pk_state = ctx->pk_state; + switch (pk_state) { + case PK_STATE_NO_KEY: + rc = -ENOKEY; + break; + case PK_STATE_CONVERT_IN_PROGRESS: + rc = -EKEYEXPIRED; + break; + case PK_STATE_VALID: + break; + default: + rc = pk_state < 0 ? pk_state : -EIO; + break; + } + spin_unlock_bh(&ctx->pk_lock); + if (rc) + goto out; + + /* Call the 'real' crypt function based on the xts prot key type. */ + switch (ctx->fc) { + case CPACF_KM_PXTS_128: + case CPACF_KM_PXTS_256: + rc = xts_paes_do_crypt_2keys(ctx, req_ctx, maysleep); + break; + case CPACF_KM_PXTS_128_FULL: + case CPACF_KM_PXTS_256_FULL: + rc = xts_paes_do_crypt_fullkey(ctx, req_ctx, maysleep); + break; + default: + rc = -EINVAL; + } + +out: + pr_debug("rc=%d\n", rc); + return rc; } -static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx) +static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) { + struct s390_pxts_req_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; int rc; - unsigned long fc; - rc = __paes_convert_key(ctx); + /* + * Attempt synchronous encryption first. If it fails, schedule the request + * asynchronously via the crypto engine. To preserve execution order, + * once a request is queued to the engine, further requests using the same + * tfm will also be routed through the engine. + */ + + rc = skcipher_walk_virt(walk, req, false); if (rc) - return rc; + goto out; - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? - CPACF_KMCTR_PAES_256 : 0; + req_ctx->modifier = modifier; + req_ctx->param_init_done = false; - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; + /* Try synchronous operation if no active engine usage */ + if (!atomic_read(&ctx->via_engine_ctr)) { + rc = xts_paes_do_crypt(ctx, req_ctx, false); + if (rc == 0) + goto out; + } + + /* + * If sync operation failed or key expired or there are already + * requests enqueued via engine, fallback to async. Mark tfm as + * using engine to serialize requests. + */ + if (rc == 0 || rc == -EKEYEXPIRED) { + atomic_inc(&ctx->via_engine_ctr); + rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req); + if (rc != -EINPROGRESS) + atomic_dec(&ctx->via_engine_ctr); + } - return ctx->fc ? 0 : -EINVAL; + if (rc != -EINPROGRESS) + skcipher_walk_done(walk, rc); + +out: + if (rc != -EINPROGRESS) + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("rc=%d\n", rc); + return rc; } -static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) +static int xts_paes_encrypt(struct skcipher_request *req) { - int rc; - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + return xts_paes_crypt(req, 0); +} - _free_kb_keybuf(&ctx->kb); - rc = _key_to_kb(&ctx->kb, in_key, key_len); - if (rc) - return rc; +static int xts_paes_decrypt(struct skcipher_request *req) +{ + return xts_paes_crypt(req, CPACF_DECRYPT); +} + +static int xts_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->pk_lock); - return __ctr_paes_set_key(ctx); + crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pxts_req_ctx)); + + return 0; } -static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) +static void xts_paes_exit(struct crypto_skcipher *tfm) { - unsigned int i, n; + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); - /* only use complete blocks, max. PAGE_SIZE */ - memcpy(ctrptr, iv, AES_BLOCK_SIZE); - n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); - for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { - memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); - crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); - ctrptr += AES_BLOCK_SIZE; - } - return n; + memzero_explicit(ctx, sizeof(*ctx)); } -static int ctr_paes_crypt(struct skcipher_request *req) +static int xts_paes_do_one_request(struct crypto_engine *engine, void *areq) { + struct skcipher_request *req = skcipher_request_cast(areq); + struct s390_pxts_req_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); - u8 buf[AES_BLOCK_SIZE], *ctrptr; - struct skcipher_walk walk; - unsigned int nbytes, n, k; - int ret, locked; - struct { - u8 key[MAXPROTKEYSIZE]; - } param; + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk *walk = &req_ctx->walk; + int rc; - ret = skcipher_walk_virt(&walk, req, false); - if (ret) - return ret; + /* walk has already been prepared */ + + rc = xts_paes_do_crypt(ctx, req_ctx, true); + if (rc == -EKEYEXPIRED) { + /* + * Protected key expired, conversion is in process. + * Trigger a re-schedule of this request by returning + * -ENOSPC ("hardware queue is full") to the crypto engine. + * To avoid immediately re-invocation of this callback, + * tell the scheduler to voluntarily give up the CPU here. + */ + cond_resched(); + pr_debug("rescheduling request\n"); + return -ENOSPC; + } else if (rc) { + skcipher_walk_done(walk, rc); + } - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - spin_unlock_bh(&ctx->pk_lock); + memzero_explicit(&req_ctx->param, sizeof(req_ctx->param)); + pr_debug("request complete with rc=%d\n", rc); + local_bh_disable(); + atomic_dec(&ctx->via_engine_ctr); + crypto_finalize_skcipher_request(engine, req, rc); + local_bh_enable(); + return rc; +} - locked = mutex_trylock(&ctrblk_lock); +static struct skcipher_engine_alg xts_paes_alg = { + .base = { + .base.cra_name = "xts(paes)", + .base.cra_driver_name = "xts-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_pxts_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(xts_paes_alg.base.base.cra_list), + .init = xts_paes_init, + .exit = xts_paes_exit, + .min_keysize = 2 * PAES_MIN_KEYSIZE, + .max_keysize = 2 * PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_paes_setkey, + .encrypt = xts_paes_encrypt, + .decrypt = xts_paes_decrypt, + }, + .op = { + .do_one_request = xts_paes_do_one_request, + }, +}; - while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - n = AES_BLOCK_SIZE; - if (nbytes >= 2*AES_BLOCK_SIZE && locked) - n = __ctrblk_init(ctrblk, walk.iv, nbytes); - ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; - k = cpacf_kmctr(ctx->fc, ¶m, walk.dst.virt.addr, - walk.src.virt.addr, n, ctrptr); - if (k) { - if (ctrptr == ctrblk) - memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE, - AES_BLOCK_SIZE); - crypto_inc(walk.iv, AES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, nbytes - k); - } - if (k < n) { - if (__paes_convert_key(ctx)) { - if (locked) - mutex_unlock(&ctrblk_lock); - return skcipher_walk_done(&walk, -EIO); - } - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - spin_unlock_bh(&ctx->pk_lock); - } - } - if (locked) - mutex_unlock(&ctrblk_lock); - /* - * final block may be < AES_BLOCK_SIZE, copy only nbytes - */ - if (nbytes) { - while (1) { - if (cpacf_kmctr(ctx->fc, ¶m, buf, - walk.src.virt.addr, AES_BLOCK_SIZE, - walk.iv) == AES_BLOCK_SIZE) - break; - if (__paes_convert_key(ctx)) - return skcipher_walk_done(&walk, -EIO); - spin_lock_bh(&ctx->pk_lock); - memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); - spin_unlock_bh(&ctx->pk_lock); - } - memcpy(walk.dst.virt.addr, buf, nbytes); - crypto_inc(walk.iv, AES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, nbytes); - } - - return ret; -} - -static struct skcipher_alg ctr_paes_alg = { - .base.cra_name = "ctr(paes)", - .base.cra_driver_name = "ctr-paes-s390", - .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct s390_paes_ctx), - .base.cra_module = THIS_MODULE, - .base.cra_list = LIST_HEAD_INIT(ctr_paes_alg.base.cra_list), - .init = ctr_paes_init, - .exit = ctr_paes_exit, - .min_keysize = PAES_MIN_KEYSIZE, - .max_keysize = PAES_MAX_KEYSIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ctr_paes_set_key, - .encrypt = ctr_paes_crypt, - .decrypt = ctr_paes_crypt, - .chunksize = AES_BLOCK_SIZE, +/* + * alg register, unregister, module init, exit + */ + +static struct miscdevice paes_dev = { + .name = "paes", + .minor = MISC_DYNAMIC_MINOR, }; -static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg) +static inline void __crypto_unregister_skcipher(struct skcipher_engine_alg *alg) { - if (!list_empty(&alg->base.cra_list)) - crypto_unregister_skcipher(alg); + if (!list_empty(&alg->base.base.cra_list)) + crypto_engine_unregister_skcipher(alg); } static void paes_s390_fini(void) { + if (paes_crypto_engine) { + crypto_engine_stop(paes_crypto_engine); + crypto_engine_exit(paes_crypto_engine); + } __crypto_unregister_skcipher(&ctr_paes_alg); __crypto_unregister_skcipher(&xts_paes_alg); __crypto_unregister_skcipher(&cbc_paes_alg); __crypto_unregister_skcipher(&ecb_paes_alg); if (ctrblk) - free_page((unsigned long) ctrblk); + free_page((unsigned long)ctrblk); + misc_deregister(&paes_dev); } static int __init paes_s390_init(void) { - int ret; + int rc; + + /* register a simple paes pseudo misc device */ + rc = misc_register(&paes_dev); + if (rc) + return rc; + + /* with this pseudo devie alloc and start a crypto engine */ + paes_crypto_engine = + crypto_engine_alloc_init_and_set(paes_dev.this_device, + true, NULL, false, MAX_QLEN); + if (!paes_crypto_engine) { + rc = -ENOMEM; + goto out_err; + } + rc = crypto_engine_start(paes_crypto_engine); + if (rc) { + crypto_engine_exit(paes_crypto_engine); + paes_crypto_engine = NULL; + goto out_err; + } /* Query available functions for KM, KMC and KMCTR */ cpacf_query(CPACF_KM, &km_functions); @@ -754,49 +1653,57 @@ static int __init paes_s390_init(void) if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) || cpacf_test_func(&km_functions, CPACF_KM_PAES_192) || cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) { - ret = crypto_register_skcipher(&ecb_paes_alg); - if (ret) + rc = crypto_engine_register_skcipher(&ecb_paes_alg); + if (rc) goto out_err; + pr_debug("%s registered\n", ecb_paes_alg.base.base.cra_driver_name); } if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) || cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) || cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) { - ret = crypto_register_skcipher(&cbc_paes_alg); - if (ret) + rc = crypto_engine_register_skcipher(&cbc_paes_alg); + if (rc) goto out_err; + pr_debug("%s registered\n", cbc_paes_alg.base.base.cra_driver_name); } if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) || cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) { - ret = crypto_register_skcipher(&xts_paes_alg); - if (ret) + rc = crypto_engine_register_skcipher(&xts_paes_alg); + if (rc) goto out_err; + pr_debug("%s registered\n", xts_paes_alg.base.base.cra_driver_name); } if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) { - ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + ctrblk = (u8 *)__get_free_page(GFP_KERNEL); if (!ctrblk) { - ret = -ENOMEM; + rc = -ENOMEM; goto out_err; } - ret = crypto_register_skcipher(&ctr_paes_alg); - if (ret) + rc = crypto_engine_register_skcipher(&ctr_paes_alg); + if (rc) goto out_err; + pr_debug("%s registered\n", ctr_paes_alg.base.base.cra_driver_name); } return 0; + out_err: paes_s390_fini(); - return ret; + return rc; } module_init(paes_s390_init); module_exit(paes_s390_fini); -MODULE_ALIAS_CRYPTO("paes"); +MODULE_ALIAS_CRYPTO("ecb(paes)"); +MODULE_ALIAS_CRYPTO("cbc(paes)"); +MODULE_ALIAS_CRYPTO("ctr(paes)"); +MODULE_ALIAS_CRYPTO("xts(paes)"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index ae382bafc772..2becd77df741 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -679,7 +679,7 @@ static ssize_t prng_chunksize_show(struct device *dev, struct device_attribute *attr, char *buf) { - return scnprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size); + return sysfs_emit(buf, "%u\n", prng_chunk_size); } static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL); @@ -698,7 +698,7 @@ static ssize_t prng_counter_show(struct device *dev, counter = prng_data->prngws.byte_counter; mutex_unlock(&prng_data->mutex); - return scnprintf(buf, PAGE_SIZE, "%llu\n", counter); + return sysfs_emit(buf, "%llu\n", counter); } static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL); @@ -707,7 +707,7 @@ static ssize_t prng_errorflag_show(struct device *dev, struct device_attribute *attr, char *buf) { - return scnprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag); + return sysfs_emit(buf, "%d\n", prng_errorflag); } static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL); @@ -717,9 +717,9 @@ static ssize_t prng_mode_show(struct device *dev, char *buf) { if (prng_mode == PRNG_MODE_TDES) - return scnprintf(buf, PAGE_SIZE, "TDES\n"); + return sysfs_emit(buf, "TDES\n"); else - return scnprintf(buf, PAGE_SIZE, "SHA512\n"); + return sysfs_emit(buf, "SHA512\n"); } static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL); @@ -742,7 +742,7 @@ static ssize_t prng_reseed_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { - return scnprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit); + return sysfs_emit(buf, "%u\n", prng_reseed_limit); } static ssize_t prng_reseed_limit_store(struct device *dev, struct device_attribute *attr, @@ -773,7 +773,7 @@ static ssize_t prng_strength_show(struct device *dev, struct device_attribute *attr, char *buf) { - return scnprintf(buf, PAGE_SIZE, "256\n"); + return sysfs_emit(buf, "256\n"); } static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL); @@ -907,5 +907,5 @@ static void __exit prng_exit(void) } } -module_cpu_feature_match(MSA, prng_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init); module_exit(prng_exit); diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h index 65ea12fc87a1..d757ccbce2b4 100644 --- a/arch/s390/crypto/sha.h +++ b/arch/s390/crypto/sha.h @@ -10,26 +10,33 @@ #ifndef _CRYPTO_ARCH_S390_SHA_H #define _CRYPTO_ARCH_S390_SHA_H -#include <linux/crypto.h> -#include <crypto/sha1.h> #include <crypto/sha2.h> #include <crypto/sha3.h> +#include <linux/types.h> /* must be big enough for the largest SHA variant */ -#define SHA3_STATE_SIZE 200 #define CPACF_MAX_PARMBLOCK_SIZE SHA3_STATE_SIZE #define SHA_MAX_BLOCK_SIZE SHA3_224_BLOCK_SIZE +#define S390_SHA_CTX_SIZE sizeof(struct s390_sha_ctx) struct s390_sha_ctx { u64 count; /* message length in bytes */ - u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)]; - u8 buf[SHA_MAX_BLOCK_SIZE]; + union { + u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)]; + struct { + u64 state[SHA512_DIGEST_SIZE / sizeof(u64)]; + u64 count_hi; + } sha512; + }; int func; /* KIMD function to use */ + bool first_message_part; }; struct shash_desc; -int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len); -int s390_sha_final(struct shash_desc *desc, u8 *out); +int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data, + unsigned int len); +int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len, + u8 *out); #endif diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index a3fabf310a38..d229cbd2ba22 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -18,12 +18,12 @@ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> */ +#include <asm/cpacf.h> #include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> #include <crypto/sha1.h> -#include <asm/cpacf.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> #include "sha.h" @@ -49,7 +49,6 @@ static int s390_sha1_export(struct shash_desc *desc, void *out) octx->count = sctx->count; memcpy(octx->state, sctx->state, sizeof(octx->state)); - memcpy(octx->buffer, sctx->buf, sizeof(octx->buffer)); return 0; } @@ -60,7 +59,6 @@ static int s390_sha1_import(struct shash_desc *desc, const void *in) sctx->count = ictx->count; memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - memcpy(sctx->buf, ictx->buffer, sizeof(ictx->buffer)); sctx->func = CPACF_KIMD_SHA_1; return 0; } @@ -68,16 +66,18 @@ static int s390_sha1_import(struct shash_desc *desc, const void *in) static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = s390_sha1_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = s390_sha1_export, .import = s390_sha1_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha1_state), + .descsize = S390_SHA_CTX_SIZE, + .statesize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name= "sha1-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -95,7 +95,7 @@ static void __exit sha1_s390_fini(void) crypto_unregister_shash(&alg); } -module_cpu_feature_match(MSA, sha1_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init); module_exit(sha1_s390_fini); MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c deleted file mode 100644 index 24983f175676..000000000000 --- a/arch/s390/crypto/sha256_s390.c +++ /dev/null @@ -1,143 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Cryptographic API. - * - * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm. - * - * s390 Version: - * Copyright IBM Corp. 2005, 2011 - * Author(s): Jan Glauber (jang@de.ibm.com) - */ -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <crypto/sha2.h> -#include <asm/cpacf.h> - -#include "sha.h" - -static int s390_sha256_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA_256; - - return 0; -} - -static int sha256_export(struct shash_desc *desc, void *out) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - struct sha256_state *octx = out; - - octx->count = sctx->count; - memcpy(octx->state, sctx->state, sizeof(octx->state)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - return 0; -} - -static int sha256_import(struct shash_desc *desc, const void *in) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha256_state *ictx = in; - - sctx->count = ictx->count; - memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->func = CPACF_KIMD_SHA_256; - return 0; -} - -static struct shash_alg sha256_alg = { - .digestsize = SHA256_DIGEST_SIZE, - .init = s390_sha256_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha256_export, - .import = sha256_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name= "sha256-s390", - .cra_priority = 300, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int s390_sha224_init(struct shash_desc *desc) -{ - struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA224_H0; - sctx->state[1] = SHA224_H1; - sctx->state[2] = SHA224_H2; - sctx->state[3] = SHA224_H3; - sctx->state[4] = SHA224_H4; - sctx->state[5] = SHA224_H5; - sctx->state[6] = SHA224_H6; - sctx->state[7] = SHA224_H7; - sctx->count = 0; - sctx->func = CPACF_KIMD_SHA_256; - - return 0; -} - -static struct shash_alg sha224_alg = { - .digestsize = SHA224_DIGEST_SIZE, - .init = s390_sha224_init, - .update = s390_sha_update, - .final = s390_sha_final, - .export = sha256_export, - .import = sha256_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name= "sha224-s390", - .cra_priority = 300, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha256_s390_init(void) -{ - int ret; - - if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) - return -ENODEV; - ret = crypto_register_shash(&sha256_alg); - if (ret < 0) - goto out; - ret = crypto_register_shash(&sha224_alg); - if (ret < 0) - crypto_unregister_shash(&sha256_alg); -out: - return ret; -} - -static void __exit sha256_s390_fini(void) -{ - crypto_unregister_shash(&sha224_alg); - crypto_unregister_shash(&sha256_alg); -} - -module_cpu_feature_match(MSA, sha256_s390_init); -module_exit(sha256_s390_fini); - -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA256 and SHA224 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c index 30ac49b635bf..4a7731ac6bcd 100644 --- a/arch/s390/crypto/sha3_256_s390.c +++ b/arch/s390/crypto/sha3_256_s390.c @@ -8,12 +8,14 @@ * Copyright IBM Corp. 2019 * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) */ +#include <asm/cpacf.h> #include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> #include <crypto/sha3.h> -#include <asm/cpacf.h> +#include <linux/cpufeature.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> #include "sha.h" @@ -21,7 +23,9 @@ static int sha3_256_init(struct shash_desc *desc) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - memset(sctx->state, 0, sizeof(sctx->state)); + sctx->first_message_part = test_facility(86); + if (!sctx->first_message_part) + memset(sctx->state, 0, sizeof(sctx->state)); sctx->count = 0; sctx->func = CPACF_KIMD_SHA3_256; @@ -33,10 +37,11 @@ static int sha3_256_export(struct shash_desc *desc, void *out) struct s390_sha_ctx *sctx = shash_desc_ctx(desc); struct sha3_state *octx = out; - octx->rsiz = sctx->count; + if (sctx->first_message_part) { + memset(sctx->state, 0, sizeof(sctx->state)); + sctx->first_message_part = 0; + } memcpy(octx->st, sctx->state, sizeof(octx->st)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - return 0; } @@ -45,9 +50,9 @@ static int sha3_256_import(struct shash_desc *desc, const void *in) struct s390_sha_ctx *sctx = shash_desc_ctx(desc); const struct sha3_state *ictx = in; - sctx->count = ictx->rsiz; + sctx->count = 0; memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->first_message_part = 0; sctx->func = CPACF_KIMD_SHA3_256; return 0; @@ -56,29 +61,26 @@ static int sha3_256_import(struct shash_desc *desc, const void *in) static int sha3_224_import(struct shash_desc *desc, const void *in) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha3_state *ictx = in; - sctx->count = ictx->rsiz; - memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sha3_256_import(desc, in); sctx->func = CPACF_KIMD_SHA3_224; - return 0; } static struct shash_alg sha3_256_alg = { .digestsize = SHA3_256_DIGEST_SIZE, /* = 32 */ .init = sha3_256_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = sha3_256_export, .import = sha3_256_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), + .descsize = S390_SHA_CTX_SIZE, + .statesize = SHA3_STATE_SIZE, .base = { .cra_name = "sha3-256", .cra_driver_name = "sha3-256-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA3_256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -88,26 +90,25 @@ static int sha3_224_init(struct shash_desc *desc) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; + sha3_256_init(desc); sctx->func = CPACF_KIMD_SHA3_224; - return 0; } static struct shash_alg sha3_224_alg = { .digestsize = SHA3_224_DIGEST_SIZE, .init = sha3_224_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = sha3_256_export, /* same as for 256 */ .import = sha3_224_import, /* function code different! */ - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), + .descsize = S390_SHA_CTX_SIZE, + .statesize = SHA3_STATE_SIZE, .base = { .cra_name = "sha3-224", .cra_driver_name = "sha3-224-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA3_224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -137,7 +138,7 @@ static void __exit sha3_256_s390_fini(void) crypto_unregister_shash(&sha3_256_alg); } -module_cpu_feature_match(MSA, sha3_256_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init); module_exit(sha3_256_s390_fini); MODULE_ALIAS_CRYPTO("sha3-256"); diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c index e70d50f7620f..018f02fff444 100644 --- a/arch/s390/crypto/sha3_512_s390.c +++ b/arch/s390/crypto/sha3_512_s390.c @@ -7,12 +7,14 @@ * Copyright IBM Corp. 2019 * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) */ +#include <asm/cpacf.h> #include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/cpufeature.h> #include <crypto/sha3.h> -#include <asm/cpacf.h> +#include <linux/cpufeature.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> #include "sha.h" @@ -20,7 +22,9 @@ static int sha3_512_init(struct shash_desc *desc) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - memset(sctx->state, 0, sizeof(sctx->state)); + sctx->first_message_part = test_facility(86); + if (!sctx->first_message_part) + memset(sctx->state, 0, sizeof(sctx->state)); sctx->count = 0; sctx->func = CPACF_KIMD_SHA3_512; @@ -32,12 +36,12 @@ static int sha3_512_export(struct shash_desc *desc, void *out) struct s390_sha_ctx *sctx = shash_desc_ctx(desc); struct sha3_state *octx = out; - octx->rsiz = sctx->count; - octx->rsizw = sctx->count >> 32; + if (sctx->first_message_part) { + memset(sctx->state, 0, sizeof(sctx->state)); + sctx->first_message_part = 0; + } memcpy(octx->st, sctx->state, sizeof(octx->st)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); - return 0; } @@ -46,12 +50,9 @@ static int sha3_512_import(struct shash_desc *desc, const void *in) struct s390_sha_ctx *sctx = shash_desc_ctx(desc); const struct sha3_state *ictx = in; - if (unlikely(ictx->rsizw)) - return -ERANGE; - sctx->count = ictx->rsiz; - + sctx->count = 0; memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->first_message_part = 0; sctx->func = CPACF_KIMD_SHA3_512; return 0; @@ -60,32 +61,26 @@ static int sha3_512_import(struct shash_desc *desc, const void *in) static int sha3_384_import(struct shash_desc *desc, const void *in) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - const struct sha3_state *ictx = in; - - if (unlikely(ictx->rsizw)) - return -ERANGE; - sctx->count = ictx->rsiz; - memcpy(sctx->state, ictx->st, sizeof(ictx->st)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sha3_512_import(desc, in); sctx->func = CPACF_KIMD_SHA3_384; - return 0; } static struct shash_alg sha3_512_alg = { .digestsize = SHA3_512_DIGEST_SIZE, .init = sha3_512_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = sha3_512_export, .import = sha3_512_import, - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), + .descsize = S390_SHA_CTX_SIZE, + .statesize = SHA3_STATE_SIZE, .base = { .cra_name = "sha3-512", .cra_driver_name = "sha3-512-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA3_512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -97,26 +92,25 @@ static int sha3_384_init(struct shash_desc *desc) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); - memset(sctx->state, 0, sizeof(sctx->state)); - sctx->count = 0; + sha3_512_init(desc); sctx->func = CPACF_KIMD_SHA3_384; - return 0; } static struct shash_alg sha3_384_alg = { .digestsize = SHA3_384_DIGEST_SIZE, .init = sha3_384_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = sha3_512_export, /* same as for 512 */ .import = sha3_384_import, /* function code different! */ - .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha3_state), + .descsize = S390_SHA_CTX_SIZE, + .statesize = SHA3_STATE_SIZE, .base = { .cra_name = "sha3-384", .cra_driver_name = "sha3-384-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA3_384_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_sha_ctx), .cra_module = THIS_MODULE, @@ -147,7 +141,7 @@ static void __exit fini(void) crypto_unregister_shash(&sha3_384_alg); } -module_cpu_feature_match(MSA, init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index 43ce4956df73..33711a29618c 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -7,14 +7,13 @@ * Copyright IBM Corp. 2007 * Author(s): Jan Glauber (jang@de.ibm.com) */ +#include <asm/cpacf.h> #include <crypto/internal/hash.h> #include <crypto/sha2.h> +#include <linux/cpufeature.h> #include <linux/errno.h> -#include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/cpufeature.h> -#include <asm/cpacf.h> #include "sha.h" @@ -22,15 +21,16 @@ static int sha512_init(struct shash_desc *desc) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - *(__u64 *)&ctx->state[0] = SHA512_H0; - *(__u64 *)&ctx->state[2] = SHA512_H1; - *(__u64 *)&ctx->state[4] = SHA512_H2; - *(__u64 *)&ctx->state[6] = SHA512_H3; - *(__u64 *)&ctx->state[8] = SHA512_H4; - *(__u64 *)&ctx->state[10] = SHA512_H5; - *(__u64 *)&ctx->state[12] = SHA512_H6; - *(__u64 *)&ctx->state[14] = SHA512_H7; + ctx->sha512.state[0] = SHA512_H0; + ctx->sha512.state[1] = SHA512_H1; + ctx->sha512.state[2] = SHA512_H2; + ctx->sha512.state[3] = SHA512_H3; + ctx->sha512.state[4] = SHA512_H4; + ctx->sha512.state[5] = SHA512_H5; + ctx->sha512.state[6] = SHA512_H6; + ctx->sha512.state[7] = SHA512_H7; ctx->count = 0; + ctx->sha512.count_hi = 0; ctx->func = CPACF_KIMD_SHA_512; return 0; @@ -42,9 +42,8 @@ static int sha512_export(struct shash_desc *desc, void *out) struct sha512_state *octx = out; octx->count[0] = sctx->count; - octx->count[1] = 0; + octx->count[1] = sctx->sha512.count_hi; memcpy(octx->state, sctx->state, sizeof(octx->state)); - memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); return 0; } @@ -53,12 +52,10 @@ static int sha512_import(struct shash_desc *desc, const void *in) struct s390_sha_ctx *sctx = shash_desc_ctx(desc); const struct sha512_state *ictx = in; - if (unlikely(ictx->count[1])) - return -ERANGE; sctx->count = ictx->count[0]; + sctx->sha512.count_hi = ictx->count[1]; memcpy(sctx->state, ictx->state, sizeof(ictx->state)); - memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); sctx->func = CPACF_KIMD_SHA_512; return 0; } @@ -66,16 +63,18 @@ static int sha512_import(struct shash_desc *desc, const void *in) static struct shash_alg sha512_alg = { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = sha512_export, .import = sha512_import, .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha512_state), + .statesize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name= "sha512-s390", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -87,15 +86,16 @@ static int sha384_init(struct shash_desc *desc) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - *(__u64 *)&ctx->state[0] = SHA384_H0; - *(__u64 *)&ctx->state[2] = SHA384_H1; - *(__u64 *)&ctx->state[4] = SHA384_H2; - *(__u64 *)&ctx->state[6] = SHA384_H3; - *(__u64 *)&ctx->state[8] = SHA384_H4; - *(__u64 *)&ctx->state[10] = SHA384_H5; - *(__u64 *)&ctx->state[12] = SHA384_H6; - *(__u64 *)&ctx->state[14] = SHA384_H7; + ctx->sha512.state[0] = SHA384_H0; + ctx->sha512.state[1] = SHA384_H1; + ctx->sha512.state[2] = SHA384_H2; + ctx->sha512.state[3] = SHA384_H3; + ctx->sha512.state[4] = SHA384_H4; + ctx->sha512.state[5] = SHA384_H5; + ctx->sha512.state[6] = SHA384_H6; + ctx->sha512.state[7] = SHA384_H7; ctx->count = 0; + ctx->sha512.count_hi = 0; ctx->func = CPACF_KIMD_SHA_512; return 0; @@ -104,17 +104,19 @@ static int sha384_init(struct shash_desc *desc) static struct shash_alg sha384_alg = { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_init, - .update = s390_sha_update, - .final = s390_sha_final, + .update = s390_sha_update_blocks, + .finup = s390_sha_finup, .export = sha512_export, .import = sha512_import, .descsize = sizeof(struct s390_sha_ctx), - .statesize = sizeof(struct sha512_state), + .statesize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name= "sha384-s390", .cra_priority = 300, .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_ctxsize = sizeof(struct s390_sha_ctx), .cra_module = THIS_MODULE, } @@ -142,7 +144,7 @@ static void __exit fini(void) crypto_unregister_shash(&sha384_alg); } -module_cpu_feature_match(MSA, init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c index 686fe7aa192f..b5e2c365ea05 100644 --- a/arch/s390/crypto/sha_common.c +++ b/arch/s390/crypto/sha_common.c @@ -13,42 +13,33 @@ #include <asm/cpacf.h> #include "sha.h" -int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len) +int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct s390_sha_ctx *ctx = shash_desc_ctx(desc); unsigned int bsize = crypto_shash_blocksize(desc->tfm); - unsigned int index, n; - - /* how much is already in the buffer? */ - index = ctx->count % bsize; - ctx->count += len; - - if ((index + len) < bsize) - goto store; + struct s390_sha_ctx *ctx = shash_desc_ctx(desc); + unsigned int n; + int fc; - /* process one stored block */ - if (index) { - memcpy(ctx->buf + index, data, bsize - index); - cpacf_kimd(ctx->func, ctx->state, ctx->buf, bsize); - data += bsize - index; - len -= bsize - index; - index = 0; - } + fc = ctx->func; + if (ctx->first_message_part) + fc |= CPACF_KIMD_NIP; /* process as many blocks as possible */ - if (len >= bsize) { - n = (len / bsize) * bsize; - cpacf_kimd(ctx->func, ctx->state, data, n); - data += n; - len -= n; + n = (len / bsize) * bsize; + ctx->count += n; + switch (ctx->func) { + case CPACF_KLMD_SHA_512: + case CPACF_KLMD_SHA3_384: + if (ctx->count < n) + ctx->sha512.count_hi++; + break; } -store: - if (len) - memcpy(ctx->buf + index , data, len); - - return 0; + cpacf_kimd(fc, ctx->state, data, n); + ctx->first_message_part = 0; + return len - n; } -EXPORT_SYMBOL_GPL(s390_sha_update); +EXPORT_SYMBOL_GPL(s390_sha_update_blocks); static int s390_crypto_shash_parmsize(int func) { @@ -69,15 +60,15 @@ static int s390_crypto_shash_parmsize(int func) } } -int s390_sha_final(struct shash_desc *desc, u8 *out) +int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len, + u8 *out) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - unsigned int bsize = crypto_shash_blocksize(desc->tfm); + int mbl_offset, fc; u64 bits; - unsigned int n; - int mbl_offset; - n = ctx->count % bsize; + ctx->count += len; + bits = ctx->count * 8; mbl_offset = s390_crypto_shash_parmsize(ctx->func); if (mbl_offset < 0) @@ -87,17 +78,16 @@ int s390_sha_final(struct shash_desc *desc, u8 *out) /* set total msg bit length (mbl) in CPACF parmblock */ switch (ctx->func) { - case CPACF_KLMD_SHA_1: - case CPACF_KLMD_SHA_256: - memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); - break; case CPACF_KLMD_SHA_512: - /* - * the SHA512 parmblock has a 128-bit mbl field, clear - * high-order u64 field, copy bits to low-order u64 field - */ - memset(ctx->state + mbl_offset, 0x00, sizeof(bits)); + /* The SHA512 parmblock has a 128-bit mbl field. */ + if (ctx->count < len) + ctx->sha512.count_hi++; + ctx->sha512.count_hi <<= 3; + ctx->sha512.count_hi |= ctx->count >> 61; mbl_offset += sizeof(u64) / sizeof(u32); + fallthrough; + case CPACF_KLMD_SHA_1: + case CPACF_KLMD_SHA_256: memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); break; case CPACF_KLMD_SHA3_224: @@ -109,16 +99,18 @@ int s390_sha_final(struct shash_desc *desc, u8 *out) return -EINVAL; } - cpacf_klmd(ctx->func, ctx->state, ctx->buf, n); + fc = ctx->func; + fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0; + if (ctx->first_message_part) + fc |= CPACF_KLMD_NIP; + cpacf_klmd(fc, ctx->state, src, len); /* copy digest to out */ memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); - /* wipe context */ - memset(ctx, 0, sizeof *ctx); return 0; } -EXPORT_SYMBOL_GPL(s390_sha_final); +EXPORT_SYMBOL_GPL(s390_sha_finup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("s390 SHA cipher common functions"); diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile index 06f601509ce9..c34854d298f8 100644 --- a/arch/s390/hypfs/Makefile +++ b/arch/s390/hypfs/Makefile @@ -3,7 +3,12 @@ # Makefile for the linux hypfs filesystem routines. # -obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_dbfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS) += hypfs_sprp.o +obj-$(CONFIG_S390_HYPFS) += hypfs_vm.o -s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o hypfs_sprp.o -s390_hypfs-objs += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_diag_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_vm_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += inode.o diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h index 05f3f9aee5fc..83ebf54cca6b 100644 --- a/arch/s390/hypfs/hypfs.h +++ b/arch/s390/hypfs/hypfs.h @@ -46,6 +46,15 @@ void hypfs_diag0c_exit(void); void hypfs_sprp_init(void); void hypfs_sprp_exit(void); +int __hypfs_fs_init(void); + +static inline int hypfs_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_fs_init(); + return 0; +} + /* debugfs interface */ struct hypfs_dbfs_file; @@ -69,8 +78,6 @@ struct hypfs_dbfs_file { struct dentry *dentry; }; -extern void hypfs_dbfs_init(void); -extern void hypfs_dbfs_exit(void); extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df); extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df); diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c index f4c7dbfaf8ee..5d9effb0867c 100644 --- a/arch/s390/hypfs/hypfs_dbfs.c +++ b/arch/s390/hypfs/hypfs_dbfs.c @@ -39,7 +39,9 @@ static ssize_t dbfs_read(struct file *file, char __user *buf, return 0; df = file_inode(file)->i_private; - mutex_lock(&df->lock); + if (mutex_lock_interruptible(&df->lock)) + return -ERESTARTSYS; + data = hypfs_dbfs_data_alloc(df); if (!data) { mutex_unlock(&df->lock); @@ -74,7 +76,6 @@ static long dbfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static const struct file_operations dbfs_ops = { .read = dbfs_read, - .llseek = no_llseek, .unlocked_ioctl = dbfs_ioctl, }; @@ -90,12 +91,33 @@ void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df) debugfs_remove(df->dentry); } -void hypfs_dbfs_init(void) +static int __init hypfs_dbfs_init(void) { - dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); -} + int rc = -ENODATA; -void hypfs_dbfs_exit(void) -{ + dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); + if (hypfs_diag_init()) + goto fail_dbfs_exit; + if (hypfs_vm_init()) + goto fail_hypfs_diag_exit; + hypfs_sprp_init(); + if (hypfs_diag0c_init()) + goto fail_hypfs_sprp_exit; + rc = hypfs_fs_init(); + if (rc) + goto fail_hypfs_diag0c_exit; + return 0; + +fail_hypfs_diag0c_exit: + hypfs_diag0c_exit(); +fail_hypfs_sprp_exit: + hypfs_sprp_exit(); + hypfs_vm_exit(); +fail_hypfs_diag_exit: + hypfs_diag_exit(); + pr_err("Initialization of hypfs failed with rc=%i\n", rc); +fail_dbfs_exit: debugfs_remove(dbfs_dir); + return rc; } +device_initcall(hypfs_dbfs_init) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index f0bc4dc3e9bf..c8af67d20994 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -18,196 +18,25 @@ #include <linux/mm.h> #include <asm/diag.h> #include <asm/ebcdic.h> +#include "hypfs_diag.h" #include "hypfs.h" -#define TMP_SIZE 64 /* size of temporary buffers */ - #define DBFS_D204_HDR_VERSION 0 -static char *diag224_cpu_names; /* diag 224 name table */ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ static void *diag204_buf; /* 4K aligned buffer for diag204 data */ -static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */ static int diag204_buf_pages; /* number of pages for diag204 data */ -static struct dentry *dbfs_d204_file; - -/* - * DIAG 204 member access functions. - * - * Since we have two different diag 204 data formats for old and new s390 - * machines, we do not access the structs directly, but use getter functions for - * each struct member instead. This should make the code more readable. - */ - -/* Time information block */ - -static inline int info_blk_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_info_blk_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_info_blk_hdr); -} - -static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) +enum diag204_format diag204_get_info_type(void) { - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->npar; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->npar; + return diag204_info_type; } -static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) +static void diag204_set_info_type(enum diag204_format type) { - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->flags; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->flags; -} - -static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus; -} - -/* Partition header */ - -static inline int part_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_part_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_part_hdr); -} - -static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_part_hdr *)hdr)->cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_part_hdr *)hdr)->rcpus; -} - -static inline void part_hdr__part_name(enum diag204_format type, void *hdr, - char *name) -{ - if (type == DIAG204_INFO_SIMPLE) - memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, - DIAG204_LPAR_NAME_LEN); - else /* DIAG204_INFO_EXT */ - memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, - DIAG204_LPAR_NAME_LEN); - EBCASC(name, DIAG204_LPAR_NAME_LEN); - name[DIAG204_LPAR_NAME_LEN] = 0; - strim(name); -} - -/* CPU info block */ - -static inline int cpu_info__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_cpu_info); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_cpu_info); -} - -static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->ctidx; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->ctidx; -} - -static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->cpu_addr; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; -} - -static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->acc_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->acc_time; -} - -static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->lp_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->lp_time; -} - -static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return 0; /* online_time not available in simple info */ - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->online_time; -} - -/* Physical header */ - -static inline int phys_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_phys_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_phys_hdr); -} - -static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_hdr *)hdr)->cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_hdr *)hdr)->cpus; -} - -/* Physical CPU info block */ - -static inline int phys_cpu__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_phys_cpu); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_phys_cpu); -} - -static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->cpu_addr; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; -} - -static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->mgm_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; -} - -static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->ctidx; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->ctidx; + diag204_info_type = type; } /* Diagnose 204 functions */ @@ -220,43 +49,11 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) static void diag204_free_buffer(void) { - if (!diag204_buf) - return; - if (diag204_buf_vmalloc) { - vfree(diag204_buf_vmalloc); - diag204_buf_vmalloc = NULL; - } else { - free_pages((unsigned long) diag204_buf, 0); - } + vfree(diag204_buf); diag204_buf = NULL; } -static void *page_align_ptr(void *ptr) -{ - return (void *) PAGE_ALIGN((unsigned long) ptr); -} - -static void *diag204_alloc_vbuf(int pages) -{ - /* The buffer has to be page aligned! */ - diag204_buf_vmalloc = vmalloc(array_size(PAGE_SIZE, (pages + 1))); - if (!diag204_buf_vmalloc) - return ERR_PTR(-ENOMEM); - diag204_buf = page_align_ptr(diag204_buf_vmalloc); - diag204_buf_pages = pages; - return diag204_buf; -} - -static void *diag204_alloc_rbuf(void) -{ - diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0); - if (!diag204_buf) - return ERR_PTR(-ENOMEM); - diag204_buf_pages = 1; - return diag204_buf; -} - -static void *diag204_get_buffer(enum diag204_format fmt, int *pages) +void *diag204_get_buffer(enum diag204_format fmt, int *pages) { if (diag204_buf) { *pages = diag204_buf_pages; @@ -264,15 +61,19 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages) } if (fmt == DIAG204_INFO_SIMPLE) { *pages = 1; - return diag204_alloc_rbuf(); } else {/* DIAG204_INFO_EXT */ *pages = diag204((unsigned long)DIAG204_SUBC_RSI | (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) - return ERR_PTR(-ENOSYS); - else - return diag204_alloc_vbuf(*pages); + return ERR_PTR(-EOPNOTSUPP); } + diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return ERR_PTR(-ENOMEM); + diag204_buf_pages = *pages; + return diag204_buf; } /* @@ -299,13 +100,13 @@ static int diag204_probe(void) if (diag204((unsigned long)DIAG204_SUBC_STIB7 | (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB7; - diag204_info_type = DIAG204_INFO_EXT; + diag204_set_info_type(DIAG204_INFO_EXT); goto out; } if (diag204((unsigned long)DIAG204_SUBC_STIB6 | (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB6; - diag204_info_type = DIAG204_INFO_EXT; + diag204_set_info_type(DIAG204_INFO_EXT); goto out; } diag204_free_buffer(); @@ -321,10 +122,10 @@ static int diag204_probe(void) if (diag204((unsigned long)DIAG204_SUBC_STIB4 | (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB4; - diag204_info_type = DIAG204_INFO_SIMPLE; + diag204_set_info_type(DIAG204_INFO_SIMPLE); goto out; } else { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto fail_store; } out: @@ -335,58 +136,24 @@ fail_alloc: return rc; } -static int diag204_do_store(void *buf, int pages) +int diag204_store(void *buf, int pages) { + unsigned long subcode; int rc; - rc = diag204((unsigned long) diag204_store_sc | - (unsigned long) diag204_info_type, pages, buf); - return rc < 0 ? -ENOSYS : 0; -} - -static void *diag204_store(void) -{ - void *buf; - int pages, rc; - - buf = diag204_get_buffer(diag204_info_type, &pages); - if (IS_ERR(buf)) - goto out; - rc = diag204_do_store(buf, pages); - if (rc) - return ERR_PTR(rc); -out: - return buf; -} - -/* Diagnose 224 functions */ - -static int diag224_get_name_table(void) -{ - /* memory must be below 2GB */ - diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (!diag224_cpu_names) - return -ENOMEM; - if (diag224(diag224_cpu_names)) { - free_page((unsigned long) diag224_cpu_names); - return -EOPNOTSUPP; + subcode = diag204_get_info_type(); + subcode |= diag204_store_sc; + if (diag204_has_bif()) + subcode |= DIAG204_BIF_BIT; + while (1) { + rc = diag204(subcode, pages, buf); + if (rc != -EBUSY) + break; + if (signal_pending(current)) + return -ERESTARTSYS; + schedule_timeout_interruptible(DIAG204_BUSY_WAIT); } - EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); - return 0; -} - -static void diag224_delete_name_table(void) -{ - free_page((unsigned long) diag224_cpu_names); -} - -static int diag224_idx2name(int index, char *name) -{ - memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), - DIAG204_CPU_NAME_LEN); - name[DIAG204_CPU_NAME_LEN] = 0; - strim(name); - return 0; + return rc < 0 ? rc : 0; } struct dbfs_d204_hdr { @@ -411,8 +178,8 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) base = vzalloc(buf_size); if (!base) return -ENOMEM; - d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr); - rc = diag204_do_store(d204->buf, diag204_buf_pages); + d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr); + rc = diag204_store(d204->buf, diag204_buf_pages); if (rc) { vfree(base); return rc; @@ -437,180 +204,22 @@ __init int hypfs_diag_init(void) int rc; if (diag204_probe()) { - pr_err("The hardware system does not support hypfs\n"); + pr_info("The hardware system does not support hypfs\n"); return -ENODATA; } - if (diag204_info_type == DIAG204_INFO_EXT) + if (diag204_get_info_type() == DIAG204_INFO_EXT) hypfs_dbfs_create_file(&dbfs_file_d204); - if (MACHINE_IS_LPAR) { - rc = diag224_get_name_table(); - if (rc) { - pr_err("The hardware system does not provide all " - "functions required by hypfs\n"); - debugfs_remove(dbfs_d204_file); - return rc; - } - } - return 0; + rc = hypfs_diag_fs_init(); + if (rc) + pr_err("The hardware system does not provide all functions required by hypfs\n"); + return rc; } void hypfs_diag_exit(void) { - debugfs_remove(dbfs_d204_file); - diag224_delete_name_table(); + hypfs_diag_fs_exit(); diag204_free_buffer(); hypfs_dbfs_remove_file(&dbfs_file_d204); } - -/* - * Functions to create the directory structure - * ******************************************* - */ - -static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) -{ - struct dentry *cpu_dir; - char buffer[TMP_SIZE]; - void *rc; - - snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type, - cpu_info)); - cpu_dir = hypfs_mkdir(cpus_dir, buffer); - rc = hypfs_create_u64(cpu_dir, "mgmtime", - cpu_info__acc_time(diag204_info_type, cpu_info) - - cpu_info__lp_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - rc = hypfs_create_u64(cpu_dir, "cputime", - cpu_info__lp_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - if (diag204_info_type == DIAG204_INFO_EXT) { - rc = hypfs_create_u64(cpu_dir, "onlinetime", - cpu_info__online_time(diag204_info_type, - cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - } - diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); -} - -static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) -{ - struct dentry *cpus_dir; - struct dentry *lpar_dir; - char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; - void *cpu_info; - int i; - - part_hdr__part_name(diag204_info_type, part_hdr, lpar_name); - lpar_name[DIAG204_LPAR_NAME_LEN] = 0; - lpar_dir = hypfs_mkdir(systems_dir, lpar_name); - if (IS_ERR(lpar_dir)) - return lpar_dir; - cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return cpus_dir; - cpu_info = part_hdr + part_hdr__size(diag204_info_type); - for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) { - int rc; - rc = hypfs_create_cpu_files(cpus_dir, cpu_info); - if (rc) - return ERR_PTR(rc); - cpu_info += cpu_info__size(diag204_info_type); - } - return cpu_info; -} - -static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) -{ - struct dentry *cpu_dir; - char buffer[TMP_SIZE]; - void *rc; - - snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type, - cpu_info)); - cpu_dir = hypfs_mkdir(cpus_dir, buffer); - if (IS_ERR(cpu_dir)) - return PTR_ERR(cpu_dir); - rc = hypfs_create_u64(cpu_dir, "mgmtime", - phys_cpu__mgm_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); -} - -static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) -{ - int i; - void *cpu_info; - struct dentry *cpus_dir; - - cpus_dir = hypfs_mkdir(parent_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return cpus_dir; - cpu_info = phys_hdr + phys_hdr__size(diag204_info_type); - for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) { - int rc; - rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); - if (rc) - return ERR_PTR(rc); - cpu_info += phys_cpu__size(diag204_info_type); - } - return cpu_info; -} - -int hypfs_diag_create_files(struct dentry *root) -{ - struct dentry *systems_dir, *hyp_dir; - void *time_hdr, *part_hdr; - int i, rc; - void *buffer, *ptr; - - buffer = diag204_store(); - if (IS_ERR(buffer)) - return PTR_ERR(buffer); - - systems_dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(systems_dir)) { - rc = PTR_ERR(systems_dir); - goto err_out; - } - time_hdr = (struct x_info_blk_hdr *)buffer; - part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type); - for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) { - part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); - if (IS_ERR(part_hdr)) { - rc = PTR_ERR(part_hdr); - goto err_out; - } - } - if (info_blk_hdr__flags(diag204_info_type, time_hdr) & - DIAG204_LPAR_PHYS_FLG) { - ptr = hypfs_create_phys_files(root, part_hdr); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - } - hyp_dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(hyp_dir)) { - rc = PTR_ERR(hyp_dir); - goto err_out; - } - ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - rc = 0; - -err_out: - return rc; -} diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h new file mode 100644 index 000000000000..7090eff27fef --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#ifndef _S390_HYPFS_DIAG_H_ +#define _S390_HYPFS_DIAG_H_ + +#include <asm/diag.h> + +enum diag204_format diag204_get_info_type(void); +void *diag204_get_buffer(enum diag204_format fmt, int *pages); +int diag204_store(void *buf, int pages); + +int __hypfs_diag_fs_init(void); +void __hypfs_diag_fs_exit(void); + +static inline int hypfs_diag_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_diag_fs_init(); + return 0; +} + +static inline void hypfs_diag_fs_exit(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + __hypfs_diag_fs_exit(); +} + +#endif /* _S390_HYPFS_DIAG_H_ */ diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c index 9a2786079e3a..61220e717af0 100644 --- a/arch/s390/hypfs/hypfs_diag0c.c +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -9,6 +9,7 @@ #include <linux/slab.h> #include <linux/cpu.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/hypfs.h> #include "hypfs.h" @@ -20,8 +21,7 @@ */ static void diag0c_fn(void *data) { - diag_stat_inc(DIAG_STAT_X00C); - diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]); + diag0c(((void **)data)[smp_processor_id()]); } /* @@ -108,7 +108,7 @@ static struct hypfs_dbfs_file dbfs_file_0c = { */ int __init hypfs_diag0c_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; hypfs_dbfs_create_file(&dbfs_file_0c); return 0; @@ -119,7 +119,7 @@ int __init hypfs_diag0c_init(void) */ void hypfs_diag0c_exit(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; hypfs_dbfs_remove_file(&dbfs_file_0c); } diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c new file mode 100644 index 000000000000..ede951dc0085 --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag_fs.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#define KMSG_COMPONENT "hypfs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <asm/machine.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include "hypfs_diag.h" +#include "hypfs.h" + +#define TMP_SIZE 64 /* size of temporary buffers */ + +static char *diag224_cpu_names; /* diag 224 name table */ +static int diag224_idx2name(int index, char *name); + +/* + * DIAG 204 member access functions. + * + * Since we have two different diag 204 data formats for old and new s390 + * machines, we do not access the structs directly, but use getter functions for + * each struct member instead. This should make the code more readable. + */ + +/* Time information block */ + +static inline int info_blk_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); +} + +static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; +} + +static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; +} + +/* Partition header */ + +static inline int part_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); +} + +static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; +} + +static inline void part_hdr__part_name(enum diag204_format type, void *hdr, + char *name) +{ + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; + strim(name); +} + +/* CPU info block */ + +static inline int cpu_info__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); +} + +static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; +} + +static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; +} + +static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; +} + +static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; +} + +static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return 0; /* online_time not available in simple info */ + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; +} + +/* Physical header */ + +static inline int phys_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); +} + +static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; +} + +/* Physical CPU info block */ + +static inline int phys_cpu__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); +} + +static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; +} + +static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; +} + +static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; +} + +/* + * Functions to create the directory structure + * ******************************************* + */ + +static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + if (IS_ERR(cpu_dir)) + return PTR_ERR(cpu_dir); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + cpu_info__acc_time(diag204_get_info_type(), cpu_info) - + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + rc = hypfs_create_u64(cpu_dir, "cputime", + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + if (diag204_get_info_type() == DIAG204_INFO_EXT) { + rc = hypfs_create_u64(cpu_dir, "onlinetime", + cpu_info__online_time(diag204_get_info_type(), + cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + } + diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) +{ + struct dentry *cpus_dir; + struct dentry *lpar_dir; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; + void *cpu_info; + int i; + + part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name); + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; + lpar_dir = hypfs_mkdir(systems_dir, lpar_name); + if (IS_ERR(lpar_dir)) + return lpar_dir; + cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = part_hdr + part_hdr__size(diag204_get_info_type()); + for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) { + int rc; + + rc = hypfs_create_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += cpu_info__size(diag204_get_info_type()); + } + return cpu_info; +} + +static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + if (IS_ERR(cpu_dir)) + return PTR_ERR(cpu_dir); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + phys_cpu__mgm_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) +{ + int i; + void *cpu_info; + struct dentry *cpus_dir; + + cpus_dir = hypfs_mkdir(parent_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type()); + for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) { + int rc; + + rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += phys_cpu__size(diag204_get_info_type()); + } + return cpu_info; +} + +int hypfs_diag_create_files(struct dentry *root) +{ + struct dentry *systems_dir, *hyp_dir; + void *time_hdr, *part_hdr; + void *buffer, *ptr; + int i, rc, pages; + + buffer = diag204_get_buffer(diag204_get_info_type(), &pages); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); + rc = diag204_store(buffer, pages); + if (rc) + return rc; + + systems_dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(systems_dir)) { + rc = PTR_ERR(systems_dir); + goto err_out; + } + time_hdr = (struct x_info_blk_hdr *)buffer; + part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type()); + for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) { + part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); + if (IS_ERR(part_hdr)) { + rc = PTR_ERR(part_hdr); + goto err_out; + } + } + if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) & + DIAG204_LPAR_PHYS_FLG) { + ptr = hypfs_create_phys_files(root, part_hdr); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + } + hyp_dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(hyp_dir)) { + rc = PTR_ERR(hyp_dir); + goto err_out; + } + ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + rc = 0; + +err_out: + return rc; +} + +/* Diagnose 224 functions */ + +static int diag224_idx2name(int index, char *name) +{ + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; + strim(name); + return 0; +} + +static int diag224_get_name_table(void) +{ + /* memory must be below 2GB */ + diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_cpu_names) + return -ENOMEM; + if (diag224(diag224_cpu_names)) { + free_page((unsigned long)diag224_cpu_names); + return -EOPNOTSUPP; + } + EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); + return 0; +} + +static void diag224_delete_name_table(void) +{ + free_page((unsigned long)diag224_cpu_names); +} + +int __init __hypfs_diag_fs_init(void) +{ + if (machine_is_lpar()) + return diag224_get_name_table(); + return 0; +} + +void __hypfs_diag_fs_exit(void) +{ + diag224_delete_name_table(); +} diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c index f5f7e78ddc0c..9fc3f0dae8f0 100644 --- a/arch/s390/hypfs/hypfs_sprp.c +++ b/arch/s390/hypfs/hypfs_sprp.c @@ -25,7 +25,7 @@ static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd) { - union register_pair r1 = { .even = (unsigned long)data, }; + union register_pair r1 = { .even = virt_to_phys(data), }; asm volatile("diag %[r1],%[r3],0x304\n" : [r1] "+&d" (r1.pair) @@ -74,7 +74,7 @@ static int __hypfs_sprp_ioctl(void __user *user_area) int rc; rc = -ENOMEM; - data = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + data = (void *)get_zeroed_page(GFP_KERNEL); diag304 = kzalloc(sizeof(*diag304), GFP_KERNEL); if (!data || !diag304) goto out; diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index a3d881ca0a98..4db2895e4da3 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -11,50 +11,19 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <asm/extable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/timex.h> +#include "hypfs_vm.h" #include "hypfs.h" -#define NAME_LEN 8 #define DBFS_D2FC_HDR_VERSION 0 static char local_guest[] = " "; static char all_guests[] = "* "; static char *all_groups = all_guests; -static char *guest_query; - -struct diag2fc_data { - __u32 version; - __u32 flags; - __u64 used_cpu; - __u64 el_time; - __u64 mem_min_kb; - __u64 mem_max_kb; - __u64 mem_share_kb; - __u64 mem_used_kb; - __u32 pcpus; - __u32 lcpus; - __u32 vcpus; - __u32 ocpus; - __u32 cpu_max; - __u32 cpu_shares; - __u32 cpu_use_samp; - __u32 cpu_delay_samp; - __u32 page_wait_samp; - __u32 idle_samp; - __u32 other_samp; - __u32 total_samp; - char guest_name[NAME_LEN]; -}; - -struct diag2fc_parm_list { - char userid[NAME_LEN]; - char aci_grp[NAME_LEN]; - __u64 addr; - __u32 size; - __u32 fmt; -}; +char *diag2fc_guest_query; static int diag2fc(int size, char* query, void *addr) { @@ -62,10 +31,10 @@ static int diag2fc(int size, char* query, void *addr) unsigned long rc; struct diag2fc_parm_list parm_list; - memcpy(parm_list.userid, query, NAME_LEN); - ASCEBC(parm_list.userid, NAME_LEN); - memcpy(parm_list.aci_grp, all_groups, NAME_LEN); - ASCEBC(parm_list.aci_grp, NAME_LEN); + memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN); + memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN); parm_list.addr = (unsigned long)addr; parm_list.size = size; parm_list.fmt = 0x02; @@ -87,7 +56,7 @@ static int diag2fc(int size, char* query, void *addr) /* * Allocate buffer for "query" and store diag 2fc at "offset" */ -static void *diag2fc_store(char *query, unsigned int *count, int offset) +void *diag2fc_store(char *query, unsigned int *count, int offset) { void *data; int size; @@ -108,132 +77,11 @@ static void *diag2fc_store(char *query, unsigned int *count, int offset) return data; } -static void diag2fc_free(const void *data) +void diag2fc_free(const void *data) { vfree(data); } -#define ATTRIBUTE(dir, name, member) \ -do { \ - void *rc; \ - rc = hypfs_create_u64(dir, name, member); \ - if (IS_ERR(rc)) \ - return PTR_ERR(rc); \ -} while(0) - -static int hypfs_vm_create_guest(struct dentry *systems_dir, - struct diag2fc_data *data) -{ - char guest_name[NAME_LEN + 1] = {}; - struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; - int dedicated_flag, capped_value; - - capped_value = (data->flags & 0x00000006) >> 1; - dedicated_flag = (data->flags & 0x00000008) >> 3; - - /* guest dir */ - memcpy(guest_name, data->guest_name, NAME_LEN); - EBCASC(guest_name, NAME_LEN); - strim(guest_name); - guest_dir = hypfs_mkdir(systems_dir, guest_name); - if (IS_ERR(guest_dir)) - return PTR_ERR(guest_dir); - ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); - - /* logical cpu information */ - cpus_dir = hypfs_mkdir(guest_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return PTR_ERR(cpus_dir); - ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); - ATTRIBUTE(cpus_dir, "capped", capped_value); - ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); - ATTRIBUTE(cpus_dir, "count", data->vcpus); - /* - * Note: The "weight_min" attribute got the wrong name. - * The value represents the number of non-stopped (operating) - * CPUS. - */ - ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); - ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); - ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); - - /* memory information */ - mem_dir = hypfs_mkdir(guest_dir, "mem"); - if (IS_ERR(mem_dir)) - return PTR_ERR(mem_dir); - ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); - ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); - ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); - ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); - - /* samples */ - samples_dir = hypfs_mkdir(guest_dir, "samples"); - if (IS_ERR(samples_dir)) - return PTR_ERR(samples_dir); - ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); - ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); - ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); - ATTRIBUTE(samples_dir, "idle", data->idle_samp); - ATTRIBUTE(samples_dir, "other", data->other_samp); - ATTRIBUTE(samples_dir, "total", data->total_samp); - return 0; -} - -int hypfs_vm_create_files(struct dentry *root) -{ - struct dentry *dir, *file; - struct diag2fc_data *data; - unsigned int count = 0; - int rc, i; - - data = diag2fc_store(guest_query, &count, 0); - if (IS_ERR(data)) - return PTR_ERR(data); - - /* Hypervisor Info */ - dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - goto failed; - } - - /* physical cpus */ - dir = hypfs_mkdir(root, "cpus"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - file = hypfs_create_u64(dir, "count", data->lcpus); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - goto failed; - } - - /* guests */ - dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - - for (i = 0; i < count; i++) { - rc = hypfs_vm_create_guest(dir, &(data[i])); - if (rc) - goto failed; - } - diag2fc_free(data); - return 0; - -failed: - diag2fc_free(data); - return rc; -} - struct dbfs_d2fc_hdr { u64 len; /* Length of d2fc buffer without header */ u16 version; /* Version of header */ @@ -252,7 +100,7 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size) struct dbfs_d2fc *d2fc; unsigned int count; - d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr)); + d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr)); if (IS_ERR(d2fc)) return PTR_ERR(d2fc); store_tod_clock_ext(&d2fc->hdr.tod_ext); @@ -274,12 +122,12 @@ static struct hypfs_dbfs_file dbfs_file_2fc = { int hypfs_vm_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; if (diag2fc(0, all_guests, NULL) > 0) - guest_query = all_guests; + diag2fc_guest_query = all_guests; else if (diag2fc(0, local_guest, NULL) > 0) - guest_query = local_guest; + diag2fc_guest_query = local_guest; else return -EACCES; hypfs_dbfs_create_file(&dbfs_file_2fc); @@ -288,7 +136,7 @@ int hypfs_vm_init(void) void hypfs_vm_exit(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; hypfs_dbfs_remove_file(&dbfs_file_2fc); } diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h new file mode 100644 index 000000000000..fe2e5851addd --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#ifndef _S390_HYPFS_VM_H_ +#define _S390_HYPFS_VM_H_ + +#define DIAG2FC_NAME_LEN 8 + +struct diag2fc_data { + __u32 version; + __u32 flags; + __u64 used_cpu; + __u64 el_time; + __u64 mem_min_kb; + __u64 mem_max_kb; + __u64 mem_share_kb; + __u64 mem_used_kb; + __u32 pcpus; + __u32 lcpus; + __u32 vcpus; + __u32 ocpus; + __u32 cpu_max; + __u32 cpu_shares; + __u32 cpu_use_samp; + __u32 cpu_delay_samp; + __u32 page_wait_samp; + __u32 idle_samp; + __u32 other_samp; + __u32 total_samp; + char guest_name[DIAG2FC_NAME_LEN]; +}; + +struct diag2fc_parm_list { + char userid[DIAG2FC_NAME_LEN]; + char aci_grp[DIAG2FC_NAME_LEN]; + __u64 addr; + __u32 size; + __u32 fmt; +}; + +void *diag2fc_store(char *query, unsigned int *count, int offset); +void diag2fc_free(const void *data); +extern char *diag2fc_guest_query; + +#endif /* _S390_HYPFS_VM_H_ */ diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c new file mode 100644 index 000000000000..6011289afa8c --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm_fs.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/vmalloc.h> +#include <asm/extable.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/timex.h> +#include "hypfs_vm.h" +#include "hypfs.h" + +#define ATTRIBUTE(dir, name, member) \ +do { \ + void *rc; \ + rc = hypfs_create_u64(dir, name, member); \ + if (IS_ERR(rc)) \ + return PTR_ERR(rc); \ +} while (0) + +static int hypfs_vm_create_guest(struct dentry *systems_dir, + struct diag2fc_data *data) +{ + char guest_name[DIAG2FC_NAME_LEN + 1] = {}; + struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; + int dedicated_flag, capped_value; + + capped_value = (data->flags & 0x00000006) >> 1; + dedicated_flag = (data->flags & 0x00000008) >> 3; + + /* guest dir */ + memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN); + EBCASC(guest_name, DIAG2FC_NAME_LEN); + strim(guest_name); + guest_dir = hypfs_mkdir(systems_dir, guest_name); + if (IS_ERR(guest_dir)) + return PTR_ERR(guest_dir); + ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); + + /* logical cpu information */ + cpus_dir = hypfs_mkdir(guest_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return PTR_ERR(cpus_dir); + ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); + ATTRIBUTE(cpus_dir, "capped", capped_value); + ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); + ATTRIBUTE(cpus_dir, "count", data->vcpus); + /* + * Note: The "weight_min" attribute got the wrong name. + * The value represents the number of non-stopped (operating) + * CPUS. + */ + ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); + ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); + ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); + + /* memory information */ + mem_dir = hypfs_mkdir(guest_dir, "mem"); + if (IS_ERR(mem_dir)) + return PTR_ERR(mem_dir); + ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); + ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); + ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); + ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); + + /* samples */ + samples_dir = hypfs_mkdir(guest_dir, "samples"); + if (IS_ERR(samples_dir)) + return PTR_ERR(samples_dir); + ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); + ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); + ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); + ATTRIBUTE(samples_dir, "idle", data->idle_samp); + ATTRIBUTE(samples_dir, "other", data->other_samp); + ATTRIBUTE(samples_dir, "total", data->total_samp); + return 0; +} + +int hypfs_vm_create_files(struct dentry *root) +{ + struct dentry *dir, *file; + struct diag2fc_data *data; + unsigned int count = 0; + int rc, i; + + data = diag2fc_store(diag2fc_guest_query, &count, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + /* Hypervisor Info */ + dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* physical cpus */ + dir = hypfs_mkdir(root, "cpus"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_u64(dir, "count", data->lcpus); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* guests */ + dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + + for (i = 0; i < count; i++) { + rc = hypfs_vm_create_guest(dir, &data[i]); + if (rc) + goto failed; + } + diag2fc_free(data); + return 0; + +failed: + diag2fc_free(data); + return rc; +} diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 5c97f48cea91..96409573c75d 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -24,6 +24,7 @@ #include <linux/kobject.h> #include <linux/seq_file.h> #include <linux/uio.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include "hypfs.h" @@ -53,7 +54,7 @@ static void hypfs_update_update(struct super_block *sb) struct inode *inode = d_inode(sb_info->update_file); sb_info->last_update = ktime_get_seconds(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } /* directory tree removal functions */ @@ -101,7 +102,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode) ret->i_mode = mode; ret->i_uid = hypfs_info->uid; ret->i_gid = hypfs_info->gid; - ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret); + simple_inode_init_ts(ret); if (S_ISDIR(mode)) set_nlink(ret, 2); } @@ -184,7 +185,7 @@ static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } hypfs_delete_tree(sb->s_root); - if (MACHINE_IS_VM) + if (machine_is_vm()) rc = hypfs_vm_create_files(sb->s_root); else rc = hypfs_diag_create_files(sb->s_root); @@ -273,7 +274,7 @@ static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_root = root_dentry = d_make_root(root_inode); if (!root_dentry) return -ENOMEM; - if (MACHINE_IS_VM) + if (machine_is_vm()) rc = hypfs_vm_create_files(root_dentry); else rc = hypfs_diag_create_files(root_dentry); @@ -341,7 +342,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, struct inode *inode; inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { dentry = ERR_PTR(-ENOMEM); goto fail; @@ -443,7 +444,6 @@ static const struct file_operations hypfs_file_ops = { .release = hypfs_release, .read_iter = hypfs_read_iter, .write_iter = hypfs_write_iter, - .llseek = no_llseek, }; static struct file_system_type hypfs_type = { @@ -460,45 +460,18 @@ static const struct super_operations hypfs_s_ops = { .show_options = hypfs_show_options, }; -static int __init hypfs_init(void) +int __init __hypfs_fs_init(void) { int rc; - hypfs_dbfs_init(); - - if (hypfs_diag_init()) { - rc = -ENODATA; - goto fail_dbfs_exit; - } - if (hypfs_vm_init()) { - rc = -ENODATA; - goto fail_hypfs_diag_exit; - } - hypfs_sprp_init(); - if (hypfs_diag0c_init()) { - rc = -ENODATA; - goto fail_hypfs_sprp_exit; - } rc = sysfs_create_mount_point(hypervisor_kobj, "s390"); if (rc) - goto fail_hypfs_diag0c_exit; + return rc; rc = register_filesystem(&hypfs_type); if (rc) - goto fail_filesystem; + goto fail; return 0; - -fail_filesystem: +fail: sysfs_remove_mount_point(hypervisor_kobj, "s390"); -fail_hypfs_diag0c_exit: - hypfs_diag0c_exit(); -fail_hypfs_sprp_exit: - hypfs_sprp_exit(); - hypfs_vm_exit(); -fail_hypfs_diag_exit: - hypfs_diag_exit(); -fail_dbfs_exit: - hypfs_dbfs_exit(); - pr_err("Initialization of hypfs failed with rc=%i\n", rc); return rc; } -device_initcall(hypfs_init) diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 1a18d7b82f86..297bf7157968 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,6 +5,6 @@ generated-y += syscall_table.h generated-y += unistd_nr.h generic-y += asm-offsets.h -generic-y += export.h generic-y += kvm_types.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h new file mode 100644 index 000000000000..317c07c09ae4 --- /dev/null +++ b/arch/s390/include/asm/abs_lowcore.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ABS_LOWCORE_H +#define _ASM_S390_ABS_LOWCORE_H + +#include <linux/smp.h> +#include <asm/lowcore.h> + +#define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore)) + +extern unsigned long __abs_lowcore; + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc); +void abs_lowcore_unmap(int cpu); + +static inline struct lowcore *get_abs_lowcore(void) +{ + int cpu; + + cpu = get_cpu(); + return ((struct lowcore *)__abs_lowcore) + cpu; +} + +static inline void put_abs_lowcore(struct lowcore *lc) +{ + put_cpu(); +} + +#endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/access-regs.h b/arch/s390/include/asm/access-regs.h new file mode 100644 index 000000000000..1a6412d9f5ad --- /dev/null +++ b/arch/s390/include/asm/access-regs.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2024 + */ + +#ifndef __ASM_S390_ACCESS_REGS_H +#define __ASM_S390_ACCESS_REGS_H + +#include <linux/instrumented.h> +#include <asm/sigcontext.h> + +struct access_regs { + unsigned int regs[NUM_ACRS]; +}; + +static inline void save_access_regs(unsigned int *acrs) +{ + struct access_regs *regs = (struct access_regs *)acrs; + + instrument_write(regs, sizeof(*regs)); + asm volatile("stamy 0,15,%[regs]" + : [regs] "=QS" (*regs) + : + : "memory"); +} + +static inline void restore_access_regs(unsigned int *acrs) +{ + struct access_regs *regs = (struct access_regs *)acrs; + + instrument_read(regs, sizeof(*regs)); + asm volatile("lamy 0,15,%[regs]" + : + : [regs] "QS" (*regs) + : "memory"); +} + +#endif /* __ASM_S390_ACCESS_REGS_H */ diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 01936fdfaddb..c4c28c2609a5 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -12,12 +12,12 @@ #include <linux/bit_spinlock.h> #include <linux/dma-mapping.h> +#include <asm/tpi.h> struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *airq, bool floating); + void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ - u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ u8 flags; }; @@ -46,8 +46,10 @@ struct airq_iv { #define AIRQ_IV_PTR 4 /* Allocate the ptr array */ #define AIRQ_IV_DATA 8 /* Allocate the data array */ #define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ +#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */ -struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); +struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags, + unsigned long *vec); void airq_iv_release(struct airq_iv *iv); unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h deleted file mode 100644 index 7db046596b93..000000000000 --- a/arch/s390/include/asm/alternative-asm.h +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_ALTERNATIVE_ASM_H -#define _ASM_S390_ALTERNATIVE_ASM_H - -#ifdef __ASSEMBLY__ - -/* - * Issue one struct alt_instr descriptor entry (need to put it into - * the section .altinstructions, see below). This entry contains - * enough information for the alternatives patching code to patch an - * instruction. See apply_alternatives(). - */ -.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature - .long \orig_start - . - .long \alt_start - . - .word \feature - .byte \orig_end - \orig_start - .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) - .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. - */ -.macro ALTERNATIVE oldinstr, newinstr, feature - .pushsection .altinstr_replacement,"ax" -770: \newinstr -771: .popsection -772: \oldinstr -773: .pushsection .altinstructions,"a" - alt_entry 772b, 773b, 770b, 771b, \feature - .popsection -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. - */ -.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 - .pushsection .altinstr_replacement,"ax" -770: \newinstr1 -771: \newinstr2 -772: .popsection -773: \oldinstr -774: .pushsection .altinstructions,"a" - alt_entry 773b, 774b, 770b, 771b,\feature1 - alt_entry 773b, 774b, 771b, 772b,\feature2 - .popsection -.endm - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_S390_ALTERNATIVE_ASM_H */ diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 904dd049f954..c7bf60a541e9 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -2,6 +2,55 @@ #ifndef _ASM_S390_ALTERNATIVE_H #define _ASM_S390_ALTERNATIVE_H +/* + * Each alternative comes with a 32 bit feature field: + * union { + * u32 feature; + * struct { + * u32 ctx : 4; + * u32 type : 8; + * u32 data : 20; + * }; + * } + * + * @ctx is a bitfield, where only one bit must be set. Each bit defines + * in which context an alternative is supposed to be applied to the + * kernel image: + * + * - from the decompressor before the kernel itself is executed + * - from early kernel code from within the kernel + * + * @type is a number which defines the type and with that the type + * specific alternative patching. + * + * @data is additional type specific information which defines if an + * alternative should be applied. + */ + +#define ALT_CTX_EARLY 1 +#define ALT_CTX_LATE 2 +#define ALT_CTX_ALL (ALT_CTX_EARLY | ALT_CTX_LATE) + +#define ALT_TYPE_FACILITY 0 +#define ALT_TYPE_FEATURE 1 +#define ALT_TYPE_SPEC 2 + +#define ALT_DATA_SHIFT 0 +#define ALT_TYPE_SHIFT 20 +#define ALT_CTX_SHIFT 28 + +#define ALT_FACILITY(facility) (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ + ALT_TYPE_FACILITY << ALT_TYPE_SHIFT | \ + (facility) << ALT_DATA_SHIFT) + +#define ALT_FEATURE(feature) (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ + ALT_TYPE_FEATURE << ALT_TYPE_SHIFT | \ + (feature) << ALT_DATA_SHIFT) + +#define ALT_SPEC(facility) (ALT_CTX_LATE << ALT_CTX_SHIFT | \ + ALT_TYPE_SPEC << ALT_TYPE_SHIFT | \ + (facility) << ALT_DATA_SHIFT) + #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -11,12 +60,30 @@ struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ - u16 facility; /* facility bit set for replacement */ + union { + u32 feature; /* feature required for replacement */ + struct { + u32 ctx : 4; /* context */ + u32 type : 8; /* type of alternative */ + u32 data : 20; /* patching information */ + }; + }; u8 instrlen; /* length of original instruction */ } __packed; -void apply_alternative_instructions(void); -void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + +void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx); + +static inline void apply_alternative_instructions(void) +{ + __apply_alternatives(__alt_instructions, __alt_instructions_end, ALT_CTX_LATE); +} + +static inline void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +{ + __apply_alternatives(start, end, ALT_CTX_ALL); +} /* * +---------------------------------+ @@ -48,11 +115,12 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); #define OLDINSTR(oldinstr) \ "661:\n\t" oldinstr "\n662:\n" -#define ALTINSTR_ENTRY(facility, num) \ +#define ALTINSTR_ENTRY(feature, num) \ "\t.long 661b - .\n" /* old instruction */ \ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ - "\t.word " __stringify(facility) "\n" /* facility bit */ \ + "\t.long " __stringify(feature) "\n" /* feature */ \ "\t.byte " oldinstr_len "\n" /* instruction len */ \ + "\t.org . - (" oldinstr_len ") & 1\n" \ "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \ "\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n" @@ -60,24 +128,24 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" /* alternative assembly primitive: */ -#define ALTERNATIVE(oldinstr, altinstr, facility) \ +#define ALTERNATIVE(oldinstr, altinstr, feature) \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(altinstr, 1) \ ".popsection\n" \ OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(facility, 1) \ + ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" -#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\ +#define ALTERNATIVE_2(oldinstr, altinstr1, feature1, altinstr2, feature2)\ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(altinstr1, 1) \ ALTINSTR_REPLACEMENT(altinstr2, 2) \ ".popsection\n" \ OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(facility1, 1) \ - ALTINSTR_ENTRY(facility2, 2) \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" /* @@ -92,12 +160,12 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); * For non barrier like inlines please define new variants * without volatile and memory clobber. */ -#define alternative(oldinstr, altinstr, facility) \ - asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory") +#define alternative(oldinstr, altinstr, feature) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature) : : : "memory") -#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ - asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \ - altinstr2, facility2) ::: "memory") +#define alternative_2(oldinstr, altinstr1, feature1, altinstr2, feature2) \ + asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, feature1, \ + altinstr2, feature2) ::: "memory") /* Alternative inline assembly with input. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ @@ -105,8 +173,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); : : input) /* Like alternative_input, but with a single output argument */ -#define alternative_io(oldinstr, altinstr, facility, output, input...) \ - asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) \ +#define alternative_io(oldinstr, altinstr, feature, output, input...) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature) \ : output : input) /* Use this macro if more than one output parameter is needed. */ @@ -115,6 +183,56 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); /* Use this macro if clobbers are needed without inputs. */ #define ASM_NO_INPUT_CLOBBER(clobber...) : clobber +#else /* __ASSEMBLY__ */ + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature + .long \orig_start - . + .long \alt_start - . + .long \feature + .byte \orig_end - \orig_start + .org . - ( \orig_end - \orig_start ) & 1 + .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) + .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature + .pushsection .altinstr_replacement,"ax" +770: \newinstr +771: .popsection +772: \oldinstr +773: .pushsection .altinstructions,"a" + alt_entry 772b, 773b, 770b, 771b, \feature + .popsection +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 + .pushsection .altinstr_replacement,"ax" +770: \newinstr1 +771: \newinstr2 +772: .popsection +773: \oldinstr +774: .pushsection .altinstructions,"a" + alt_entry 773b, 774b, 770b, 771b,\feature1 + alt_entry 773b, 774b, 771b, 772b,\feature2 + .popsection +.endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_ALTERNATIVE_H */ diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index b515cfa62bd9..395b02d6a133 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -43,10 +43,24 @@ struct ap_queue_status { unsigned int queue_empty : 1; unsigned int replies_waiting : 1; unsigned int queue_full : 1; - unsigned int _pad1 : 4; + unsigned int : 3; + unsigned int async : 1; unsigned int irq_enabled : 1; unsigned int response_code : 8; - unsigned int _pad2 : 16; + unsigned int : 16; +}; + +/* + * AP queue status reg union to access the reg1 + * register with the lower 32 bits comprising the + * ap queue status. + */ +union ap_queue_status_reg { + unsigned long value; + struct { + u32 _pad; + struct ap_queue_status status; + }; }; /** @@ -73,16 +87,55 @@ static inline bool ap_instructions_available(void) return reg1 != 0; } +/* TAPQ register GR2 response struct */ +struct ap_tapq_hwinfo { + union { + unsigned long value; + struct { + unsigned int fac : 32; /* facility bits */ + unsigned int apinfo : 32; /* ap type, ... */ + }; + struct { + unsigned int apsc : 1; /* APSC */ + unsigned int mex4k : 1; /* AP4KM */ + unsigned int crt4k : 1; /* AP4KC */ + unsigned int cca : 1; /* D */ + unsigned int accel : 1; /* A */ + unsigned int ep11 : 1; /* X */ + unsigned int apxa : 1; /* APXA */ + unsigned int : 1; + unsigned int class : 8; + unsigned int bs : 2; /* SE bind/assoc */ + unsigned int : 14; + unsigned int at : 8; /* ap type */ + unsigned int nd : 8; /* nr of domains */ + unsigned int : 4; + unsigned int ml : 4; /* apxl ml */ + unsigned int : 4; + unsigned int qd : 4; /* queue depth */ + }; + }; +}; + +/* + * Convenience defines to be used with the bs field from struct ap_tapq_gr2 + */ +#define AP_BS_Q_USABLE 0 +#define AP_BS_Q_USABLE_NO_SECURE_KEY 1 +#define AP_BS_Q_AVAIL_FOR_BINDING 2 +#define AP_BS_Q_UNUSABLE 3 + /** * ap_tapq(): Test adjunct processor queue. * @qid: The AP queue number - * @info: Pointer to queue descriptor + * @info: Pointer to tapq hwinfo struct * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info) +static inline struct ap_queue_status ap_tapq(ap_qid_t qid, + struct ap_tapq_hwinfo *info) { - struct ap_queue_status reg1; + union ap_queue_status_reg reg1; unsigned long reg2; asm volatile( @@ -91,25 +144,24 @@ static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info) " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ " lgr %[reg2],2\n" /* gr2 into reg2 */ - : [reg1] "=&d" (reg1), [reg2] "=&d" (reg2) + : [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2) : [qid] "d" (qid) : "cc", "0", "1", "2"); if (info) - *info = reg2; - return reg1; + info->value = reg2; + return reg1.status; } /** * ap_test_queue(): Test adjunct processor queue. * @qid: The AP queue number * @tbit: Test facilities bit - * @info: Pointer to queue descriptor + * @info: Ptr to tapq gr2 struct * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, - int tbit, - unsigned long *info) +static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, int tbit, + struct ap_tapq_hwinfo *info) { if (tbit) qid |= 1UL << 23; /* set T bit*/ @@ -119,43 +171,51 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, /** * ap_pqap_rapq(): Reset adjunct processor queue. * @qid: The AP queue number + * @fbit: if != 0 set F bit * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_rapq(ap_qid_t qid) +static inline struct ap_queue_status ap_rapq(ap_qid_t qid, int fbit) { unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */ - struct ap_queue_status reg1; + union ap_queue_status_reg reg1; + + if (fbit) + reg0 |= 1UL << 22; asm volatile( " lgr 0,%[reg0]\n" /* qid arg into gr0 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(RAPQ) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - : [reg1] "=&d" (reg1) + : [reg1] "=&d" (reg1.value) : [reg0] "d" (reg0) : "cc", "0", "1"); - return reg1; + return reg1.status; } /** * ap_pqap_zapq(): Reset and zeroize adjunct processor queue. * @qid: The AP queue number + * @fbit: if != 0 set F bit * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_zapq(ap_qid_t qid) +static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit) { unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */ - struct ap_queue_status reg1; + union ap_queue_status_reg reg1; + + if (fbit) + reg0 |= 1UL << 22; asm volatile( " lgr 0,%[reg0]\n" /* qid arg into gr0 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(ZAPQ) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - : [reg1] "=&d" (reg1) + : [reg1] "=&d" (reg1.value) : [reg0] "d" (reg0) : "cc", "0", "1"); - return reg1; + return reg1.status; } /** @@ -163,19 +223,25 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid) * config info as returned by the ap_qci() function. */ struct ap_config_info { - unsigned int apsc : 1; /* S bit */ - unsigned int apxa : 1; /* N bit */ - unsigned int qact : 1; /* C bit */ - unsigned int rc8a : 1; /* R bit */ - unsigned char _reserved1 : 4; - unsigned char _reserved2[3]; - unsigned char Na; /* max # of APs - 1 */ - unsigned char Nd; /* max # of Domains - 1 */ - unsigned char _reserved3[10]; + union { + unsigned int flags; + struct { + unsigned int apsc : 1; /* S bit */ + unsigned int apxa : 1; /* N bit */ + unsigned int qact : 1; /* C bit */ + unsigned int rc8a : 1; /* R bit */ + unsigned int : 4; + unsigned int apsb : 1; /* B bit */ + unsigned int : 23; + }; + }; + unsigned char na; /* max # of APs - 1 */ + unsigned char nd; /* max # of Domains - 1 */ + unsigned char _reserved0[10]; unsigned int apm[8]; /* AP ID mask */ unsigned int aqm[8]; /* AP (usage) queue mask */ unsigned int adm[8]; /* AP (control) domain mask */ - unsigned char _reserved4[16]; + unsigned char _reserved1[16]; } __aligned(8); /** @@ -209,41 +275,40 @@ static inline int ap_qci(struct ap_config_info *config) * parameter to the PQAP(AQIC) instruction. For details please * see the AR documentation. */ -struct ap_qirq_ctrl { - unsigned int _res1 : 8; - unsigned int zone : 8; /* zone info */ - unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ - unsigned int _res2 : 4; - unsigned int gisc : 3; /* guest isc field */ - unsigned int _res3 : 6; - unsigned int gf : 2; /* gisa format */ - unsigned int _res4 : 1; - unsigned int gisa : 27; /* gisa origin */ - unsigned int _res5 : 1; - unsigned int isc : 3; /* irq sub class */ +union ap_qirq_ctrl { + unsigned long value; + struct { + unsigned int : 8; + unsigned int zone : 8; /* zone info */ + unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ + unsigned int : 4; + unsigned int gisc : 3; /* guest isc field */ + unsigned int : 6; + unsigned int gf : 2; /* gisa format */ + unsigned int : 1; + unsigned int gisa : 27; /* gisa origin */ + unsigned int : 1; + unsigned int isc : 3; /* irq sub class */ + }; }; /** * ap_aqic(): Control interruption for a specific AP. * @qid: The AP queue number * @qirqctrl: struct ap_qirq_ctrl (64 bit value) - * @ind: The notification indicator byte + * @pa_ind: Physical address of the notification indicator byte * * Returns AP queue status. */ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, - struct ap_qirq_ctrl qirqctrl, - void *ind) + union ap_qirq_ctrl qirqctrl, + phys_addr_t pa_ind) { unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */ - union { - unsigned long value; - struct ap_qirq_ctrl qirqctrl; - struct ap_queue_status status; - } reg1; - unsigned long reg2 = virt_to_phys(ind); + union ap_queue_status_reg reg1; + unsigned long reg2 = pa_ind; - reg1.qirqctrl = qirqctrl; + reg1.value = qirqctrl.value; asm volatile( " lgr 0,%[reg0]\n" /* qid param into gr0 */ @@ -251,9 +316,9 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, " lgr 2,%[reg2]\n" /* ni addr into gr2 */ " .insn rre,0xb2af0000,0,0\n" /* PQAP(AQIC) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - : [reg1] "+&d" (reg1) + : [reg1] "+&d" (reg1.value) : [reg0] "d" (reg0), [reg2] "d" (reg2) - : "cc", "0", "1", "2"); + : "cc", "memory", "0", "1", "2"); return reg1.status; } @@ -276,7 +341,7 @@ union ap_qact_ap_info { }; /** - * ap_qact(): Query AP combatibility type. + * ap_qact(): Query AP compatibility type. * @qid: The AP queue number * @apinfo: On input the info about the AP queue. On output the * alternate AP queue info provided by the qact function @@ -288,10 +353,7 @@ static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, union ap_qact_ap_info *apinfo) { unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22); - union { - unsigned long value; - struct ap_queue_status status; - } reg1; + union ap_queue_status_reg reg1; unsigned long reg2; reg1.value = apinfo->val; @@ -302,13 +364,66 @@ static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, " .insn rre,0xb2af0000,0,0\n" /* PQAP(QACT) */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ " lgr %[reg2],2\n" /* qact out info into reg2 */ - : [reg1] "+&d" (reg1), [reg2] "=&d" (reg2) + : [reg1] "+&d" (reg1.value), [reg2] "=&d" (reg2) : [reg0] "d" (reg0) : "cc", "0", "1", "2"); apinfo->val = reg2; return reg1.status; } +/* + * ap_bapq(): SE bind AP queue. + * @qid: The AP queue number + * + * Returns AP queue status structure. + * + * Invoking this function in a non-SE environment + * may case a specification exception. + */ +static inline struct ap_queue_status ap_bapq(ap_qid_t qid) +{ + unsigned long reg0 = qid | (7UL << 24); /* fc 7 is BAPQ */ + union ap_queue_status_reg reg1; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(BAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + + return reg1.status; +} + +/* + * ap_aapq(): SE associate AP queue. + * @qid: The AP queue number + * @sec_idx: The secret index + * + * Returns AP queue status structure. + * + * Invoking this function in a non-SE environment + * may case a specification exception. + */ +static inline struct ap_queue_status ap_aapq(ap_qid_t qid, unsigned int sec_idx) +{ + unsigned long reg0 = qid | (8UL << 24); /* fc 8 is AAPQ */ + unsigned long reg2 = sec_idx; + union ap_queue_status_reg reg1; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " lgr 2,%[reg2]\n" /* secret index into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(AAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "0", "1", "2"); + + return reg1.status; +} + /** * ap_nqap(): Send message to adjunct processor queue. * @qid: The AP queue number @@ -327,7 +442,7 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, { unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */ union register_pair nqap_r1, nqap_r2; - struct ap_queue_status reg1; + union ap_queue_status_reg reg1; nqap_r1.even = (unsigned int)(psmid >> 32); nqap_r1.odd = psmid & 0xffffffff; @@ -339,21 +454,22 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n" " brc 2,0b\n" /* handle partial completion */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ - : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), [nqap_r2] "+&d" (nqap_r2.pair) : [nqap_r1] "d" (nqap_r1.pair) : "cc", "memory", "0", "1"); - return reg1; + return reg1.status; } /** * ap_dqap(): Receive message from adjunct processor queue. * @qid: The AP queue number * @psmid: Pointer to program supplied message identifier - * @msg: The message text - * @length: The message length - * @reslength: Resitual length on return - * @resgr0: input: gr0 value (only used if != 0), output: resitual gr0 content + * @msg: Pointer to message buffer + * @msglen: Message buffer size + * @length: Pointer to length of actually written bytes + * @reslength: Residual length on return + * @resgr0: input: gr0 value (only used if != 0), output: residual gr0 content * * Returns AP queue status structure. * Condition code 1 on DQAP means the receive has taken place @@ -377,20 +493,21 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, * *resgr0 is to be used instead of qid to further process this entry. */ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, - unsigned long long *psmid, - void *msg, size_t length, + unsigned long *psmid, + void *msg, size_t msglen, + size_t *length, size_t *reslength, unsigned long *resgr0) { unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL; - struct ap_queue_status reg1; + union ap_queue_status_reg reg1; unsigned long reg2; union register_pair rp1, rp2; rp1.even = 0UL; rp1.odd = 0UL; rp2.even = (unsigned long)msg; - rp2.odd = (unsigned long)length; + rp2.odd = (unsigned long)msglen; asm volatile( " lgr 0,%[reg0]\n" /* qid param into gr0 */ @@ -402,8 +519,9 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */ - : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), [reg2] "=&d" (reg2), - [rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair) + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), + [reg2] "=&d" (reg2), [rp1] "+&d" (rp1.pair), + [rp2] "+&d" (rp2.pair) : : "cc", "memory", "0", "1", "2"); @@ -415,27 +533,20 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, * Signal the caller that this dqap is only partially received * with a special status response code 0xFF and *resgr0 updated */ - reg1.response_code = 0xFF; + reg1.status.response_code = 0xFF; if (resgr0) *resgr0 = reg0; } else { - *psmid = (((unsigned long long)rp1.even) << 32) + rp1.odd; + *psmid = (rp1.even << 32) + rp1.odd; if (resgr0) *resgr0 = 0; } - return reg1; -} + /* update *length with the nr of bytes stored into the msg buffer */ + if (length) + *length = msglen - rp2.odd; -/* - * Interface to tell the AP bus code that a configuration - * change has happened. The bus code should at least do - * an ap bus resource rescan. - */ -#if IS_ENABLED(CONFIG_ZCRYPT) -void ap_bus_cfg_chg(void); -#else -static inline void ap_bus_cfg_chg(void){} -#endif + return reg1.status; +} #endif /* _ASM_S390_AP_H_ */ diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h index c5bd9f4437e5..99b2902c10fd 100644 --- a/arch/s390/include/asm/appldata.h +++ b/arch/s390/include/asm/appldata.h @@ -8,8 +8,9 @@ #ifndef _ASM_S390_APPLDATA_H #define _ASM_S390_APPLDATA_H +#include <linux/io.h> +#include <asm/machine.h> #include <asm/diag.h> -#include <asm/io.h> #define APPLDATA_START_INTERVAL_REC 0x80 #define APPLDATA_STOP_REC 0x81 @@ -48,19 +49,19 @@ static inline int appldata_asm(struct appldata_parameter_list *parm_list, { int ry; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -EOPNOTSUPP; parm_list->diag = 0xdc; parm_list->function = fn; parm_list->parlist_length = sizeof(*parm_list); parm_list->buffer_length = length; - parm_list->product_id_addr = (unsigned long) id; + parm_list->product_id_addr = virt_to_phys(id); parm_list->buffer_addr = virt_to_phys(buffer); diag_stat_inc(DIAG_STAT_X0DC); asm volatile( " diag %1,%0,0xdc" : "=d" (ry) - : "d" (parm_list), "m" (*parm_list), "m" (*id) + : "d" (virt_to_phys(parm_list)), "m" (*parm_list), "m" (*id) : "cc"); return ry; } diff --git a/arch/s390/include/asm/arch_hweight.h b/arch/s390/include/asm/arch_hweight.h new file mode 100644 index 000000000000..aca08b0acbc1 --- /dev/null +++ b/arch/s390/include/asm/arch_hweight.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_ARCH_HWEIGHT_H +#define _ASM_S390_ARCH_HWEIGHT_H + +#include <linux/types.h> +#include <asm/march.h> + +static __always_inline unsigned long popcnt_z196(unsigned long w) +{ + unsigned long cnt; + + asm volatile(".insn rrf,0xb9e10000,%[cnt],%[w],0,0" + : [cnt] "=d" (cnt) + : [w] "d" (w) + : "cc"); + return cnt; +} + +static __always_inline unsigned long popcnt_z15(unsigned long w) +{ + unsigned long cnt; + + asm volatile(".insn rrf,0xb9e10000,%[cnt],%[w],8,0" + : [cnt] "=d" (cnt) + : [w] "d" (w) + : "cc"); + return cnt; +} + +static __always_inline unsigned long __arch_hweight64(__u64 w) +{ + if (__is_defined(MARCH_HAS_Z15_FEATURES)) + return popcnt_z15(w); + if (__is_defined(MARCH_HAS_Z196_FEATURES)) { + w = popcnt_z196(w); + w += w >> 32; + w += w >> 16; + w += w >> 8; + return w & 0xff; + } + return __sw_hweight64(w); +} + +static __always_inline unsigned int __arch_hweight32(unsigned int w) +{ + if (__is_defined(MARCH_HAS_Z15_FEATURES)) + return popcnt_z15(w); + if (__is_defined(MARCH_HAS_Z196_FEATURES)) { + w = popcnt_z196(w); + w += w >> 16; + w += w >> 8; + return w & 0xff; + } + return __sw_hweight32(w); +} + +static __always_inline unsigned int __arch_hweight16(unsigned int w) +{ + if (__is_defined(MARCH_HAS_Z15_FEATURES)) + return popcnt_z15((unsigned short)w); + if (__is_defined(MARCH_HAS_Z196_FEATURES)) { + w = popcnt_z196(w); + w += w >> 8; + return w & 0xff; + } + return __sw_hweight16(w); +} + +static __always_inline unsigned int __arch_hweight8(unsigned int w) +{ + if (__is_defined(MARCH_HAS_Z196_FEATURES)) + return popcnt_z196((unsigned char)w); + return __sw_hweight8(w); +} + +#endif /* _ASM_S390_ARCH_HWEIGHT_H */ diff --git a/arch/s390/include/asm/asce.h b/arch/s390/include/asm/asce.h new file mode 100644 index 000000000000..f6dfaaba735a --- /dev/null +++ b/arch/s390/include/asm/asce.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_ASCE_H +#define _ASM_S390_ASCE_H + +#include <linux/thread_info.h> +#include <linux/irqflags.h> +#include <asm/lowcore.h> +#include <asm/ctlreg.h> + +static inline bool enable_sacf_uaccess(void) +{ + unsigned long flags; + + if (test_thread_flag(TIF_ASCE_PRIMARY)) + return true; + local_irq_save(flags); + local_ctl_load(1, &get_lowcore()->kernel_asce); + set_thread_flag(TIF_ASCE_PRIMARY); + local_irq_restore(flags); + return false; +} + +static inline void disable_sacf_uaccess(bool previous) +{ + unsigned long flags; + + if (previous) + return; + local_irq_save(flags); + local_ctl_load(1, &get_lowcore()->user_asce); + clear_thread_flag(TIF_ASCE_PRIMARY); + local_irq_restore(flags); +} + +#endif /* _ASM_S390_ASCE_H */ diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h index b74f1070ddb2..d23ea0c94e4e 100644 --- a/arch/s390/include/asm/asm-extable.h +++ b/arch/s390/include/asm/asm-extable.h @@ -9,9 +9,13 @@ #define EX_TYPE_NONE 0 #define EX_TYPE_FIXUP 1 #define EX_TYPE_BPF 2 -#define EX_TYPE_UA_STORE 3 -#define EX_TYPE_UA_LOAD_MEM 4 +#define EX_TYPE_UA_FAULT 3 #define EX_TYPE_UA_LOAD_REG 5 +#define EX_TYPE_UA_LOAD_REGPAIR 6 +#define EX_TYPE_ZEROPAD 7 +#define EX_TYPE_FPC 8 +#define EX_TYPE_UA_MVCOS_TO 9 +#define EX_TYPE_UA_MVCOS_FROM 10 #define EX_DATA_REG_ERR_SHIFT 0 #define EX_DATA_REG_ERR GENMASK(3, 0) @@ -22,18 +26,9 @@ #define EX_DATA_LEN_SHIFT 8 #define EX_DATA_LEN GENMASK(11, 8) -#define __EX_TABLE(_section, _fault, _target, _type) \ - stringify_in_c(.section _section,"a";) \ - stringify_in_c(.align 4;) \ - stringify_in_c(.long (_fault) - .;) \ - stringify_in_c(.long (_target) - .;) \ - stringify_in_c(.short (_type);) \ - stringify_in_c(.short 0;) \ - stringify_in_c(.previous) - -#define __EX_TABLE_UA(_section, _fault, _target, _type, _regerr, _regaddr, _len)\ +#define __EX_TABLE(_section, _fault, _target, _type, _regerr, _regaddr, _len) \ stringify_in_c(.section _section,"a";) \ - stringify_in_c(.align 4;) \ + stringify_in_c(.balign 4;) \ stringify_in_c(.long (_fault) - .;) \ stringify_in_c(.long (_target) - .;) \ stringify_in_c(.short (_type);) \ @@ -71,18 +66,30 @@ stringify_in_c(.previous) #define EX_TABLE(_fault, _target) \ - __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP) + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0) #define EX_TABLE_AMODE31(_fault, _target) \ - __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP) + __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0) -#define EX_TABLE_UA_STORE(_fault, _target, _regerr) \ - __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_STORE, _regerr, _regerr, 0) - -#define EX_TABLE_UA_LOAD_MEM(_fault, _target, _regerr, _regmem, _len) \ - __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_MEM, _regerr, _regmem, _len) +#define EX_TABLE_UA_FAULT(_fault, _target, _regerr) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_FAULT, _regerr, _regerr, 0) #define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \ - __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0) + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0) + +#define EX_TABLE_UA_LOAD_REGPAIR(_fault, _target, _regerr, _regzero) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REGPAIR, _regerr, _regzero, 0) + +#define EX_TABLE_ZEROPAD(_fault, _target, _regdata, _regaddr) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_ZEROPAD, _regdata, _regaddr, 0) + +#define EX_TABLE_FPC(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FPC, __stringify(%%r0), __stringify(%%r0), 0) + +#define EX_TABLE_UA_MVCOS_TO(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_MVCOS_TO, __stringify(%%r0), __stringify(%%r0), 0) + +#define EX_TABLE_UA_MVCOS_FROM(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_MVCOS_FROM, __stringify(%%r0), __stringify(%%r0), 0) #endif /* __ASM_EXTABLE_H */ diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h index c37eb921bfbf..f662eb4b9246 100644 --- a/arch/s390/include/asm/asm-prototypes.h +++ b/arch/s390/include/asm/asm-prototypes.h @@ -3,7 +3,12 @@ #include <linux/kvm_host.h> #include <linux/ftrace.h> -#include <asm/fpu/api.h> +#include <asm/fpu.h> +#include <asm/nospec-branch.h> #include <asm-generic/asm-prototypes.h> +__int128_t __ashlti3(__int128_t a, int b); +__int128_t __ashrti3(__int128_t a, int b); +__int128_t __lshrti3(__int128_t a, int b); + #endif /* _ASM_S390_PROTOTYPES_H */ diff --git a/arch/s390/include/asm/asm.h b/arch/s390/include/asm/asm.h new file mode 100644 index 000000000000..e9062b01e2a2 --- /dev/null +++ b/arch/s390/include/asm/asm.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ASM_H +#define _ASM_S390_ASM_H + +#include <linux/stringify.h> + +/* + * Helper macros to be used for flag output operand handling. + * Inline assemblies must use four of the five supplied macros: + * + * Use CC_IPM(sym) at the end of the inline assembly; this extracts the + * condition code and program mask with the ipm instruction and writes it to + * the variable with symbolic name [sym] if the compiler has no support for + * flag output operands. If the compiler has support for flag output operands + * this generates no code. + * + * Use CC_OUT(sym, var) at the output operand list of an inline assembly. This + * defines an output operand with symbolic name [sym] for the variable + * [var]. [var] must be an int variable and [sym] must be identical with [sym] + * used with CC_IPM(). + * + * Use either CC_CLOBBER or CC_CLOBBER_LIST() for the clobber list. Use + * CC_CLOBBER if the clobber list contains only "cc", otherwise use + * CC_CLOBBER_LIST() and add all clobbers as argument to the macro. + * + * Use CC_TRANSFORM() to convert the variable [var] which contains the + * extracted condition code. If the condition code is extracted with ipm, the + * [var] also contains the program mask. CC_TRANSFORM() moves the condition + * code to the two least significant bits and sets all other bits to zero. + */ +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_ASM_FLAG_OUTPUT_BROKEN)) + +#define __HAVE_ASM_FLAG_OUTPUTS__ + +#define CC_IPM(sym) +#define CC_OUT(sym, var) "=@cc" (var) +#define CC_TRANSFORM(cc) ({ cc; }) +#define CC_CLOBBER +#define CC_CLOBBER_LIST(...) __VA_ARGS__ + +#else + +#define CC_IPM(sym) " ipm %[" __stringify(sym) "]\n" +#define CC_OUT(sym, var) [sym] "=d" (var) +#define CC_TRANSFORM(cc) ({ (cc) >> 28; }) +#define CC_CLOBBER "cc" +#define CC_CLOBBER_LIST(...) "cc", __VA_ARGS__ + +#endif + +#endif /* _ASM_S390_ASM_H */ diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 7138d189cc42..b36dd6a1d652 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,46 +15,76 @@ #include <asm/barrier.h> #include <asm/cmpxchg.h> -static inline int arch_atomic_read(const atomic_t *v) +static __always_inline int arch_atomic_read(const atomic_t *v) { - return __atomic_read(v); + return __atomic_read(&v->counter); } #define arch_atomic_read arch_atomic_read -static inline void arch_atomic_set(atomic_t *v, int i) +static __always_inline void arch_atomic_set(atomic_t *v, int i) { - __atomic_set(v, i); + __atomic_set(&v->counter, i); } #define arch_atomic_set arch_atomic_set -static inline int arch_atomic_add_return(int i, atomic_t *v) +static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter) + i; } #define arch_atomic_add_return arch_atomic_add_return -static inline int arch_atomic_fetch_add(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter); } #define arch_atomic_fetch_add arch_atomic_fetch_add -static inline void arch_atomic_add(int i, atomic_t *v) +static __always_inline void arch_atomic_add(int i, atomic_t *v) { __atomic_add(i, &v->counter); } #define arch_atomic_add arch_atomic_add +static __always_inline void arch_atomic_inc(atomic_t *v) +{ + __atomic_add_const(1, &v->counter); +} +#define arch_atomic_inc arch_atomic_inc + +static __always_inline void arch_atomic_dec(atomic_t *v) +{ + __atomic_add_const(-1, &v->counter); +} +#define arch_atomic_dec arch_atomic_dec + +static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) +{ + return __atomic_add_and_test_barrier(-i, &v->counter); +} +#define arch_atomic_sub_and_test arch_atomic_sub_and_test + +static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) +{ + return __atomic_add_const_and_test_barrier(-1, &v->counter); +} +#define arch_atomic_dec_and_test arch_atomic_dec_and_test + +static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) +{ + return __atomic_add_const_and_test_barrier(1, &v->counter); +} +#define arch_atomic_inc_and_test arch_atomic_inc_and_test + #define arch_atomic_sub(_i, _v) arch_atomic_add(-(int)(_i), _v) #define arch_atomic_sub_return(_i, _v) arch_atomic_add_return(-(int)(_i), _v) #define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v) #define ATOMIC_OPS(op) \ -static inline void arch_atomic_##op(int i, atomic_t *v) \ +static __always_inline void arch_atomic_##op(int i, atomic_t *v) \ { \ __atomic_##op(i, &v->counter); \ } \ -static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ +static __always_inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ { \ return __atomic_##op##_barrier(i, &v->counter); \ } @@ -72,62 +102,112 @@ ATOMIC_OPS(xor) #define arch_atomic_fetch_or arch_atomic_fetch_or #define arch_atomic_fetch_xor arch_atomic_fetch_xor -#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new)) +static __always_inline int arch_atomic_xchg(atomic_t *v, int new) +{ + return arch_xchg(&v->counter, new); +} +#define arch_atomic_xchg arch_atomic_xchg -static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { - return __atomic_cmpxchg(&v->counter, old, new); + return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic_cmpxchg arch_atomic_cmpxchg +static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return arch_try_cmpxchg(&v->counter, old, new); +} +#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg + #define ATOMIC64_INIT(i) { (i) } -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { - return __atomic64_read(v); + return __atomic64_read((long *)&v->counter); } #define arch_atomic64_read arch_atomic64_read -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { - __atomic64_set(v, i); + __atomic64_set((long *)&v->counter, i); } #define arch_atomic64_set arch_atomic64_set -static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter) + i; } #define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline void arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __atomic64_add(i, (long *)&v->counter); } #define arch_atomic64_add arch_atomic64_add -#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new)) +static __always_inline void arch_atomic64_inc(atomic64_t *v) +{ + __atomic64_add_const(1, (long *)&v->counter); +} +#define arch_atomic64_inc arch_atomic64_inc + +static __always_inline void arch_atomic64_dec(atomic64_t *v) +{ + __atomic64_add_const(-1, (long *)&v->counter); +} +#define arch_atomic64_dec arch_atomic64_dec -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { - return __atomic64_cmpxchg((long *)&v->counter, old, new); + return __atomic64_add_and_test_barrier(-i, (long *)&v->counter); +} +#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test + +static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) +{ + return __atomic64_add_const_and_test_barrier(-1, (long *)&v->counter); +} +#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test + +static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) +{ + return __atomic64_add_const_and_test_barrier(1, (long *)&v->counter); +} +#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test + +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) +{ + return arch_xchg(&v->counter, new); +} +#define arch_atomic64_xchg arch_atomic64_xchg + +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +{ + return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg -#define ATOMIC64_OPS(op) \ -static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ -{ \ - __atomic64_##op(i, (long *)&v->counter); \ -} \ -static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ -{ \ - return __atomic64_##op##_barrier(i, (long *)&v->counter); \ +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_try_cmpxchg(&v->counter, old, new); +} +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg + +#define ATOMIC64_OPS(op) \ +static __always_inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ +{ \ + __atomic64_##op(i, (long *)&v->counter); \ +} \ +static __always_inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ +{ \ + return __atomic64_##op##_barrier(i, (long *)&v->counter); \ } ATOMIC64_OPS(and) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index 50510e08b893..21c26d842832 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -8,44 +8,60 @@ #ifndef __ARCH_S390_ATOMIC_OPS__ #define __ARCH_S390_ATOMIC_OPS__ -static inline int __atomic_read(const atomic_t *v) +#include <linux/limits.h> +#include <asm/march.h> +#include <asm/asm.h> + +static __always_inline int __atomic_read(const int *ptr) { - int c; + int val; asm volatile( - " l %0,%1\n" - : "=d" (c) : "R" (v->counter)); - return c; + " l %[val],%[ptr]\n" + : [val] "=d" (val) : [ptr] "R" (*ptr)); + return val; } -static inline void __atomic_set(atomic_t *v, int i) +static __always_inline void __atomic_set(int *ptr, int val) { - asm volatile( - " st %1,%0\n" - : "=R" (v->counter) : "d" (i)); + if (__builtin_constant_p(val) && val >= S16_MIN && val <= S16_MAX) { + asm volatile( + " mvhi %[ptr],%[val]\n" + : [ptr] "=Q" (*ptr) : [val] "K" (val)); + } else { + asm volatile( + " st %[val],%[ptr]\n" + : [ptr] "=R" (*ptr) : [val] "d" (val)); + } } -static inline s64 __atomic64_read(const atomic64_t *v) +static __always_inline long __atomic64_read(const long *ptr) { - s64 c; + long val; asm volatile( - " lg %0,%1\n" - : "=d" (c) : "RT" (v->counter)); - return c; + " lg %[val],%[ptr]\n" + : [val] "=d" (val) : [ptr] "RT" (*ptr)); + return val; } -static inline void __atomic64_set(atomic64_t *v, s64 i) +static __always_inline void __atomic64_set(long *ptr, long val) { - asm volatile( - " stg %1,%0\n" - : "=RT" (v->counter) : "d" (i)); + if (__builtin_constant_p(val) && val >= S16_MIN && val <= S16_MAX) { + asm volatile( + " mvghi %[ptr],%[val]\n" + : [ptr] "=Q" (*ptr) : [val] "K" (val)); + } else { + asm volatile( + " stg %[val],%[ptr]\n" + : [ptr] "=RT" (*ptr) : [val] "d" (val)); + } } -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES #define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ -static inline op_type op_name(op_type val, op_type *ptr) \ +static __always_inline op_type op_name(op_type val, op_type *ptr) \ { \ op_type old; \ \ @@ -58,7 +74,7 @@ static inline op_type op_name(op_type val, op_type *ptr) \ } \ #define __ATOMIC_OPS(op_name, op_type, op_string) \ - __ATOMIC_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_OP(op_name, op_type, op_string, "") \ __ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") __ATOMIC_OPS(__atomic_add, int, "laa") @@ -84,7 +100,7 @@ static __always_inline void op_name(op_type val, op_type *ptr) \ } #define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \ - __ATOMIC_CONST_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_CONST_OP(op_name, op_type, op_string, "") \ __ATOMIC_CONST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") __ATOMIC_CONST_OPS(__atomic_add_const, int, "asi") @@ -93,10 +109,10 @@ __ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi") #undef __ATOMIC_CONST_OPS #undef __ATOMIC_CONST_OP -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ +#else /* MARCH_HAS_Z196_FEATURES */ #define __ATOMIC_OP(op_name, op_string) \ -static inline int op_name(int val, int *ptr) \ +static __always_inline int op_name(int val, int *ptr) \ { \ int old, new; \ \ @@ -122,7 +138,7 @@ __ATOMIC_OPS(__atomic_xor, "xr") #undef __ATOMIC_OPS #define __ATOMIC64_OP(op_name, op_string) \ -static inline long op_name(long val, long *ptr) \ +static __always_inline long op_name(long val, long *ptr) \ { \ long old, new; \ \ @@ -147,55 +163,83 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") #undef __ATOMIC64_OPS -#define __atomic_add_const(val, ptr) __atomic_add(val, ptr) -#define __atomic_add_const_barrier(val, ptr) __atomic_add(val, ptr) -#define __atomic64_add_const(val, ptr) __atomic64_add(val, ptr) -#define __atomic64_add_const_barrier(val, ptr) __atomic64_add(val, ptr) +#define __atomic_add_const(val, ptr) ((void)__atomic_add(val, ptr)) +#define __atomic_add_const_barrier(val, ptr) ((void)__atomic_add(val, ptr)) +#define __atomic64_add_const(val, ptr) ((void)__atomic64_add(val, ptr)) +#define __atomic64_add_const_barrier(val, ptr) ((void)__atomic64_add(val, ptr)) -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ +#endif /* MARCH_HAS_Z196_FEATURES */ -static inline int __atomic_cmpxchg(int *ptr, int old, int new) -{ - asm volatile( - " cs %[old],%[new],%[ptr]" - : [old] "+d" (old), [ptr] "+Q" (*ptr) - : [new] "d" (new) - : "cc", "memory"); - return old; -} +#if defined(MARCH_HAS_Z196_FEATURES) && defined(__HAVE_ASM_FLAG_OUTPUTS__) -static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) -{ - int old_expected = old; +#define __ATOMIC_TEST_OP(op_name, op_type, op_string, op_barrier) \ +static __always_inline bool op_name(op_type val, op_type *ptr) \ +{ \ + op_type tmp; \ + int cc; \ + \ + asm volatile( \ + op_string " %[tmp],%[val],%[ptr]\n" \ + op_barrier \ + : "=@cc" (cc), [tmp] "=d" (tmp), [ptr] "+QS" (*ptr) \ + : [val] "d" (val) \ + : "memory"); \ + return (cc == 0) || (cc == 2); \ +} \ - asm volatile( - " cs %[old],%[new],%[ptr]" - : [old] "+d" (old), [ptr] "+Q" (*ptr) - : [new] "d" (new) - : "cc", "memory"); - return old == old_expected; -} +#define __ATOMIC_TEST_OPS(op_name, op_type, op_string) \ + __ATOMIC_TEST_OP(op_name, op_type, op_string, "") \ + __ATOMIC_TEST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") -static inline long __atomic64_cmpxchg(long *ptr, long old, long new) -{ - asm volatile( - " csg %[old],%[new],%[ptr]" - : [old] "+d" (old), [ptr] "+QS" (*ptr) - : [new] "d" (new) - : "cc", "memory"); - return old; +__ATOMIC_TEST_OPS(__atomic_add_and_test, int, "laal") +__ATOMIC_TEST_OPS(__atomic64_add_and_test, long, "laalg") + +#undef __ATOMIC_TEST_OPS +#undef __ATOMIC_TEST_OP + +#define __ATOMIC_CONST_TEST_OP(op_name, op_type, op_string, op_barrier) \ +static __always_inline bool op_name(op_type val, op_type *ptr) \ +{ \ + int cc; \ + \ + asm volatile( \ + op_string " %[ptr],%[val]\n" \ + op_barrier \ + : "=@cc" (cc), [ptr] "+QS" (*ptr) \ + : [val] "i" (val) \ + : "memory"); \ + return (cc == 0) || (cc == 2); \ } -static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) -{ - long old_expected = old; +#define __ATOMIC_CONST_TEST_OPS(op_name, op_type, op_string) \ + __ATOMIC_CONST_TEST_OP(op_name, op_type, op_string, "") \ + __ATOMIC_CONST_TEST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") - asm volatile( - " csg %[old],%[new],%[ptr]" - : [old] "+d" (old), [ptr] "+QS" (*ptr) - : [new] "d" (new) - : "cc", "memory"); - return old == old_expected; +__ATOMIC_CONST_TEST_OPS(__atomic_add_const_and_test, int, "alsi") +__ATOMIC_CONST_TEST_OPS(__atomic64_add_const_and_test, long, "algsi") + +#undef __ATOMIC_CONST_TEST_OPS +#undef __ATOMIC_CONST_TEST_OP + +#else /* defined(MARCH_HAS_Z196_FEATURES) && defined(__HAVE_ASM_FLAG_OUTPUTS__) */ + +#define __ATOMIC_TEST_OP(op_name, op_func, op_type) \ +static __always_inline bool op_name(op_type val, op_type *ptr) \ +{ \ + return op_func(val, ptr) == -val; \ } +__ATOMIC_TEST_OP(__atomic_add_and_test, __atomic_add, int) +__ATOMIC_TEST_OP(__atomic_add_and_test_barrier, __atomic_add_barrier, int) +__ATOMIC_TEST_OP(__atomic_add_const_and_test, __atomic_add, int) +__ATOMIC_TEST_OP(__atomic_add_const_and_test_barrier, __atomic_add_barrier, int) +__ATOMIC_TEST_OP(__atomic64_add_and_test, __atomic64_add, long) +__ATOMIC_TEST_OP(__atomic64_add_and_test_barrier, __atomic64_add_barrier, long) +__ATOMIC_TEST_OP(__atomic64_add_const_and_test, __atomic64_add, long) +__ATOMIC_TEST_OP(__atomic64_add_const_and_test_barrier, __atomic64_add_barrier, long) + +#undef __ATOMIC_TEST_OP + +#endif /* defined(MARCH_HAS_Z196_FEATURES) && defined(__HAVE_ASM_FLAG_OUTPUTS__) */ + #endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index 82de2a7c4160..d82130d7f2b6 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -8,13 +8,15 @@ #ifndef __ASM_BARRIER_H #define __ASM_BARRIER_H +#include <asm/march.h> + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking * to devices. */ -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES /* Fast-BCR without checkpoint synchronization */ #define __ASM_BCR_SERIALIZE "bcr 14,0\n" #else diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 191dc7898b0f..a5ca0a947691 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -36,179 +36,45 @@ #include <linux/typecheck.h> #include <linux/compiler.h> #include <linux/types.h> -#include <asm/atomic_ops.h> -#include <asm/barrier.h> - -#define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) - -static inline unsigned long * -__bitops_word(unsigned long nr, const volatile unsigned long *ptr) -{ - unsigned long addr; - - addr = (unsigned long)ptr + ((nr ^ (nr & (BITS_PER_LONG - 1))) >> 3); - return (unsigned long *)addr; -} - -static inline unsigned long __bitops_mask(unsigned long nr) -{ - return 1UL << (nr & (BITS_PER_LONG - 1)); -} - -static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - __atomic64_or(mask, (long *)addr); -} - -static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - __atomic64_and(~mask, (long *)addr); -} - -static __always_inline void arch_change_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - __atomic64_xor(mask, (long *)addr); -} - -static inline bool arch_test_and_set_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - unsigned long old; - - old = __atomic64_or_barrier(mask, (long *)addr); - return old & mask; -} - -static inline bool arch_test_and_clear_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - unsigned long old; - - old = __atomic64_and_barrier(~mask, (long *)addr); - return old & mask; -} - -static inline bool arch_test_and_change_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - unsigned long old; - - old = __atomic64_xor_barrier(mask, (long *)addr); - return old & mask; -} - -static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - *addr |= mask; -} - -static inline void arch___clear_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - *addr &= ~mask; -} - -static inline void arch___change_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - *addr ^= mask; -} - -static inline bool arch___test_and_set_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - unsigned long old; - - old = *addr; - *addr |= mask; - return old & mask; -} - -static inline bool arch___test_and_clear_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - unsigned long old; - - old = *addr; - *addr &= ~mask; - return old & mask; -} - -static inline bool arch___test_and_change_bit(unsigned long nr, - volatile unsigned long *ptr) -{ - unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - unsigned long old; - - old = *addr; - *addr ^= mask; - return old & mask; -} - -static inline bool arch_test_bit(unsigned long nr, - const volatile unsigned long *ptr) -{ - const volatile unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask = __bitops_mask(nr); - - return *addr & mask; -} - -static inline bool arch_test_and_set_bit_lock(unsigned long nr, - volatile unsigned long *ptr) -{ - if (arch_test_bit(nr, ptr)) - return true; - return arch_test_and_set_bit(nr, ptr); -} - -static inline void arch_clear_bit_unlock(unsigned long nr, - volatile unsigned long *ptr) -{ - smp_mb__before_atomic(); - arch_clear_bit(nr, ptr); -} - -static inline void arch___clear_bit_unlock(unsigned long nr, - volatile unsigned long *ptr) -{ - smp_mb(); - arch___clear_bit(nr, ptr); +#include <asm/asm.h> + +#define arch___set_bit generic___set_bit +#define arch___clear_bit generic___clear_bit +#define arch___change_bit generic___change_bit +#define arch___test_and_set_bit generic___test_and_set_bit +#define arch___test_and_clear_bit generic___test_and_clear_bit +#define arch___test_and_change_bit generic___test_and_change_bit +#define arch_test_bit_acquire generic_test_bit_acquire + +static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsigned long *ptr) +{ +#ifdef __HAVE_ASM_FLAG_OUTPUTS__ + const volatile unsigned char *addr; + unsigned long mask; + int cc; + + /* + * With CONFIG_PROFILE_ALL_BRANCHES enabled gcc fails to + * handle __builtin_constant_p() in some cases. + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && __builtin_constant_p(nr)) { + addr = (const volatile unsigned char *)ptr; + addr += (nr ^ (BITS_PER_LONG - BITS_PER_BYTE)) / BITS_PER_BYTE; + mask = 1UL << (nr & (BITS_PER_BYTE - 1)); + asm volatile( + " tm %[addr],%[mask]\n" + : "=@cc" (cc) + : [addr] "Q" (*addr), [mask] "I" (mask) + ); + return cc == 3; + } +#endif + return generic_test_bit(nr, ptr); } -#include <asm-generic/bitops/instrumented-atomic.h> -#include <asm-generic/bitops/instrumented-non-atomic.h> -#include <asm-generic/bitops/instrumented-lock.h> +#include <asm-generic/bitops/atomic.h> +#include <asm-generic/bitops/non-instrumented-non-atomic.h> +#include <asm-generic/bitops/lock.h> /* * Functions which use MSB0 bit numbering. @@ -374,8 +240,9 @@ static inline int fls(unsigned int word) return fls64(word); } +#include <asm/arch_hweight.h> +#include <asm-generic/bitops/const_hweight.h> #include <asm-generic/bitops/ffz.h> -#include <asm-generic/bitops/hweight.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h index f7eed27b3220..f55f8227058e 100644 --- a/arch/s390/include/asm/boot_data.h +++ b/arch/s390/include/asm/boot_data.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_S390_BOOT_DATA_H +#include <linux/string.h> #include <asm/setup.h> #include <asm/ipl.h> @@ -15,4 +16,54 @@ extern unsigned long ipl_cert_list_size; extern unsigned long early_ipl_comp_list_addr; extern unsigned long early_ipl_comp_list_size; +extern char boot_rb[PAGE_SIZE * 2]; +extern bool boot_earlyprintk; +extern size_t boot_rb_off; +extern char bootdebug_filter[128]; +extern bool bootdebug; + +#define boot_rb_foreach(cb) \ + do { \ + size_t off = boot_rb_off + strlen(boot_rb + boot_rb_off) + 1; \ + size_t len; \ + for (; off < sizeof(boot_rb) && (len = strlen(boot_rb + off)); off += len + 1) \ + cb(boot_rb + off); \ + for (off = 0; off < boot_rb_off && (len = strlen(boot_rb + off)); off += len + 1) \ + cb(boot_rb + off); \ + } while (0) + +/* + * bootdebug_filter is a comma separated list of strings, + * where each string can be a prefix of the message. + */ +static inline bool bootdebug_filter_match(const char *buf) +{ + char *p = bootdebug_filter, *s; + char *end; + + if (!*p) + return true; + + end = p + strlen(p); + while (p < end) { + p = skip_spaces(p); + s = memscan(p, ',', end - p); + if (!strncmp(p, buf, s - p)) + return true; + p = s + 1; + } + return false; +} + +static inline const char *skip_timestamp(const char *buf) +{ +#ifdef CONFIG_PRINTK_TIME + const char *p = memchr(buf, ']', strlen(buf)); + + if (p && p[1] == ' ') + return p + 2; +#endif + return buf; +} + #endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index aebe1e22c7be..c500d45fb465 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -14,7 +14,7 @@ ".section .rodata.str,\"aMS\",@progbits,1\n" \ "1: .asciz \""__FILE__"\"\n" \ ".previous\n" \ - ".section __bug_table,\"awM\",@progbits,%2\n" \ + ".section __bug_table,\"aw\"\n" \ "2: .long 0b-.\n" \ " .long 1b-.\n" \ " .short %0,%1\n" \ @@ -30,7 +30,7 @@ #define __EMIT_BUG(x) do { \ asm_inline volatile( \ "0: mc 0,0\n" \ - ".section __bug_table,\"awM\",@progbits,%1\n" \ + ".section __bug_table,\"aw\"\n" \ "1: .long 0b-.\n" \ " .short %0\n" \ " .org 1b+%1\n" \ diff --git a/arch/s390/include/asm/bugs.h b/arch/s390/include/asm/bugs.h deleted file mode 100644 index aa42a179be33..000000000000 --- a/arch/s390/include/asm/bugs.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * Derived from "include/asm-i386/bugs.h" - * Copyright (C) 1994 Linus Torvalds - */ - -/* - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ - -static inline void check_bugs(void) -{ - /* s390 has no bugs ... */ -} diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index d4e90f2ba77e..e3afcece375e 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -15,6 +15,7 @@ #include <asm/fcx.h> #include <asm/irq.h> #include <asm/schid.h> +#include <linux/mutex.h> /* structs from asm/cio.h */ struct irb; @@ -87,6 +88,7 @@ struct ccw_device { spinlock_t *ccwlock; /* private: */ struct ccw_device_private *private; /* cio private information */ + struct mutex reg_mutex; /* public: */ struct ccw_device_id id; struct ccw_driver *drv; @@ -208,15 +210,15 @@ extern void ccw_device_get_id(struct ccw_device *, struct ccw_dev_id *); #define get_ccwdev_lock(x) (x)->ccwlock #define to_ccwdev(n) container_of(n, struct ccw_device, dev) -#define to_ccwdrv(n) container_of(n, struct ccw_driver, driver) +#define to_ccwdrv(n) container_of_const(n, struct ccw_driver, driver) extern struct ccw_device *ccw_device_create_console(struct ccw_driver *); extern void ccw_device_destroy_console(struct ccw_device *); extern int ccw_device_enable_console(struct ccw_device *); extern void ccw_device_wait_idle(struct ccw_device *); -extern int ccw_device_force_console(struct ccw_device *); -extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size); +extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size, + dma32_t *dma_handle); extern void ccw_device_dma_free(struct ccw_device *cdev, void *cpu_addr, size_t size); diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index cdd19d326345..d86dea5900e7 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -12,28 +12,19 @@ #ifndef _S390_CHECKSUM_H #define _S390_CHECKSUM_H -#include <linux/uaccess.h> +#include <linux/instrumented.h> +#include <linux/kmsan-checks.h> #include <linux/in6.h> -/* - * Computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit). - * - * Returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic. - * - * This function must be called with even lengths, except - * for the last fragment, which may be odd. - * - * It's best to have buff aligned on a 32-bit boundary. - */ -static inline __wsum csum_partial(const void *buff, int len, __wsum sum) +static inline __wsum cksm(const void *buff, int len, __wsum sum) { union register_pair rp = { - .even = (unsigned long) buff, - .odd = (unsigned long) len, + .even = (unsigned long)buff, + .odd = (unsigned long)len, }; + instrument_read(buff, len); + kmsan_check_memory(buff, len); asm volatile( "0: cksm %[sum],%[rp]\n" " jo 0b\n" @@ -41,6 +32,11 @@ static inline __wsum csum_partial(const void *buff, int len, __wsum sum) return sum; } +__wsum csum_partial(const void *buff, int len, __wsum sum); + +#define _HAVE_ARCH_CSUM_AND_COPY +__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len); + /* * Fold a partial checksum without adding pseudo headers. */ diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h index bb48ea380c0d..bb78159d8042 100644 --- a/arch/s390/include/asm/chsc.h +++ b/arch/s390/include/asm/chsc.h @@ -11,6 +11,9 @@ #include <uapi/asm/chsc.h> +/* struct from linux/notifier.h */ +struct notifier_block; + /** * Operation codes for CHSC PNSO: * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport @@ -66,4 +69,16 @@ struct chsc_pnso_area { struct chsc_pnso_naid_l2 entries[]; } __packed __aligned(PAGE_SIZE); +/* + * notifier interface - registered notifiers gets called on + * the following events: + * - ap config changed (CHSC_NOTIFY_AP_CFG) + */ +enum chsc_notify_type { + CHSC_NOTIFY_AP_CFG = 3, +}; + +int chsc_notifier_register(struct notifier_block *nb); +int chsc_notifier_unregister(struct notifier_block *nb); + #endif /* _ASM_S390_CHSC_H */ diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 1c4f585dd39b..b6b619f340a5 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -7,6 +7,7 @@ #include <linux/bitops.h> #include <linux/genalloc.h> +#include <asm/dma-types.h> #include <asm/types.h> #include <asm/tpi.h> @@ -32,7 +33,7 @@ struct ccw1 { __u8 cmd_code; __u8 flags; __u16 count; - __u32 cda; + dma32_t cda; } __attribute__ ((packed,aligned(8))); /** @@ -152,8 +153,8 @@ struct sublog { struct esw0 { struct sublog sublog; struct erw erw; - __u32 faddr[2]; - __u32 saddr; + dma32_t faddr[2]; + dma32_t saddr; } __attribute__ ((packed)); /** @@ -364,6 +365,8 @@ extern struct device *cio_get_dma_css_dev(void); void *cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev, size_t size); +void *__cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev, + size_t size, dma32_t *dma_handle); void cio_gp_dma_free(struct gen_pool *gp_dma, void *cpu_addr, size_t size); void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev); struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h index 84c3f0d576c5..a9e2006033b7 100644 --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -11,194 +11,263 @@ #include <linux/mmdebug.h> #include <linux/types.h> #include <linux/bug.h> +#include <asm/asm.h> -void __xchg_called_with_bad_pointer(void); +void __cmpxchg_called_with_bad_pointer(void); -static __always_inline unsigned long __xchg(unsigned long x, - unsigned long address, int size) +static __always_inline u32 __cs_asm(u64 ptr, u32 old, u32 new) { - unsigned long old; - int shift; + asm volatile( + " cs %[old],%[new],%[ptr]\n" + : [old] "+d" (old), [ptr] "+Q" (*(u32 *)ptr) + : [new] "d" (new) + : "memory", "cc"); + return old; +} + +static __always_inline u64 __csg_asm(u64 ptr, u64 old, u64 new) +{ + asm volatile( + " csg %[old],%[new],%[ptr]\n" + : [old] "+d" (old), [ptr] "+QS" (*(u64 *)ptr) + : [new] "d" (new) + : "memory", "cc"); + return old; +} + +static inline u8 __arch_cmpxchg1(u64 ptr, u8 old, u8 new) +{ + union { + u8 b[4]; + u32 w; + } old32, new32; + u32 prev; + int i; + + i = ptr & 3; + ptr &= ~0x3; + prev = READ_ONCE(*(u32 *)ptr); + do { + old32.w = prev; + if (old32.b[i] != old) + return old32.b[i]; + new32.w = old32.w; + new32.b[i] = new; + prev = __cs_asm(ptr, old32.w, new32.w); + } while (prev != old32.w); + return old; +} + +static inline u16 __arch_cmpxchg2(u64 ptr, u16 old, u16 new) +{ + union { + u16 b[2]; + u32 w; + } old32, new32; + u32 prev; + int i; + + i = (ptr & 3) >> 1; + ptr &= ~0x3; + prev = READ_ONCE(*(u32 *)ptr); + do { + old32.w = prev; + if (old32.b[i] != old) + return old32.b[i]; + new32.w = old32.w; + new32.b[i] = new; + prev = __cs_asm(ptr, old32.w, new32.w); + } while (prev != old32.w); + return old; +} +static __always_inline u64 __arch_cmpxchg(u64 ptr, u64 old, u64 new, int size) +{ switch (size) { - case 1: - shift = (3 ^ (address & 3)) << 3; - address ^= address & 3; - asm volatile( - " l %0,%1\n" - "0: lr 0,%0\n" - " nr 0,%3\n" - " or 0,%2\n" - " cs %0,0,%1\n" - " jl 0b\n" - : "=&d" (old), "+Q" (*(int *) address) - : "d" ((x & 0xff) << shift), "d" (~(0xff << shift)) - : "memory", "cc", "0"); - return old >> shift; - case 2: - shift = (2 ^ (address & 2)) << 3; - address ^= address & 2; - asm volatile( - " l %0,%1\n" - "0: lr 0,%0\n" - " nr 0,%3\n" - " or 0,%2\n" - " cs %0,0,%1\n" - " jl 0b\n" - : "=&d" (old), "+Q" (*(int *) address) - : "d" ((x & 0xffff) << shift), "d" (~(0xffff << shift)) - : "memory", "cc", "0"); - return old >> shift; - case 4: - asm volatile( - " l %0,%1\n" - "0: cs %0,%2,%1\n" - " jl 0b\n" - : "=&d" (old), "+Q" (*(int *) address) - : "d" (x) - : "memory", "cc"); - return old; - case 8: - asm volatile( - " lg %0,%1\n" - "0: csg %0,%2,%1\n" - " jl 0b\n" - : "=&d" (old), "+QS" (*(long *) address) - : "d" (x) - : "memory", "cc"); - return old; + case 1: return __arch_cmpxchg1(ptr, old & 0xff, new & 0xff); + case 2: return __arch_cmpxchg2(ptr, old & 0xffff, new & 0xffff); + case 4: return __cs_asm(ptr, old & 0xffffffff, new & 0xffffffff); + case 8: return __csg_asm(ptr, old, new); + default: __cmpxchg_called_with_bad_pointer(); } - __xchg_called_with_bad_pointer(); - return x; + return old; } -#define arch_xchg(ptr, x) \ +#define arch_cmpxchg(ptr, o, n) \ ({ \ - __typeof__(*(ptr)) __ret; \ + (__typeof__(*(ptr)))__arch_cmpxchg((unsigned long)(ptr), \ + (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr))); \ +}) + +#define arch_cmpxchg64 arch_cmpxchg +#define arch_cmpxchg_local arch_cmpxchg +#define arch_cmpxchg64_local arch_cmpxchg + +#ifdef __HAVE_ASM_FLAG_OUTPUTS__ + +#define arch_try_cmpxchg(ptr, oldp, new) \ +({ \ + __typeof__(ptr) __oldp = (__typeof__(ptr))(oldp); \ + __typeof__(*(ptr)) __old = *__oldp; \ + __typeof__(*(ptr)) __new = (new); \ + __typeof__(*(ptr)) __prev; \ + int __cc; \ \ - __ret = (__typeof__(*(ptr))) \ - __xchg((unsigned long)(x), (unsigned long)(ptr), \ - sizeof(*(ptr))); \ - __ret; \ + switch (sizeof(*(ptr))) { \ + case 1: \ + case 2: { \ + __prev = arch_cmpxchg((ptr), (__old), (__new)); \ + __cc = (__prev != __old); \ + if (unlikely(__cc)) \ + *__oldp = __prev; \ + break; \ + } \ + case 4: { \ + asm volatile( \ + " cs %[__old],%[__new],%[__ptr]\n" \ + : [__old] "+d" (*__oldp), \ + [__ptr] "+Q" (*(ptr)), \ + "=@cc" (__cc) \ + : [__new] "d" (__new) \ + : "memory"); \ + break; \ + } \ + case 8: { \ + asm volatile( \ + " csg %[__old],%[__new],%[__ptr]\n" \ + : [__old] "+d" (*__oldp), \ + [__ptr] "+QS" (*(ptr)), \ + "=@cc" (__cc) \ + : [__new] "d" (__new) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_called_with_bad_pointer(); \ + } \ + likely(__cc == 0); \ }) -void __cmpxchg_called_with_bad_pointer(void); +#else /* __HAVE_ASM_FLAG_OUTPUTS__ */ + +#define arch_try_cmpxchg(ptr, oldp, new) \ +({ \ + __typeof__((ptr)) __oldp = (__typeof__(ptr))(oldp); \ + __typeof__(*(ptr)) __old = *__oldp; \ + __typeof__(*(ptr)) __new = (new); \ + __typeof__(*(ptr)) __prev; \ + \ + __prev = arch_cmpxchg((ptr), (__old), (__new)); \ + if (unlikely(__prev != __old)) \ + *__oldp = __prev; \ + likely(__prev == __old); \ +}) + +#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */ + +#define arch_try_cmpxchg64 arch_try_cmpxchg +#define arch_try_cmpxchg_local arch_try_cmpxchg +#define arch_try_cmpxchg64_local arch_try_cmpxchg + +void __xchg_called_with_bad_pointer(void); + +static inline u8 __arch_xchg1(u64 ptr, u8 x) +{ + int shift = (3 ^ (ptr & 3)) << 3; + u32 mask, old, new; + + ptr &= ~0x3; + mask = ~(0xff << shift); + old = READ_ONCE(*(u32 *)ptr); + do { + new = old & mask; + new |= x << shift; + } while (!arch_try_cmpxchg((u32 *)ptr, &old, new)); + return old >> shift; +} -static __always_inline unsigned long __cmpxchg(unsigned long address, - unsigned long old, - unsigned long new, int size) +static inline u16 __arch_xchg2(u64 ptr, u16 x) { - unsigned long prev, tmp; - int shift; + int shift = (2 ^ (ptr & 2)) << 3; + u32 mask, old, new; + ptr &= ~0x3; + mask = ~(0xffff << shift); + old = READ_ONCE(*(u32 *)ptr); + do { + new = old & mask; + new |= x << shift; + } while (!arch_try_cmpxchg((u32 *)ptr, &old, new)); + return old >> shift; +} + +static __always_inline u64 __arch_xchg(u64 ptr, u64 x, int size) +{ switch (size) { case 1: - shift = (3 ^ (address & 3)) << 3; - address ^= address & 3; - asm volatile( - " l %0,%2\n" - "0: nr %0,%5\n" - " lr %1,%0\n" - " or %0,%3\n" - " or %1,%4\n" - " cs %0,%1,%2\n" - " jnl 1f\n" - " xr %1,%0\n" - " nr %1,%5\n" - " jnz 0b\n" - "1:" - : "=&d" (prev), "=&d" (tmp), "+Q" (*(int *) address) - : "d" ((old & 0xff) << shift), - "d" ((new & 0xff) << shift), - "d" (~(0xff << shift)) - : "memory", "cc"); - return prev >> shift; + return __arch_xchg1(ptr, x & 0xff); case 2: - shift = (2 ^ (address & 2)) << 3; - address ^= address & 2; - asm volatile( - " l %0,%2\n" - "0: nr %0,%5\n" - " lr %1,%0\n" - " or %0,%3\n" - " or %1,%4\n" - " cs %0,%1,%2\n" - " jnl 1f\n" - " xr %1,%0\n" - " nr %1,%5\n" - " jnz 0b\n" - "1:" - : "=&d" (prev), "=&d" (tmp), "+Q" (*(int *) address) - : "d" ((old & 0xffff) << shift), - "d" ((new & 0xffff) << shift), - "d" (~(0xffff << shift)) - : "memory", "cc"); - return prev >> shift; - case 4: - asm volatile( - " cs %0,%3,%1\n" - : "=&d" (prev), "+Q" (*(int *) address) - : "0" (old), "d" (new) - : "memory", "cc"); - return prev; - case 8: - asm volatile( - " csg %0,%3,%1\n" - : "=&d" (prev), "+QS" (*(long *) address) - : "0" (old), "d" (new) - : "memory", "cc"); - return prev; + return __arch_xchg2(ptr, x & 0xffff); + case 4: { + u32 old = READ_ONCE(*(u32 *)ptr); + + do { + } while (!arch_try_cmpxchg((u32 *)ptr, &old, x & 0xffffffff)); + return old; } - __cmpxchg_called_with_bad_pointer(); - return old; + case 8: { + u64 old = READ_ONCE(*(u64 *)ptr); + + do { + } while (!arch_try_cmpxchg((u64 *)ptr, &old, x)); + return old; + } + } + __xchg_called_with_bad_pointer(); + return x; } -#define arch_cmpxchg(ptr, o, n) \ +#define arch_xchg(ptr, x) \ ({ \ - __typeof__(*(ptr)) __ret; \ - \ - __ret = (__typeof__(*(ptr))) \ - __cmpxchg((unsigned long)(ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr))); \ - __ret; \ + (__typeof__(*(ptr)))__arch_xchg((unsigned long)(ptr), \ + (unsigned long)(x), \ + sizeof(*(ptr))); \ }) -#define arch_cmpxchg64 arch_cmpxchg -#define arch_cmpxchg_local arch_cmpxchg -#define arch_cmpxchg64_local arch_cmpxchg +#define system_has_cmpxchg128() 1 -#define system_has_cmpxchg_double() 1 +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + asm volatile( + " cdsg %[old],%[new],%[ptr]\n" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "memory", "cc"); + return old; +} + +#define arch_cmpxchg128 arch_cmpxchg128 +#define arch_cmpxchg128_local arch_cmpxchg128 + +#ifdef __HAVE_ASM_FLAG_OUTPUTS__ -static __always_inline int __cmpxchg_double(unsigned long p1, unsigned long p2, - unsigned long o1, unsigned long o2, - unsigned long n1, unsigned long n2) +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new) { - union register_pair old = { .even = o1, .odd = o2, }; - union register_pair new = { .even = n1, .odd = n2, }; int cc; asm volatile( " cdsg %[old],%[new],%[ptr]\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=&d" (cc), [old] "+&d" (old.pair) - : [new] "d" (new.pair), - [ptr] "QS" (*(unsigned long *)p1), "Q" (*(unsigned long *)p2) - : "memory", "cc"); - return !cc; + : [old] "+d" (*oldp), [ptr] "+QS" (*ptr), "=@cc" (cc) + : [new] "d" (new) + : "memory"); + return likely(cc == 0); } -#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \ -({ \ - typeof(p1) __p1 = (p1); \ - typeof(p2) __p2 = (p2); \ - \ - BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ - BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ - VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\ - __cmpxchg_double((unsigned long)__p1, (unsigned long)__p2, \ - (unsigned long)(o1), (unsigned long)(o2), \ - (unsigned long)(n1), (unsigned long)(n2)); \ -}) +#define arch_try_cmpxchg128 arch_try_cmpxchg128 +#define arch_try_cmpxchg128_local arch_try_cmpxchg128 + +#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */ #endif /* __ASM_CMPXCHG_H */ diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index a386070f1d56..3cb9d813f022 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -112,7 +112,7 @@ struct compat_statfs64 { u32 f_namelen; u32 f_frsize; u32 f_flags; - u32 f_spare[4]; + u32 f_spare[5]; }; /* diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index 646b12981f20..54cb97603ec0 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -2,7 +2,7 @@ /* * CP Assist for Cryptographic Functions (CPACF) * - * Copyright IBM Corp. 2003, 2017 + * Copyright IBM Corp. 2003, 2023 * Author(s): Thomas Spatzier * Jan Glauber * Harald Freudenberger (freude@de.ibm.com) @@ -12,6 +12,7 @@ #define _ASM_S390_CPACF_H #include <asm/facility.h> +#include <linux/kmsan-checks.h> /* * Instruction opcodes for the CPACF instructions @@ -53,6 +54,10 @@ #define CPACF_KM_XTS_256 0x34 #define CPACF_KM_PXTS_128 0x3a #define CPACF_KM_PXTS_256 0x3c +#define CPACF_KM_XTS_128_FULL 0x52 +#define CPACF_KM_XTS_256_FULL 0x54 +#define CPACF_KM_PXTS_128_FULL 0x5a +#define CPACF_KM_PXTS_256_FULL 0x5c /* * Function codes for the KMC (CIPHER MESSAGE WITH CHAINING) @@ -120,18 +125,31 @@ #define CPACF_KMAC_DEA 0x01 #define CPACF_KMAC_TDEA_128 0x02 #define CPACF_KMAC_TDEA_192 0x03 +#define CPACF_KMAC_HMAC_SHA_224 0x70 +#define CPACF_KMAC_HMAC_SHA_256 0x71 +#define CPACF_KMAC_HMAC_SHA_384 0x72 +#define CPACF_KMAC_HMAC_SHA_512 0x73 /* * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT) * instruction */ -#define CPACF_PCKMO_QUERY 0x00 -#define CPACF_PCKMO_ENC_DES_KEY 0x01 -#define CPACF_PCKMO_ENC_TDES_128_KEY 0x02 -#define CPACF_PCKMO_ENC_TDES_192_KEY 0x03 -#define CPACF_PCKMO_ENC_AES_128_KEY 0x12 -#define CPACF_PCKMO_ENC_AES_192_KEY 0x13 -#define CPACF_PCKMO_ENC_AES_256_KEY 0x14 +#define CPACF_PCKMO_QUERY 0x00 +#define CPACF_PCKMO_ENC_DES_KEY 0x01 +#define CPACF_PCKMO_ENC_TDES_128_KEY 0x02 +#define CPACF_PCKMO_ENC_TDES_192_KEY 0x03 +#define CPACF_PCKMO_ENC_AES_128_KEY 0x12 +#define CPACF_PCKMO_ENC_AES_192_KEY 0x13 +#define CPACF_PCKMO_ENC_AES_256_KEY 0x14 +#define CPACF_PCKMO_ENC_AES_XTS_128_DOUBLE_KEY 0x15 +#define CPACF_PCKMO_ENC_AES_XTS_256_DOUBLE_KEY 0x16 +#define CPACF_PCKMO_ENC_ECC_P256_KEY 0x20 +#define CPACF_PCKMO_ENC_ECC_P384_KEY 0x21 +#define CPACF_PCKMO_ENC_ECC_P521_KEY 0x22 +#define CPACF_PCKMO_ENC_ECC_ED25519_KEY 0x28 +#define CPACF_PCKMO_ENC_ECC_ED448_KEY 0x29 +#define CPACF_PCKMO_ENC_HMAC_512_KEY 0x76 +#define CPACF_PCKMO_ENC_HMAC_1024_KEY 0x7a /* * Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION) @@ -159,30 +177,126 @@ #define CPACF_KMA_LAAD 0x200 /* Last-AAD */ #define CPACF_KMA_HS 0x400 /* Hash-subkey Supplied */ +/* + * Flags for the KIMD/KLMD (COMPUTE INTERMEDIATE/LAST MESSAGE DIGEST) + * instructions + */ +#define CPACF_KIMD_NIP 0x8000 +#define CPACF_KLMD_DUFOP 0x4000 +#define CPACF_KLMD_NIP 0x8000 + +/* + * Function codes for KDSA (COMPUTE DIGITAL SIGNATURE AUTHENTICATION) + * instruction + */ +#define CPACF_KDSA_QUERY 0x00 +#define CPACF_KDSA_ECDSA_VERIFY_P256 0x01 +#define CPACF_KDSA_ECDSA_VERIFY_P384 0x02 +#define CPACF_KDSA_ECDSA_VERIFY_P521 0x03 +#define CPACF_KDSA_ECDSA_SIGN_P256 0x09 +#define CPACF_KDSA_ECDSA_SIGN_P384 0x0a +#define CPACF_KDSA_ECDSA_SIGN_P521 0x0b +#define CPACF_KDSA_ENC_ECDSA_SIGN_P256 0x11 +#define CPACF_KDSA_ENC_ECDSA_SIGN_P384 0x12 +#define CPACF_KDSA_ENC_ECDSA_SIGN_P521 0x13 +#define CPACF_KDSA_EDDSA_VERIFY_ED25519 0x20 +#define CPACF_KDSA_EDDSA_VERIFY_ED448 0x24 +#define CPACF_KDSA_EDDSA_SIGN_ED25519 0x28 +#define CPACF_KDSA_EDDSA_SIGN_ED448 0x2c +#define CPACF_KDSA_ENC_EDDSA_SIGN_ED25519 0x30 +#define CPACF_KDSA_ENC_EDDSA_SIGN_ED448 0x34 + +#define CPACF_FC_QUERY 0x00 +#define CPACF_FC_QUERY_AUTH_INFO 0x7F + typedef struct { unsigned char bytes[16]; } cpacf_mask_t; +typedef struct { unsigned char bytes[256]; } cpacf_qai_t; -/** - * cpacf_query() - check if a specific CPACF function is available - * @opcode: the opcode of the crypto instruction - * @func: the function code to test for - * - * Executes the query function for the given crypto instruction @opcode - * and checks if @func is available - * - * Returns 1 if @func is available for @opcode, 0 otherwise +/* + * Prototype for a not existing function to produce a link + * error if __cpacf_query() or __cpacf_check_opcode() is used + * with an invalid compile time const opcode. */ -static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +void __cpacf_bad_opcode(void); + +static __always_inline void __cpacf_query_rre(u32 opc, u8 r1, u8 r2, + u8 *pb, u8 fc) { asm volatile( - " lghi 0,0\n" /* query function */ - " lgr 1,%[mask]\n" - " spm 0\n" /* pckmo doesn't change the cc */ - /* Parameter regs are ignored, but must be nonzero and unique */ - "0: .insn rrf,%[opc] << 16,2,4,6,0\n" - " brc 1,0b\n" /* handle partial completion */ - : "=m" (*mask) - : [mask] "d" ((unsigned long)mask), [opc] "i" (opcode) - : "cc", "0", "1"); + " la %%r1,%[pb]\n" + " lghi %%r0,%[fc]\n" + " .insn rre,%[opc] << 16,%[r1],%[r2]\n" + : [pb] "=R" (*pb) + : [opc] "i" (opc), [fc] "i" (fc), + [r1] "i" (r1), [r2] "i" (r2) + : "cc", "memory", "r0", "r1"); +} + +static __always_inline void __cpacf_query_rrf(u32 opc, u8 r1, u8 r2, u8 r3, + u8 m4, u8 *pb, u8 fc) +{ + asm volatile( + " la %%r1,%[pb]\n" + " lghi %%r0,%[fc]\n" + " .insn rrf,%[opc] << 16,%[r1],%[r2],%[r3],%[m4]\n" + : [pb] "=R" (*pb) + : [opc] "i" (opc), [fc] "i" (fc), [r1] "i" (r1), + [r2] "i" (r2), [r3] "i" (r3), [m4] "i" (m4) + : "cc", "memory", "r0", "r1"); +} + +static __always_inline void __cpacf_query_insn(unsigned int opcode, void *pb, + u8 fc) +{ + switch (opcode) { + case CPACF_KDSA: + __cpacf_query_rre(CPACF_KDSA, 0, 2, pb, fc); + break; + case CPACF_KIMD: + __cpacf_query_rre(CPACF_KIMD, 0, 2, pb, fc); + break; + case CPACF_KLMD: + __cpacf_query_rre(CPACF_KLMD, 0, 2, pb, fc); + break; + case CPACF_KM: + __cpacf_query_rre(CPACF_KM, 2, 4, pb, fc); + break; + case CPACF_KMA: + __cpacf_query_rrf(CPACF_KMA, 2, 4, 6, 0, pb, fc); + break; + case CPACF_KMAC: + __cpacf_query_rre(CPACF_KMAC, 0, 2, pb, fc); + break; + case CPACF_KMC: + __cpacf_query_rre(CPACF_KMC, 2, 4, pb, fc); + break; + case CPACF_KMCTR: + __cpacf_query_rrf(CPACF_KMCTR, 2, 4, 6, 0, pb, fc); + break; + case CPACF_KMF: + __cpacf_query_rre(CPACF_KMF, 2, 4, pb, fc); + break; + case CPACF_KMO: + __cpacf_query_rre(CPACF_KMO, 2, 4, pb, fc); + break; + case CPACF_PCC: + __cpacf_query_rre(CPACF_PCC, 0, 0, pb, fc); + break; + case CPACF_PCKMO: + __cpacf_query_rre(CPACF_PCKMO, 0, 0, pb, fc); + break; + case CPACF_PRNO: + __cpacf_query_rre(CPACF_PRNO, 2, 4, pb, fc); + break; + default: + __cpacf_bad_opcode(); + } +} + +static __always_inline void __cpacf_query(unsigned int opcode, + cpacf_mask_t *mask) +{ + __cpacf_query_insn(opcode, mask, CPACF_FC_QUERY); } static __always_inline int __cpacf_check_opcode(unsigned int opcode) @@ -205,11 +319,25 @@ static __always_inline int __cpacf_check_opcode(unsigned int opcode) return test_facility(57); /* check for MSA5 */ case CPACF_KMA: return test_facility(146); /* check for MSA8 */ + case CPACF_KDSA: + return test_facility(155); /* check for MSA9 */ default: - BUG(); + __cpacf_bad_opcode(); + return 0; } } +/** + * cpacf_query() - Query the function code mask for this CPACF opcode + * @opcode: the opcode of the crypto instruction + * @mask: ptr to struct cpacf_mask_t + * + * Executes the query function for the given crypto instruction @opcode + * and checks if @func is available + * + * On success 1 is returned and the mask is filled with the function + * code mask for this CPACF opcode, otherwise 0 is returned. + */ static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) { if (__cpacf_check_opcode(opcode)) { @@ -225,7 +353,8 @@ static inline int cpacf_test_func(cpacf_mask_t *mask, unsigned int func) return (mask->bytes[func >> 3] & (0x80 >> (func & 7))) != 0; } -static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int func) +static __always_inline int cpacf_query_func(unsigned int opcode, + unsigned int func) { cpacf_mask_t mask; @@ -234,6 +363,32 @@ static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int fu return 0; } +static __always_inline void __cpacf_qai(unsigned int opcode, cpacf_qai_t *qai) +{ + __cpacf_query_insn(opcode, qai, CPACF_FC_QUERY_AUTH_INFO); +} + +/** + * cpacf_qai() - Get the query authentication information for a CPACF opcode + * @opcode: the opcode of the crypto instruction + * @mask: ptr to struct cpacf_qai_t + * + * Executes the query authentication information function for the given crypto + * instruction @opcode and checks if @func is available + * + * On success 1 is returned and the mask is filled with the query authentication + * information for this CPACF opcode, otherwise 0 is returned. + */ +static __always_inline int cpacf_qai(unsigned int opcode, cpacf_qai_t *qai) +{ + if (cpacf_query_func(opcode, CPACF_FC_QUERY_AUTH_INFO)) { + __cpacf_qai(opcode, qai); + return 1; + } + memset(qai, 0, sizeof(*qai)); + return 0; +} + /** * cpacf_km() - executes the KM (CIPHER MESSAGE) instruction * @func: the function code passed to KM; see CPACF_KM_xxx defines @@ -316,7 +471,7 @@ static inline void cpacf_kimd(unsigned long func, void *param, asm volatile( " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" - "0: .insn rre,%[opc] << 16,0,%[src]\n" + "0: .insn rrf,%[opc] << 16,0,%[src],8,0\n" " brc 1,0b\n" /* handle partial completion */ : [src] "+&d" (s.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), @@ -341,7 +496,7 @@ static inline void cpacf_klmd(unsigned long func, void *param, asm volatile( " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" - "0: .insn rre,%[opc] << 16,0,%[src]\n" + "0: .insn rrf,%[opc] << 16,0,%[src],8,0\n" " brc 1,0b\n" /* handle partial completion */ : [src] "+&d" (s.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)param), @@ -350,29 +505,30 @@ static inline void cpacf_klmd(unsigned long func, void *param, } /** - * cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) - * instruction - * @func: the function code passed to KM; see CPACF_KMAC_xxx defines + * _cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) + * instruction and updates flags in gr0 + * @gr0: pointer to gr0 (fc and flags) passed to KMAC; see CPACF_KMAC_xxx defines * @param: address of parameter block; see POP for details on each func * @src: address of source memory area * @src_len: length of src operand in bytes * * Returns 0 for the query func, number of processed bytes for digest funcs */ -static inline int cpacf_kmac(unsigned long func, void *param, - const u8 *src, long src_len) +static inline int _cpacf_kmac(unsigned long *gr0, void *param, + const u8 *src, long src_len) { union register_pair s; s.even = (unsigned long)src; s.odd = (unsigned long)src_len; asm volatile( - " lgr 0,%[fc]\n" + " lgr 0,%[r0]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,%[src]\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+&d" (s.pair) - : [fc] "d" (func), [pba] "d" ((unsigned long)param), + " lgr %[r0],0\n" + : [r0] "+d" (*gr0), [src] "+&d" (s.pair) + : [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_KMAC) : "cc", "memory", "0", "1"); @@ -380,6 +536,22 @@ static inline int cpacf_kmac(unsigned long func, void *param, } /** + * cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) + * instruction + * @func: function code passed to KMAC; see CPACF_KMAC_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for digest funcs + */ +static inline int cpacf_kmac(unsigned long func, void *param, + const u8 *src, long src_len) +{ + return _cpacf_kmac(&func, param, src, src_len); +} + +/** * cpacf_kmctr() - executes the KMCTR (CIPHER MESSAGE WITH COUNTER) instruction * @func: the function code passed to KMCTR; see CPACF_KMCTR_xxx defines * @param: address of parameter block; see POP for details on each func @@ -468,6 +640,8 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair) : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO) : "cc", "memory", "0"); + kmsan_unpoison_memory(ucbuf, ucbuf_len); + kmsan_unpoison_memory(cbuf, cbuf_len); } /** @@ -475,18 +649,30 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, * instruction * @func: the function code passed to PCC; see CPACF_KM_xxx defines * @param: address of parameter block; see POP for details on each func + * + * Returns the condition code, this is + * 0 - cc code 0 (normal completion) + * 1 - cc code 1 (protected key wkvp mismatch or src operand out of range) + * 2 - cc code 2 (something invalid, scalar multiply infinity, ...) + * Condition code 3 (partial completion) is handled within the asm code + * and never returned. */ -static inline void cpacf_pcc(unsigned long func, void *param) +static inline int cpacf_pcc(unsigned long func, void *param) { + int cc; + asm volatile( " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */ " brc 1,0b\n" /* handle partial completion */ - : + CC_IPM(cc) + : CC_OUT(cc, cc) : [fc] "d" (func), [pba] "d" ((unsigned long)param), [opc] "i" (CPACF_PCC) - : "cc", "memory", "0", "1"); + : CC_CLOBBER_LIST("memory", "0", "1")); + + return CC_TRANSFORM(cc); } /** diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h deleted file mode 100644 index f87a4788c19c..000000000000 --- a/arch/s390/include/asm/cpu_mcf.h +++ /dev/null @@ -1,112 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Counter facility support definitions for the Linux perf - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#ifndef _ASM_S390_CPU_MCF_H -#define _ASM_S390_CPU_MCF_H - -#include <linux/perf_event.h> -#include <asm/cpu_mf.h> - -enum cpumf_ctr_set { - CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ - CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ - CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ - CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ - CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ - - /* Maximum number of counter sets */ - CPUMF_CTR_SET_MAX, -}; - -#define CPUMF_LCCTL_ENABLE_SHIFT 16 -#define CPUMF_LCCTL_ACTCTL_SHIFT 0 - -static inline void ctr_set_enable(u64 *state, u64 ctrsets) -{ - *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; -} - -static inline void ctr_set_disable(u64 *state, u64 ctrsets) -{ - *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); -} - -static inline void ctr_set_start(u64 *state, u64 ctrsets) -{ - *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; -} - -static inline void ctr_set_stop(u64 *state, u64 ctrsets) -{ - *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); -} - -static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) -{ - switch (set) { - case CPUMF_CTR_SET_BASIC: - return stcctm(BASIC, range, dest); - case CPUMF_CTR_SET_USER: - return stcctm(PROBLEM_STATE, range, dest); - case CPUMF_CTR_SET_CRYPTO: - return stcctm(CRYPTO_ACTIVITY, range, dest); - case CPUMF_CTR_SET_EXT: - return stcctm(EXTENDED, range, dest); - case CPUMF_CTR_SET_MT_DIAG: - return stcctm(MT_DIAG_CLEARING, range, dest); - case CPUMF_CTR_SET_MAX: - return 3; - } - return 3; -} - -struct cpu_cf_events { - struct cpumf_ctr_info info; - atomic_t ctr_set[CPUMF_CTR_SET_MAX]; - atomic64_t alert; - u64 state; /* For perf_event_open SVC */ - u64 dev_state; /* For /dev/hwctr */ - unsigned int flags; - size_t used; /* Bytes used in data */ - size_t usedss; /* Bytes used in start/stop */ - unsigned char start[PAGE_SIZE]; /* Counter set at event add */ - unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ - unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ - unsigned int sets; /* # Counter set saved in memory */ -}; -DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events); - -bool kernel_cpumcf_avail(void); -int __kernel_cpumcf_begin(void); -unsigned long kernel_cpumcf_alert(int clear); -void __kernel_cpumcf_end(void); - -static inline int kernel_cpumcf_begin(void) -{ - if (!cpum_cf_avail()) - return -ENODEV; - - preempt_disable(); - return __kernel_cpumcf_begin(); -} -static inline void kernel_cpumcf_end(void) -{ - __kernel_cpumcf_end(); - preempt_enable(); -} - -/* Return true if store counter set multiple instruction is available */ -static inline int stccm_avail(void) -{ - return test_facility(142); -} - -size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info); -int cfset_online_cpu(unsigned int cpu); -int cfset_offline_cpu(unsigned int cpu); -#endif /* _ASM_S390_CPU_MCF_H */ diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index feaba12dbecb..1798fbd59068 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -10,8 +10,10 @@ #define _ASM_S390_CPU_MF_H #include <linux/errno.h> +#include <linux/kmsan-checks.h> #include <asm/asm-extable.h> #include <asm/facility.h> +#include <asm/asm.h> asm(".include \"asm/cpu_mf-insn.h\"\n"); @@ -42,7 +44,6 @@ static inline int cpum_sf_avail(void) return test_facility(40) && test_facility(68); } - struct cpumf_ctr_info { u16 cfvn; u16 auth_ctl; @@ -131,19 +132,21 @@ struct hws_combined_entry { struct hws_diag_entry diag; /* Diagnostic-sampling data entry */ } __packed; -struct hws_trailer_entry { - union { - struct { - unsigned int f:1; /* 0 - Block Full Indicator */ - unsigned int a:1; /* 1 - Alert request control */ - unsigned int t:1; /* 2 - Timestamp format */ - unsigned int :29; /* 3 - 31: Reserved */ - unsigned int bsdes:16; /* 32-47: size of basic SDE */ - unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */ - }; - unsigned long long flags; /* 0 - 63: All indicators */ +union hws_trailer_header { + struct { + unsigned int f:1; /* 0 - Block Full Indicator */ + unsigned int a:1; /* 1 - Alert request control */ + unsigned int t:1; /* 2 - Timestamp format */ + unsigned int :29; /* 3 - 31: Reserved */ + unsigned int bsdes:16; /* 32-47: size of basic SDE */ + unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */ + unsigned long long overflow; /* 64 - Overflow Count */ }; - unsigned long long overflow; /* 64 - sample Overflow count */ + u128 val; +}; + +struct hws_trailer_entry { + union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count */ unsigned char timestamp[16]; /* 16 - 31 timestamp */ unsigned long long reserved1; /* 32 -Reserved */ unsigned long long reserved2; /* */ @@ -168,7 +171,7 @@ static inline int qctri(struct cpumf_ctr_info *info) { int rc = -EINVAL; - asm volatile ( + asm_inline volatile ( "0: qctri %1\n" "1: lhi %0,0\n" "2:\n" @@ -182,12 +185,13 @@ static inline int lcctl(u64 ctl) { int cc; - asm volatile ( - " lcctl %1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc) : "Q" (ctl) : "cc"); - return cc; + asm_inline volatile ( + " lcctl %[ctl]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [ctl] "Q" (ctl) + : CC_CLOBBER); + return CC_TRANSFORM(cc); } /* Extract CPU counter */ @@ -196,13 +200,14 @@ static inline int __ecctr(u64 ctr, u64 *content) u64 _content; int cc; - asm volatile ( - " ecctr %0,%2\n" - " ipm %1\n" - " srl %1,28\n" - : "=d" (_content), "=d" (cc) : "d" (ctr) : "cc"); + asm_inline volatile ( + " ecctr %[_content],%[ctr]\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [_content] "=d" (_content) + : [ctr] "d" (ctr) + : CC_CLOBBER); *content = _content; - return cc; + return CC_TRANSFORM(cc); } /* Extract CPU counter */ @@ -232,13 +237,17 @@ static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest) int cc; asm volatile ( - " STCCTM %2,%3,%1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc) - : "Q" (*dest), "d" (range), "i" (set) - : "cc", "memory"); - return cc; + " STCCTM %[range],%[set],%[dest]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [dest] "Q" (*dest), [range] "d" (range), [set] "i" (set) + : CC_CLOBBER_LIST("memory")); + /* + * If cc == 2, less than RANGE counters are stored, but it's not easy + * to tell how many. Always unpoison the whole range for simplicity. + */ + kmsan_unpoison_memory(dest, range * sizeof(u64)); + return CC_TRANSFORM(cc); } /* Query sampling information */ @@ -258,74 +267,20 @@ static inline int qsi(struct hws_qsi_info_block *info) /* Load sampling controls */ static inline int lsctl(struct hws_lsctl_request_block *req) { - int cc; + int cc, exception; - cc = 1; + exception = 1; asm volatile( - "0: lsctl 0(%1)\n" - "1: ipm %0\n" - " srl %0,28\n" + "0: lsctl %[req]\n" + "1: lhi %[exc],0\n" "2:\n" + CC_IPM(cc) EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "+d" (cc), "+a" (req) - : "m" (*req) - : "cc", "memory"); - - return cc ? -EINVAL : 0; -} - -/* Sampling control helper functions */ - -#include <linux/time.h> - -static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, - unsigned long freq) -{ - return (USEC_PER_SEC / freq) * qsi->cpu_speed; -} - -static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, - unsigned long rate) -{ - return USEC_PER_SEC * qsi->cpu_speed / rate; -} - -#define SDB_TE_ALERT_REQ_MASK 0x4000000000000000UL -#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL - -/* Return TOD timestamp contained in an trailer entry */ -static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) -{ - /* TOD in STCKE format */ - if (te->t) - return *((unsigned long long *) &te->timestamp[1]); - - /* TOD in STCK format */ - return *((unsigned long long *) &te->timestamp[0]); -} - -/* Return pointer to trailer entry of an sample data block */ -static inline unsigned long *trailer_entry_ptr(unsigned long v) -{ - void *ret; - - ret = (void *) v; - ret += PAGE_SIZE; - ret -= sizeof(struct hws_trailer_entry); - - return (unsigned long *) ret; -} - -/* Return true if the entry in the sample data block table (sdbt) - * is a link to the next sdbt */ -static inline int is_link_entry(unsigned long *s) -{ - return *s & 0x1ul ? 1 : 0; -} - -/* Return pointer to the linked sdbt */ -static inline unsigned long *get_next_sdbt(unsigned long *s) -{ - return (unsigned long *) (*s & ~0x1ul); + : CC_OUT(cc, cc), [exc] "+d" (exception) + : [req] "Q" (*req) + : CC_CLOBBER); + if (exception || CC_TRANSFORM(cc)) + return -EINVAL; + return 0; } #endif /* _ASM_S390_CPU_MF_H */ diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h index 14cfd48d598e..6c6a99660e78 100644 --- a/arch/s390/include/asm/cpufeature.h +++ b/arch/s390/include/asm/cpufeature.h @@ -2,29 +2,37 @@ /* * Module interface for CPU features * - * Copyright IBM Corp. 2015 + * Copyright IBM Corp. 2015, 2022 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ #ifndef __ASM_S390_CPUFEATURE_H #define __ASM_S390_CPUFEATURE_H -#include <asm/elf.h> +#include <asm/facility.h> -/* Hardware features on Linux on z Systems are indicated by facility bits that - * are mapped to the so-called machine flags. Particular machine flags are - * then used to define ELF hardware capabilities; most notably hardware flags - * that are essential for user space / glibc. - * - * Restrict the set of exposed CPU features to ELF hardware capabilities for - * now. Additional machine flags can be indicated by values larger than - * MAX_ELF_HWCAP_FEATURES. - */ -#define MAX_ELF_HWCAP_FEATURES (8 * sizeof(elf_hwcap)) -#define MAX_CPU_FEATURES MAX_ELF_HWCAP_FEATURES +enum { + S390_CPU_FEATURE_MSA, + S390_CPU_FEATURE_VXRS, + S390_CPU_FEATURE_UV, + S390_CPU_FEATURE_D288, + MAX_CPU_FEATURES +}; -#define cpu_feature(feat) ilog2(HWCAP_ ## feat) +#define cpu_feature(feature) (feature) int cpu_have_feature(unsigned int nr); +#define cpu_has_bear() test_facility(193) +#define cpu_has_edat1() test_facility(8) +#define cpu_has_edat2() test_facility(78) +#define cpu_has_gs() test_facility(133) +#define cpu_has_idte() test_facility(3) +#define cpu_has_nx() test_facility(130) +#define cpu_has_rdp() test_facility(194) +#define cpu_has_seq_insn() test_facility(85) +#define cpu_has_tlb_lc() test_facility(51) +#define cpu_has_topology() test_facility(11) +#define cpu_has_vx() test_facility(129) + #endif /* __ASM_S390_CPUFEATURE_H */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 1d389847b588..30bb3ec4e5fc 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -11,30 +11,11 @@ #include <linux/types.h> #include <asm/timex.h> -#define CPUTIME_PER_USEC 4096ULL -#define CPUTIME_PER_SEC (CPUTIME_PER_USEC * USEC_PER_SEC) - -/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ - -#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) - -/* - * Convert cputime to microseconds. - */ -static inline u64 cputime_to_usecs(const u64 cputime) -{ - return cputime >> 12; -} - /* * Convert cputime to nanoseconds. */ #define cputime_to_nsecs(cputime) tod_to_ns(cputime) -u64 arch_cpu_idle_time(int cpu); - -#define arch_idle_time(cpu) arch_cpu_idle_time(cpu) - void account_idle_time_irq(void); #endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h index 638137d46c85..a03f64033760 100644 --- a/arch/s390/include/asm/css_chars.h +++ b/arch/s390/include/asm/css_chars.h @@ -25,7 +25,7 @@ struct css_general_char { u64 : 2; u64 : 3; - u64 aif_osa : 1; /* bit 67 */ + u64 aif_qdio : 1;/* bit 67 */ u64 : 12; u64 eadm_rf : 1; /* bit 80 */ u64 : 1; diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h deleted file mode 100644 index 267a8f88e143..000000000000 --- a/arch/s390/include/asm/ctl_reg.h +++ /dev/null @@ -1,145 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright IBM Corp. 1999, 2009 - * - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#ifndef __ASM_CTL_REG_H -#define __ASM_CTL_REG_H - -#include <linux/bits.h> - -#define CR0_CLOCK_COMPARATOR_SIGN BIT(63 - 10) -#define CR0_LOW_ADDRESS_PROTECTION BIT(63 - 35) -#define CR0_FETCH_PROTECTION_OVERRIDE BIT(63 - 38) -#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(63 - 39) -#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(63 - 49) -#define CR0_EXTERNAL_CALL_SUBMASK BIT(63 - 50) -#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(63 - 52) -#define CR0_CPU_TIMER_SUBMASK BIT(63 - 53) -#define CR0_SERVICE_SIGNAL_SUBMASK BIT(63 - 54) -#define CR0_UNUSED_56 BIT(63 - 56) -#define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57) -#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58) - -#define CR14_UNUSED_32 BIT(63 - 32) -#define CR14_UNUSED_33 BIT(63 - 33) -#define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35) -#define CR14_RECOVERY_SUBMASK BIT(63 - 36) -#define CR14_DEGRADATION_SUBMASK BIT(63 - 37) -#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(63 - 38) -#define CR14_WARNING_SUBMASK BIT(63 - 39) - -#ifndef __ASSEMBLY__ - -#include <linux/bug.h> - -#define __ctl_load(array, low, high) do { \ - typedef struct { char _[sizeof(array)]; } addrtype; \ - \ - BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\ - asm volatile( \ - " lctlg %1,%2,%0\n" \ - : \ - : "Q" (*(addrtype *)(&array)), "i" (low), "i" (high) \ - : "memory"); \ -} while (0) - -#define __ctl_store(array, low, high) do { \ - typedef struct { char _[sizeof(array)]; } addrtype; \ - \ - BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\ - asm volatile( \ - " stctg %1,%2,%0\n" \ - : "=Q" (*(addrtype *)(&array)) \ - : "i" (low), "i" (high)); \ -} while (0) - -static __always_inline void __ctl_set_bit(unsigned int cr, unsigned int bit) -{ - unsigned long reg; - - __ctl_store(reg, cr, cr); - reg |= 1UL << bit; - __ctl_load(reg, cr, cr); -} - -static __always_inline void __ctl_clear_bit(unsigned int cr, unsigned int bit) -{ - unsigned long reg; - - __ctl_store(reg, cr, cr); - reg &= ~(1UL << bit); - __ctl_load(reg, cr, cr); -} - -void smp_ctl_set_clear_bit(int cr, int bit, bool set); - -static inline void ctl_set_bit(int cr, int bit) -{ - smp_ctl_set_clear_bit(cr, bit, true); -} - -static inline void ctl_clear_bit(int cr, int bit) -{ - smp_ctl_set_clear_bit(cr, bit, false); -} - -union ctlreg0 { - unsigned long val; - struct { - unsigned long : 8; - unsigned long tcx : 1; /* Transactional-Execution control */ - unsigned long pifo : 1; /* Transactional-Execution Program- - Interruption-Filtering Override */ - unsigned long : 3; - unsigned long ccc : 1; /* Cryptography counter control */ - unsigned long : 18; - unsigned long : 3; - unsigned long lap : 1; /* Low-address-protection control */ - unsigned long : 4; - unsigned long edat : 1; /* Enhanced-DAT-enablement control */ - unsigned long : 2; - unsigned long iep : 1; /* Instruction-Execution-Protection */ - unsigned long : 1; - unsigned long afp : 1; /* AFP-register control */ - unsigned long vx : 1; /* Vector enablement control */ - unsigned long : 7; - unsigned long sssm : 1; /* Service signal subclass mask */ - unsigned long : 9; - }; -}; - -union ctlreg2 { - unsigned long val; - struct { - unsigned long : 33; - unsigned long ducto : 25; - unsigned long : 1; - unsigned long gse : 1; - unsigned long : 1; - unsigned long tds : 1; - unsigned long tdc : 2; - }; -}; - -union ctlreg5 { - unsigned long val; - struct { - unsigned long : 33; - unsigned long pasteo: 25; - unsigned long : 6; - }; -}; - -union ctlreg15 { - unsigned long val; - struct { - unsigned long lsea : 61; - unsigned long : 3; - }; -}; - -#endif /* __ASSEMBLY__ */ -#endif /* __ASM_CTL_REG_H */ diff --git a/arch/s390/include/asm/ctlreg.h b/arch/s390/include/asm/ctlreg.h new file mode 100644 index 000000000000..e6527f51ad0b --- /dev/null +++ b/arch/s390/include/asm/ctlreg.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#ifndef __ASM_S390_CTLREG_H +#define __ASM_S390_CTLREG_H + +#include <linux/bits.h> + +#define CR0_TRANSACTIONAL_EXECUTION_BIT (63 - 8) +#define CR0_CLOCK_COMPARATOR_SIGN_BIT (63 - 10) +#define CR0_CRYPTOGRAPHY_COUNTER_BIT (63 - 13) +#define CR0_PAI_EXTENSION_BIT (63 - 14) +#define CR0_CPUMF_EXTRACTION_AUTH_BIT (63 - 15) +#define CR0_WARNING_TRACK_BIT (63 - 30) +#define CR0_LOW_ADDRESS_PROTECTION_BIT (63 - 35) +#define CR0_FETCH_PROTECTION_OVERRIDE_BIT (63 - 38) +#define CR0_STORAGE_PROTECTION_OVERRIDE_BIT (63 - 39) +#define CR0_EDAT_BIT (63 - 40) +#define CR0_INSTRUCTION_EXEC_PROTECTION_BIT (63 - 43) +#define CR0_VECTOR_BIT (63 - 46) +#define CR0_MALFUNCTION_ALERT_SUBMASK_BIT (63 - 48) +#define CR0_EMERGENCY_SIGNAL_SUBMASK_BIT (63 - 49) +#define CR0_EXTERNAL_CALL_SUBMASK_BIT (63 - 50) +#define CR0_CLOCK_COMPARATOR_SUBMASK_BIT (63 - 52) +#define CR0_CPU_TIMER_SUBMASK_BIT (63 - 53) +#define CR0_SERVICE_SIGNAL_SUBMASK_BIT (63 - 54) +#define CR0_UNUSED_56_BIT (63 - 56) +#define CR0_INTERRUPT_KEY_SUBMASK_BIT (63 - 57) +#define CR0_MEASUREMENT_ALERT_SUBMASK_BIT (63 - 58) +#define CR0_ETR_SUBMASK_BIT (63 - 59) +#define CR0_IUCV_BIT (63 - 62) + +#define CR0_TRANSACTIONAL_EXECUTION BIT(CR0_TRANSACTIONAL_EXECUTION_BIT) +#define CR0_CLOCK_COMPARATOR_SIGN BIT(CR0_CLOCK_COMPARATOR_SIGN_BIT) +#define CR0_CRYPTOGRAPHY_COUNTER BIT(CR0_CRYPTOGRAPHY_COUNTER_BIT) +#define CR0_PAI_EXTENSION BIT(CR0_PAI_EXTENSION_BIT) +#define CR0_CPUMF_EXTRACTION_AUTH BIT(CR0_CPUMF_EXTRACTION_AUTH_BIT) +#define CR0_WARNING_TRACK BIT(CR0_WARNING_TRACK_BIT) +#define CR0_LOW_ADDRESS_PROTECTION BIT(CR0_LOW_ADDRESS_PROTECTION_BIT) +#define CR0_FETCH_PROTECTION_OVERRIDE BIT(CR0_FETCH_PROTECTION_OVERRIDE_BIT) +#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(CR0_STORAGE_PROTECTION_OVERRIDE_BIT) +#define CR0_EDAT BIT(CR0_EDAT_BIT) +#define CR0_INSTRUCTION_EXEC_PROTECTION BIT(CR0_INSTRUCTION_EXEC_PROTECTION_BIT) +#define CR0_VECTOR BIT(CR0_VECTOR_BIT) +#define CR0_MALFUNCTION_ALERT_SUBMASK BIT(CR0_MALFUNCTION_ALERT_SUBMASK_BIT) +#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(CR0_EMERGENCY_SIGNAL_SUBMASK_BIT) +#define CR0_EXTERNAL_CALL_SUBMASK BIT(CR0_EXTERNAL_CALL_SUBMASK_BIT) +#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(CR0_CLOCK_COMPARATOR_SUBMASK_BIT) +#define CR0_CPU_TIMER_SUBMASK BIT(CR0_CPU_TIMER_SUBMASK_BIT) +#define CR0_SERVICE_SIGNAL_SUBMASK BIT(CR0_SERVICE_SIGNAL_SUBMASK_BIT) +#define CR0_UNUSED_56 BIT(CR0_UNUSED_56_BIT) +#define CR0_INTERRUPT_KEY_SUBMASK BIT(CR0_INTERRUPT_KEY_SUBMASK_BIT) +#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(CR0_MEASUREMENT_ALERT_SUBMASK_BIT) +#define CR0_ETR_SUBMASK BIT(CR0_ETR_SUBMASK_BIT) +#define CR0_IUCV BIT(CR0_IUCV_BIT) + +#define CR2_MIO_ADDRESSING_BIT (63 - 58) +#define CR2_GUARDED_STORAGE_BIT (63 - 59) + +#define CR2_MIO_ADDRESSING BIT(CR2_MIO_ADDRESSING_BIT) +#define CR2_GUARDED_STORAGE BIT(CR2_GUARDED_STORAGE_BIT) + +#define CR14_UNUSED_32_BIT (63 - 32) +#define CR14_UNUSED_33_BIT (63 - 33) +#define CR14_CHANNEL_REPORT_SUBMASK_BIT (63 - 35) +#define CR14_RECOVERY_SUBMASK_BIT (63 - 36) +#define CR14_DEGRADATION_SUBMASK_BIT (63 - 37) +#define CR14_EXTERNAL_DAMAGE_SUBMASK_BIT (63 - 38) +#define CR14_WARNING_SUBMASK_BIT (63 - 39) + +#define CR14_UNUSED_32 BIT(CR14_UNUSED_32_BIT) +#define CR14_UNUSED_33 BIT(CR14_UNUSED_33_BIT) +#define CR14_CHANNEL_REPORT_SUBMASK BIT(CR14_CHANNEL_REPORT_SUBMASK_BIT) +#define CR14_RECOVERY_SUBMASK BIT(CR14_RECOVERY_SUBMASK_BIT) +#define CR14_DEGRADATION_SUBMASK BIT(CR14_DEGRADATION_SUBMASK_BIT) +#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(CR14_EXTERNAL_DAMAGE_SUBMASK_BIT) +#define CR14_WARNING_SUBMASK BIT(CR14_WARNING_SUBMASK_BIT) + +#ifndef __ASSEMBLY__ + +#include <linux/bug.h> + +struct ctlreg { + unsigned long val; +}; + +#define __local_ctl_load(low, high, array) do { \ + struct addrtype { \ + char _[sizeof(array)]; \ + }; \ + int _high = high; \ + int _low = low; \ + int _esize; \ + \ + _esize = (_high - _low + 1) * sizeof(struct ctlreg); \ + BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \ + typecheck(struct ctlreg, array[0]); \ + asm volatile( \ + " lctlg %[_low],%[_high],%[_arr]\n" \ + : \ + : [_arr] "Q" (*(struct addrtype *)(&array)), \ + [_low] "i" (low), [_high] "i" (high) \ + : "memory"); \ +} while (0) + +#define __local_ctl_store(low, high, array) do { \ + struct addrtype { \ + char _[sizeof(array)]; \ + }; \ + int _high = high; \ + int _low = low; \ + int _esize; \ + \ + _esize = (_high - _low + 1) * sizeof(struct ctlreg); \ + BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \ + typecheck(struct ctlreg, array[0]); \ + asm volatile( \ + " stctg %[_low],%[_high],%[_arr]\n" \ + : [_arr] "=Q" (*(struct addrtype *)(&array)) \ + : [_low] "i" (low), [_high] "i" (high)); \ +} while (0) + +static __always_inline void local_ctl_load(unsigned int cr, struct ctlreg *reg) +{ + asm volatile( + " lctlg %[cr],%[cr],%[reg]\n" + : + : [reg] "Q" (*reg), [cr] "i" (cr) + : "memory"); +} + +static __always_inline void local_ctl_store(unsigned int cr, struct ctlreg *reg) +{ + asm volatile( + " stctg %[cr],%[cr],%[reg]\n" + : [reg] "=Q" (*reg) + : [cr] "i" (cr)); +} + +static __always_inline struct ctlreg local_ctl_set_bit(unsigned int cr, unsigned int bit) +{ + struct ctlreg new, old; + + local_ctl_store(cr, &old); + new = old; + new.val |= 1UL << bit; + local_ctl_load(cr, &new); + return old; +} + +static __always_inline struct ctlreg local_ctl_clear_bit(unsigned int cr, unsigned int bit) +{ + struct ctlreg new, old; + + local_ctl_store(cr, &old); + new = old; + new.val &= ~(1UL << bit); + local_ctl_load(cr, &new); + return old; +} + +struct lowcore; + +void system_ctlreg_lock(void); +void system_ctlreg_unlock(void); +void system_ctlreg_init_save_area(struct lowcore *lc); +void system_ctlreg_modify(unsigned int cr, unsigned long data, int request); + +enum { + CTLREG_SET_BIT, + CTLREG_CLEAR_BIT, + CTLREG_LOAD, +}; + +static inline void system_ctl_set_bit(unsigned int cr, unsigned int bit) +{ + system_ctlreg_modify(cr, bit, CTLREG_SET_BIT); +} + +static inline void system_ctl_clear_bit(unsigned int cr, unsigned int bit) +{ + system_ctlreg_modify(cr, bit, CTLREG_CLEAR_BIT); +} + +static inline void system_ctl_load(unsigned int cr, struct ctlreg *reg) +{ + system_ctlreg_modify(cr, reg->val, CTLREG_LOAD); +} + +union ctlreg0 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long : 8; + unsigned long tcx : 1; /* Transactional-Execution control */ + unsigned long pifo : 1; /* Transactional-Execution Program- + Interruption-Filtering Override */ + unsigned long : 3; + unsigned long ccc : 1; /* Cryptography counter control */ + unsigned long pec : 1; /* PAI extension control */ + unsigned long : 15; + unsigned long wti : 1; /* Warning-track */ + unsigned long : 4; + unsigned long lap : 1; /* Low-address-protection control */ + unsigned long : 4; + unsigned long edat : 1; /* Enhanced-DAT-enablement control */ + unsigned long : 2; + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 1; + unsigned long afp : 1; /* AFP-register control */ + unsigned long vx : 1; /* Vector enablement control */ + unsigned long : 7; + unsigned long sssm : 1; /* Service signal subclass mask */ + unsigned long : 9; + }; +}; + +union ctlreg2 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long : 33; + unsigned long ducto : 25; + unsigned long : 1; + unsigned long gse : 1; + unsigned long : 1; + unsigned long tds : 1; + unsigned long tdc : 2; + }; +}; + +union ctlreg5 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long : 33; + unsigned long pasteo: 25; + unsigned long : 6; + }; +}; + +union ctlreg15 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long lsea : 61; + unsigned long : 3; + }; +}; + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_CTLREG_H */ diff --git a/arch/s390/include/asm/current.h b/arch/s390/include/asm/current.h index 68f84315277c..f9529f7cf62c 100644 --- a/arch/s390/include/asm/current.h +++ b/arch/s390/include/asm/current.h @@ -11,9 +11,25 @@ #define _S390_CURRENT_H #include <asm/lowcore.h> +#include <asm/machine.h> struct task_struct; -#define current ((struct task_struct *const)S390_lowcore.current_task) +static __always_inline struct task_struct *get_current(void) +{ + unsigned long ptr, lc_current; + + lc_current = offsetof(struct lowcore, current_task); + asm_inline( + ALTERNATIVE(" lg %[ptr],%[offzero](%%r0)\n", + " lg %[ptr],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [ptr] "=d" (ptr) + : [offzero] "i" (lc_current), + [offalt] "i" (lc_current + LOWCORE_ALT_ADDRESS)); + return (struct task_struct *)ptr; +} + +#define current get_current() #endif /* !(_S390_CURRENT_H) */ diff --git a/arch/s390/include/asm/dat-bits.h b/arch/s390/include/asm/dat-bits.h new file mode 100644 index 000000000000..8d65eec2f124 --- /dev/null +++ b/arch/s390/include/asm/dat-bits.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAT table and related structures + * + * Copyright IBM Corp. 2024 + * + */ + +#ifndef _S390_DAT_BITS_H +#define _S390_DAT_BITS_H + +union asce { + unsigned long val; + struct { + unsigned long rsto: 52;/* Region- or Segment-Table Origin */ + unsigned long : 2; + unsigned long g : 1; /* Subspace Group control */ + unsigned long p : 1; /* Private Space control */ + unsigned long s : 1; /* Storage-Alteration-Event control */ + unsigned long x : 1; /* Space-Switch-Event control */ + unsigned long r : 1; /* Real-Space control */ + unsigned long : 1; + unsigned long dt : 2; /* Designation-Type control */ + unsigned long tl : 2; /* Region- or Segment-Table Length */ + }; +}; + +enum { + ASCE_TYPE_SEGMENT = 0, + ASCE_TYPE_REGION3 = 1, + ASCE_TYPE_REGION2 = 2, + ASCE_TYPE_REGION1 = 3 +}; + +union region1_table_entry { + unsigned long val; + struct { + unsigned long rto: 52;/* Region-Table Origin */ + unsigned long : 2; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 1; + unsigned long tf : 2; /* Region-Second-Table Offset */ + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long : 1; + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long tl : 2; /* Region-Second-Table Length */ + }; +}; + +union region2_table_entry { + unsigned long val; + struct { + unsigned long rto: 52;/* Region-Table Origin */ + unsigned long : 2; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 1; + unsigned long tf : 2; /* Region-Third-Table Offset */ + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long : 1; + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long tl : 2; /* Region-Third-Table Length */ + }; +}; + +struct region3_table_entry_fc0 { + unsigned long sto: 52;/* Segment-Table Origin */ + unsigned long : 1; + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 1; + unsigned long tf : 2; /* Segment-Table Offset */ + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long cr : 1; /* Common-Region Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long tl : 2; /* Segment-Table Length */ +}; + +struct region3_table_entry_fc1 { + unsigned long rfaa: 33;/* Region-Frame Absolute Address */ + unsigned long : 14; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc : 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long cr : 1; /* Common-Region Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; +}; + +union region3_table_entry { + unsigned long val; + struct region3_table_entry_fc0 fc0; + struct region3_table_entry_fc1 fc1; + struct { + unsigned long : 53; + unsigned long fc: 1; /* Format-Control */ + unsigned long : 4; + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long cr: 1; /* Common-Region Bit */ + unsigned long tt: 2; /* Table-Type Bits */ + unsigned long : 2; + }; +}; + +struct segment_table_entry_fc0 { + unsigned long pto: 53;/* Page-Table Origin */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; +}; + +struct segment_table_entry_fc1 { + unsigned long sfaa: 44;/* Segment-Frame Absolute Address */ + unsigned long : 3; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc : 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; +}; + +union segment_table_entry { + unsigned long val; + struct segment_table_entry_fc0 fc0; + struct segment_table_entry_fc1 fc1; + struct { + unsigned long : 53; + unsigned long fc: 1; /* Format-Control */ + unsigned long : 4; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs: 1; /* Common-Segment Bit */ + unsigned long tt: 2; /* Table-Type Bits */ + unsigned long : 2; + }; +}; + +union page_table_entry { + unsigned long val; + struct { + unsigned long pfra: 52;/* Page-Frame Real Address */ + unsigned long z : 1; /* Zero Bit */ + unsigned long i : 1; /* Page-Invalid Bit */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 8; + }; +}; + +enum { + TABLE_TYPE_SEGMENT = 0, + TABLE_TYPE_REGION3 = 1, + TABLE_TYPE_REGION2 = 2, + TABLE_TYPE_REGION1 = 3 +}; + +#endif /* _S390_DAT_BITS_H */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 77f24262c25c..6375276d94ea 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -4,8 +4,8 @@ * * Copyright IBM Corp. 1999, 2020 */ -#ifndef DEBUG_H -#define DEBUG_H +#ifndef _ASM_S390_DEBUG_H +#define _ASM_S390_DEBUG_H #include <linux/string.h> #include <linux/spinlock.h> @@ -66,14 +66,15 @@ typedef int (debug_header_proc_t) (debug_info_t *id, struct debug_view *view, int area, debug_entry_t *entry, - char *out_buf); + char *out_buf, size_t out_buf_size); typedef int (debug_format_proc_t) (debug_info_t *id, struct debug_view *view, char *out_buf, + size_t out_buf_size, const char *in_buf); typedef int (debug_prolog_proc_t) (debug_info_t *id, struct debug_view *view, - char *out_buf); + char *out_buf, size_t out_buf_size); typedef int (debug_input_proc_t) (debug_info_t *id, struct debug_view *view, struct file *file, @@ -81,8 +82,13 @@ typedef int (debug_input_proc_t) (debug_info_t *id, size_t in_buf_size, loff_t *offset); int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf); + int area, debug_entry_t *entry, + char *out_buf, size_t out_buf_size); +#define DEBUG_SPRINTF_MAX_ARGS 10 +int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, size_t out_buf_size, + const char *inbuf); struct debug_view { char name[DEBUG_MAX_NAME_LEN]; debug_prolog_proc_t *prolog_proc; @@ -112,6 +118,9 @@ debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas, int buf_size, umode_t mode, uid_t uid, gid_t gid); +ssize_t debug_dump(debug_info_t *id, struct debug_view *view, + char *buf, size_t buf_size, bool reverse); + void debug_unregister(debug_info_t *id); void debug_set_level(debug_info_t *id, int new_level); @@ -222,7 +231,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) @@ -350,7 +359,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) @@ -487,4 +496,4 @@ void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas); #endif /* MODULE */ -#endif /* DEBUG_H */ +#endif /* _ASM_S390_DEBUG_H */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 56e99c286d12..8db8db3b1018 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -12,6 +12,8 @@ #include <linux/if_ether.h> #include <linux/percpu.h> #include <asm/asm-extable.h> +#include <asm/sclp.h> +#include <asm/cio.h> enum diag_stat_enum { DIAG_STAT_X008, @@ -20,6 +22,7 @@ enum diag_stat_enum { DIAG_STAT_X014, DIAG_STAT_X044, DIAG_STAT_X064, + DIAG_STAT_X08C, DIAG_STAT_X09C, DIAG_STAT_X0DC, DIAG_STAT_X204, @@ -33,7 +36,11 @@ enum diag_stat_enum { DIAG_STAT_X2FC, DIAG_STAT_X304, DIAG_STAT_X308, + DIAG_STAT_X310, DIAG_STAT_X318, + DIAG_STAT_X320, + DIAG_STAT_X324, + DIAG_STAT_X49C, DIAG_STAT_X500, NR_DIAG_STAT }; @@ -41,6 +48,13 @@ enum diag_stat_enum { void diag_stat_inc(enum diag_stat_enum nr); void diag_stat_inc_norecursion(enum diag_stat_enum nr); +struct hypfs_diag0c_entry; + +/* + * Diagnose 0c: Pseudo Timer + */ +void diag0c(struct hypfs_diag0c_entry *data); + /* * Diagnose 10: Release page range */ @@ -52,7 +66,7 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) end_addr = pfn_to_phys(start_pfn + num_pfn - 1); diag_stat_inc(DIAG_STAT_X010); - asm volatile( + asm_inline volatile( "0: diag %0,%1,0x10\n" "1: nopr %%r7\n" EX_TABLE(0b, 1b) @@ -79,10 +93,20 @@ struct diag210 { u8 vrdccrty; /* real device type (output) */ u8 vrdccrmd; /* real device model (output) */ u8 vrdccrft; /* real device feature (output) */ -} __attribute__((packed, aligned(4))); +} __packed __aligned(4); extern int diag210(struct diag210 *addr); +struct diag8c { + u8 flags; + u8 num_partitions; + u16 width; + u16 height; + u8 data[]; +} __packed __aligned(4); + +extern int diag8c(struct diag8c *out, struct ccw_dev_id *devno); + /* bit is set in flags, when physical cpu info is included in diag 204 data */ #define DIAG204_LPAR_PHYS_FLG 0x80 #define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ @@ -96,6 +120,10 @@ enum diag204_sc { DIAG204_SUBC_STIB7 = 7 }; +#define DIAG204_SUBCODE_MASK 0xffff +#define DIAG204_BIF_BIT 0x80000000 +#define DIAG204_BUSY_WAIT (HZ / 10) + /* The two available diag 204 data formats */ enum diag204_format { DIAG204_INFO_SIMPLE = 0, @@ -304,6 +332,11 @@ union diag318_info { }; }; +static inline bool diag204_has_bif(void) +{ + return sclp.has_diag204_bif; +} + int diag204(unsigned long subcode, unsigned long size, void *addr); int diag224(void *ptr); int diag26c(void *req, void *resp, enum diag26c_sc subcode); @@ -316,9 +349,10 @@ struct hypfs_diag0c_entry; */ struct diag_ops { int (*diag210)(struct diag210 *addr); - int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); + int (*diag26c)(unsigned long rx, unsigned long rx1, enum diag26c_sc subcode); int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); - void (*diag0c)(struct hypfs_diag0c_entry *entry); + int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); + void (*diag0c)(unsigned long rx); void (*diag308_reset)(void); }; @@ -326,9 +360,18 @@ extern struct diag_ops diag_amode31_ops; extern struct diag210 *__diag210_tmp_amode31; int _diag210_amode31(struct diag210 *addr); -int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode); +int _diag26c_amode31(unsigned long rx, unsigned long rx1, enum diag26c_sc subcode); int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode); -void _diag0c_amode31(struct hypfs_diag0c_entry *entry); +void _diag0c_amode31(unsigned long rx); void _diag308_reset_amode31(void); +int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); + +/* diag 49c subcodes */ +enum diag49c_sc { + DIAG49C_SUBC_ACK = 0, + DIAG49C_SUBC_REG = 1 +}; + +int diag49c(unsigned long subcode); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/diag288.h b/arch/s390/include/asm/diag288.h new file mode 100644 index 000000000000..5e1b43cea9d6 --- /dev/null +++ b/arch/s390/include/asm/diag288.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_DIAG288_H +#define _ASM_S390_DIAG288_H + +#include <asm/asm-extable.h> +#include <asm/types.h> + +#define MIN_INTERVAL 15 /* Minimal time supported by diag288 */ +#define MAX_INTERVAL 3600 /* One hour should be enough - pure estimation */ + +#define WDT_DEFAULT_TIMEOUT 30 + +/* Function codes - init, change, cancel */ +#define WDT_FUNC_INIT 0 +#define WDT_FUNC_CHANGE 1 +#define WDT_FUNC_CANCEL 2 +#define WDT_FUNC_CONCEAL 0x80000000 + +/* Action codes for LPAR watchdog */ +#define LPARWDT_RESTART 0 + +static inline int __diag288(unsigned int func, unsigned int timeout, + unsigned long action, unsigned int len) +{ + union register_pair r1 = { .even = func, .odd = timeout, }; + union register_pair r3 = { .even = action, .odd = len, }; + int rc = -EINVAL; + + asm volatile( + " diag %[r1],%[r3],0x288\n" + "0: lhi %[rc],0\n" + "1:" + EX_TABLE(0b, 1b) + : [rc] "+d" (rc) + : [r1] "d" (r1.pair), [r3] "d" (r3.pair) + : "cc", "memory"); + return rc; +} + +#endif /* _ASM_S390_DIAG288_H */ diff --git a/arch/s390/include/asm/dma-types.h b/arch/s390/include/asm/dma-types.h new file mode 100644 index 000000000000..5c5734e6946c --- /dev/null +++ b/arch/s390/include/asm/dma-types.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_DMA_TYPES_H_ +#define _ASM_S390_DMA_TYPES_H_ + +#include <linux/types.h> +#include <linux/io.h> + +/* + * typedef dma32_t + * Contains a 31 bit absolute address to a DMA capable piece of storage. + * + * For CIO, DMA addresses are always absolute addresses. These addresses tend + * to be used in architectured memory blocks (like ORB, IDAW, MIDAW). Under + * certain circumstances 31 bit wide addresses must be used because the + * address must fit in 31 bits. + * + * This type is to be used when such fields can be modelled as 32 bit wide. + */ +typedef u32 __bitwise dma32_t; + +/* + * typedef dma64_t + * Contains a 64 bit absolute address to a DMA capable piece of storage. + * + * For CIO, DMA addresses are always absolute addresses. These addresses tend + * to be used in architectured memory blocks (like ORB, IDAW, MIDAW). + * + * This type is to be used to model such 64 bit wide fields. + */ +typedef u64 __bitwise dma64_t; + +/* + * Although DMA addresses should be obtained using the DMA API, in cases when + * it is known that the first argument holds a virtual address that points to + * DMA-able 31 bit addressable storage, then this function can be safely used. + */ +static inline dma32_t virt_to_dma32(void *ptr) +{ + return (__force dma32_t)__pa32(ptr); +} + +static inline void *dma32_to_virt(dma32_t addr) +{ + return __va((__force unsigned long)addr); +} + +static inline dma32_t u32_to_dma32(u32 addr) +{ + return (__force dma32_t)addr; +} + +static inline u32 dma32_to_u32(dma32_t addr) +{ + return (__force u32)addr; +} + +static inline dma32_t dma32_add(dma32_t a, u32 b) +{ + return (__force dma32_t)((__force u32)a + b); +} + +static inline dma32_t dma32_and(dma32_t a, u32 b) +{ + return (__force dma32_t)((__force u32)a & b); +} + +/* + * Although DMA addresses should be obtained using the DMA API, in cases when + * it is known that the first argument holds a virtual address that points to + * DMA-able storage, then this function can be safely used. + */ +static inline dma64_t virt_to_dma64(void *ptr) +{ + return (__force dma64_t)__pa(ptr); +} + +static inline void *dma64_to_virt(dma64_t addr) +{ + return __va((__force unsigned long)addr); +} + +static inline dma64_t u64_to_dma64(u64 addr) +{ + return (__force dma64_t)addr; +} + +static inline u64 dma64_to_u64(dma64_t addr) +{ + return (__force u64)addr; +} + +static inline dma64_t dma64_add(dma64_t a, u64 b) +{ + return (__force dma64_t)((__force u64)a + b); +} + +static inline dma64_t dma64_and(dma64_t a, u64 b) +{ + return (__force dma64_t)((__force u64)a & b); +} + +#endif /* _ASM_S390_DMA_TYPES_H_ */ diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h index 6f26f35d4a71..7fe3e31956d7 100644 --- a/arch/s390/include/asm/dma.h +++ b/arch/s390/include/asm/dma.h @@ -2,19 +2,13 @@ #ifndef _ASM_S390_DMA_H #define _ASM_S390_DMA_H -#include <asm/io.h> +#include <linux/io.h> /* * MAX_DMA_ADDRESS is ambiguous because on s390 its completely unrelated * to DMA. It _is_ used for the s390 memory zone split at 2GB caused * by the 31 bit heritage. */ -#define MAX_DMA_ADDRESS 0x80000000 - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif +#define MAX_DMA_ADDRESS __va(0x80000000) #endif /* _ASM_S390_DMA_H */ diff --git a/arch/s390/include/asm/dwarf.h b/arch/s390/include/asm/dwarf.h index 4f21ae561e4d..390906b8e386 100644 --- a/arch/s390/include/asm/dwarf.h +++ b/arch/s390/include/asm/dwarf.h @@ -9,6 +9,7 @@ #define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset #define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset #define CFI_RESTORE .cfi_restore +#define CFI_REL_OFFSET .cfi_rel_offset #ifdef CONFIG_AS_CFI_VAL_OFFSET #define CFI_VAL_OFFSET .cfi_val_offset diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index 06f795855af7..c4589ec4505e 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/device.h> #include <linux/blk_types.h> +#include <asm/dma-types.h> struct arqb { u64 data; @@ -45,7 +46,7 @@ struct msb { u16:12; u16 bs:4; u32 blk_count; - u64 data_addr; + dma64_t data_addr; u64 scm_addr; u64:64; } __packed; @@ -54,7 +55,7 @@ struct aidaw { u8 flags; u32 :24; u32 :32; - u64 data_addr; + dma64_t data_addr; } __packed; #define MSB_OC_CLEAR 0 diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index efb50fc6866c..7164cb658435 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -22,18 +22,18 @@ extern __u8 _ebc_toupper[256]; /* EBCDIC -> uppercase */ static inline void codepage_convert(const __u8 *codepage, volatile char *addr, unsigned long nr) { - if (nr-- <= 0) + if (!nr--) return; asm volatile( - " bras 1,1f\n" - " tr 0(1,%0),0(%2)\n" - "0: tr 0(256,%0),0(%2)\n" + " j 2f\n" + "0: tr 0(1,%0),0(%2)\n" + "1: tr 0(256,%0),0(%2)\n" " la %0,256(%0)\n" - "1: ahi %1,-256\n" - " jnm 0b\n" - " ex %1,0(1)" + "2: aghi %1,-256\n" + " jnm 1b\n" + " exrl %1,0b" : "+&a" (addr), "+&a" (nr) - : "a" (codepage) : "cc", "memory", "1"); + : "a" (codepage) : "cc", "memory"); } #define ASCEBC(addr,nr) codepage_convert(_ascebc, addr, nr) diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 70a30ae258b7..a03df312081e 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -91,6 +91,14 @@ /* Keep this the last entry. */ #define R_390_NUM 61 +/* + * HWCAP flags - for AT_HWCAP + * + * Bits 32-63 are reserved for use by libc. + * Bit 31 is reserved and will be used by libc to determine if a second + * argument is passed to IFUNC resolvers. This will be implemented when + * there is a need for AT_HWCAP2. + */ enum { HWCAP_NR_ESAN3 = 0, HWCAP_NR_ZARCH = 1, @@ -150,9 +158,6 @@ enum { #define ELF_DATA ELFDATA2MSB #define ELF_ARCH EM_S390 -/* s390 specific phdr types */ -#define PT_S390_PGSTE 0x70000000 - /* * ELF register definitions.. */ @@ -183,35 +188,6 @@ typedef s390_compat_regs compat_elf_gregset_t; && (x)->e_ident[EI_CLASS] == ELF_CLASS) #define compat_start_thread start_thread31 -struct arch_elf_state { - int rc; -}; - -#define INIT_ARCH_ELF_STATE { .rc = 0 } - -#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0) -#ifdef CONFIG_PGSTE -#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ -({ \ - struct arch_elf_state *_state = state; \ - if ((phdr)->p_type == PT_S390_PGSTE && \ - !page_table_allocate_pgste && \ - !test_thread_flag(TIF_PGSTE) && \ - !current->mm->context.alloc_pgste) { \ - set_thread_flag(TIF_PGSTE); \ - set_pt_regs_flag(task_pt_regs(current), \ - PIF_EXECVE_PGSTE_RESTART); \ - _state->rc = -EAGAIN; \ - } \ - _state->rc; \ -}) -#else -#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ -({ \ - (state)->rc; \ -}) -#endif - /* For SVR4/S390 the function pointer to be registered with `atexit` is passed in R14. */ #define ELF_PLAT_INIT(_r, load_addr) \ diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 000de2b1e67a..35555c944630 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -8,7 +8,7 @@ #include <linux/processor.h> #include <linux/uaccess.h> #include <asm/timex.h> -#include <asm/fpu/api.h> +#include <asm/fpu.h> #include <asm/pai.h> #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) @@ -41,8 +41,7 @@ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, static __always_inline void arch_exit_to_user_mode(void) { - if (test_cpu_flag(CIF_FPU)) - __load_fpu_regs(); + load_user_fpu_regs(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) debug_user_asce(1); @@ -55,14 +54,9 @@ static __always_inline void arch_exit_to_user_mode(void) static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { - choose_random_kstack_offset(get_tod_clock_fast() & 0xff); + choose_random_kstack_offset(get_tod_clock_fast()); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare -static inline bool on_thread_stack(void) -{ - return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); -} - #endif diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h index 568fd81bb77b..e0a06060afdd 100644 --- a/arch/s390/include/asm/extmem.h +++ b/arch/s390/include/asm/extmem.h @@ -8,6 +8,13 @@ #define _ASM_S390X_DCSS_H #ifndef __ASSEMBLY__ +/* + * DCSS segment is defined as a contiguous range of pages using DEFSEG command. + * The range start and end is a page number with a value less than or equal to + * 0x7ffffff (see CP Commands and Utilities Reference). + */ +#define MAX_DCSS_ADDR (512UL * SZ_1G) + /* possible values for segment type as returned by segment_info */ #define SEG_TYPE_SW 0 #define SEG_TYPE_EW 1 diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index 94b6919026df..5f5b1aa6c233 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -14,13 +14,12 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/preempt.h> - +#include <asm/alternative.h> #include <asm/lowcore.h> #define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8) extern u64 stfle_fac_list[16]; -extern u64 alt_stfle_fac_list[16]; static inline void __set_facility(unsigned long nr, void *facilities) { @@ -40,33 +39,56 @@ static inline void __clear_facility(unsigned long nr, void *facilities) ptr[nr >> 3] &= ~(0x80 >> (nr & 7)); } -static inline int __test_facility(unsigned long nr, void *facilities) +static __always_inline bool __test_facility(unsigned long nr, void *facilities) { unsigned char *ptr; if (nr >= MAX_FACILITY_BIT) - return 0; + return false; ptr = (unsigned char *) facilities + (nr >> 3); return (*ptr & (0x80 >> (nr & 7))) != 0; } /* + * __test_facility_constant() generates a single instruction branch. If the + * tested facility is available (likely) the branch is patched into a nop. + * + * Do not use this function unless you know what you are doing. All users are + * supposed to use test_facility() which will do the right thing. + */ +static __always_inline bool __test_facility_constant(unsigned long nr) +{ + asm goto( + ALTERNATIVE("brcl 15,%l[l_no]", "brcl 0,0", ALT_FACILITY(%[nr])) + : + : [nr] "i" (nr) + : + : l_no); + return true; +l_no: + return false; +} + +/* * The test_facility function uses the bit ordering where the MSB is bit 0. * That makes it easier to query facility bits with the bit number as * documented in the Principles of Operation. */ -static inline int test_facility(unsigned long nr) +static __always_inline bool test_facility(unsigned long nr) { unsigned long facilities_als[] = { FACILITIES_ALS }; - if (__builtin_constant_p(nr) && nr < sizeof(facilities_als) * 8) { - if (__test_facility(nr, &facilities_als)) - return 1; + if (!__is_defined(__DECOMPRESSOR) && __builtin_constant_p(nr)) { + if (nr < sizeof(facilities_als) * 8) { + if (__test_facility(nr, &facilities_als)) + return true; + } + return __test_facility_constant(nr); } return __test_facility(nr, &stfle_fac_list); } -static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) +static inline unsigned long __stfle_asm(u64 *fac_list, int size) { unsigned long reg0 = size - 1; @@ -74,7 +96,7 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) " lgr 0,%[reg0]\n" " .insn s,0xb2b00000,%[list]\n" /* stfle */ " lgr %[reg0],0\n" - : [reg0] "+&d" (reg0), [list] "+Q" (*stfle_fac_list) + : [reg0] "+&d" (reg0), [list] "+Q" (*fac_list) : : "memory", "cc", "0"); return reg0; @@ -82,33 +104,39 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) /** * stfle - Store facility list extended - * @stfle_fac_list: array where facility list can be stored + * @fac_list: array where facility list can be stored * @size: size of passed in array in double words */ -static inline void __stfle(u64 *stfle_fac_list, int size) +static inline void __stfle(u64 *fac_list, int size) { unsigned long nr; u32 stfl_fac_list; asm volatile( " stfl 0(0)\n" - : "=m" (S390_lowcore.stfl_fac_list)); - stfl_fac_list = S390_lowcore.stfl_fac_list; - memcpy(stfle_fac_list, &stfl_fac_list, 4); + : "=m" (get_lowcore()->stfl_fac_list)); + stfl_fac_list = get_lowcore()->stfl_fac_list; + memcpy(fac_list, &stfl_fac_list, 4); nr = 4; /* bytes stored by stfl */ if (stfl_fac_list & 0x01000000) { /* More facility bits available with stfle */ - nr = __stfle_asm(stfle_fac_list, size); + nr = __stfle_asm(fac_list, size); nr = min_t(unsigned long, (nr + 1) * 8, size * 8); } - memset((char *) stfle_fac_list + nr, 0, size * 8 - nr); + memset((char *)fac_list + nr, 0, size * 8 - nr); } -static inline void stfle(u64 *stfle_fac_list, int size) +static inline void stfle(u64 *fac_list, int size) { preempt_disable(); - __stfle(stfle_fac_list, size); + __stfle(fac_list, size); preempt_enable(); } +/** + * stfle_size - Actual size of the facility list as specified by stfle + * (number of double words) + */ +unsigned int stfle_size(void); + #endif /* __ASM_FACILITY_H */ diff --git a/arch/s390/include/asm/fault.h b/arch/s390/include/asm/fault.h new file mode 100644 index 000000000000..d326f56603d6 --- /dev/null +++ b/arch/s390/include/asm/fault.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2023 + */ +#ifndef _ASM_S390_FAULT_H +#define _ASM_S390_FAULT_H + +union teid { + unsigned long val; + struct { + unsigned long addr : 52; /* Translation-exception Address */ + unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ + unsigned long : 2; + unsigned long b56 : 1; + unsigned long : 3; + unsigned long b60 : 1; + unsigned long b61 : 1; + unsigned long as : 2; /* ASCE Identifier */ + }; +}; + +enum { + TEID_FSI_UNKNOWN = 0, /* Unknown whether fetch or store */ + TEID_FSI_STORE = 1, /* Exception was due to store operation */ + TEID_FSI_FETCH = 2 /* Exception was due to fetch operation */ +}; + +#endif /* _ASM_S390_FAULT_H */ diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h index b8a028a36173..80f82a739b45 100644 --- a/arch/s390/include/asm/fcx.h +++ b/arch/s390/include/asm/fcx.h @@ -10,6 +10,7 @@ #define _ASM_S390_FCX_H #include <linux/types.h> +#include <asm/dma-types.h> #define TCW_FORMAT_DEFAULT 0 #define TCW_TIDAW_FORMAT_DEFAULT 0 @@ -43,16 +44,16 @@ struct tcw { u32 r:1; u32 w:1; u32 :16; - u64 output; - u64 input; - u64 tsb; - u64 tccb; + dma64_t output; + dma64_t input; + dma64_t tsb; + dma64_t tccb; u32 output_count; u32 input_count; u32 :32; u32 :32; u32 :32; - u32 intrg; + dma32_t intrg; } __attribute__ ((packed, aligned(64))); #define TIDAW_FLAGS_LAST (1 << (7 - 0)) @@ -73,7 +74,7 @@ struct tidaw { u32 flags:8; u32 :24; u32 count; - u64 addr; + dma64_t addr; } __attribute__ ((packed, aligned(16))); /** @@ -286,7 +287,7 @@ struct tccb_tcat { */ struct tccb { struct tccb_tcah tcah; - u8 tca[0]; + u8 tca[]; } __attribute__ ((packed, aligned(8))); struct tcw *tcw_get_intrg(struct tcw *tcw); diff --git a/arch/s390/include/asm/fprobe.h b/arch/s390/include/asm/fprobe.h new file mode 100644 index 000000000000..5ef600b372f4 --- /dev/null +++ b/arch/s390/include/asm/fprobe.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_FPROBE_H +#define _ASM_S390_FPROBE_H + +#include <asm-generic/fprobe.h> + +#undef FPROBE_HEADER_MSB_PATTERN +#define FPROBE_HEADER_MSB_PATTERN 0 + +#endif /* _ASM_S390_FPROBE_H */ diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/fpu-insn-asm.h index 95480ed9149e..d296322be4bc 100644 --- a/arch/s390/include/asm/vx-insn.h +++ b/arch/s390/include/asm/fpu-insn-asm.h @@ -9,11 +9,14 @@ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#ifndef __ASM_S390_VX_INSN_H -#define __ASM_S390_VX_INSN_H +#ifndef __ASM_S390_FPU_INSN_ASM_H +#define __ASM_S390_FPU_INSN_ASM_H -#ifdef __ASSEMBLY__ +#ifndef __ASM_S390_FPU_INSN_H +#error only <asm/fpu-insn.h> can be included directly +#endif +#ifdef __ASSEMBLY__ /* Macros to generate vector instruction byte code */ @@ -192,10 +195,26 @@ /* RXB - Compute most significant bit used vector registers * * @rxb: Operand to store computed RXB value - * @v1: First vector register designated operand - * @v2: Second vector register designated operand - * @v3: Third vector register designated operand - * @v4: Fourth vector register designated operand + * @v1: Vector register designated operand whose MSB is stored in + * RXB bit 0 (instruction bit 36) and whose remaining bits + * are stored in instruction bits 8-11. + * @v2: Vector register designated operand whose MSB is stored in + * RXB bit 1 (instruction bit 37) and whose remaining bits + * are stored in instruction bits 12-15. + * @v3: Vector register designated operand whose MSB is stored in + * RXB bit 2 (instruction bit 38) and whose remaining bits + * are stored in instruction bits 16-19. + * @v4: Vector register designated operand whose MSB is stored in + * RXB bit 3 (instruction bit 39) and whose remaining bits + * are stored in instruction bits 32-35. + * + * Note: In most vector instruction formats [1] V1, V2, V3, and V4 directly + * correspond to @v1, @v2, @v3, and @v4. But there are exceptions, such as but + * not limited to the vector instruction formats VRR-g, VRR-h, VRS-a, VRS-d, + * and VSI. + * + * [1] IBM z/Architecture Principles of Operation, chapter "Program + * Execution, section "Instructions", subsection "Instruction Formats". */ .macro RXB rxb v1 v2=0 v3=0 v4=0 \rxb = 0 @@ -220,6 +239,9 @@ * @v2: Second vector register designated operand (for RXB) * @v3: Third vector register designated operand (for RXB) * @v4: Fourth vector register designated operand (for RXB) + * + * Note: For @v1, @v2, @v3, and @v4 also refer to the RXB macro + * description for further details. */ .macro MRXB m v1 v2=0 v3=0 v4=0 rxb = 0 @@ -235,6 +257,9 @@ * @v2: Second vector register designated operand (for RXB) * @v3: Third vector register designated operand (for RXB) * @v4: Fourth vector register designated operand (for RXB) + * + * Note: For @v1, @v2, @v3, and @v4 also refer to the RXB macro + * description for further details. */ .macro MRXBOPC m opc v1 v2=0 v3=0 v4=0 MRXB \m, \v1, \v2, \v3, \v4 @@ -347,7 +372,7 @@ VX_NUM v3, \vr .word 0xE700 | (r1 << 4) | (v3&15) .word (b2 << 12) | (\disp) - MRXBOPC \m, 0x21, v3 + MRXBOPC \m, 0x21, 0, v3 .endm .macro VLGVB gr, vr, disp, base="%r0" VLGV \gr, \vr, \disp, \base, 0 @@ -382,6 +407,28 @@ MRXBOPC 0, 0x0E, v1 .endm +/* VECTOR STORE BYTE REVERSED ELEMENTS */ + .macro VSTBR vr1, disp, index="%r0", base, m + VX_NUM v1, \vr1 + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE600 | ((v1&15) << 4) | (x2&15) + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x0E, v1 +.endm +.macro VSTBRH vr1, disp, index="%r0", base + VSTBR \vr1, \disp, \index, \base, 1 +.endm +.macro VSTBRF vr1, disp, index="%r0", base + VSTBR \vr1, \disp, \index, \base, 2 +.endm +.macro VSTBRG vr1, disp, index="%r0", base + VSTBR \vr1, \disp, \index, \base, 3 +.endm +.macro VSTBRQ vr1, disp, index="%r0", base + VSTBR \vr1, \disp, \index, \base, 4 +.endm + /* VECTOR STORE MULTIPLE */ .macro VSTM vfrom, vto, disp, base, hint=3 VX_NUM v1, \vfrom @@ -496,6 +543,25 @@ VMRL \vr1, \vr2, \vr3, 3 .endm +/* VECTOR LOAD WITH LENGTH */ +.macro VLL v, gr, disp, base + VX_NUM v1, \v + GR_NUM b2, \base + GR_NUM r3, \gr + .word 0xE700 | ((v1&15) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x37, v1 +.endm + +/* VECTOR STORE WITH LENGTH */ +.macro VSTL v, gr, disp, base + VX_NUM v1, \v + GR_NUM b2, \base + GR_NUM r3, \gr + .word 0xE700 | ((v1&15) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x3f, v1 +.endm /* Vector integer instructions */ @@ -509,6 +575,16 @@ MRXBOPC 0, 0x68, v1, v2, v3 .endm +/* VECTOR CHECKSUM */ +.macro VCKSM vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x66, v1, v2, v3 +.endm + /* VECTOR EXCLUSIVE OR */ .macro VX vr1, vr2, vr3 VX_NUM v1, \vr1 @@ -675,4 +751,4 @@ .endm #endif /* __ASSEMBLY__ */ -#endif /* __ASM_S390_VX_INSN_H */ +#endif /* __ASM_S390_FPU_INSN_ASM_H */ diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h new file mode 100644 index 000000000000..f668bffd6dd3 --- /dev/null +++ b/arch/s390/include/asm/fpu-insn.h @@ -0,0 +1,479 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for Floating Point and Vector Instructions + * + */ + +#ifndef __ASM_S390_FPU_INSN_H +#define __ASM_S390_FPU_INSN_H + +#include <asm/fpu-insn-asm.h> + +#ifndef __ASSEMBLY__ + +#include <linux/instrumented.h> +#include <asm/asm-extable.h> + +asm(".include \"asm/fpu-insn-asm.h\"\n"); + +/* + * Various small helper functions, which can and should be used within + * kernel fpu code sections. Each function represents only one floating + * point or vector instruction (except for helper functions which require + * exception handling). + * + * This allows to use floating point and vector instructions like C + * functions, which has the advantage that all supporting code, like + * e.g. loops, can be written in easy to read C code. + * + * Each of the helper functions provides support for code instrumentation, + * like e.g. KASAN. Therefore instrumentation is also covered automatically + * when using these functions. + * + * In order to ensure that code generated with the helper functions stays + * within kernel fpu sections, which are guarded with kernel_fpu_begin() + * and kernel_fpu_end() calls, each function has a mandatory "memory" + * barrier. + */ + +static __always_inline void fpu_cefbr(u8 f1, s32 val) +{ + asm volatile("cefbr %[f1],%[val]\n" + : + : [f1] "I" (f1), [val] "d" (val) + : "memory"); +} + +static __always_inline unsigned long fpu_cgebr(u8 f2, u8 mode) +{ + unsigned long val; + + asm volatile("cgebr %[val],%[mode],%[f2]\n" + : [val] "=d" (val) + : [f2] "I" (f2), [mode] "I" (mode) + : "memory"); + return val; +} + +static __always_inline void fpu_debr(u8 f1, u8 f2) +{ + asm volatile("debr %[f1],%[f2]\n" + : + : [f1] "I" (f1), [f2] "I" (f2) + : "memory"); +} + +static __always_inline void fpu_ld(unsigned short fpr, freg_t *reg) +{ + instrument_read(reg, sizeof(*reg)); + asm volatile("ld %[fpr],%[reg]\n" + : + : [fpr] "I" (fpr), [reg] "Q" (reg->ui) + : "memory"); +} + +static __always_inline void fpu_ldgr(u8 f1, u32 val) +{ + asm volatile("ldgr %[f1],%[val]\n" + : + : [f1] "I" (f1), [val] "d" (val) + : "memory"); +} + +static __always_inline void fpu_lfpc(unsigned int *fpc) +{ + instrument_read(fpc, sizeof(*fpc)); + asm volatile("lfpc %[fpc]" + : + : [fpc] "Q" (*fpc) + : "memory"); +} + +/** + * fpu_lfpc_safe - Load floating point control register safely. + * @fpc: new value for floating point control register + * + * Load floating point control register. This may lead to an exception, + * since a saved value may have been modified by user space (ptrace, + * signal return, kvm registers) to an invalid value. In such a case + * set the floating point control register to zero. + */ +static inline void fpu_lfpc_safe(unsigned int *fpc) +{ + instrument_read(fpc, sizeof(*fpc)); + asm_inline volatile( + " lfpc %[fpc]\n" + "0: nopr %%r7\n" + EX_TABLE_FPC(0b, 0b) + : + : [fpc] "Q" (*fpc) + : "memory"); +} + +static __always_inline void fpu_std(unsigned short fpr, freg_t *reg) +{ + instrument_write(reg, sizeof(*reg)); + asm volatile("std %[fpr],%[reg]\n" + : [reg] "=Q" (reg->ui) + : [fpr] "I" (fpr) + : "memory"); +} + +static __always_inline void fpu_sfpc(unsigned int fpc) +{ + asm volatile("sfpc %[fpc]" + : + : [fpc] "d" (fpc) + : "memory"); +} + +static __always_inline void fpu_stfpc(unsigned int *fpc) +{ + instrument_write(fpc, sizeof(*fpc)); + asm volatile("stfpc %[fpc]" + : [fpc] "=Q" (*fpc) + : + : "memory"); +} + +static __always_inline void fpu_vab(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VAB %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +static __always_inline void fpu_vcksm(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VCKSM %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +static __always_inline void fpu_vesravb(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VESRAVB %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +static __always_inline void fpu_vgfmag(u8 v1, u8 v2, u8 v3, u8 v4) +{ + asm volatile("VGFMAG %[v1],%[v2],%[v3],%[v4]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3), [v4] "I" (v4) + : "memory"); +} + +static __always_inline void fpu_vgfmg(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VGFMG %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vl(u8 v1, const void *vxr) +{ + instrument_read(vxr, sizeof(__vector128)); + asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n" + : + : [vxr] "Q" (*(__vector128 *)vxr), + [v1] "I" (v1) + : "memory"); +} + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vl(u8 v1, const void *vxr) +{ + instrument_read(vxr, sizeof(__vector128)); + asm volatile( + " la 1,%[vxr]\n" + " VL %[v1],0,,1\n" + : + : [vxr] "R" (*(__vector128 *)vxr), + [v1] "I" (v1) + : "memory", "1"); +} + +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vleib(u8 v, s16 val, u8 index) +{ + asm volatile("VLEIB %[v],%[val],%[index]" + : + : [v] "I" (v), [val] "K" (val), [index] "I" (index) + : "memory"); +} + +static __always_inline void fpu_vleig(u8 v, s16 val, u8 index) +{ + asm volatile("VLEIG %[v],%[val],%[index]" + : + : [v] "I" (v), [val] "K" (val), [index] "I" (index) + : "memory"); +} + +static __always_inline u64 fpu_vlgvf(u8 v, u16 index) +{ + u64 val; + + asm volatile("VLGVF %[val],%[v],%[index]" + : [val] "=d" (val) + : [v] "I" (v), [index] "L" (index) + : "memory"); + return val; +} + +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_read(vxr, size); + asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n" + : + : [vxr] "Q" (*(u8 *)vxr), + [index] "d" (index), + [v1] "I" (v1) + : "memory"); +} + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_read(vxr, size); + asm volatile( + " la 1,%[vxr]\n" + " VLL %[v1],%[index],0,1\n" + : + : [vxr] "R" (*(u8 *)vxr), + [index] "d" (index), + [v1] "I" (v1) + : "memory", "1"); +} + +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +#define fpu_vlm(_v1, _v3, _vxrs) \ +({ \ + unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \ + struct { \ + __vector128 _v[(_v3) - (_v1) + 1]; \ + } *_v = (void *)(_vxrs); \ + \ + instrument_read(_v, size); \ + asm volatile("VLM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ + : \ + : [vxrs] "Q" (*_v), \ + [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory"); \ + (_v3) - (_v1) + 1; \ +}) + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#define fpu_vlm(_v1, _v3, _vxrs) \ +({ \ + unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \ + struct { \ + __vector128 _v[(_v3) - (_v1) + 1]; \ + } *_v = (void *)(_vxrs); \ + \ + instrument_read(_v, size); \ + asm volatile( \ + " la 1,%[vxrs]\n" \ + " VLM %[v1],%[v3],0,1\n" \ + : \ + : [vxrs] "R" (*_v), \ + [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory", "1"); \ + (_v3) - (_v1) + 1; \ +}) + +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vlr(u8 v1, u8 v2) +{ + asm volatile("VLR %[v1],%[v2]" + : + : [v1] "I" (v1), [v2] "I" (v2) + : "memory"); +} + +static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index) +{ + asm volatile("VLVGF %[v],%[val],%[index]" + : + : [v] "I" (v), [val] "d" (val), [index] "L" (index) + : "memory"); +} + +static __always_inline void fpu_vn(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VN %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +static __always_inline void fpu_vperm(u8 v1, u8 v2, u8 v3, u8 v4) +{ + asm volatile("VPERM %[v1],%[v2],%[v3],%[v4]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3), [v4] "I" (v4) + : "memory"); +} + +static __always_inline void fpu_vrepib(u8 v1, s16 i2) +{ + asm volatile("VREPIB %[v1],%[i2]" + : + : [v1] "I" (v1), [i2] "K" (i2) + : "memory"); +} + +static __always_inline void fpu_vsrlb(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VSRLB %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vst(u8 v1, const void *vxr) +{ + instrument_write(vxr, sizeof(__vector128)); + asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n" + : [vxr] "=Q" (*(__vector128 *)vxr) + : [v1] "I" (v1) + : "memory"); +} + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vst(u8 v1, const void *vxr) +{ + instrument_write(vxr, sizeof(__vector128)); + asm volatile( + " la 1,%[vxr]\n" + " VST %[v1],0,,1\n" + : [vxr] "=R" (*(__vector128 *)vxr) + : [v1] "I" (v1) + : "memory", "1"); +} + +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_write(vxr, size); + asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n" + : [vxr] "=Q" (*(u8 *)vxr) + : [index] "d" (index), [v1] "I" (v1) + : "memory"); +} + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_write(vxr, size); + asm volatile( + " la 1,%[vxr]\n" + " VSTL %[v1],%[index],0,1\n" + : [vxr] "=R" (*(u8 *)vxr) + : [index] "d" (index), [v1] "I" (v1) + : "memory", "1"); +} + +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#ifdef CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS + +#define fpu_vstm(_v1, _v3, _vxrs) \ +({ \ + unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \ + struct { \ + __vector128 _v[(_v3) - (_v1) + 1]; \ + } *_v = (void *)(_vxrs); \ + \ + instrument_write(_v, size); \ + asm volatile("VSTM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \ + : [vxrs] "=Q" (*_v) \ + : [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory"); \ + (_v3) - (_v1) + 1; \ +}) + +#else /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#define fpu_vstm(_v1, _v3, _vxrs) \ +({ \ + unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \ + struct { \ + __vector128 _v[(_v3) - (_v1) + 1]; \ + } *_v = (void *)(_vxrs); \ + \ + instrument_write(_v, size); \ + asm volatile( \ + " la 1,%[vxrs]\n" \ + " VSTM %[v1],%[v3],0,1\n" \ + : [vxrs] "=R" (*_v) \ + : [v1] "I" (_v1), [v3] "I" (_v3) \ + : "memory", "1"); \ + (_v3) - (_v1) + 1; \ +}) + +#endif /* CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +static __always_inline void fpu_vupllf(u8 v1, u8 v2) +{ + asm volatile("VUPLLF %[v1],%[v2]" + : + : [v1] "I" (v1), [v2] "I" (v2) + : "memory"); +} + +static __always_inline void fpu_vx(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VX %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +static __always_inline void fpu_vzero(u8 v) +{ + asm volatile("VZERO %[v]" + : + : [v] "I" (v) + : "memory"); +} + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_FPU_INSN_H */ diff --git a/arch/s390/include/asm/fpu-types.h b/arch/s390/include/asm/fpu-types.h new file mode 100644 index 000000000000..8d58d5a95399 --- /dev/null +++ b/arch/s390/include/asm/fpu-types.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * FPU data structures + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#ifndef _ASM_S390_FPU_TYPES_H +#define _ASM_S390_FPU_TYPES_H + +#include <asm/sigcontext.h> + +struct fpu { + u32 fpc; + __vector128 vxrs[__NUM_VXRS] __aligned(8); +}; + +struct kernel_fpu_hdr { + int mask; + u32 fpc; +}; + +struct kernel_fpu { + struct kernel_fpu_hdr hdr; + __vector128 vxrs[] __aligned(8); +}; + +#define KERNEL_FPU_STRUCT(vxr_size) \ +struct kernel_fpu_##vxr_size { \ + struct kernel_fpu_hdr hdr; \ + __vector128 vxrs[vxr_size] __aligned(8); \ +} + +KERNEL_FPU_STRUCT(8); +KERNEL_FPU_STRUCT(16); +KERNEL_FPU_STRUCT(32); + +#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name) \ + struct kernel_fpu_##vxr_size name __uninitialized + +#define DECLARE_KERNEL_FPU_ONSTACK8(name) \ + DECLARE_KERNEL_FPU_ONSTACK(8, name) + +#define DECLARE_KERNEL_FPU_ONSTACK16(name) \ + DECLARE_KERNEL_FPU_ONSTACK(16, name) + +#define DECLARE_KERNEL_FPU_ONSTACK32(name) \ + DECLARE_KERNEL_FPU_ONSTACK(32, name) + +#endif /* _ASM_S390_FPU_TYPES_H */ diff --git a/arch/s390/include/asm/fpu.h b/arch/s390/include/asm/fpu.h new file mode 100644 index 000000000000..960c6c67ad6c --- /dev/null +++ b/arch/s390/include/asm/fpu.h @@ -0,0 +1,290 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * In-kernel FPU support functions + * + * + * Consider these guidelines before using in-kernel FPU functions: + * + * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel + * use of floating-point or vector registers and instructions. + * + * 2. For kernel_fpu_begin(), specify the vector register range you want to + * use with the KERNEL_VXR_* constants. Consider these usage guidelines: + * + * a) If your function typically runs in process-context, use the lower + * half of the vector registers, for example, specify KERNEL_VXR_LOW. + * b) If your function typically runs in soft-irq or hard-irq context, + * prefer using the upper half of the vector registers, for example, + * specify KERNEL_VXR_HIGH. + * + * If you adhere to these guidelines, an interrupted process context + * does not require to save and restore vector registers because of + * disjoint register ranges. + * + * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions + * includes logic to save and restore up to 16 vector registers at once. + * + * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different + * struct kernel_fpu states. Vector registers that are in use by outer + * levels are saved and restored. You can minimize the save and restore + * effort by choosing disjoint vector register ranges. + * + * 5. To use vector floating-point instructions, specify the KERNEL_FPC + * flag to save and restore floating-point controls in addition to any + * vector register range. + * + * 6. To use floating-point registers and instructions only, specify the + * KERNEL_FPR flag. This flag triggers a save and restore of vector + * registers V0 to V15 and floating-point controls. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#ifndef _ASM_S390_FPU_H +#define _ASM_S390_FPU_H + +#include <linux/cpufeature.h> +#include <linux/processor.h> +#include <linux/preempt.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <asm/sigcontext.h> +#include <asm/fpu-types.h> +#include <asm/fpu-insn.h> + +enum { + KERNEL_FPC_BIT = 0, + KERNEL_VXR_V0V7_BIT, + KERNEL_VXR_V8V15_BIT, + KERNEL_VXR_V16V23_BIT, + KERNEL_VXR_V24V31_BIT, +}; + +#define KERNEL_FPC BIT(KERNEL_FPC_BIT) +#define KERNEL_VXR_V0V7 BIT(KERNEL_VXR_V0V7_BIT) +#define KERNEL_VXR_V8V15 BIT(KERNEL_VXR_V8V15_BIT) +#define KERNEL_VXR_V16V23 BIT(KERNEL_VXR_V16V23_BIT) +#define KERNEL_VXR_V24V31 BIT(KERNEL_VXR_V24V31_BIT) + +#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7 | KERNEL_VXR_V8V15) +#define KERNEL_VXR_MID (KERNEL_VXR_V8V15 | KERNEL_VXR_V16V23) +#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23 | KERNEL_VXR_V24V31) + +#define KERNEL_VXR (KERNEL_VXR_LOW | KERNEL_VXR_HIGH) +#define KERNEL_FPR (KERNEL_FPC | KERNEL_VXR_LOW) + +void load_fpu_state(struct fpu *state, int flags); +void save_fpu_state(struct fpu *state, int flags); +void __kernel_fpu_begin(struct kernel_fpu *state, int flags); +void __kernel_fpu_end(struct kernel_fpu *state, int flags); + +static __always_inline void save_vx_regs(__vector128 *vxrs) +{ + fpu_vstm(0, 15, &vxrs[0]); + fpu_vstm(16, 31, &vxrs[16]); +} + +static __always_inline void load_vx_regs(__vector128 *vxrs) +{ + fpu_vlm(0, 15, &vxrs[0]); + fpu_vlm(16, 31, &vxrs[16]); +} + +static __always_inline void __save_fp_regs(freg_t *fprs, unsigned int offset) +{ + fpu_std(0, &fprs[0 * offset]); + fpu_std(1, &fprs[1 * offset]); + fpu_std(2, &fprs[2 * offset]); + fpu_std(3, &fprs[3 * offset]); + fpu_std(4, &fprs[4 * offset]); + fpu_std(5, &fprs[5 * offset]); + fpu_std(6, &fprs[6 * offset]); + fpu_std(7, &fprs[7 * offset]); + fpu_std(8, &fprs[8 * offset]); + fpu_std(9, &fprs[9 * offset]); + fpu_std(10, &fprs[10 * offset]); + fpu_std(11, &fprs[11 * offset]); + fpu_std(12, &fprs[12 * offset]); + fpu_std(13, &fprs[13 * offset]); + fpu_std(14, &fprs[14 * offset]); + fpu_std(15, &fprs[15 * offset]); +} + +static __always_inline void __load_fp_regs(freg_t *fprs, unsigned int offset) +{ + fpu_ld(0, &fprs[0 * offset]); + fpu_ld(1, &fprs[1 * offset]); + fpu_ld(2, &fprs[2 * offset]); + fpu_ld(3, &fprs[3 * offset]); + fpu_ld(4, &fprs[4 * offset]); + fpu_ld(5, &fprs[5 * offset]); + fpu_ld(6, &fprs[6 * offset]); + fpu_ld(7, &fprs[7 * offset]); + fpu_ld(8, &fprs[8 * offset]); + fpu_ld(9, &fprs[9 * offset]); + fpu_ld(10, &fprs[10 * offset]); + fpu_ld(11, &fprs[11 * offset]); + fpu_ld(12, &fprs[12 * offset]); + fpu_ld(13, &fprs[13 * offset]); + fpu_ld(14, &fprs[14 * offset]); + fpu_ld(15, &fprs[15 * offset]); +} + +static __always_inline void save_fp_regs(freg_t *fprs) +{ + __save_fp_regs(fprs, sizeof(freg_t) / sizeof(freg_t)); +} + +static __always_inline void load_fp_regs(freg_t *fprs) +{ + __load_fp_regs(fprs, sizeof(freg_t) / sizeof(freg_t)); +} + +static __always_inline void save_fp_regs_vx(__vector128 *vxrs) +{ + freg_t *fprs = (freg_t *)&vxrs[0].high; + + __save_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t)); +} + +static __always_inline void load_fp_regs_vx(__vector128 *vxrs) +{ + freg_t *fprs = (freg_t *)&vxrs[0].high; + + __load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t)); +} + +static inline void load_user_fpu_regs(void) +{ + struct thread_struct *thread = ¤t->thread; + + if (!thread->ufpu_flags) + return; + load_fpu_state(&thread->ufpu, thread->ufpu_flags); + thread->ufpu_flags = 0; +} + +static __always_inline void __save_user_fpu_regs(struct thread_struct *thread, int flags) +{ + save_fpu_state(&thread->ufpu, flags); + __atomic_or(flags, &thread->ufpu_flags); +} + +static inline void save_user_fpu_regs(void) +{ + struct thread_struct *thread = ¤t->thread; + int mask, flags; + + mask = __atomic_or(KERNEL_FPC | KERNEL_VXR, &thread->kfpu_flags); + flags = ~READ_ONCE(thread->ufpu_flags) & (KERNEL_FPC | KERNEL_VXR); + if (flags) + __save_user_fpu_regs(thread, flags); + barrier(); + WRITE_ONCE(thread->kfpu_flags, mask); +} + +static __always_inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags) +{ + struct thread_struct *thread = ¤t->thread; + int mask, uflags; + + mask = __atomic_or(flags, &thread->kfpu_flags); + state->hdr.mask = mask; + uflags = READ_ONCE(thread->ufpu_flags); + if ((uflags & flags) != flags) + __save_user_fpu_regs(thread, ~uflags & flags); + if (mask & flags) + __kernel_fpu_begin(state, flags); +} + +static __always_inline void _kernel_fpu_end(struct kernel_fpu *state, int flags) +{ + int mask = state->hdr.mask; + + if (mask & flags) + __kernel_fpu_end(state, flags); + barrier(); + WRITE_ONCE(current->thread.kfpu_flags, mask); +} + +void __kernel_fpu_invalid_size(void); + +static __always_inline void kernel_fpu_check_size(int flags, unsigned int size) +{ + unsigned int cnt = 0; + + if (flags & KERNEL_VXR_V0V7) + cnt += 8; + if (flags & KERNEL_VXR_V8V15) + cnt += 8; + if (flags & KERNEL_VXR_V16V23) + cnt += 8; + if (flags & KERNEL_VXR_V24V31) + cnt += 8; + if (cnt != size) + __kernel_fpu_invalid_size(); +} + +#define kernel_fpu_begin(state, flags) \ +{ \ + typeof(state) s = (state); \ + int _flags = (flags); \ + \ + kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs)); \ + _kernel_fpu_begin((struct kernel_fpu *)s, _flags); \ +} + +#define kernel_fpu_end(state, flags) \ +{ \ + typeof(state) s = (state); \ + int _flags = (flags); \ + \ + kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs)); \ + _kernel_fpu_end((struct kernel_fpu *)s, _flags); \ +} + +static inline void save_kernel_fpu_regs(struct thread_struct *thread) +{ + if (!thread->kfpu_flags) + return; + save_fpu_state(&thread->kfpu, thread->kfpu_flags); +} + +static inline void restore_kernel_fpu_regs(struct thread_struct *thread) +{ + if (!thread->kfpu_flags) + return; + load_fpu_state(&thread->kfpu, thread->kfpu_flags); +} + +static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs) +{ + int i; + + for (i = 0; i < __NUM_FPRS; i++) + fprs[i].ui = vxrs[i].high; +} + +static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) +{ + int i; + + for (i = 0; i < __NUM_FPRS; i++) + vxrs[i].high = fprs[i].ui; +} + +static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) +{ + fpregs->pad = 0; + fpregs->fpc = fpu->fpc; + convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs); +} + +static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu) +{ + fpu->fpc = fpregs->fpc; + convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs); +} + +#endif /* _ASM_S390_FPU_H */ diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h deleted file mode 100644 index b714ed0ef688..000000000000 --- a/arch/s390/include/asm/fpu/api.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * In-kernel FPU support functions - * - * - * Consider these guidelines before using in-kernel FPU functions: - * - * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel - * use of floating-point or vector registers and instructions. - * - * 2. For kernel_fpu_begin(), specify the vector register range you want to - * use with the KERNEL_VXR_* constants. Consider these usage guidelines: - * - * a) If your function typically runs in process-context, use the lower - * half of the vector registers, for example, specify KERNEL_VXR_LOW. - * b) If your function typically runs in soft-irq or hard-irq context, - * prefer using the upper half of the vector registers, for example, - * specify KERNEL_VXR_HIGH. - * - * If you adhere to these guidelines, an interrupted process context - * does not require to save and restore vector registers because of - * disjoint register ranges. - * - * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions - * includes logic to save and restore up to 16 vector registers at once. - * - * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different - * struct kernel_fpu states. Vector registers that are in use by outer - * levels are saved and restored. You can minimize the save and restore - * effort by choosing disjoint vector register ranges. - * - * 5. To use vector floating-point instructions, specify the KERNEL_FPC - * flag to save and restore floating-point controls in addition to any - * vector register range. - * - * 6. To use floating-point registers and instructions only, specify the - * KERNEL_FPR flag. This flag triggers a save and restore of vector - * registers V0 to V15 and floating-point controls. - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ - -#ifndef _ASM_S390_FPU_API_H -#define _ASM_S390_FPU_API_H - -#include <linux/preempt.h> -#include <asm/asm-extable.h> - -void save_fpu_regs(void); -void load_fpu_regs(void); -void __load_fpu_regs(void); - -static inline int test_fp_ctl(u32 fpc) -{ - u32 orig_fpc; - int rc; - - asm volatile( - " efpc %1\n" - " sfpc %2\n" - "0: sfpc %1\n" - " la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc), "=&d" (orig_fpc) - : "d" (fpc), "0" (-EINVAL)); - return rc; -} - -#define KERNEL_FPC 1 -#define KERNEL_VXR_V0V7 2 -#define KERNEL_VXR_V8V15 4 -#define KERNEL_VXR_V16V23 8 -#define KERNEL_VXR_V24V31 16 - -#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15) -#define KERNEL_VXR_MID (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23) -#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31) - -#define KERNEL_VXR (KERNEL_VXR_LOW|KERNEL_VXR_HIGH) -#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_V0V7) - -struct kernel_fpu; - -/* - * Note the functions below must be called with preemption disabled. - * Do not enable preemption before calling __kernel_fpu_end() to prevent - * an corruption of an existing kernel FPU state. - * - * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions. - */ -void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags); -void __kernel_fpu_end(struct kernel_fpu *state, u32 flags); - - -static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags) -{ - preempt_disable(); - state->mask = S390_lowcore.fpu_flags; - if (!test_cpu_flag(CIF_FPU)) - /* Save user space FPU state and register contents */ - save_fpu_regs(); - else if (state->mask & flags) - /* Save FPU/vector register in-use by the kernel */ - __kernel_fpu_begin(state, flags); - S390_lowcore.fpu_flags |= flags; -} - -static inline void kernel_fpu_end(struct kernel_fpu *state, u32 flags) -{ - S390_lowcore.fpu_flags = state->mask; - if (state->mask & flags) - /* Restore FPU/vector register in-use by the kernel */ - __kernel_fpu_end(state, flags); - preempt_enable(); -} - -#endif /* _ASM_S390_FPU_API_H */ diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h deleted file mode 100644 index 4a71dbbf76fb..000000000000 --- a/arch/s390/include/asm/fpu/internal.h +++ /dev/null @@ -1,62 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * FPU state and register content conversion primitives - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ - -#ifndef _ASM_S390_FPU_INTERNAL_H -#define _ASM_S390_FPU_INTERNAL_H - -#include <linux/string.h> -#include <asm/ctl_reg.h> -#include <asm/fpu/types.h> - -static inline void save_vx_regs(__vector128 *vxrs) -{ - asm volatile( - " la 1,%0\n" - " .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */ - " .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */ - : "=Q" (*(struct vx_array *) vxrs) : : "1"); -} - -static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs) -{ - int i; - - for (i = 0; i < __NUM_FPRS; i++) - fprs[i] = *(freg_t *)(vxrs + i); -} - -static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) -{ - int i; - - for (i = 0; i < __NUM_FPRS; i++) - *(freg_t *)(vxrs + i) = fprs[i]; -} - -static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) -{ - fpregs->pad = 0; - fpregs->fpc = fpu->fpc; - if (MACHINE_HAS_VX) - convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs); - else - memcpy((freg_t *)&fpregs->fprs, fpu->fprs, - sizeof(fpregs->fprs)); -} - -static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu) -{ - fpu->fpc = fpregs->fpc; - if (MACHINE_HAS_VX) - convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs); - else - memcpy(fpu->fprs, (freg_t *)&fpregs->fprs, - sizeof(fpregs->fprs)); -} - -#endif /* _ASM_S390_FPU_INTERNAL_H */ diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h deleted file mode 100644 index d889e9436865..000000000000 --- a/arch/s390/include/asm/fpu/types.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * FPU data structures - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - */ - -#ifndef _ASM_S390_FPU_TYPES_H -#define _ASM_S390_FPU_TYPES_H - -#include <asm/sigcontext.h> - -struct fpu { - __u32 fpc; /* Floating-point control */ - void *regs; /* Pointer to the current save area */ - union { - /* Floating-point register save area */ - freg_t fprs[__NUM_FPRS]; - /* Vector register save area */ - __vector128 vxrs[__NUM_VXRS]; - }; -}; - -/* VX array structure for address operand constraints in inline assemblies */ -struct vx_array { __vector128 _[__NUM_VXRS]; }; - -/* In-kernel FPU state structure */ -struct kernel_fpu { - u32 mask; - u32 fpc; - union { - freg_t fprs[__NUM_FPRS]; - __vector128 vxrs[__NUM_VXRS]; - }; -}; - -#endif /* _ASM_S390_FPU_TYPES_H */ diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 6f80ec9c04be..185331e91f83 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -2,18 +2,28 @@ #ifndef _ASM_S390_FTRACE_H #define _ASM_S390_FTRACE_H -#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR #define ARCH_SUPPORTS_FTRACE_OPS 1 #define MCOUNT_INSN_SIZE 6 #ifndef __ASSEMBLY__ +#include <asm/stacktrace.h> -#ifdef CONFIG_CC_IS_CLANG -/* https://bugs.llvm.org/show_bug.cgi?id=41424 */ -#define ftrace_return_address(n) 0UL -#else -#define ftrace_return_address(n) __builtin_return_address(n) -#endif +static __always_inline unsigned long return_address(unsigned int n) +{ + struct stack_frame *sf; + + if (!n) + return (unsigned long)__builtin_return_address(0); + + sf = (struct stack_frame *)current_frame_address(); + do { + sf = (struct stack_frame *)sf->back_chain; + if (!sf) + return 0; + } while (--n); + return sf->gprs[8]; +} +#define ftrace_return_address(n) return_address(n) void ftrace_caller(void); @@ -29,6 +39,7 @@ struct dyn_arch_ftrace { }; struct module; struct dyn_ftrace; +struct ftrace_ops; bool ftrace_need_init_nop(void); #define ftrace_need_init_nop ftrace_need_init_nop @@ -40,26 +51,46 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } +#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip)) -struct ftrace_regs { - struct pt_regs regs; -}; +#include <linux/ftrace_regs.h> static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS)) return regs; return NULL; } -static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, - unsigned long ip) +static __always_inline void +ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, + unsigned long ip) +{ + arch_ftrace_regs(fregs)->regs.psw.addr = ip; +} + +#undef ftrace_regs_get_frame_pointer +static __always_inline unsigned long +ftrace_regs_get_frame_pointer(struct ftrace_regs *fregs) { - fregs->regs.psw.addr = ip; + return ftrace_regs_get_stack_pointer(fregs); } +static __always_inline unsigned long +ftrace_regs_get_return_address(const struct ftrace_regs *fregs) +{ + return arch_ftrace_regs(fregs)->regs.gprs[14]; +} + +#define arch_ftrace_fill_perf_regs(fregs, _regs) do { \ + (_regs)->psw.mask = 0; \ + (_regs)->psw.addr = arch_ftrace_regs(fregs)->regs.psw.addr; \ + (_regs)->gprs[15] = arch_ftrace_regs(fregs)->regs.gprs[15]; \ + } while (0) + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS /* * When an ftrace registered caller is tracing a function that is * also set by a register_ftrace_direct() call, it needs to be @@ -67,10 +98,12 @@ static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *f * place the direct caller in the ORIG_GPR2 part of pt_regs. This * tells the ftrace_caller that there's a direct caller. */ -static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; regs->orig_gpr2 = addr; } +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /* * Even though the system call numbers are identical for s390/s390x a @@ -97,6 +130,10 @@ static inline bool arch_syscall_match_sym_name(const char *sym, return !strcmp(sym + 7, name) || !strcmp(sym + 8, name); } +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +#define ftrace_graph_func ftrace_graph_func + #endif /* __ASSEMBLY__ */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index e08c882dccaa..942f21c39697 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -2,79 +2,101 @@ #ifndef _ASM_S390_FUTEX_H #define _ASM_S390_FUTEX_H +#include <linux/instrumented.h> #include <linux/uaccess.h> #include <linux/futex.h> #include <asm/asm-extable.h> #include <asm/mmu_context.h> #include <asm/errno.h> -#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ - asm volatile( \ - " sacf 256\n" \ - "0: l %1,0(%6)\n" \ - "1:"insn \ - "2: cs %1,%2,0(%6)\n" \ - "3: jl 1b\n" \ - " lhi %0,0\n" \ - "4: sacf 768\n" \ - EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ - : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ - "=m" (*uaddr) \ - : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ - "m" (*uaddr) : "cc"); +#define FUTEX_OP_FUNC(name, insn) \ +static uaccess_kmsan_or_inline int \ +__futex_atomic_##name(int oparg, int *old, u32 __user *uaddr) \ +{ \ + bool sacf_flag; \ + int rc, new; \ + \ + instrument_copy_from_user_before(old, uaddr, sizeof(*old)); \ + sacf_flag = enable_sacf_uaccess(); \ + asm_inline volatile( \ + " sacf 256\n" \ + "0: l %[old],%[uaddr]\n" \ + "1:"insn \ + "2: cs %[old],%[new],%[uaddr]\n" \ + "3: jl 1b\n" \ + " lhi %[rc],0\n" \ + "4: sacf 768\n" \ + EX_TABLE_UA_FAULT(0b, 4b, %[rc]) \ + EX_TABLE_UA_FAULT(1b, 4b, %[rc]) \ + EX_TABLE_UA_FAULT(2b, 4b, %[rc]) \ + EX_TABLE_UA_FAULT(3b, 4b, %[rc]) \ + : [rc] "=d" (rc), [old] "=&d" (*old), \ + [new] "=&d" (new), [uaddr] "+Q" (*uaddr) \ + : [oparg] "d" (oparg) \ + : "cc"); \ + disable_sacf_uaccess(sacf_flag); \ + if (!rc) \ + instrument_copy_from_user_after(old, uaddr, sizeof(*old), 0); \ + return rc; \ +} + +FUTEX_OP_FUNC(set, "lr %[new],%[oparg]\n") +FUTEX_OP_FUNC(add, "lr %[new],%[old]\n ar %[new],%[oparg]\n") +FUTEX_OP_FUNC(or, "lr %[new],%[old]\n or %[new],%[oparg]\n") +FUTEX_OP_FUNC(and, "lr %[new],%[old]\n nr %[new],%[oparg]\n") +FUTEX_OP_FUNC(xor, "lr %[new],%[old]\n xr %[new],%[oparg]\n") -static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, - u32 __user *uaddr) +static inline +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int oldval = 0, newval, ret; + int old, rc; switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("lr %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_set(oparg, &old, uaddr); break; case FUTEX_OP_ADD: - __futex_atomic_op("lr %2,%1\nar %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_add(oparg, &old, uaddr); break; case FUTEX_OP_OR: - __futex_atomic_op("lr %2,%1\nor %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_or(oparg, &old, uaddr); break; case FUTEX_OP_ANDN: - __futex_atomic_op("lr %2,%1\nnr %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_and(~oparg, &old, uaddr); break; case FUTEX_OP_XOR: - __futex_atomic_op("lr %2,%1\nxr %2,%5\n", - ret, oldval, newval, uaddr, oparg); + rc = __futex_atomic_xor(oparg, &old, uaddr); break; default: - ret = -ENOSYS; + rc = -ENOSYS; } - - if (!ret) - *oval = oldval; - - return ret; + if (!rc) + *oval = old; + return rc; } -static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, - u32 oldval, u32 newval) +static uaccess_kmsan_or_inline +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - int ret; + bool sacf_flag; + int rc; - asm volatile( - " sacf 256\n" - "0: cs %1,%4,0(%5)\n" - "1: la %0,0\n" - "2: sacf 768\n" - EX_TABLE(0b,2b) EX_TABLE(1b,2b) - : "=d" (ret), "+d" (oldval), "=m" (*uaddr) - : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) + instrument_copy_from_user_before(uval, uaddr, sizeof(*uval)); + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile( + " sacf 256\n" + "0: cs %[old],%[new],%[uaddr]\n" + "1: lhi %[rc],0\n" + "2: sacf 768\n" + EX_TABLE_UA_FAULT(0b, 2b, %[rc]) + EX_TABLE_UA_FAULT(1b, 2b, %[rc]) + : [rc] "=d" (rc), [old] "+d" (oldval), [uaddr] "+Q" (*uaddr) + : [new] "d" (newval) : "cc", "memory"); + disable_sacf_uaccess(sacf_flag); *uval = oldval; - return ret; + instrument_copy_from_user_after(uval, uaddr, sizeof(*uval), 0); + return rc; } #endif /* _ASM_S390_FUTEX_H */ diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 40264f60b0da..66c5808fd011 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -17,13 +17,12 @@ #define GMAP_NOTIFY_MPROT 0x1 /* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */ +#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ +#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ /** * struct gmap_struct - guest address space * @list: list head for the mm->context gmap list - * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries @@ -35,7 +34,6 @@ * @guest_handle: protected virtual machine handle for the ultravisor * @host_to_rmap: radix tree with gmap_rmap lists * @children: list of shadow gmap structures - * @pt_list: list of all page tables used in the shadow guest address space * @shadow_lock: spinlock to protect the shadow gmap list * @parent: pointer to the parent gmap for shadow guest address spaces * @orig_asce: ASCE for which the shadow page table has been created @@ -45,7 +43,6 @@ */ struct gmap { struct list_head list; - struct list_head crst_list; struct mm_struct *mm; struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; @@ -61,7 +58,6 @@ struct gmap { /* Additional data for shadow guest address spaces */ struct radix_tree_root host_to_rmap; struct list_head children; - struct list_head pt_list; spinlock_t shadow_lock; struct gmap *parent; unsigned long orig_asce; @@ -106,26 +102,20 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); void gmap_remove(struct gmap *gmap); struct gmap *gmap_get(struct gmap *gmap); void gmap_put(struct gmap *gmap); +void gmap_free(struct gmap *gmap); +struct gmap *gmap_alloc(unsigned long limit); -void gmap_enable(struct gmap *gmap); -void gmap_disable(struct gmap *gmap); -struct gmap *gmap_get_enabled(void); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -unsigned long gmap_translate(struct gmap *, unsigned long gaddr); int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); -void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level); -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +void gmap_unshadow(struct gmap *sg); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, @@ -134,18 +124,51 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake); -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, int *fake); int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); -int gmap_mprotect_notify(struct gmap *, unsigned long start, - unsigned long len, int prot); +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); -int gmap_mark_unmergeable(void); -void s390_reset_acc(struct mm_struct *mm); +int s390_replace_asce(struct gmap *gmap); +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible); +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); + +/** + * s390_uv_destroy_range - Destroy a range of pages in the given mm. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls, but + * it will otherwise only return when it completed. + */ +static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + (void)__s390_uv_destroy_range(mm, start, end, false); +} + +/** + * s390_uv_destroy_range_interruptible - Destroy a range of pages in the + * given mm, but stop when a fatal signal is received. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls. If + * a fatal signal is received, it will return with -EINTR immediately, + * without finishing destroying the whole range. Upon successful + * completion, 0 is returned. + */ +static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return __s390_uv_destroy_range(mm, start, end, true); +} #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h new file mode 100644 index 000000000000..5356446a61c4 --- /dev/null +++ b/arch/s390/include/asm/gmap_helpers.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2025 + */ + +#ifndef _ASM_S390_GMAP_HELPERS_H +#define _ASM_S390_GMAP_HELPERS_H + +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); +int gmap_helper_disable_cow_sharing(void); + +#endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h index 58668ffb5488..a5b45388c91f 100644 --- a/arch/s390/include/asm/hardirq.h +++ b/arch/s390/include/asm/hardirq.h @@ -13,9 +13,9 @@ #include <asm/lowcore.h> -#define local_softirq_pending() (S390_lowcore.softirq_pending) -#define set_softirq_pending(x) (S390_lowcore.softirq_pending = (x)) -#define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x)) +#define local_softirq_pending() (get_lowcore()->softirq_pending) +#define set_softirq_pending(x) (get_lowcore()->softirq_pending = (x)) +#define or_softirq_pending(x) (get_lowcore()->softirq_pending |= (x)) #define __ARCH_IRQ_STAT #define __ARCH_IRQ_EXIT_IRQS_DISABLED diff --git a/arch/s390/include/asm/hiperdispatch.h b/arch/s390/include/asm/hiperdispatch.h new file mode 100644 index 000000000000..27e23aa27a24 --- /dev/null +++ b/arch/s390/include/asm/hiperdispatch.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2024 + */ + +#ifndef _ASM_HIPERDISPATCH_H +#define _ASM_HIPERDISPATCH_H + +void hd_reset_state(void); +void hd_add_core(int cpu); +void hd_disable_hiperdispatch(void); +int hd_enable_hiperdispatch(void); + +#endif /* _ASM_HIPERDISPATCH_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index f22beda9e6d5..931fcc413598 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -9,38 +9,41 @@ #ifndef _ASM_S390_HUGETLB_H #define _ASM_S390_HUGETLB_H +#include <linux/cpufeature.h> #include <linux/pgtable.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/page.h> -#define hugetlb_free_pgd_range free_pgd_range -#define hugepages_supported() (MACHINE_HAS_EDAT1) +#define hugepages_supported() cpu_has_edat1() +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); -pte_t huge_ptep_get(pte_t *ptep); -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) +#define __HAVE_ARCH_HUGE_PTEP_GET +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned long sz) { - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; + return __huge_ptep_get_and_clear(mm, addr, ptep); } -static inline void arch_clear_hugepage_flags(struct page *page) +static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); } -#define arch_clear_hugepage_flags arch_clear_hugepage_flags +#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags +#define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { @@ -50,94 +53,54 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int changed = !pte_same(huge_ptep_get(ptep), pte); + int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); + if (changed) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + __huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); - set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); -} - -static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) -{ - return mk_pte(page, pgprot); -} - -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline int huge_pte_none_mostly(pte_t pte) -{ - return huge_pte_none(pte); -} - -static inline int huge_pte_write(pte_t pte) -{ - return pte_write(pte); -} - -static inline int huge_pte_dirty(pte_t pte) -{ - return pte_dirty(pte); -} + pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep); -static inline pte_t huge_pte_mkwrite(pte_t pte) -{ - return pte_mkwrite(pte); -} - -static inline pte_t huge_pte_mkdirty(pte_t pte) -{ - return pte_mkdirty(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) -{ - return pte_modify(pte, newprot); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } +#define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return 0; } -static inline bool gigantic_page_runtime_supported(void) -{ - return true; -} +#include <asm-generic/hugetlb.h> #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index 40eae2c08d61..ac68c657b28c 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* +/* * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * Bugreports.to..: <Linux390@de.ibm.com> @@ -17,47 +17,65 @@ #include <linux/err.h> #include <linux/types.h> #include <linux/slab.h> -#include <asm/cio.h> #include <linux/uaccess.h> +#include <asm/dma-types.h> +#include <asm/cio.h> + +#define IDA_SIZE_SHIFT 12 +#define IDA_BLOCK_SIZE (1UL << IDA_SIZE_SHIFT) -#define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */ -#define IDA_BLOCK_SIZE (1L<<IDA_SIZE_LOG) +#define IDA_2K_SIZE_SHIFT 11 +#define IDA_2K_BLOCK_SIZE (1UL << IDA_2K_SIZE_SHIFT) /* * Test if an address/length pair needs an idal list. */ -static inline int -idal_is_needed(void *vaddr, unsigned int length) +static inline bool idal_is_needed(void *vaddr, unsigned int length) { - return ((__pa(vaddr) + length - 1) >> 31) != 0; -} + dma64_t paddr = virt_to_dma64(vaddr); + return (((__force unsigned long)(paddr) + length - 1) >> 31) != 0; +} /* * Return the number of idal words needed for an address/length pair. */ static inline unsigned int idal_nr_words(void *vaddr, unsigned int length) { - return ((__pa(vaddr) & (IDA_BLOCK_SIZE-1)) + length + - (IDA_BLOCK_SIZE-1)) >> IDA_SIZE_LOG; + unsigned int cidaw; + + cidaw = (unsigned long)vaddr & (IDA_BLOCK_SIZE - 1); + cidaw += length + IDA_BLOCK_SIZE - 1; + cidaw >>= IDA_SIZE_SHIFT; + return cidaw; +} + +/* + * Return the number of 2K IDA words needed for an address/length pair. + */ +static inline unsigned int idal_2k_nr_words(void *vaddr, unsigned int length) +{ + unsigned int cidaw; + + cidaw = (unsigned long)vaddr & (IDA_2K_BLOCK_SIZE - 1); + cidaw += length + IDA_2K_BLOCK_SIZE - 1; + cidaw >>= IDA_2K_SIZE_SHIFT; + return cidaw; } /* * Create the list of idal words for an address/length pair. */ -static inline unsigned long *idal_create_words(unsigned long *idaws, - void *vaddr, unsigned int length) +static inline dma64_t *idal_create_words(dma64_t *idaws, void *vaddr, unsigned int length) { - unsigned long paddr; + dma64_t paddr = virt_to_dma64(vaddr); unsigned int cidaw; - paddr = __pa(vaddr); - cidaw = ((paddr & (IDA_BLOCK_SIZE-1)) + length + - (IDA_BLOCK_SIZE-1)) >> IDA_SIZE_LOG; *idaws++ = paddr; - paddr &= -IDA_BLOCK_SIZE; + cidaw = idal_nr_words(vaddr, length); + paddr = dma64_and(paddr, -IDA_BLOCK_SIZE); while (--cidaw > 0) { - paddr += IDA_BLOCK_SIZE; + paddr = dma64_add(paddr, IDA_BLOCK_SIZE); *idaws++ = paddr; } return idaws; @@ -67,36 +85,33 @@ static inline unsigned long *idal_create_words(unsigned long *idaws, * Sets the address of the data in CCW. * If necessary it allocates an IDAL and sets the appropriate flags. */ -static inline int -set_normalized_cda(struct ccw1 * ccw, void *vaddr) +static inline int set_normalized_cda(struct ccw1 *ccw, void *vaddr) { unsigned int nridaws; - unsigned long *idal; + dma64_t *idal; if (ccw->flags & CCW_FLAG_IDA) return -EINVAL; nridaws = idal_nr_words(vaddr, ccw->count); if (nridaws > 0) { - idal = kmalloc(nridaws * sizeof(unsigned long), - GFP_ATOMIC | GFP_DMA ); - if (idal == NULL) + idal = kcalloc(nridaws, sizeof(*idal), GFP_ATOMIC | GFP_DMA); + if (!idal) return -ENOMEM; idal_create_words(idal, vaddr, ccw->count); ccw->flags |= CCW_FLAG_IDA; vaddr = idal; } - ccw->cda = (__u32)(unsigned long) vaddr; + ccw->cda = virt_to_dma32(vaddr); return 0; } /* * Releases any allocated IDAL related to the CCW. */ -static inline void -clear_normalized_cda(struct ccw1 * ccw) +static inline void clear_normalized_cda(struct ccw1 *ccw) { if (ccw->flags & CCW_FLAG_IDA) { - kfree((void *)(unsigned long) ccw->cda); + kfree(dma32_to_virt(ccw->cda)); ccw->flags &= ~CCW_FLAG_IDA; } ccw->cda = 0; @@ -108,125 +123,138 @@ clear_normalized_cda(struct ccw1 * ccw) struct idal_buffer { size_t size; size_t page_order; - void *data[]; + dma64_t data[]; }; /* * Allocate an idal buffer */ -static inline struct idal_buffer * -idal_buffer_alloc(size_t size, int page_order) +static inline struct idal_buffer *idal_buffer_alloc(size_t size, int page_order) { - struct idal_buffer *ib; int nr_chunks, nr_ptrs, i; + struct idal_buffer *ib; + void *vaddr; - nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG; - nr_chunks = (4096 << page_order) >> IDA_SIZE_LOG; + nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_SHIFT; + nr_chunks = (PAGE_SIZE << page_order) >> IDA_SIZE_SHIFT; ib = kmalloc(struct_size(ib, data, nr_ptrs), GFP_DMA | GFP_KERNEL); - if (ib == NULL) + if (!ib) return ERR_PTR(-ENOMEM); ib->size = size; ib->page_order = page_order; for (i = 0; i < nr_ptrs; i++) { - if ((i & (nr_chunks - 1)) != 0) { - ib->data[i] = ib->data[i-1] + IDA_BLOCK_SIZE; - continue; - } - ib->data[i] = (void *) - __get_free_pages(GFP_KERNEL, page_order); - if (ib->data[i] != NULL) + if (i & (nr_chunks - 1)) { + ib->data[i] = dma64_add(ib->data[i - 1], IDA_BLOCK_SIZE); continue; - // Not enough memory - while (i >= nr_chunks) { - i -= nr_chunks; - free_pages((unsigned long) ib->data[i], - ib->page_order); } - kfree(ib); - return ERR_PTR(-ENOMEM); + vaddr = (void *)__get_free_pages(GFP_KERNEL, page_order); + if (!vaddr) + goto error; + ib->data[i] = virt_to_dma64(vaddr); } return ib; +error: + while (i >= nr_chunks) { + i -= nr_chunks; + vaddr = dma64_to_virt(ib->data[i]); + free_pages((unsigned long)vaddr, ib->page_order); + } + kfree(ib); + return ERR_PTR(-ENOMEM); } /* * Free an idal buffer. */ -static inline void -idal_buffer_free(struct idal_buffer *ib) +static inline void idal_buffer_free(struct idal_buffer *ib) { int nr_chunks, nr_ptrs, i; + void *vaddr; - nr_ptrs = (ib->size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG; - nr_chunks = (4096 << ib->page_order) >> IDA_SIZE_LOG; - for (i = 0; i < nr_ptrs; i += nr_chunks) - free_pages((unsigned long) ib->data[i], ib->page_order); + nr_ptrs = (ib->size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_SHIFT; + nr_chunks = (PAGE_SIZE << ib->page_order) >> IDA_SIZE_SHIFT; + for (i = 0; i < nr_ptrs; i += nr_chunks) { + vaddr = dma64_to_virt(ib->data[i]); + free_pages((unsigned long)vaddr, ib->page_order); + } kfree(ib); } /* * Test if a idal list is really needed. */ -static inline int -__idal_buffer_is_needed(struct idal_buffer *ib) +static inline bool __idal_buffer_is_needed(struct idal_buffer *ib) { - return ib->size > (4096ul << ib->page_order) || - idal_is_needed(ib->data[0], ib->size); + if (ib->size > (PAGE_SIZE << ib->page_order)) + return true; + return idal_is_needed(dma64_to_virt(ib->data[0]), ib->size); } /* * Set channel data address to idal buffer. */ -static inline void -idal_buffer_set_cda(struct idal_buffer *ib, struct ccw1 *ccw) +static inline void idal_buffer_set_cda(struct idal_buffer *ib, struct ccw1 *ccw) { + void *vaddr; + if (__idal_buffer_is_needed(ib)) { - // setup idals; - ccw->cda = (u32)(addr_t) ib->data; + /* Setup idals */ + ccw->cda = virt_to_dma32(ib->data); ccw->flags |= CCW_FLAG_IDA; - } else - // we do not need idals - use direct addressing - ccw->cda = (u32)(addr_t) ib->data[0]; + } else { + /* + * No idals needed - use direct addressing. Convert from + * dma64_t to virt and then to dma32_t only because of type + * checking. The physical address is known to be below 2GB. + */ + vaddr = dma64_to_virt(ib->data[0]); + ccw->cda = virt_to_dma32(vaddr); + } ccw->count = ib->size; } /* * Copy count bytes from an idal buffer to user memory */ -static inline size_t -idal_buffer_to_user(struct idal_buffer *ib, void __user *to, size_t count) +static inline size_t idal_buffer_to_user(struct idal_buffer *ib, void __user *to, size_t count) { size_t left; + void *vaddr; int i; BUG_ON(count > ib->size); for (i = 0; count > IDA_BLOCK_SIZE; i++) { - left = copy_to_user(to, ib->data[i], IDA_BLOCK_SIZE); + vaddr = dma64_to_virt(ib->data[i]); + left = copy_to_user(to, vaddr, IDA_BLOCK_SIZE); if (left) return left + count - IDA_BLOCK_SIZE; - to = (void __user *) to + IDA_BLOCK_SIZE; + to = (void __user *)to + IDA_BLOCK_SIZE; count -= IDA_BLOCK_SIZE; } - return copy_to_user(to, ib->data[i], count); + vaddr = dma64_to_virt(ib->data[i]); + return copy_to_user(to, vaddr, count); } /* * Copy count bytes from user memory to an idal buffer */ -static inline size_t -idal_buffer_from_user(struct idal_buffer *ib, const void __user *from, size_t count) +static inline size_t idal_buffer_from_user(struct idal_buffer *ib, const void __user *from, size_t count) { size_t left; + void *vaddr; int i; BUG_ON(count > ib->size); for (i = 0; count > IDA_BLOCK_SIZE; i++) { - left = copy_from_user(ib->data[i], from, IDA_BLOCK_SIZE); + vaddr = dma64_to_virt(ib->data[i]); + left = copy_from_user(vaddr, from, IDA_BLOCK_SIZE); if (left) return left + count - IDA_BLOCK_SIZE; - from = (void __user *) from + IDA_BLOCK_SIZE; + from = (void __user *)from + IDA_BLOCK_SIZE; count -= IDA_BLOCK_SIZE; } - return copy_from_user(ib->data[i], from, count); + vaddr = dma64_to_virt(ib->data[i]); + return copy_from_user(vaddr, from, count); } #endif diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 5cea629c548e..09f763b9eb40 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -10,16 +10,12 @@ #include <linux/types.h> #include <linux/device.h> -#include <linux/seqlock.h> struct s390_idle_data { - seqcount_t seqcount; unsigned long idle_count; unsigned long idle_time; unsigned long clock_idle_enter; - unsigned long clock_idle_exit; unsigned long timer_idle_enter; - unsigned long timer_idle_exit; unsigned long mt_cycles_enter[8]; }; @@ -27,6 +23,5 @@ extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); -void psw_idle_exit(void); #endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index e3882b012bfa..faddb9aef3b8 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -16,17 +16,24 @@ #include <asm/pci_io.h> #define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define kc_xlate_dev_mem_ptr xlate_dev_mem_ptr void *xlate_dev_mem_ptr(phys_addr_t phys); #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr +#define kc_unxlate_dev_mem_ptr unxlate_dev_mem_ptr void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); -void __iomem *ioremap(phys_addr_t addr, size_t size); -void __iomem *ioremap_wc(phys_addr_t addr, size_t size); -void __iomem *ioremap_wt(phys_addr_t addr, size_t size); -void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap + +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_writecombine(PAGE_KERNEL)) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -51,10 +58,6 @@ static inline void ioport_unmap(void __iomem *p) #define pci_iomap_wc pci_iomap_wc #define pci_iomap_wc_range pci_iomap_wc_range -#define ioremap ioremap -#define ioremap_wt ioremap_wt -#define ioremap_wc ioremap_wc - #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) @@ -70,6 +73,21 @@ static inline void ioport_unmap(void __iomem *p) #define __raw_writel zpci_write_u32 #define __raw_writeq zpci_write_u64 +/* combine single writes by using store-block insn */ +static inline void __iowrite32_copy(void __iomem *to, const void *from, + size_t count) +{ + zpci_memcpy_toio(to, from, count * 4); +} +#define __iowrite32_copy __iowrite32_copy + +static inline void __iowrite64_copy(void __iomem *to, const void *from, + size_t count) +{ + zpci_memcpy_toio(to, from, count * 8); +} +#define __iowrite64_copy __iowrite64_copy + #endif /* CONFIG_PCI */ #include <asm-generic/io.h> diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index a405b6bb89fb..b0d00032479d 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -22,6 +22,7 @@ struct ipl_parameter_block { struct ipl_pb0_common common; struct ipl_pb0_fcp fcp; struct ipl_pb0_ccw ccw; + struct ipl_pb0_eckd eckd; struct ipl_pb0_nvme nvme; char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; }; @@ -41,6 +42,10 @@ struct ipl_parameter_block { sizeof(struct ipl_pb0_ccw)) #define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) +#define IPL_BP_ECKD_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_eckd)) +#define IPL_BP0_ECKD_LEN (sizeof(struct ipl_pb0_eckd)) + #define IPL_MAX_SUPPORTED_VERSION (0) #define IPL_RB_CERT_UNKNOWN ((unsigned short)-1) @@ -68,6 +73,8 @@ enum ipl_type { IPL_TYPE_NSS = 16, IPL_TYPE_NVME = 32, IPL_TYPE_NVME_DUMP = 64, + IPL_TYPE_ECKD = 128, + IPL_TYPE_ECKD_DUMP = 256, }; struct ipl_info @@ -79,6 +86,9 @@ struct ipl_info } ccw; struct { struct ccw_dev_id dev_id; + } eckd; + struct { + struct ccw_dev_id dev_id; u64 wwpn; u64 lun; } fcp; @@ -99,6 +109,7 @@ extern void set_os_info_reipl_block(void); static inline bool is_ipl_type_dump(void) { return (ipl_info.type == IPL_TYPE_FCP_DUMP) || + (ipl_info.type == IPL_TYPE_ECKD_DUMP) || (ipl_info.type == IPL_TYPE_NVME_DUMP); } diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 89902f754740..bde6a496df5f 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -31,6 +31,7 @@ #include <linux/percpu.h> #include <linux/cache.h> #include <linux/types.h> +#include <asm/ctlreg.h> enum interruption_class { IRQEXT_CLK, @@ -46,13 +47,13 @@ enum interruption_class { IRQEXT_CMS, IRQEXT_CMC, IRQEXT_FTP, + IRQEXT_WTI, IRQIO_CIO, IRQIO_DAS, IRQIO_C15, IRQIO_C70, IRQIO_TAP, IRQIO_VMR, - IRQIO_LCS, IRQIO_CTC, IRQIO_ADM, IRQIO_CSC, @@ -98,20 +99,21 @@ int unregister_external_irq(u16 code, ext_int_handler_t handler); enum irq_subclass { IRQ_SUBCLASS_MEASUREMENT_ALERT = 5, IRQ_SUBCLASS_SERVICE_SIGNAL = 9, + IRQ_SUBCLASS_WARNING_TRACK = 33, }; #define CR0_IRQ_SUBCLASS_MASK \ - ((1UL << (63 - 30)) /* Warning Track */ | \ - (1UL << (63 - 48)) /* Malfunction Alert */ | \ - (1UL << (63 - 49)) /* Emergency Signal */ | \ - (1UL << (63 - 50)) /* External Call */ | \ - (1UL << (63 - 52)) /* Clock Comparator */ | \ - (1UL << (63 - 53)) /* CPU Timer */ | \ - (1UL << (63 - 54)) /* Service Signal */ | \ - (1UL << (63 - 57)) /* Interrupt Key */ | \ - (1UL << (63 - 58)) /* Measurement Alert */ | \ - (1UL << (63 - 59)) /* Timing Alert */ | \ - (1UL << (63 - 62))) /* IUCV */ + (CR0_WARNING_TRACK | \ + CR0_MALFUNCTION_ALERT_SUBMASK | \ + CR0_EMERGENCY_SIGNAL_SUBMASK | \ + CR0_EXTERNAL_CALL_SUBMASK | \ + CR0_CLOCK_COMPARATOR_SUBMASK | \ + CR0_CPU_TIMER_SUBMASK | \ + CR0_SERVICE_SIGNAL_SUBMASK | \ + CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK | \ + CR0_ETR_SUBMASK | \ + CR0_IUCV) void irq_subclass_register(enum irq_subclass subclass); void irq_subclass_unregister(enum irq_subclass subclass); diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h index 603783766d0a..f00c9f610d5a 100644 --- a/arch/s390/include/asm/irq_work.h +++ b/arch/s390/include/asm/irq_work.h @@ -7,6 +7,4 @@ static inline bool arch_irq_work_has_interrupt(void) return true; } -void arch_irq_work_raise(void); - #endif /* _ASM_S390_IRQ_WORK_H */ diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h index 02427b205c11..bcab456dfb80 100644 --- a/arch/s390/include/asm/irqflags.h +++ b/arch/s390/include/asm/irqflags.h @@ -37,12 +37,18 @@ static __always_inline void __arch_local_irq_ssm(unsigned long flags) asm volatile("ssm %0" : : "Q" (flags) : "memory"); } -static __always_inline unsigned long arch_local_save_flags(void) +#ifdef CONFIG_KMSAN +#define arch_local_irq_attributes noinline notrace __no_sanitize_memory __maybe_unused +#else +#define arch_local_irq_attributes __always_inline +#endif + +static arch_local_irq_attributes unsigned long arch_local_save_flags(void) { return __arch_local_irq_stnsm(0xff); } -static __always_inline unsigned long arch_local_irq_save(void) +static arch_local_irq_attributes unsigned long arch_local_irq_save(void) { return __arch_local_irq_stnsm(0xfc); } @@ -52,7 +58,12 @@ static __always_inline void arch_local_irq_disable(void) arch_local_irq_save(); } -static __always_inline void arch_local_irq_enable(void) +static arch_local_irq_attributes void arch_local_irq_enable_external(void) +{ + __arch_local_irq_stosm(0x01); +} + +static arch_local_irq_attributes void arch_local_irq_enable(void) { __arch_local_irq_stosm(0x03); } diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 895f774bbcc5..bf78cf381dfc 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -25,7 +25,7 @@ */ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 0,%l[label]\n" + asm goto("0: brcl 0,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" @@ -39,7 +39,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 15,%l[label]\n" + asm goto("0: brcl 15,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h index 2768d5db181f..0cffead0f2f2 100644 --- a/arch/s390/include/asm/kasan.h +++ b/arch/s390/include/asm/kasan.h @@ -2,7 +2,7 @@ #ifndef __ASM_KASAN_H #define __ASM_KASAN_H -#include <asm/pgtable.h> +#include <linux/const.h> #ifdef CONFIG_KASAN @@ -13,39 +13,6 @@ #define KASAN_SHADOW_START KASAN_SHADOW_OFFSET #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) -extern void kasan_early_init(void); -extern void kasan_copy_shadow_mapping(void); -extern void kasan_free_early_identity(void); - -/* - * Estimate kasan memory requirements, which it will reserve - * at the very end of available physical memory. To estimate - * that, we take into account that kasan would require - * 1/8 of available physical memory (for shadow memory) + - * creating page tables for the whole memory + shadow memory - * region (1 + 1/8). To keep page tables estimates simple take - * the double of combined ptes size. - * - * physmem parameter has to be already adjusted if not entire physical memory - * would be used (e.g. due to effect of "mem=" option). - */ -static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) -{ - unsigned long kasan_needs; - unsigned long pages; - /* for shadow memory */ - kasan_needs = round_up(physmem / 8, PAGE_SIZE); - /* for paging structures */ - pages = DIV_ROUND_UP(physmem + kasan_needs, PAGE_SIZE); - kasan_needs += DIV_ROUND_UP(pages, _PAGE_ENTRIES) * _PAGE_TABLE_SIZE * 2; - - return kasan_needs; -} -#else -static inline void kasan_early_init(void) { } -static inline void kasan_copy_shadow_mapping(void) { } -static inline void kasan_free_early_identity(void) { } -static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) { return 0; } #endif #endif diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 1bd08eb56d5f..9084b750350d 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -94,6 +94,9 @@ void arch_kexec_protect_crashkres(void); void arch_kexec_unprotect_crashkres(void); #define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres + +bool is_kdump_kernel(void); +#define is_kdump_kernel is_kdump_kernel #endif #ifdef CONFIG_KEXEC_FILE diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h index d55ba878378b..e95e35eb8a3f 100644 --- a/arch/s390/include/asm/kfence.h +++ b/arch/s390/include/asm/kfence.h @@ -12,30 +12,19 @@ void __kernel_map_pages(struct page *page, int numpages, int enable); static __always_inline bool arch_kfence_init_pool(void) { - return true; -} - -#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK) - -/* - * Do not split kfence pool to 4k mapping with arch_kfence_init_pool(), - * but earlier where page table allocations still happen with memblock. - * Reason is that arch_kfence_init_pool() gets called when the system - * is still in a limbo state - disabling and enabling bottom halves is - * not yet allowed, but that is what our page_table_alloc() would do. - */ -static __always_inline void kfence_split_mapping(void) -{ #ifdef CONFIG_KFENCE unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT; set_memory_4k((unsigned long)__kfence_pool, pool_pages); #endif + return true; } +#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK) + static inline bool kfence_protect_page(unsigned long addr, bool protect) { - __kernel_map_pages(virt_to_page(addr), 1, !protect); + __kernel_map_pages(virt_to_page((void *)addr), 1, !protect); return true; } diff --git a/arch/s390/include/asm/kmsan.h b/arch/s390/include/asm/kmsan.h new file mode 100644 index 000000000000..f73e181d09ae --- /dev/null +++ b/arch/s390/include/asm/kmsan.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_KMSAN_H +#define _ASM_S390_KMSAN_H + +#include <asm/lowcore.h> +#include <asm/page.h> +#include <linux/kmsan.h> +#include <linux/mmzone.h> +#include <linux/stddef.h> + +#ifndef MODULE + +static inline bool is_lowcore_addr(void *addr) +{ + return addr >= (void *)get_lowcore() && + addr < (void *)(get_lowcore() + 1); +} + +static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) +{ + if (is_lowcore_addr(addr)) { + /* + * Different lowcores accessed via S390_lowcore are described + * by the same struct page. Resolve the prefix manually in + * order to get a distinct struct page. + */ + addr += (void *)lowcore_ptr[raw_smp_processor_id()] - + (void *)get_lowcore(); + if (KMSAN_WARN_ON(is_lowcore_addr(addr))) + return NULL; + return kmsan_get_metadata(addr, is_origin); + } + return NULL; +} + +static inline bool kmsan_virt_addr_valid(void *addr) +{ + bool ret; + + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = virt_addr_valid(addr); + preempt_enable_no_resched(); + + return ret; +} + +#endif /* !MODULE */ + +#endif /* _ASM_S390_KMSAN_H */ diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 598095f4b924..01f1682a73b7 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -15,6 +15,7 @@ * <grundym@us.ibm.com> */ #include <linux/types.h> +#include <asm/ctlreg.h> #include <asm-generic/kprobes.h> #define BREAKPOINT_INSTRUCTION 0x0002 @@ -65,17 +66,13 @@ struct prev_kprobe { struct kprobe_ctlblk { unsigned long kprobe_status; unsigned long kprobe_saved_imask; - unsigned long kprobe_saved_ctl[3]; + struct ctlreg kprobe_saved_ctl[3]; struct prev_kprobe prev_kprobe; }; void arch_remove_kprobe(struct kprobe *p); -void __kretprobe_trampoline(void); -void trampoline_probe_handler(struct pt_regs *regs); int kprobe_fault_handler(struct pt_regs *regs, int trapnr); -int kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data); #define flush_insn_slot(p) do { } while (0) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 766028d54a3e..cb89e54ada25 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -15,20 +15,22 @@ #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kvm_types.h> -#include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/seqlock.h> #include <linux/module.h> +#include <linux/pci.h> +#include <linux/mmu_notifier.h> +#include <asm/kvm_host_types.h> #include <asm/debug.h> #include <asm/cpu.h> -#include <asm/fpu/api.h> +#include <asm/fpu.h> #include <asm/isc.h> #include <asm/guarded_storage.h> -#define KVM_S390_BSCA_CPU_SLOTS 64 -#define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 +#define KVM_INTERNAL_MEM_SLOTS 1 + /* * These seem to be used for allocating ->chip in the routing table, which we * don't use. 1 is as small as we can get to reduce the needed memory. If we @@ -48,321 +50,6 @@ #define KVM_REQ_REFRESH_GUEST_PREFIX \ KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define SIGP_CTRL_C 0x80 -#define SIGP_CTRL_SCN_MASK 0x3f - -union bsca_sigp_ctrl { - __u8 value; - struct { - __u8 c : 1; - __u8 r : 1; - __u8 scn : 6; - }; -}; - -union esca_sigp_ctrl { - __u16 value; - struct { - __u8 c : 1; - __u8 reserved: 7; - __u8 scn; - }; -}; - -struct esca_entry { - union esca_sigp_ctrl sigp_ctrl; - __u16 reserved1[3]; - __u64 sda; - __u64 reserved2[6]; -}; - -struct bsca_entry { - __u8 reserved0; - union bsca_sigp_ctrl sigp_ctrl; - __u16 reserved[3]; - __u64 sda; - __u64 reserved2[2]; -}; - -union ipte_control { - unsigned long val; - struct { - unsigned long k : 1; - unsigned long kh : 31; - unsigned long kg : 32; - }; -}; - -struct bsca_block { - union ipte_control ipte_control; - __u64 reserved[5]; - __u64 mcn; - __u64 reserved2; - struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; -}; - -struct esca_block { - union ipte_control ipte_control; - __u64 reserved1[7]; - __u64 mcn[4]; - __u64 reserved2[20]; - struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; -}; - -/* - * This struct is used to store some machine check info from lowcore - * for machine checks that happen while the guest is running. - * This info in host's lowcore might be overwritten by a second machine - * check from host when host is in the machine check's high-level handling. - * The size is 24 bytes. - */ -struct mcck_volatile_info { - __u64 mcic; - __u64 failing_storage_address; - __u32 ext_damage_code; - __u32 reserved; -}; - -#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ - CR0_MEASUREMENT_ALERT_SUBMASK) -#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ - CR14_EXTERNAL_DAMAGE_SUBMASK) - -#define SIDAD_SIZE_MASK 0xff -#define sida_origin(sie_block) \ - ((sie_block)->sidad & PAGE_MASK) -#define sida_size(sie_block) \ - ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) - -#define CPUSTAT_STOPPED 0x80000000 -#define CPUSTAT_WAIT 0x10000000 -#define CPUSTAT_ECALL_PEND 0x08000000 -#define CPUSTAT_STOP_INT 0x04000000 -#define CPUSTAT_IO_INT 0x02000000 -#define CPUSTAT_EXT_INT 0x01000000 -#define CPUSTAT_RUNNING 0x00800000 -#define CPUSTAT_RETAINED 0x00400000 -#define CPUSTAT_TIMING_SUB 0x00020000 -#define CPUSTAT_SIE_SUB 0x00010000 -#define CPUSTAT_RRF 0x00008000 -#define CPUSTAT_SLSV 0x00004000 -#define CPUSTAT_SLSR 0x00002000 -#define CPUSTAT_ZARCH 0x00000800 -#define CPUSTAT_MCDS 0x00000100 -#define CPUSTAT_KSS 0x00000200 -#define CPUSTAT_SM 0x00000080 -#define CPUSTAT_IBS 0x00000040 -#define CPUSTAT_GED2 0x00000010 -#define CPUSTAT_G 0x00000008 -#define CPUSTAT_GED 0x00000004 -#define CPUSTAT_J 0x00000002 -#define CPUSTAT_P 0x00000001 - -struct kvm_s390_sie_block { - atomic_t cpuflags; /* 0x0000 */ - __u32 : 1; /* 0x0004 */ - __u32 prefix : 18; - __u32 : 1; - __u32 ibc : 12; - __u8 reserved08[4]; /* 0x0008 */ -#define PROG_IN_SIE (1<<0) - __u32 prog0c; /* 0x000c */ - union { - __u8 reserved10[16]; /* 0x0010 */ - struct { - __u64 pv_handle_cpu; - __u64 pv_handle_config; - }; - }; -#define PROG_BLOCK_SIE (1<<0) -#define PROG_REQUEST (1<<1) - atomic_t prog20; /* 0x0020 */ - __u8 reserved24[4]; /* 0x0024 */ - __u64 cputm; /* 0x0028 */ - __u64 ckc; /* 0x0030 */ - __u64 epoch; /* 0x0038 */ - __u32 svcc; /* 0x0040 */ -#define LCTL_CR0 0x8000 -#define LCTL_CR6 0x0200 -#define LCTL_CR9 0x0040 -#define LCTL_CR10 0x0020 -#define LCTL_CR11 0x0010 -#define LCTL_CR14 0x0002 - __u16 lctl; /* 0x0044 */ - __s16 icpua; /* 0x0046 */ -#define ICTL_OPEREXC 0x80000000 -#define ICTL_PINT 0x20000000 -#define ICTL_LPSW 0x00400000 -#define ICTL_STCTL 0x00040000 -#define ICTL_ISKE 0x00004000 -#define ICTL_SSKE 0x00002000 -#define ICTL_RRBE 0x00001000 -#define ICTL_TPROT 0x00000200 - __u32 ictl; /* 0x0048 */ -#define ECA_CEI 0x80000000 -#define ECA_IB 0x40000000 -#define ECA_SIGPI 0x10000000 -#define ECA_MVPGI 0x01000000 -#define ECA_AIV 0x00200000 -#define ECA_VX 0x00020000 -#define ECA_PROTEXCI 0x00002000 -#define ECA_APIE 0x00000008 -#define ECA_SII 0x00000001 - __u32 eca; /* 0x004c */ -#define ICPT_INST 0x04 -#define ICPT_PROGI 0x08 -#define ICPT_INSTPROGI 0x0C -#define ICPT_EXTREQ 0x10 -#define ICPT_EXTINT 0x14 -#define ICPT_IOREQ 0x18 -#define ICPT_WAIT 0x1c -#define ICPT_VALIDITY 0x20 -#define ICPT_STOP 0x28 -#define ICPT_OPEREXC 0x2C -#define ICPT_PARTEXEC 0x38 -#define ICPT_IOINST 0x40 -#define ICPT_KSS 0x5c -#define ICPT_MCHKREQ 0x60 -#define ICPT_INT_ENABLE 0x64 -#define ICPT_PV_INSTR 0x68 -#define ICPT_PV_NOTIFY 0x6c -#define ICPT_PV_PREF 0x70 - __u8 icptcode; /* 0x0050 */ - __u8 icptstatus; /* 0x0051 */ - __u16 ihcpu; /* 0x0052 */ - __u8 reserved54; /* 0x0054 */ -#define IICTL_CODE_NONE 0x00 -#define IICTL_CODE_MCHK 0x01 -#define IICTL_CODE_EXT 0x02 -#define IICTL_CODE_IO 0x03 -#define IICTL_CODE_RESTART 0x04 -#define IICTL_CODE_SPECIFICATION 0x10 -#define IICTL_CODE_OPERAND 0x11 - __u8 iictl; /* 0x0055 */ - __u16 ipa; /* 0x0056 */ - __u32 ipb; /* 0x0058 */ - __u32 scaoh; /* 0x005c */ -#define FPF_BPBC 0x20 - __u8 fpf; /* 0x0060 */ -#define ECB_GS 0x40 -#define ECB_TE 0x10 -#define ECB_SPECI 0x08 -#define ECB_SRSI 0x04 -#define ECB_HOSTPROTINT 0x02 - __u8 ecb; /* 0x0061 */ -#define ECB2_CMMA 0x80 -#define ECB2_IEP 0x20 -#define ECB2_PFMFI 0x08 -#define ECB2_ESCA 0x04 - __u8 ecb2; /* 0x0062 */ -#define ECB3_DEA 0x08 -#define ECB3_AES 0x04 -#define ECB3_RI 0x01 - __u8 ecb3; /* 0x0063 */ - __u32 scaol; /* 0x0064 */ - __u8 sdf; /* 0x0068 */ - __u8 epdx; /* 0x0069 */ - __u8 cpnc; /* 0x006a */ - __u8 reserved6b; /* 0x006b */ - __u32 todpr; /* 0x006c */ -#define GISA_FORMAT1 0x00000001 - __u32 gd; /* 0x0070 */ - __u8 reserved74[12]; /* 0x0074 */ - __u64 mso; /* 0x0080 */ - __u64 msl; /* 0x0088 */ - psw_t gpsw; /* 0x0090 */ - __u64 gg14; /* 0x00a0 */ - __u64 gg15; /* 0x00a8 */ - __u8 reservedb0[8]; /* 0x00b0 */ -#define HPID_KVM 0x4 -#define HPID_VSIE 0x5 - __u8 hpid; /* 0x00b8 */ - __u8 reservedb9[7]; /* 0x00b9 */ - union { - struct { - __u32 eiparams; /* 0x00c0 */ - __u16 extcpuaddr; /* 0x00c4 */ - __u16 eic; /* 0x00c6 */ - }; - __u64 mcic; /* 0x00c0 */ - } __packed; - __u32 reservedc8; /* 0x00c8 */ - union { - struct { - __u16 pgmilc; /* 0x00cc */ - __u16 iprcc; /* 0x00ce */ - }; - __u32 edc; /* 0x00cc */ - } __packed; - union { - struct { - __u32 dxc; /* 0x00d0 */ - __u16 mcn; /* 0x00d4 */ - __u8 perc; /* 0x00d6 */ - __u8 peratmid; /* 0x00d7 */ - }; - __u64 faddr; /* 0x00d0 */ - } __packed; - __u64 peraddr; /* 0x00d8 */ - __u8 eai; /* 0x00e0 */ - __u8 peraid; /* 0x00e1 */ - __u8 oai; /* 0x00e2 */ - __u8 armid; /* 0x00e3 */ - __u8 reservede4[4]; /* 0x00e4 */ - union { - __u64 tecmc; /* 0x00e8 */ - struct { - __u16 subchannel_id; /* 0x00e8 */ - __u16 subchannel_nr; /* 0x00ea */ - __u32 io_int_parm; /* 0x00ec */ - __u32 io_int_word; /* 0x00f0 */ - }; - } __packed; - __u8 reservedf4[8]; /* 0x00f4 */ -#define CRYCB_FORMAT_MASK 0x00000003 -#define CRYCB_FORMAT0 0x00000000 -#define CRYCB_FORMAT1 0x00000001 -#define CRYCB_FORMAT2 0x00000003 - __u32 crycbd; /* 0x00fc */ - __u64 gcr[16]; /* 0x0100 */ - union { - __u64 gbea; /* 0x0180 */ - __u64 sidad; - }; - __u8 reserved188[8]; /* 0x0188 */ - __u64 sdnxo; /* 0x0190 */ - __u8 reserved198[8]; /* 0x0198 */ - __u32 fac; /* 0x01a0 */ - __u8 reserved1a4[20]; /* 0x01a4 */ - __u64 cbrlo; /* 0x01b8 */ - __u8 reserved1c0[8]; /* 0x01c0 */ -#define ECD_HOSTREGMGMT 0x20000000 -#define ECD_MEF 0x08000000 -#define ECD_ETOKENF 0x02000000 -#define ECD_ECC 0x00200000 - __u32 ecd; /* 0x01c8 */ - __u8 reserved1cc[18]; /* 0x01cc */ - __u64 pp; /* 0x01de */ - __u8 reserved1e6[2]; /* 0x01e6 */ - __u64 itdba; /* 0x01e8 */ - __u64 riccbd; /* 0x01f0 */ - __u64 gvrd; /* 0x01f8 */ -} __packed __aligned(512); - -struct kvm_s390_itdb { - __u8 data[256]; -}; - -struct sie_page { - struct kvm_s390_sie_block sie_block; - struct mcck_volatile_info mcck_info; /* 0x0200 */ - __u8 reserved218[360]; /* 0x0218 */ - __u64 pv_grregs[16]; /* 0x0380 */ - __u8 reserved400[512]; /* 0x0400 */ - struct kvm_s390_itdb itdb; /* 0x0600 */ - __u8 reserved700[2304]; /* 0x0700 */ -}; - struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; u64 exit_userspace; @@ -410,6 +97,7 @@ struct kvm_vcpu_stat { u64 instruction_io_other; u64 instruction_lpsw; u64 instruction_lpswe; + u64 instruction_lpswey; u64 instruction_pfmf; u64 instruction_ptff; u64 instruction_sck; @@ -510,6 +198,9 @@ struct kvm_vcpu_stat { #define PGM_REGION_FIRST_TRANS 0x39 #define PGM_REGION_SECOND_TRANS 0x3a #define PGM_REGION_THIRD_TRANS 0x3b +#define PGM_SECURE_STORAGE_ACCESS 0x3d +#define PGM_NON_SECURE_STORAGE_ACCESS 0x3e +#define PGM_SECURE_STORAGE_VIOLATION 0x3f #define PGM_MONITOR 0x40 #define PGM_PER 0x80 #define PGM_CRYPTO_OPERATION 0x119 @@ -726,13 +417,10 @@ struct kvm_vcpu_arch { struct kvm_s390_sie_block *vsie_block; unsigned int host_acrs[NUM_ACRS]; struct gs_cb *host_gscb; - struct fpu host_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; struct gmap *gmap; - /* backup location for the currently enabled gmap when scheduled out */ - struct gmap *enabled_gmap; struct kvm_guestdbg_info_arch guestdbg; unsigned long pfault_token; unsigned long pfault_select; @@ -748,6 +436,8 @@ struct kvm_vcpu_arch { __u64 cputm_start; bool gs_enabled; bool skey_enabled; + /* Indicator if the access registers have been loaded from guest */ + bool acrs_loaded; struct kvm_s390_pv_vcpu pv; union diag318_info diag318_info; }; @@ -759,6 +449,14 @@ struct kvm_vm_stat { u64 inject_pfault_done; u64 inject_service_signal; u64 inject_virtio; + u64 aen_forward; + u64 gmap_shadow_create; + u64 gmap_shadow_reuse; + u64 gmap_shadow_r1_entry; + u64 gmap_shadow_r2_entry; + u64 gmap_shadow_r3_entry; + u64 gmap_shadow_sg_entry; + u64 gmap_shadow_pg_entry; }; struct kvm_arch_memory_slot { @@ -793,12 +491,14 @@ struct s390_io_adapter { struct kvm_s390_cpu_model { /* facility mask supported by kvm & hosting machine */ - __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + __u64 fac_mask[S390_ARCH_FAC_MASK_SIZE_U64]; struct kvm_s390_vm_cpu_subfunc subfuncs; /* facility list requested by guest (in dma page) */ __u64 *fac_list; u64 cpuid; unsigned short ibc; + /* subset of available UV-features for pv-guests enabled by user space */ + struct kvm_s390_vm_cpu_uv_feat uv_feat_guest; }; typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); @@ -896,12 +596,14 @@ struct sie_page2 { u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ }; +struct vsie_page; + struct kvm_s390_vsie { struct mutex mutex; struct radix_tree_root addr_to_page; int page_count; int next; - struct page *pages[KVM_MAX_VCPUS]; + struct vsie_page *pages[KVM_MAX_VCPUS]; }; struct kvm_s390_gisa_iam { @@ -923,6 +625,10 @@ struct kvm_s390_pv { u64 guest_len; unsigned long stor_base; void *stor_var; + bool dumping; + void *set_aside; + struct list_head need_cleanup; + struct mmu_notifier mmu_notifier; }; struct kvm_arch{ @@ -939,6 +645,7 @@ struct kvm_arch{ int use_cmma; int use_pfmfi; int use_skf; + int use_zpci_interp; int user_cpu_state_ctrl; int user_sigp; int user_stsi; @@ -962,6 +669,8 @@ struct kvm_arch{ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; + struct list_head kzdev_list; + spinlock_t kzdev_list_lock; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -994,15 +703,22 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm); void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm); -extern int sie64a(struct kvm_s390_sie_block *, u64 *); +int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa, + unsigned long gasce); + +static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa, unsigned long gasce) +{ + return __sie64a(virt_to_phys(sie_block), sie_block, rsa, gasce); +} + extern char sie_exit; +bool kvm_s390_pv_is_protected(struct kvm *kvm); +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu); + extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} @@ -1012,4 +728,14 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +#define __KVM_HAVE_ARCH_VM_FREE +void kvm_arch_free_vm(struct kvm *kvm); + +struct zpci_kvm_hook { + int (*kvm_register)(void *opaque, struct kvm *kvm); + void (*kvm_unregister)(void *opaque); +}; + +extern struct zpci_kvm_hook zpci_kvm_hook; + #endif diff --git a/arch/s390/include/asm/kvm_host_types.h b/arch/s390/include/asm/kvm_host_types.h new file mode 100644 index 000000000000..1394d3fb648f --- /dev/null +++ b/arch/s390/include/asm/kvm_host_types.h @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_KVM_HOST_TYPES_H +#define _ASM_KVM_HOST_TYPES_H + +#include <linux/atomic.h> +#include <linux/types.h> + +#define KVM_S390_BSCA_CPU_SLOTS 64 +#define KVM_S390_ESCA_CPU_SLOTS 248 + +#define SIGP_CTRL_C 0x80 +#define SIGP_CTRL_SCN_MASK 0x3f + +union bsca_sigp_ctrl { + __u8 value; + struct { + __u8 c : 1; + __u8 r : 1; + __u8 scn : 6; + }; +}; + +union esca_sigp_ctrl { + __u16 value; + struct { + __u8 c : 1; + __u8 reserved: 7; + __u8 scn; + }; +}; + +struct esca_entry { + union esca_sigp_ctrl sigp_ctrl; + __u16 reserved1[3]; + __u64 sda; + __u64 reserved2[6]; +}; + +struct bsca_entry { + __u8 reserved0; + union bsca_sigp_ctrl sigp_ctrl; + __u16 reserved[3]; + __u64 sda; + __u64 reserved2[2]; +}; + +union ipte_control { + unsigned long val; + struct { + unsigned long k : 1; + unsigned long kh : 31; + unsigned long kg : 32; + }; +}; + +/* + * Utility is defined as two bytes but having it four bytes wide + * generates more efficient code. Since the following bytes are + * reserved this makes no functional difference. + */ +union sca_utility { + __u32 val; + struct { + __u32 mtcr : 1; + __u32 : 31; + }; +}; + +struct bsca_block { + union ipte_control ipte_control; + __u64 reserved[5]; + __u64 mcn; + union sca_utility utility; + __u8 reserved2[4]; + struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; +}; + +struct esca_block { + union ipte_control ipte_control; + __u64 reserved1[6]; + union sca_utility utility; + __u8 reserved2[4]; + __u64 mcn[4]; + __u64 reserved3[20]; + struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; +}; + +/* + * This struct is used to store some machine check info from lowcore + * for machine checks that happen while the guest is running. + * This info in host's lowcore might be overwritten by a second machine + * check from host when host is in the machine check's high-level handling. + * The size is 24 bytes. + */ +struct mcck_volatile_info { + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 reserved; +}; + +#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK) +#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ + CR14_EXTERNAL_DAMAGE_SUBMASK) + +#define SIDAD_SIZE_MASK 0xff +#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + +#define CPUSTAT_STOPPED 0x80000000 +#define CPUSTAT_WAIT 0x10000000 +#define CPUSTAT_ECALL_PEND 0x08000000 +#define CPUSTAT_STOP_INT 0x04000000 +#define CPUSTAT_IO_INT 0x02000000 +#define CPUSTAT_EXT_INT 0x01000000 +#define CPUSTAT_RUNNING 0x00800000 +#define CPUSTAT_RETAINED 0x00400000 +#define CPUSTAT_TIMING_SUB 0x00020000 +#define CPUSTAT_SIE_SUB 0x00010000 +#define CPUSTAT_RRF 0x00008000 +#define CPUSTAT_SLSV 0x00004000 +#define CPUSTAT_SLSR 0x00002000 +#define CPUSTAT_ZARCH 0x00000800 +#define CPUSTAT_MCDS 0x00000100 +#define CPUSTAT_KSS 0x00000200 +#define CPUSTAT_SM 0x00000080 +#define CPUSTAT_IBS 0x00000040 +#define CPUSTAT_GED2 0x00000010 +#define CPUSTAT_G 0x00000008 +#define CPUSTAT_GED 0x00000004 +#define CPUSTAT_J 0x00000002 +#define CPUSTAT_P 0x00000001 + +struct kvm_s390_sie_block { + atomic_t cpuflags; /* 0x0000 */ + __u32 : 1; /* 0x0004 */ + __u32 prefix : 18; + __u32 : 1; + __u32 ibc : 12; + __u8 reserved08[4]; /* 0x0008 */ +#define PROG_IN_SIE (1<<0) + __u32 prog0c; /* 0x000c */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; +#define PROG_BLOCK_SIE (1<<0) +#define PROG_REQUEST (1<<1) + atomic_t prog20; /* 0x0020 */ + __u8 reserved24[4]; /* 0x0024 */ + __u64 cputm; /* 0x0028 */ + __u64 ckc; /* 0x0030 */ + __u64 epoch; /* 0x0038 */ + __u32 svcc; /* 0x0040 */ +#define LCTL_CR0 0x8000 +#define LCTL_CR6 0x0200 +#define LCTL_CR9 0x0040 +#define LCTL_CR10 0x0020 +#define LCTL_CR11 0x0010 +#define LCTL_CR14 0x0002 + __u16 lctl; /* 0x0044 */ + __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 +#define ICTL_PINT 0x20000000 +#define ICTL_LPSW 0x00400000 +#define ICTL_STCTL 0x00040000 +#define ICTL_ISKE 0x00004000 +#define ICTL_SSKE 0x00002000 +#define ICTL_RRBE 0x00001000 +#define ICTL_TPROT 0x00000200 + __u32 ictl; /* 0x0048 */ +#define ECA_CEI 0x80000000 +#define ECA_IB 0x40000000 +#define ECA_SIGPI 0x10000000 +#define ECA_MVPGI 0x01000000 +#define ECA_AIV 0x00200000 +#define ECA_VX 0x00020000 +#define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 +#define ECA_SII 0x00000001 + __u32 eca; /* 0x004c */ +#define ICPT_INST 0x04 +#define ICPT_PROGI 0x08 +#define ICPT_INSTPROGI 0x0C +#define ICPT_EXTREQ 0x10 +#define ICPT_EXTINT 0x14 +#define ICPT_IOREQ 0x18 +#define ICPT_WAIT 0x1c +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 +#define ICPT_OPEREXC 0x2C +#define ICPT_PARTEXEC 0x38 +#define ICPT_IOINST 0x40 +#define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 + __u8 icptcode; /* 0x0050 */ + __u8 icptstatus; /* 0x0051 */ + __u16 ihcpu; /* 0x0052 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ + __u16 ipa; /* 0x0056 */ + __u32 ipb; /* 0x0058 */ + __u32 scaoh; /* 0x005c */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ +#define ECB_GS 0x40 +#define ECB_TE 0x10 +#define ECB_SPECI 0x08 +#define ECB_SRSI 0x04 +#define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 + __u8 ecb; /* 0x0061 */ +#define ECB2_CMMA 0x80 +#define ECB2_IEP 0x20 +#define ECB2_PFMFI 0x08 +#define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 + __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 +#define ECB3_DEA 0x08 +#define ECB3_AES 0x04 +#define ECB3_RI 0x01 + __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU + __u32 scaol; /* 0x0064 */ + __u8 sdf; /* 0x0068 */ + __u8 epdx; /* 0x0069 */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ + __u32 todpr; /* 0x006c */ +#define GISA_FORMAT1 0x00000001 + __u32 gd; /* 0x0070 */ + __u8 reserved74[12]; /* 0x0074 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ + psw_t gpsw; /* 0x0090 */ + __u64 gg14; /* 0x00a0 */ + __u64 gg15; /* 0x00a8 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; + __u32 reservedc8; /* 0x00c8 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; + __u64 peraddr; /* 0x00d8 */ + __u8 eai; /* 0x00e0 */ + __u8 peraid; /* 0x00e1 */ + __u8 oai; /* 0x00e2 */ + __u8 armid; /* 0x00e3 */ + __u8 reservede4[4]; /* 0x00e4 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 +#define CRYCB_FORMAT1 0x00000001 +#define CRYCB_FORMAT2 0x00000003 + __u32 crycbd; /* 0x00fc */ + __u64 gcr[16]; /* 0x0100 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; + __u8 reserved188[8]; /* 0x0188 */ + __u64 sdnxo; /* 0x0190 */ + __u8 reserved198[8]; /* 0x0198 */ + __u32 fac; /* 0x01a0 */ + __u8 reserved1a4[20]; /* 0x01a4 */ + __u64 cbrlo; /* 0x01b8 */ + __u8 reserved1c0[8]; /* 0x01c0 */ +#define ECD_HOSTREGMGMT 0x20000000 +#define ECD_MEF 0x08000000 +#define ECD_ETOKENF 0x02000000 +#define ECD_ECC 0x00200000 +#define ECD_HMAC 0x00004000 + __u32 ecd; /* 0x01c8 */ + __u8 reserved1cc[18]; /* 0x01cc */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ + __u64 itdba; /* 0x01e8 */ + __u64 riccbd; /* 0x01f0 */ + __u64 gvrd; /* 0x01f8 */ +} __packed __aligned(512); + +struct kvm_s390_itdb { + __u8 data[256]; +}; + +struct sie_page { + struct kvm_s390_sie_block sie_block; + struct mcck_volatile_info mcck_info; /* 0x0200 */ + __u8 reserved218[360]; /* 0x0218 */ + __u64 pv_grregs[16]; /* 0x0380 */ + __u8 reserved400[512]; /* 0x0400 */ + struct kvm_s390_itdb itdb; /* 0x0600 */ + __u8 reserved700[2304]; /* 0x0700 */ +}; + +#endif /* _ASM_KVM_HOST_TYPES_H */ diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index c76777b15fec..df3fb7d8227b 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -4,7 +4,7 @@ #include <linux/stringify.h> -#define __ALIGN .align 16, 0x07 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x07 #define __ALIGN_STR __stringify(__ALIGN) #endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 26fe5e535728..e99e9c87b1ce 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -10,13 +10,20 @@ #define _ASM_S390_LOWCORE_H #include <linux/types.h> +#include <asm/machine.h> #include <asm/ptrace.h> +#include <asm/ctlreg.h> #include <asm/cpu.h> #include <asm/types.h> +#include <asm/alternative.h> #define LC_ORDER 1 #define LC_PAGES 2 +#define LOWCORE_ALT_ADDRESS _AC(0x70000, UL) + +#ifndef __ASSEMBLY__ + struct pgm_tdb { u64 data[32]; }; @@ -92,12 +99,11 @@ struct lowcore { psw_t io_new_psw; /* 0x01f0 */ /* Save areas. */ - __u64 save_area_sync[8]; /* 0x0200 */ - __u64 save_area_async[8]; /* 0x0240 */ + __u64 save_area[8]; /* 0x0200 */ + __u8 pad_0x0240[0x0280-0x0240]; /* 0x0240 */ __u64 save_area_restart[1]; /* 0x0280 */ - /* CPU flags. */ - __u64 cpu_flags; /* 0x0288 */ + __u64 pcpu; /* 0x0288 */ /* Return psws. */ psw_t return_psw; /* 0x0290 */ @@ -118,10 +124,10 @@ struct lowcore { __u64 avg_steal_timer; /* 0x0300 */ __u64 last_update_timer; /* 0x0308 */ __u64 last_update_clock; /* 0x0310 */ - __u64 int_clock; /* 0x0318*/ - __u64 mcck_clock; /* 0x0320 */ + __u64 int_clock; /* 0x0318 */ + __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ __u64 clock_comparator; /* 0x0328 */ - __u64 boot_clock[2]; /* 0x0330 */ + __u8 pad_0x0330[0x0340-0x0330]; /* 0x0330 */ /* Current process. */ __u64 current_task; /* 0x0340 */ @@ -139,8 +145,8 @@ struct lowcore { __u32 restart_flags; /* 0x0384 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0388 */ - __u64 user_asce; /* 0x0390 */ + struct ctlreg kernel_asce; /* 0x0388 */ + struct ctlreg user_asce; /* 0x0390 */ /* * The lpp and current_pid fields form a @@ -156,12 +162,9 @@ struct lowcore { __s32 preempt_count; /* 0x03a8 */ __u32 spinlock_lockval; /* 0x03ac */ __u32 spinlock_index; /* 0x03b0 */ - __u32 fpu_flags; /* 0x03b4 */ + __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ __u64 percpu_offset; /* 0x03b8 */ - __u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */ - __u64 machine_flags; /* 0x03c8 */ - __u64 gmap; /* 0x03d0 */ - __u8 pad_0x03d8[0x0400-0x03d8]; /* 0x03d8 */ + __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ __u32 return_lpswe; /* 0x0400 */ __u32 return_mcck_lpswe; /* 0x0404 */ @@ -199,18 +202,33 @@ struct lowcore { __u32 clock_comp_save_area[2]; /* 0x1330 */ __u64 last_break_save_area; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ - __u64 cregs_save_area[16]; /* 0x1380 */ + struct ctlreg cregs_save_area[16]; /* 0x1380 */ __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */ /* Cryptography-counter designation */ __u64 ccd; /* 0x1500 */ - __u8 pad_0x1508[0x1800-0x1508]; /* 0x1508 */ + /* AI-extension counter designation */ + __u64 aicd; /* 0x1508 */ + __u8 pad_0x1510[0x1800-0x1510]; /* 0x1510 */ /* Transaction abort diagnostic block */ struct pgm_tdb pgm_tdb; /* 0x1800 */ __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ } __packed __aligned(8192); -#define S390_lowcore (*((struct lowcore *) 0)) +static __always_inline struct lowcore *get_lowcore(void) +{ + struct lowcore *lc; + + if (__is_defined(__DECOMPRESSOR)) + return NULL; + asm_inline( + ALTERNATIVE(" lghi %[lc],0", + " llilh %[lc],%[alt]", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [lc] "=d" (lc) + : [alt] "i" (LOWCORE_ALT_ADDRESS >> 16)); + return lc; +} extern struct lowcore *lowcore_ptr[]; @@ -219,12 +237,19 @@ static inline void set_prefix(__u32 address) asm volatile("spx %0" : : "Q" (address) : "memory"); } -static inline __u32 store_prefix(void) -{ - __u32 address; +#else /* __ASSEMBLY__ */ - asm volatile("stpx %0" : "=Q" (address)); - return address; -} +.macro GET_LC reg + ALTERNATIVE "lghi \reg,0", \ + __stringify(llilh \reg, LOWCORE_ALT_ADDRESS >> 16), \ + ALT_FEATURE(MFEATURE_LOWCORE) +.endm + +.macro STMG_LC start, end, savearea + ALTERNATIVE "stmg \start, \end, \savearea", \ + __stringify(stmg \start, \end, LOWCORE_ALT_ADDRESS + \savearea), \ + ALT_FEATURE(MFEATURE_LOWCORE) +.endm +#endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_LOWCORE_H */ diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h new file mode 100644 index 000000000000..50225940d971 --- /dev/null +++ b/arch/s390/include/asm/maccess.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_S390_MACCESS_H +#define __ASM_S390_MACCESS_H + +#include <linux/types.h> + +#define MEMCPY_REAL_SIZE PAGE_SIZE +#define MEMCPY_REAL_MASK PAGE_MASK + +struct iov_iter; + +extern unsigned long __memcpy_real_area; +extern pte_t *memcpy_real_ptep; +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count); +int memcpy_real(void *dest, unsigned long src, size_t count); +#ifdef CONFIG_CRASH_DUMP +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count); +#endif + +#endif /* __ASM_S390_MACCESS_H */ diff --git a/arch/s390/include/asm/machine.h b/arch/s390/include/asm/machine.h new file mode 100644 index 000000000000..8abe5afdbfc4 --- /dev/null +++ b/arch/s390/include/asm/machine.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2024 + */ + +#ifndef __ASM_S390_MACHINE_H +#define __ASM_S390_MACHINE_H + +#include <linux/const.h> + +#define MFEATURE_LOWCORE 0 +#define MFEATURE_PCI_MIO 1 +#define MFEATURE_SCC 2 +#define MFEATURE_TLB_GUEST 3 +#define MFEATURE_TX 4 +#define MFEATURE_ESOP 5 +#define MFEATURE_DIAG9C 6 +#define MFEATURE_VM 7 +#define MFEATURE_KVM 8 +#define MFEATURE_LPAR 9 +#define MFEATURE_DIAG288 10 + +#ifndef __ASSEMBLY__ + +#include <linux/bitops.h> +#include <asm/alternative.h> + +extern unsigned long machine_features[1]; + +#define MAX_MFEATURE_BIT (sizeof(machine_features) * BITS_PER_BYTE) + +static inline void __set_machine_feature(unsigned int nr, unsigned long *mfeatures) +{ + if (nr >= MAX_MFEATURE_BIT) + return; + __set_bit(nr, mfeatures); +} + +static inline void set_machine_feature(unsigned int nr) +{ + __set_machine_feature(nr, machine_features); +} + +static inline void __clear_machine_feature(unsigned int nr, unsigned long *mfeatures) +{ + if (nr >= MAX_MFEATURE_BIT) + return; + __clear_bit(nr, mfeatures); +} + +static inline void clear_machine_feature(unsigned int nr) +{ + __clear_machine_feature(nr, machine_features); +} + +static bool __test_machine_feature(unsigned int nr, unsigned long *mfeatures) +{ + if (nr >= MAX_MFEATURE_BIT) + return false; + return test_bit(nr, mfeatures); +} + +static bool test_machine_feature(unsigned int nr) +{ + return __test_machine_feature(nr, machine_features); +} + +static __always_inline bool __test_machine_feature_constant(unsigned int nr) +{ + asm goto( + ALTERNATIVE("brcl 15,%l[l_no]", "brcl 0,0", ALT_FEATURE(%[nr])) + : + : [nr] "i" (nr) + : + : l_no); + return true; +l_no: + return false; +} + +#define DEFINE_MACHINE_HAS_FEATURE(name, feature) \ +static __always_inline bool machine_has_##name(void) \ +{ \ + if (!__is_defined(__DECOMPRESSOR) && __builtin_constant_p(feature)) \ + return __test_machine_feature_constant(feature); \ + return test_machine_feature(feature); \ +} + +DEFINE_MACHINE_HAS_FEATURE(relocated_lowcore, MFEATURE_LOWCORE) +DEFINE_MACHINE_HAS_FEATURE(scc, MFEATURE_SCC) +DEFINE_MACHINE_HAS_FEATURE(tlb_guest, MFEATURE_TLB_GUEST) +DEFINE_MACHINE_HAS_FEATURE(tx, MFEATURE_TX) +DEFINE_MACHINE_HAS_FEATURE(esop, MFEATURE_ESOP) +DEFINE_MACHINE_HAS_FEATURE(diag9c, MFEATURE_DIAG9C) +DEFINE_MACHINE_HAS_FEATURE(vm, MFEATURE_VM) +DEFINE_MACHINE_HAS_FEATURE(kvm, MFEATURE_KVM) +DEFINE_MACHINE_HAS_FEATURE(lpar, MFEATURE_LPAR) + +#define machine_is_vm machine_has_vm +#define machine_is_kvm machine_has_kvm +#define machine_is_lpar machine_has_lpar + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_MACHINE_H */ diff --git a/arch/s390/include/asm/march.h b/arch/s390/include/asm/march.h new file mode 100644 index 000000000000..11a71bd14954 --- /dev/null +++ b/arch/s390/include/asm/march.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_S390_MARCH_H +#define __ASM_S390_MARCH_H + +#include <linux/kconfig.h> + +#define MARCH_HAS_Z10_FEATURES 1 + +#ifndef __DECOMPRESSOR + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#define MARCH_HAS_Z196_FEATURES 1 +#endif + +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES +#define MARCH_HAS_ZEC12_FEATURES 1 +#endif + +#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES +#define MARCH_HAS_Z13_FEATURES 1 +#endif + +#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES +#define MARCH_HAS_Z14_FEATURES 1 +#endif + +#ifdef CONFIG_HAVE_MARCH_Z15_FEATURES +#define MARCH_HAS_Z15_FEATURES 1 +#endif + +#ifdef CONFIG_HAVE_MARCH_Z16_FEATURES +#define MARCH_HAS_Z16_FEATURES 1 +#endif + +#ifdef CONFIG_HAVE_MARCH_Z17_FEATURES +#define MARCH_HAS_Z17_FEATURES 1 +#endif + +#endif /* __DECOMPRESSOR */ + +#endif /* __ASM_S390_MARCH_H */ diff --git a/arch/s390/include/asm/mem_detect.h b/arch/s390/include/asm/mem_detect.h deleted file mode 100644 index a7c922a69050..000000000000 --- a/arch/s390/include/asm/mem_detect.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_MEM_DETECT_H -#define _ASM_S390_MEM_DETECT_H - -#include <linux/types.h> - -enum mem_info_source { - MEM_DETECT_NONE = 0, - MEM_DETECT_SCLP_STOR_INFO, - MEM_DETECT_DIAG260, - MEM_DETECT_SCLP_READ_INFO, - MEM_DETECT_BIN_SEARCH -}; - -struct mem_detect_block { - u64 start; - u64 end; -}; - -/* - * Storage element id is defined as 1 byte (up to 256 storage elements). - * In practise only storage element id 0 and 1 are used). - * According to architecture one storage element could have as much as - * 1020 subincrements. 255 mem_detect_blocks are embedded in mem_detect_info. - * If more mem_detect_blocks are required, a block of memory from already - * known mem_detect_block is taken (entries_extended points to it). - */ -#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */ - -struct mem_detect_info { - u32 count; - u8 info_source; - struct mem_detect_block entries[MEM_INLINED_ENTRIES]; - struct mem_detect_block *entries_extended; -}; -extern struct mem_detect_info mem_detect; - -void add_mem_detect_block(u64 start, u64 end); - -static inline int __get_mem_detect_block(u32 n, unsigned long *start, - unsigned long *end) -{ - if (n >= mem_detect.count) { - *start = 0; - *end = 0; - return -1; - } - - if (n < MEM_INLINED_ENTRIES) { - *start = (unsigned long)mem_detect.entries[n].start; - *end = (unsigned long)mem_detect.entries[n].end; - } else { - *start = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].start; - *end = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].end; - } - return 0; -} - -/** - * for_each_mem_detect_block - early online memory range iterator - * @i: an integer used as loop variable - * @p_start: ptr to unsigned long for start address of the range - * @p_end: ptr to unsigned long for end address of the range - * - * Walks over detected online memory ranges. - */ -#define for_each_mem_detect_block(i, p_start, p_end) \ - for (i = 0, __get_mem_detect_block(i, p_start, p_end); \ - i < mem_detect.count; \ - i++, __get_mem_detect_block(i, p_start, p_end)) - -static inline void get_mem_detect_reserved(unsigned long *start, - unsigned long *size) -{ - *start = (unsigned long)mem_detect.entries_extended; - if (mem_detect.count > MEM_INLINED_ENTRIES) - *size = (mem_detect.count - MEM_INLINED_ENTRIES) * sizeof(struct mem_detect_block); - else - *size = 0; -} - -static inline unsigned long get_mem_detect_end(void) -{ - unsigned long start; - unsigned long end; - - if (mem_detect.count) { - __get_mem_detect_block(mem_detect.count - 1, &start, &end); - return end; - } - return 0; -} - -#endif diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index 08a8b96606d7..b85e13505a0f 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -4,8 +4,8 @@ #ifndef __ASSEMBLY__ -int set_memory_encrypted(unsigned long addr, int numpages); -int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long vaddr, int numpages); +int set_memory_decrypted(unsigned long vaddr, int numpages); #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 82aae78e1315..f07e49b419ab 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -11,49 +11,35 @@ typedef struct { cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; - struct list_head pgtable_list; struct list_head gmap_list; unsigned long gmap_asce; unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; /* The mmu context belongs to a secure guest. */ - atomic_t is_protected; + atomic_t protected_count; /* * The following bitfields need a down_write on the mm * semaphore when they are written to. As they are only * written once, they can be read without a lock. - * - * The mmu context allocates 4K page tables. */ - unsigned int alloc_pgste:1; /* The mmu context uses extended page tables. */ unsigned int has_pgste:1; /* The mmu context uses storage keys. */ unsigned int uses_skeys:1; /* The mmu context uses CMM. */ unsigned int uses_cmm:1; + /* + * The mmu context allows COW-sharing of memory pages (KSM, zeropage). + * Note that COW-sharing during fork() is currently always allowed. + */ + unsigned int allow_cow_sharing:1; /* The gmaps associated with this context are allowed to use huge pages. */ unsigned int allow_gmap_hpage_1m:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), -static inline int tprot(unsigned long addr) -{ - int rc = -EFAULT; - - asm volatile( - " tprot 0(%1),0\n" - "0: ipm %0\n" - " srl %0,28\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) : "a" (addr) : "cc"); - return rc; -} - #endif diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c7937f369e62..d9b8501bc93d 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -12,7 +12,8 @@ #include <linux/uaccess.h> #include <linux/mm_types.h> #include <asm/tlbflush.h> -#include <asm/ctl_reg.h> +#include <asm/ctlreg.h> +#include <asm/asce.h> #include <asm-generic/mm_hooks.h> #define init_new_context init_new_context @@ -22,20 +23,17 @@ static inline int init_new_context(struct task_struct *tsk, unsigned long asce_type, init_entry; spin_lock_init(&mm->context.lock); - INIT_LIST_HEAD(&mm->context.pgtable_list); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); - atomic_set(&mm->context.is_protected, 0); + atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #ifdef CONFIG_PGSTE - mm->context.alloc_pgste = page_table_allocate_pgste || - test_thread_flag(TIF_PGSTE) || - (current->mm && current->mm->context.alloc_pgste); mm->context.has_pgste = 0; mm->context.uses_skeys = 0; mm->context.uses_cmm = 0; + mm->context.allow_cow_sharing = 1; mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { @@ -76,12 +74,13 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct * int cpu = smp_processor_id(); if (next == &init_mm) - S390_lowcore.user_asce = s390_invalid_asce; + get_lowcore()->user_asce = s390_invalid_asce; else - S390_lowcore.user_asce = next->context.asce; + get_lowcore()->user_asce.val = next->context.asce; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - /* Clear previous user-ASCE from CR7 */ - __ctl_load(s390_invalid_asce, 7, 7); + /* Clear previous user-ASCE from CR1 and CR7 */ + local_ctl_load(1, &s390_invalid_asce); + local_ctl_load(7, &s390_invalid_asce); if (prev != next) cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); } @@ -102,6 +101,7 @@ static inline void finish_arch_post_lock_switch(void) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + unsigned long flags; if (mm) { preempt_disable(); @@ -111,16 +111,26 @@ static inline void finish_arch_post_lock_switch(void) __tlb_flush_mm_lazy(mm); preempt_enable(); } - __ctl_load(S390_lowcore.user_asce, 7, 7); + local_irq_save(flags); + if (test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &get_lowcore()->kernel_asce); + else + local_ctl_load(1, &get_lowcore()->user_asce); + local_ctl_load(7, &get_lowcore()->user_asce); + local_irq_restore(flags); } #define activate_mm activate_mm static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { - switch_mm(prev, next, current); + switch_mm_irqs_off(prev, next, current); cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); - __ctl_load(S390_lowcore.user_asce, 7, 7); + if (test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &get_lowcore()->kernel_asce); + else + local_ctl_load(1, &get_lowcore()->user_asce); + local_ctl_load(7, &get_lowcore()->user_asce); } #include <asm-generic/mmu_context.h> diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h deleted file mode 100644 index 73e3e7c6976c..000000000000 --- a/arch/s390/include/asm/mmzone.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * Copyright IBM Corp. 2015 - */ - -#ifndef _ASM_S390_MMZONE_H -#define _ASM_S390_MMZONE_H - -#ifdef CONFIG_NUMA - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) - -#endif /* CONFIG_NUMA */ -#endif /* _ASM_S390_MMZONE_H */ diff --git a/arch/s390/include/asm/module.h b/arch/s390/include/asm/module.h index 9f1eea15872c..916ab59e458a 100644 --- a/arch/s390/include/asm/module.h +++ b/arch/s390/include/asm/module.h @@ -38,4 +38,18 @@ struct mod_arch_specific { #endif /* CONFIG_FUNCTION_TRACER */ }; +static inline const Elf_Shdr *find_section(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + const char *name) +{ + const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + const Elf_Shdr *s, *se; + + for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { + if (strcmp(name, secstrs + s->sh_name) == 0) + return s; + } + return NULL; +} + #endif /* _ASM_S390_MODULE_H */ diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h new file mode 100644 index 000000000000..399343ed9ffb --- /dev/null +++ b/arch/s390/include/asm/msi.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MSI_H +#define _ASM_S390_MSI_H +#include <asm-generic/msi.h> + +/* + * Work around S390 not using irq_domain at all so we can't set + * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works: + * + * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/ + * + * Note this is less isolated than the ARM/x86 versions as userspace can trigger + * MSI belonging to kernel devices within the same gisa. + */ +#define arch_is_isolated_msi() true + +#endif diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index af1cd3a6f406..227466ce9e41 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -101,9 +101,8 @@ void nmi_alloc_mcesa_early(u64 *mcesad); int nmi_alloc_mcesa(u64 *mcesad); void nmi_free_mcesa(u64 *mcesad); -void s390_handle_mcck(struct pt_regs *regs); -void __s390_handle_mcck(void); -int s390_do_machine_check(struct pt_regs *regs); +void s390_handle_mcck(void); +void s390_do_machine_check(struct pt_regs *regs); #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_NMI_H */ diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index 82725cf783c7..c7c96282f011 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -5,8 +5,17 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> +#include <asm/facility.h> extern int nospec_disable; +extern int nobp; + +static inline bool nobp_enabled(void) +{ + if (__is_defined(__DECOMPRESSOR)) + return false; + return nobp && test_facility(82); +} void nospec_init_branches(void); void nospec_auto_detect(void); @@ -17,6 +26,22 @@ static inline bool nospec_uses_trampoline(void) return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; } +void __s390_indirect_jump_r1(void); +void __s390_indirect_jump_r2(void); +void __s390_indirect_jump_r3(void); +void __s390_indirect_jump_r4(void); +void __s390_indirect_jump_r5(void); +void __s390_indirect_jump_r6(void); +void __s390_indirect_jump_r7(void); +void __s390_indirect_jump_r8(void); +void __s390_indirect_jump_r9(void); +void __s390_indirect_jump_r10(void); +void __s390_indirect_jump_r11(void); +void __s390_indirect_jump_r12(void); +void __s390_indirect_jump_r13(void); +void __s390_indirect_jump_r14(void); +void __s390_indirect_jump_r15(void); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 7e9e99523e95..cb15dd25bf21 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -2,6 +2,7 @@ #ifndef _ASM_S390_NOSPEC_ASM_H #define _ASM_S390_NOSPEC_ASM_H +#include <linux/linkage.h> #include <asm/dwarf.h> #ifdef __ASSEMBLY__ @@ -15,24 +16,25 @@ */ .macro __THUNK_PROLOG_NAME name #ifdef CONFIG_EXPOLINE_EXTERN - .pushsection .text,"ax",@progbits - .align 16,0x07 + SYM_CODE_START(\name) #else .pushsection .text.\name,"axG",@progbits,\name,comdat -#endif .globl \name .hidden \name .type \name,@function \name: CFI_STARTPROC +#endif .endm .macro __THUNK_EPILOG_NAME name - CFI_ENDPROC #ifdef CONFIG_EXPOLINE_EXTERN - .size \name, .-\name -#endif + SYM_CODE_END(\name) + EXPORT_SYMBOL(\name) +#else + CFI_ENDPROC .popsection +#endif .endm .macro __THUNK_PROLOG_BR r1 diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index 147a8d547ef9..3ee9e8f5ceae 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -8,15 +8,34 @@ #ifndef _ASM_S390_OS_INFO_H #define _ASM_S390_OS_INFO_H +#include <linux/uio.h> + #define OS_INFO_VERSION_MAJOR 1 #define OS_INFO_VERSION_MINOR 1 #define OS_INFO_MAGIC 0x4f53494e464f535aULL /* OSINFOSZ */ #define OS_INFO_VMCOREINFO 0 #define OS_INFO_REIPL_BLOCK 1 +#define OS_INFO_FLAGS_ENTRY 2 +#define OS_INFO_RESERVED 3 +#define OS_INFO_IDENTITY_BASE 4 +#define OS_INFO_KASLR_OFFSET 5 +#define OS_INFO_KASLR_OFF_PHYS 6 +#define OS_INFO_VMEMMAP 7 +#define OS_INFO_AMODE31_START 8 +#define OS_INFO_AMODE31_END 9 +#define OS_INFO_IMAGE_START 10 +#define OS_INFO_IMAGE_END 11 +#define OS_INFO_IMAGE_PHYS 12 +#define OS_INFO_MAX 13 + +#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) struct os_info_entry { - u64 addr; + union { + u64 addr; + u64 val; + }; u64 size; u32 csum; } __packed; @@ -28,18 +47,24 @@ struct os_info { u16 version_minor; u64 crashkernel_addr; u64 crashkernel_size; - struct os_info_entry entry[2]; - u8 reserved[4024]; + struct os_info_entry entry[OS_INFO_MAX]; + u8 reserved[3804]; } __packed; void os_info_init(void); -void os_info_entry_add(int nr, void *ptr, u64 len); +void os_info_entry_add_data(int nr, void *ptr, u64 len); +void os_info_entry_add_val(int nr, u64 val); void os_info_crashkernel_add(unsigned long base, unsigned long size); u32 os_info_csum(struct os_info *os_info); #ifdef CONFIG_CRASH_DUMP void *os_info_old_entry(int nr, unsigned long *size); -int copy_oldmem_kernel(void *dst, unsigned long src, size_t count); +static inline unsigned long os_info_old_value(int nr) +{ + unsigned long size; + + return (unsigned long)os_info_old_entry(nr, &size); +} #else static inline void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index c33c4deb545f..794fdb21500a 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -7,6 +7,8 @@ #ifndef PAGE_STATES_H #define PAGE_STATES_H +#include <asm/page.h> + #define ESSA_GET_STATE 0 #define ESSA_SET_STABLE 1 #define ESSA_SET_UNUSED 2 @@ -18,4 +20,60 @@ #define ESSA_MAX ESSA_SET_STABLE_NODAT +extern int cmma_flag; + +static __always_inline unsigned long essa(unsigned long paddr, unsigned char cmd) +{ + unsigned long rc; + + asm volatile( + " .insn rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0" + : [rc] "=d" (rc) + : [paddr] "d" (paddr), + [cmd] "i" (cmd)); + return rc; +} + +static __always_inline void __set_page_state(void *addr, unsigned long num_pages, unsigned char cmd) +{ + unsigned long paddr = __pa(addr) & PAGE_MASK; + + while (num_pages--) { + essa(paddr, cmd); + paddr += PAGE_SIZE; + } +} + +static inline void __set_page_unused(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_UNUSED); +} + +static inline void __set_page_stable_dat(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_STABLE); +} + +static inline void __set_page_stable_nodat(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_STABLE_NODAT); +} + +static inline void __arch_set_page_nodat(void *addr, unsigned long num_pages) +{ + if (!cmma_flag) + return; + if (cmma_flag < 2) + __set_page_stable_dat(addr, num_pages); + else + __set_page_stable_nodat(addr, num_pages); +} + +static inline void __arch_set_page_dat(void *addr, unsigned long num_pages) +{ + if (!cmma_flag) + return; + __set_page_stable_dat(addr, num_pages); +} + #endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 61dea67bb9c7..4e5dbabdf202 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -10,16 +10,11 @@ #include <linux/const.h> #include <asm/types.h> +#include <asm/asm.h> -#define _PAGE_SHIFT 12 -#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) -#define _PAGE_MASK (~(_PAGE_SIZE - 1)) +#include <vdso/page.h> -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT _PAGE_SHIFT -#define PAGE_SIZE _PAGE_SIZE -#define PAGE_MASK _PAGE_MASK -#define PAGE_DEFAULT_ACC 0 +#define PAGE_DEFAULT_ACC _AC(0, UL) /* storage-protection override */ #define PAGE_SPO_ACC 9 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) @@ -73,13 +68,14 @@ static inline void copy_page(void *to, void *from) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) -/* - * These are used to make use of C type-checking.. - */ +#ifdef CONFIG_STRICT_MM_TYPECHECKS +#define STRICT_MM_TYPECHECKS +#endif + +#ifdef STRICT_MM_TYPECHECKS typedef struct { unsigned long pgprot; } pgprot_t; typedef struct { unsigned long pgste; } pgste_t; @@ -88,43 +84,48 @@ typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pud; } pud_t; typedef struct { unsigned long p4d; } p4d_t; typedef struct { unsigned long pgd; } pgd_t; -typedef pte_t *pgtable_t; - -#define pgprot_val(x) ((x).pgprot) -#define pgste_val(x) ((x).pgste) -static inline unsigned long pte_val(pte_t pte) -{ - return pte.pte; +#define DEFINE_PGVAL_FUNC(name) \ +static __always_inline unsigned long name ## _val(name ## _t name) \ +{ \ + return name.name; \ } -static inline unsigned long pmd_val(pmd_t pmd) -{ - return pmd.pmd; -} +#else /* STRICT_MM_TYPECHECKS */ -static inline unsigned long pud_val(pud_t pud) -{ - return pud.pud; -} +typedef unsigned long pgprot_t; +typedef unsigned long pgste_t; +typedef unsigned long pte_t; +typedef unsigned long pmd_t; +typedef unsigned long pud_t; +typedef unsigned long p4d_t; +typedef unsigned long pgd_t; -static inline unsigned long p4d_val(p4d_t p4d) -{ - return p4d.p4d; +#define DEFINE_PGVAL_FUNC(name) \ +static __always_inline unsigned long name ## _val(name ## _t name) \ +{ \ + return name; \ } -static inline unsigned long pgd_val(pgd_t pgd) -{ - return pgd.pgd; -} +#endif /* STRICT_MM_TYPECHECKS */ + +DEFINE_PGVAL_FUNC(pgprot) +DEFINE_PGVAL_FUNC(pgste) +DEFINE_PGVAL_FUNC(pte) +DEFINE_PGVAL_FUNC(pmd) +DEFINE_PGVAL_FUNC(pud) +DEFINE_PGVAL_FUNC(p4d) +DEFINE_PGVAL_FUNC(pgd) + +typedef pte_t *pgtable_t; +#define __pgprot(x) ((pgprot_t) { (x) } ) #define __pgste(x) ((pgste_t) { (x) } ) #define __pte(x) ((pte_t) { (x) } ) #define __pmd(x) ((pmd_t) { (x) } ) #define __pud(x) ((pud_t) { (x) } ) #define __p4d(x) ((p4d_t) { (x) } ) #define __pgd(x) ((pgd_t) { (x) } ) -#define __pgprot(x) ((pgprot_t) { (x) } ) static inline void page_set_storage_key(unsigned long addr, unsigned char skey, int mapped) @@ -149,11 +150,12 @@ static inline int page_reset_referenced(unsigned long addr) int cc; asm volatile( - " rrbe 0,%1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc) : "a" (addr) : "cc"); - return cc; + " rrbe 0,%[addr]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [addr] "a" (addr) + : CC_CLOBBER); + return CC_TRANSFORM(cc); } /* Bits int the storage key */ @@ -163,9 +165,9 @@ static inline int page_reset_referenced(unsigned long addr) #define _PAGE_ACC_BITS 0xf0 /* HW access control bits */ struct page; +struct folio; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); -void arch_set_page_dat(struct page *page, int order); static inline int devmem_is_allowed(unsigned long pfn) { @@ -175,37 +177,114 @@ static inline int devmem_is_allowed(unsigned long pfn) #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE -#if IS_ENABLED(CONFIG_PGSTE) -int arch_make_page_accessible(struct page *page); -#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +int arch_make_folio_accessible(struct folio *folio); +#define HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE + +struct vm_layout { + unsigned long kaslr_offset; + unsigned long kaslr_offset_phys; + unsigned long identity_base; + unsigned long identity_size; +}; + +extern struct vm_layout vm_layout; + +#define __kaslr_offset vm_layout.kaslr_offset +#define __kaslr_offset_phys vm_layout.kaslr_offset_phys +#ifdef CONFIG_RANDOMIZE_IDENTITY_BASE +#define __identity_base vm_layout.identity_base +#else +#define __identity_base 0UL #endif +#define ident_map_size vm_layout.identity_size -#endif /* !__ASSEMBLY__ */ +static inline unsigned long kaslr_offset(void) +{ + return __kaslr_offset; +} + +extern int __kaslr_enabled; +static inline int kaslr_enabled(void) +{ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return __kaslr_enabled; + return 0; +} + +#define __PAGE_OFFSET __identity_base +#define PAGE_OFFSET __PAGE_OFFSET -#define __PAGE_OFFSET 0x0UL -#define PAGE_OFFSET 0x0UL +#ifdef __DECOMPRESSOR -#define __pa(x) ((unsigned long)(x)) +#define __pa_nodebug(x) ((unsigned long)(x)) +#define __pa(x) __pa_nodebug(x) +#define __pa32(x) __pa(x) #define __va(x) ((void *)(unsigned long)(x)) +#else /* __DECOMPRESSOR */ + +static inline unsigned long __pa_nodebug(unsigned long x) +{ + if (x < __kaslr_offset) + return x - __identity_base; + return x - __kaslr_offset + __kaslr_offset_phys; +} + +#ifdef CONFIG_DEBUG_VIRTUAL + +unsigned long __phys_addr(unsigned long x, bool is_31bit); + +#else /* CONFIG_DEBUG_VIRTUAL */ + +static inline unsigned long __phys_addr(unsigned long x, bool is_31bit) +{ + return __pa_nodebug(x); +} + +#endif /* CONFIG_DEBUG_VIRTUAL */ + +#define __pa(x) __phys_addr((unsigned long)(x), false) +#define __pa32(x) __phys_addr((unsigned long)(x), true) +#define __va(x) ((void *)((unsigned long)(x) + __identity_base)) + +#endif /* __DECOMPRESSOR */ + #define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) #define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) -#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) -#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) +#define phys_to_folio(phys) page_folio(phys_to_page(phys)) +#define folio_to_phys(page) pfn_to_phys(folio_pfn(folio)) + +static inline void *pfn_to_virt(unsigned long pfn) +{ + return __va(pfn_to_phys(pfn)); +} + +static inline unsigned long virt_to_pfn(const void *kaddr) +{ + return phys_to_pfn(__pa(kaddr)); +} -#define pfn_to_virt(pfn) __va(pfn_to_phys(pfn)) -#define virt_to_pfn(kaddr) (phys_to_pfn(__pa(kaddr))) #define pfn_to_kaddr(pfn) pfn_to_virt(pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) -#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) +#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr)))) #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#endif /* !__ASSEMBLY__ */ + #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> +#define AMODE31_SIZE (3 * PAGE_SIZE) + +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE +#define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) + +#define TEXT_OFFSET 0x100000 + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h index 5b7e33ac6f0b..ebeabd0aaa51 100644 --- a/arch/s390/include/asm/pai.h +++ b/arch/s390/include/asm/pai.h @@ -11,13 +11,16 @@ #include <linux/jump_label.h> #include <asm/lowcore.h> #include <asm/ptrace.h> +#include <asm/asm.h> struct qpaci_info_block { u64 header; struct { u64 : 8; - u64 num_cc : 8; /* # of supported crypto counters */ - u64 : 48; + u64 num_cc : 8; /* # of supported crypto counters */ + u64 : 9; + u64 num_nnpa : 7; /* # of supported NNPA counters */ + u64 : 32; }; }; @@ -31,17 +34,18 @@ static inline int qpaci(struct qpaci_info_block *info) " lgr 0,%[size]\n" " .insn s,0xb28f0000,%[info]\n" " lgr %[size],0\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size) + CC_IPM(cc) + : CC_OUT(cc, cc), [info] "=Q" (*info), [size] "+&d" (size) : - : "0", "cc", "memory"); - return cc ? (size + 1) * sizeof(u64) : 0; + : CC_CLOBBER_LIST("0", "memory")); + return CC_TRANSFORM(cc) ? (size + 1) * sizeof(u64) : 0; } #define PAI_CRYPTO_BASE 0x1000 /* First event number */ #define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */ #define PAI_CRYPTO_KERNEL_OFFSET 2048 +#define PAI_NNPA_BASE 0x1800 /* First event number */ +#define PAI_NNPA_MAXCTR 128 /* Max # of event counters */ DECLARE_STATIC_KEY_FALSE(pai_key); @@ -51,11 +55,11 @@ static __always_inline void pai_kernel_enter(struct pt_regs *regs) return; if (!static_branch_unlikely(&pai_key)) return; - if (!S390_lowcore.ccd) + if (!get_lowcore()->ccd) return; if (!user_mode(regs)) return; - WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd | PAI_CRYPTO_KERNEL_OFFSET); + WRITE_ONCE(get_lowcore()->ccd, get_lowcore()->ccd | PAI_CRYPTO_KERNEL_OFFSET); } static __always_inline void pai_kernel_exit(struct pt_regs *regs) @@ -64,11 +68,15 @@ static __always_inline void pai_kernel_exit(struct pt_regs *regs) return; if (!static_branch_unlikely(&pai_key)) return; - if (!S390_lowcore.ccd) + if (!get_lowcore()->ccd) return; if (!user_mode(regs)) return; - WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd & ~PAI_CRYPTO_KERNEL_OFFSET); + WRITE_ONCE(get_lowcore()->ccd, get_lowcore()->ccd & ~PAI_CRYPTO_KERNEL_OFFSET); } +#define PAI_SAVE_AREA(x) ((x)->hw.event_base) +#define PAI_CPU_MASK(x) ((x)->hw.addr_filters) +#define PAI_SWLIST(x) (&(x)->hw.tp_list) + #endif diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index fdb9745ee998..41f900f693d9 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -6,11 +6,14 @@ #include <linux/mutex.h> #include <linux/iommu.h> #include <linux/pci_hotplug.h> -#include <asm-generic/pci.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> +#include <asm/pci_insn.h> #include <asm/sclp.h> +#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 +#define arch_can_pci_mmap_wc() 1 + #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 @@ -96,7 +99,7 @@ struct zpci_bar_struct { u8 size; /* order 2 exponent */ }; -struct s390_domain; +struct kvm_zdev; #define ZPCI_FUNCTIONS_PER_BUS 256 struct zpci_bus { @@ -106,9 +109,10 @@ struct zpci_bus { struct list_head resources; struct list_head bus_next; struct resource bus_resource; - int pchid; + int topo; /* TID if topo_is_tid, PCHID otherwise */ int domain_nr; - bool multifunction; + u8 multifunction : 1; + u8 topo_is_tid : 1; enum pci_bus_speed max_bus_speed; }; @@ -116,28 +120,36 @@ struct zpci_bus { struct zpci_dev { struct zpci_bus *zbus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ - struct list_head bus_next; + struct list_head iommu_list; struct kref kref; + struct rcu_head rcu; struct hotplug_slot hotplug_slot; + struct mutex state_lock; /* protect state changes */ enum zpci_state state; u32 fid; /* function ID, used by sclp */ u32 fh; /* function handle, used by insn's */ + u32 gisa; /* GISA designation for passthrough */ u16 vfn; /* virtual function number */ u16 pchid; /* physical channel ID */ + u16 maxstbl; /* Maximum store block size */ + u16 rid; /* RID as supplied by firmware */ + u16 tid; /* Topology for which RID is valid */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ u8 port; + u8 fidparm; + u8 dtsm; /* Supported DT mask */ u8 rid_available : 1; u8 has_hp_slot : 1; u8 has_resources : 1; u8 is_physfn : 1; u8 util_str_avail : 1; u8 irqs_registered : 1; - u8 reserved : 2; + u8 tid_avail : 1; + u8 rtr_avail : 1; /* Relaxed translation allowed */ unsigned int devfn; /* DEVFN part of the RID*/ - struct mutex lock; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ @@ -152,16 +164,8 @@ struct zpci_dev { /* DMA stuff */ unsigned long *dma_table; - spinlock_t dma_table_lock; int tlb_refresh; - spinlock_t iommu_bitmap_lock; - unsigned long *iommu_bitmap; - unsigned long *lazy_bitmap; - unsigned long iommu_size; - unsigned long iommu_pages; - unsigned int next_bit; - struct iommu_device iommu_dev; /* IOMMU core handle */ char res_name[16]; @@ -173,20 +177,21 @@ struct zpci_dev { u64 dma_mask; /* DMA address space mask */ /* Function measurement block */ + struct mutex fmb_lock; struct zpci_fmb *fmb; u16 fmb_update; /* update interval */ u16 fmb_length; - /* software counters */ - atomic64_t allocated_pages; - atomic64_t mapped_pages; - atomic64_t unmapped_pages; u8 version; enum pci_bus_speed max_bus_speed; struct dentry *debugfs_dev; - struct s390_domain *s390_domain; /* s390 IOMMU domain data */ + /* IOMMU and passthrough */ + struct iommu_domain *s390_domain; /* attached IOMMU domain */ + struct kvm_zdev *kzdev; + struct mutex kzdev_lock; + spinlock_t dom_lock; /* protect s390_domain change */ }; static inline bool zdev_enabled(struct zpci_dev *zdev) @@ -194,31 +199,44 @@ static inline bool zdev_enabled(struct zpci_dev *zdev) return (zdev->fh & (1UL << 31)) ? true : false; } -extern const struct attribute_group *zpci_attr_groups[]; +extern const struct attribute_group zpci_attr_group; +extern const struct attribute_group pfip_attr_group; +extern const struct attribute_group zpci_ident_attr_group; + +#define ARCH_PCI_DEV_GROUPS &zpci_attr_group, \ + &pfip_attr_group, \ + &zpci_ident_attr_group, + extern unsigned int s390_pci_force_floating __initdata; extern unsigned int s390_pci_no_rid; +extern union zpci_sic_iib *zpci_aipb; +extern struct airq_iv *zpci_aif_sbv; + /* ----------------------------------------------------------------------------- Prototypes ----------------------------------------------------------------------------- */ /* Base stuff */ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); +int zpci_add_device(struct zpci_dev *zdev); int zpci_enable_device(struct zpci_dev *); +int zpci_reenable_device(struct zpci_dev *zdev); int zpci_disable_device(struct zpci_dev *); int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); void zpci_device_reserved(struct zpci_dev *zdev); bool zpci_is_device_configured(struct zpci_dev *zdev); +int zpci_scan_devices(void); int zpci_hot_reset_device(struct zpci_dev *zdev); -int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); +int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *); int zpci_unregister_ioat(struct zpci_dev *, u8); void zpci_remove_reserved_devices(void); void zpci_update_fh(struct zpci_dev *zdev, u32 fh); /* CLP */ int clp_setup_writeback_mio(void); -int clp_scan_pci_devices(void); +int clp_scan_pci_devices(struct list_head *scan_list); int clp_query_pci_fn(struct zpci_dev *zdev); int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as); int clp_disable_fh(struct zpci_dev *zdev, u32 *fh); @@ -231,6 +249,7 @@ void update_uid_checking(bool new); /* IOMMU Interface */ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); +int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status); #ifdef CONFIG_PCI static inline bool zpci_use_mio(struct zpci_dev *zdev) diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index 1f4b666e85ee..7ebff39c84b3 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -50,6 +50,9 @@ struct clp_fh_list_entry { #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 +/* PCI function type numbers */ +#define PCI_FUNC_TYPE_ISM 0x5 /* ISM device */ + extern bool zpci_unique_uid; struct clp_rsp_slpc_pci { @@ -107,7 +110,8 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 3; + u16 : 2; + u16 tid_avail : 1; u16 rid_avail : 1; u16 is_physfn : 1; u16 reserved1 : 1; @@ -119,16 +123,18 @@ struct clp_rsp_query_pci { u16 pchid; __le32 bar[PCI_STD_NUM_BARS]; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ - u16 : 12; - u16 port : 4; + u8 fidparm; + u8 reserved3 : 4; + u8 port : 4; u8 fmb_len; u8 pft; /* pci function type */ u64 sdma; /* start dma as */ u64 edma; /* end dma as */ #define ZPCI_RID_MASK_DEVFN 0x00ff u16 rid; /* BUS/DEVFN PCI address */ - u16 reserved0; - u32 reserved[10]; + u32 reserved0; + u16 tid; + u32 reserved[9]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ u32 reserved2[16]; @@ -150,12 +156,16 @@ struct clp_rsp_query_pci_grp { u16 : 4; u16 noi : 12; /* number of interrupts */ u8 version; - u8 : 6; + u8 : 2; + u8 rtr : 1; /* Relaxed translation requirement */ + u8 : 3; u8 frame : 1; u8 refresh : 1; /* TLB refresh mode */ - u16 reserved2; + u16 : 3; + u16 maxstbl : 13; /* Maximum store block size */ u16 mui; - u16 : 16; + u8 dtsm; /* Supported DT mask */ + u8 reserved3; u16 maxfaal; u16 : 4; u16 dnoi : 12; @@ -173,7 +183,8 @@ struct clp_req_set_pci { u16 reserved2; u8 oc; /* operation controls */ u8 ndas; /* number of dma spaces */ - u64 reserved3; + u32 reserved3; + u32 gisa; /* GISA designation */ } __packed; /* Set PCI function response */ diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 91e63426bdc5..d12e17201661 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -25,6 +25,7 @@ enum zpci_ioat_dtype { #define ZPCI_KEY (PAGE_DEFAULT_KEY << 5) #define ZPCI_TABLE_SIZE_RT (1UL << 42) +#define ZPCI_TABLE_SIZE_RS (1UL << 53) #define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST) #define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT) @@ -55,6 +56,8 @@ enum zpci_ioat_dtype { #define ZPCI_PT_BITS 8 #define ZPCI_ST_SHIFT (ZPCI_PT_BITS + PAGE_SHIFT) #define ZPCI_RT_SHIFT (ZPCI_ST_SHIFT + ZPCI_TABLE_BITS) +#define ZPCI_RS_SHIFT (ZPCI_RT_SHIFT + ZPCI_TABLE_BITS) +#define ZPCI_RF_SHIFT (ZPCI_RS_SHIFT + ZPCI_TABLE_BITS) #define ZPCI_RTE_FLAG_MASK 0x3fffUL #define ZPCI_RTE_ADDR_MASK (~ZPCI_RTE_FLAG_MASK) @@ -82,116 +85,16 @@ enum zpci_ioat_dtype { #define ZPCI_TABLE_VALID_MASK 0x20 #define ZPCI_TABLE_PROT_MASK 0x200 -static inline unsigned int calc_rtx(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; -} - -static inline unsigned int calc_sx(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK; -} - -static inline unsigned int calc_px(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK; -} - -static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa) -{ - *entry &= ZPCI_PTE_FLAG_MASK; - *entry |= (pfaa & ZPCI_PTE_ADDR_MASK); -} - -static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto) -{ - *entry &= ZPCI_RTE_FLAG_MASK; - *entry |= (sto & ZPCI_RTE_ADDR_MASK); - *entry |= ZPCI_TABLE_TYPE_RTX; -} - -static inline void set_st_pto(unsigned long *entry, phys_addr_t pto) -{ - *entry &= ZPCI_STE_FLAG_MASK; - *entry |= (pto & ZPCI_STE_ADDR_MASK); - *entry |= ZPCI_TABLE_TYPE_SX; -} - -static inline void validate_rt_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry &= ~ZPCI_TABLE_OFFSET_MASK; - *entry |= ZPCI_TABLE_VALID; - *entry |= ZPCI_TABLE_LEN_RTX; -} - -static inline void validate_st_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry |= ZPCI_TABLE_VALID; -} - -static inline void invalidate_pt_entry(unsigned long *entry) -{ - WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); - *entry &= ~ZPCI_PTE_VALID_MASK; - *entry |= ZPCI_PTE_INVALID; -} - -static inline void validate_pt_entry(unsigned long *entry) -{ - WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID); - *entry &= ~ZPCI_PTE_VALID_MASK; - *entry |= ZPCI_PTE_VALID; -} - -static inline void entry_set_protected(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_PROT_MASK; - *entry |= ZPCI_TABLE_PROTECTED; -} - -static inline void entry_clr_protected(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_PROT_MASK; - *entry |= ZPCI_TABLE_UNPROTECTED; -} - -static inline int reg_entry_isvalid(unsigned long entry) -{ - return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; -} - -static inline int pt_entry_isvalid(unsigned long entry) -{ - return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; -} - -static inline unsigned long *get_rt_sto(unsigned long entry) -{ - if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) - return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); - else - return NULL; - -} - -static inline unsigned long *get_st_pto(unsigned long entry) -{ - if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX) - return phys_to_virt(entry & ZPCI_STE_ADDR_MASK); - else - return NULL; -} - -/* Prototypes */ -void dma_free_seg_table(unsigned long); -unsigned long *dma_alloc_cpu_table(void); -void dma_cleanup_tables(unsigned long *); -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr); -void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags); - -extern const struct dma_map_ops s390_pci_dma_ops; +struct zpci_iommu_ctrs { + atomic64_t mapped_pages; + atomic64_t unmapped_pages; + atomic64_t global_rpcits; + atomic64_t sync_map_rpcits; + atomic64_t sync_rpcits; +}; + +struct zpci_dev; +struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev); #endif diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 61cf9531f68f..e5f57cfe1d45 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -98,6 +98,15 @@ struct zpci_fib { u32 gd; } __packed __aligned(8); +/* Set Interruption Controls Operation Controls */ +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_SET_AENI_CONTROLS 2 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + /* directed interruption information block */ struct zpci_diib { u32 : 1; @@ -119,9 +128,20 @@ struct zpci_cdiib { u64 : 64; } __packed __aligned(8); +/* adapter interruption parameters block */ +struct zpci_aipb { + u64 faisb; + u64 gait; + u16 : 13; + u16 afi : 3; + u32 : 32; + u16 faal; +} __packed __aligned(8); + union zpci_sic_iib { struct zpci_diib diib; struct zpci_cdiib cdiib; + struct zpci_aipb aipb; }; DECLARE_STATIC_KEY_FALSE(have_mio); @@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset); int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); int __zpci_store_block(const u64 *data, u64 req, u64 offset); void zpci_barrier(void); -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); - -static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) -{ - union zpci_sic_iib iib = {{0}}; - - return __zpci_set_irq_ctrl(ctl, isc, &iib); -} +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); #endif diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index 287bb88f7698..43a5ea4ee20f 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -11,6 +11,8 @@ /* I/O size constraints */ #define ZPCI_MAX_READ_SIZE 8 #define ZPCI_MAX_WRITE_SIZE 128 +#define ZPCI_BOUNDARY_SIZE (1 << 12) +#define ZPCI_BOUNDARY_MASK (ZPCI_BOUNDARY_SIZE - 1) /* I/O Map */ #define ZPCI_IOMAP_SHIFT 48 @@ -125,28 +127,30 @@ out: int zpci_write_block(volatile void __iomem *dst, const void *src, unsigned long len); -static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max) +static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max) { - int count = len > max ? max : len, size = 1; + int offset = dst & ZPCI_BOUNDARY_MASK; + int size; - while (!(src & 0x1) && !(dst & 0x1) && ((size << 1) <= count)) { - dst = dst >> 1; - src = src >> 1; - size = size << 1; - } - return size; + size = min3(len, ZPCI_BOUNDARY_SIZE - offset, max); + if (IS_ALIGNED(src, 8) && IS_ALIGNED(dst, 8) && IS_ALIGNED(size, 8)) + return size; + + if (size >= 8) + return 8; + return rounddown_pow_of_two(size); } static inline int zpci_memcpy_fromio(void *dst, const volatile void __iomem *src, - unsigned long n) + size_t n) { int size, rc = 0; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) src, - (u64) dst, n, - ZPCI_MAX_READ_SIZE); + size = zpci_get_max_io_size((u64 __force) src, + (u64) dst, n, + ZPCI_MAX_READ_SIZE); rc = zpci_read_single(dst, src, size); if (rc) break; @@ -158,7 +162,7 @@ static inline int zpci_memcpy_fromio(void *dst, } static inline int zpci_memcpy_toio(volatile void __iomem *dst, - const void *src, unsigned long n) + const void *src, size_t n) { int size, rc = 0; @@ -166,9 +170,9 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, return -EINVAL; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) dst, - (u64) src, n, - ZPCI_MAX_WRITE_SIZE); + size = zpci_get_max_io_size((u64 __force) dst, + (u64) src, n, + ZPCI_MAX_WRITE_SIZE); if (size > 8) /* main path */ rc = zpci_write_block(dst, src, size); else @@ -183,7 +187,7 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, } static inline int zpci_memset_io(volatile void __iomem *dst, - unsigned char val, size_t count) + int val, size_t count) { u8 *src = kmalloc(count, GFP_KERNEL); int rc; diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index cb5fc0690435..84f6b8357b45 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -4,12 +4,13 @@ #include <linux/preempt.h> #include <asm/cmpxchg.h> +#include <asm/march.h> /* * s390 uses its own implementation for per cpu data, the offset of * the cpu local data area is cached in the cpu's lowcore memory. */ -#define __my_cpu_offset S390_lowcore.percpu_offset +#define __my_cpu_offset get_lowcore()->percpu_offset /* * For 64 bit module code, the module may be more than 4G above the @@ -31,7 +32,7 @@ pcp_op_T__ *ptr__; \ preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ - prev__ = *ptr__; \ + prev__ = READ_ONCE(*ptr__); \ do { \ old__ = prev__; \ new__ = old__ op (val); \ @@ -50,7 +51,7 @@ #define this_cpu_or_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) #define this_cpu_or_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) -#ifndef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifndef MARCH_HAS_Z196_FEATURES #define this_cpu_add_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) #define this_cpu_add_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) @@ -61,7 +62,7 @@ #define this_cpu_or_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) #define this_cpu_or_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ +#else /* MARCH_HAS_Z196_FEATURES */ #define arch_this_cpu_add(pcp, val, op1, op2, szcast) \ { \ @@ -129,7 +130,7 @@ #define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lao") #define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, "laog") -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ +#endif /* MARCH_HAS_Z196_FEATURES */ #define arch_this_cpu_cmpxchg(pcp, oval, nval) \ ({ \ @@ -148,6 +149,22 @@ #define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) +#define this_cpu_cmpxchg64(pcp, o, n) this_cpu_cmpxchg_8(pcp, o, n) + +#define this_cpu_cmpxchg128(pcp, oval, nval) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + u128 old__, new__, ret__; \ + pcp_op_T__ *ptr__; \ + old__ = oval; \ + new__ = nval; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + ret__ = cmpxchg128((void *)ptr__, old__, new__); \ + preempt_enable_notrace(); \ + ret__; \ +}) + #define arch_this_cpu_xchg(pcp, nval) \ ({ \ typeof(pcp) *ptr__; \ @@ -164,24 +181,6 @@ #define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval) #define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval) -#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2) \ -({ \ - typeof(pcp1) *p1__; \ - typeof(pcp2) *p2__; \ - int ret__; \ - \ - preempt_disable_notrace(); \ - p1__ = raw_cpu_ptr(&(pcp1)); \ - p2__ = raw_cpu_ptr(&(pcp2)); \ - ret__ = __cmpxchg_double((unsigned long)p1__, (unsigned long)p2__, \ - (unsigned long)(o1), (unsigned long)(o2), \ - (unsigned long)(n1), (unsigned long)(n2)); \ - preempt_enable_notrace(); \ - ret__; \ -}) - -#define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double - #include <asm-generic/percpu.h> #endif /* __ARCH_S390_PERCPU__ */ diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index b9da71632827..e53894cedf08 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -37,9 +37,9 @@ extern ssize_t cpumf_events_sysfs_show(struct device *dev, /* Perf callbacks */ struct pt_regs; -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); -extern unsigned long perf_misc_flags(struct pt_regs *regs); -#define perf_misc_flags(regs) perf_misc_flags(regs) +extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_arch_misc_flags(struct pt_regs *regs); +#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs) #define perf_arch_bpf_user_pt_regs(regs) ®s->user_regs /* Perf pt_regs extension for sample-data-entry indicators */ @@ -48,33 +48,8 @@ struct perf_sf_sde_regs { unsigned long reserved:63; /* reserved */ }; -/* Perf PMU definitions for the counter facility */ -#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ - -/* Perf PMU definitions for the sampling facility */ -#define PERF_CPUM_SF_MAX_CTR 2 -#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ -#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ -#define PERF_EVENT_CPUM_CF_DIAG 0xBC000UL /* Event: Counter sets */ -#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ -#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ -#define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \ - PERF_CPUM_SF_DIAG_MODE) -#define PERF_CPUM_SF_FULL_BLOCKS 0x0004 /* Process full SDBs only */ -#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ - -#define REG_NONE 0 -#define REG_OVERFLOW 1 -#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) -#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) -#define TEAR_REG(hwc) ((hwc)->last_tag) -#define SAMPL_RATE(hwc) ((hwc)->event_base) -#define SAMPL_FLAGS(hwc) ((hwc)->config_base) -#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) -#define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS) -#define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) - #define perf_arch_fetch_caller_regs(regs, __ip) do { \ + (regs)->psw.mask = 0; \ (regs)->psw.addr = (__ip); \ (regs)->gprs[15] = (unsigned long)__builtin_frame_address(0) - \ offsetof(struct stack_frame, back_chain); \ diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h new file mode 100644 index 000000000000..a1bee4a1e470 --- /dev/null +++ b/arch/s390/include/asm/pfault.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2023 + */ +#ifndef _ASM_S390_PFAULT_H +#define _ASM_S390_PFAULT_H + +#include <linux/errno.h> + +int __pfault_init(void); +void __pfault_fini(void); + +static inline int pfault_init(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + return __pfault_init(); + return -EOPNOTSUPP; +} + +static inline void pfault_fini(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + __pfault_fini(); +} + +#endif /* _ASM_S390_PFAULT_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 17eb618f1348..5345398df653 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -23,11 +23,9 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); -struct page *page_table_alloc_pgste(struct mm_struct *mm); +struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); -void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); -void page_table_free_pgste(struct page *page); -extern int page_table_allocate_pgste; +void page_table_free_pgste(struct ptdesc *ptdesc); static inline void crst_table_init(unsigned long *crst, unsigned long entry) { @@ -54,29 +52,42 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); - if (table) - crst_table_init(table, _REGION2_ENTRY_EMPTY); + if (!table) + return NULL; + crst_table_init(table, _REGION2_ENTRY_EMPTY); + pagetable_p4d_ctor(virt_to_ptdesc(table)); + return (p4d_t *) table; } static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { - if (!mm_p4d_folded(mm)) - crst_table_free(mm, (unsigned long *) p4d); + if (mm_p4d_folded(mm)) + return; + + pagetable_dtor(virt_to_ptdesc(p4d)); + crst_table_free(mm, (unsigned long *) p4d); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); - if (table) - crst_table_init(table, _REGION3_ENTRY_EMPTY); + + if (!table) + return NULL; + crst_table_init(table, _REGION3_ENTRY_EMPTY); + pagetable_pud_ctor(virt_to_ptdesc(table)); + return (pud_t *) table; } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - if (!mm_pud_folded(mm)) - crst_table_free(mm, (unsigned long *) pud); + if (mm_pud_folded(mm)) + return; + + pagetable_dtor(virt_to_ptdesc(pud)); + crst_table_free(mm, (unsigned long *) pud); } static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) @@ -86,7 +97,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(mm, virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -97,7 +108,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } @@ -118,11 +129,18 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return (pgd_t *) crst_table_alloc(mm); + unsigned long *table = crst_table_alloc(mm); + + if (!table) + return NULL; + pagetable_pgd_ctor(virt_to_ptdesc(table)); + + return (pgd_t *) table; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { + pagetable_dtor(virt_to_ptdesc(pgd)); crst_table_free(mm, (unsigned long *) pgd); } @@ -143,6 +161,10 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); pte_t *vmem_pte_alloc(void); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a397b072a580..6d8bc27a366e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -14,17 +14,19 @@ #include <linux/sched.h> #include <linux/mm_types.h> +#include <linux/cpufeature.h> #include <linux/page-flags.h> #include <linux/radix-tree.h> #include <linux/atomic.h> -#include <asm/sections.h> +#include <asm/ctlreg.h> #include <asm/bug.h> #include <asm/page.h> #include <asm/uv.h> extern pgd_t swapper_pg_dir[]; +extern pgd_t invalid_pg_dir[]; extern void paging_init(void); -extern unsigned long s390_invalid_asce; +extern struct ctlreg s390_invalid_asce; enum { PG_DIRECT_MAP_4K = 0, @@ -41,14 +43,12 @@ static inline void update_page_count(int level, long count) atomic_long_add(count, &direct_pages_count[level]); } -struct seq_file; -void arch_report_meminfo(struct seq_file *m); - /* * The S390 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* @@ -85,16 +85,14 @@ extern unsigned long zero_page_mask; * happen without trampolines and in addition the placement within a * 2GB frame is branch prediction unit friendly. */ -extern unsigned long __bootdata_preserved(VMALLOC_START); -extern unsigned long __bootdata_preserved(VMALLOC_END); +extern unsigned long VMALLOC_START; +extern unsigned long VMALLOC_END; #define VMALLOC_DEFAULT_SIZE ((512UL << 30) - MODULES_LEN) -extern struct page *__bootdata_preserved(vmemmap); -extern unsigned long __bootdata_preserved(vmemmap_size); - -#define VMEM_MAX_PHYS ((unsigned long) vmemmap) +extern struct page *vmemmap; +extern unsigned long vmemmap_size; -extern unsigned long __bootdata_preserved(MODULES_VADDR); -extern unsigned long __bootdata_preserved(MODULES_END); +extern unsigned long MODULES_VADDR; +extern unsigned long MODULES_END; #define MODULES_VADDR MODULES_VADDR #define MODULES_END MODULES_END #define MODULES_LEN (1UL << 31) @@ -109,6 +107,26 @@ static inline int is_module_addr(void *addr) return 1; } +#ifdef CONFIG_KMSAN +#define KMSAN_VMALLOC_SIZE (VMALLOC_END - VMALLOC_START) +#define KMSAN_VMALLOC_SHADOW_START VMALLOC_END +#define KMSAN_VMALLOC_SHADOW_END (KMSAN_VMALLOC_SHADOW_START + KMSAN_VMALLOC_SIZE) +#define KMSAN_VMALLOC_ORIGIN_START KMSAN_VMALLOC_SHADOW_END +#define KMSAN_VMALLOC_ORIGIN_END (KMSAN_VMALLOC_ORIGIN_START + KMSAN_VMALLOC_SIZE) +#define KMSAN_MODULES_SHADOW_START KMSAN_VMALLOC_ORIGIN_END +#define KMSAN_MODULES_SHADOW_END (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#define KMSAN_MODULES_ORIGIN_START KMSAN_MODULES_SHADOW_END +#define KMSAN_MODULES_ORIGIN_END (KMSAN_MODULES_ORIGIN_START + MODULES_LEN) +#endif + +#ifdef CONFIG_RANDOMIZE_BASE +#define KASLR_LEN (1UL << 31) +#else +#define KASLR_LEN 0UL +#endif + +void setup_protection_map(void); + /* * A 64 bit pagetable entry of S390 has following format: * | PFRA |0IPC| OS | @@ -181,6 +199,8 @@ static inline int is_module_addr(void *addr) #define _PAGE_SOFT_DIRTY 0x000 #endif +#define _PAGE_SW_BITS 0xffUL /* All SW bits */ + #define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */ /* Set of bits not changed in pte_modify */ @@ -188,6 +208,12 @@ static inline int is_module_addr(void *addr) _PAGE_YOUNG | _PAGE_SOFT_DIRTY) /* + * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT + * HW bit and all SW bits. + */ +#define _PAGE_RDP_MASK ~(_PAGE_PROTECT | _PAGE_SW_BITS) + +/* * handle_pte_fault uses pte_present and pte_none to find out the pte type * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to * distinguish present from not-present ptes. It is changed only with the page @@ -253,28 +279,40 @@ static inline int is_module_addr(void *addr) #define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID) #define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH) #define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID) -#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) +#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH | \ + _REGION3_ENTRY_PRESENT) #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) +#define _REGION3_ENTRY_HARDWARE_BITS 0xfffffffffffff6ffUL +#define _REGION3_ENTRY_HARDWARE_BITS_LARGE 0xffffffff8001073cUL #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ +#define _REGION3_ENTRY_COMM 0x0010 /* Common-Region, marks swap entry */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ -#define _REGION3_ENTRY_READ 0x0002 /* SW region read bit */ -#define _REGION3_ENTRY_WRITE 0x0001 /* SW region write bit */ +#define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ +#define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ +#define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */ #else #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ #endif #define _REGION_ENTRY_BITS 0xfffffffffffff22fUL +/* + * SW region present bit. For non-leaf region-third-table entries, bits 62-63 + * indicate the TABLE LENGTH and both must be set to 1. But such entries + * would always be considered as present, so it is safe to use bit 63 as + * PRESENT bit for PUD. + */ +#define _REGION3_ENTRY_PRESENT 0x0001 + /* Bits in the segment table entry */ -#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL -#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL -#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff00730UL +#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe3fUL +#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe3cUL +#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff1073cUL #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ #define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ #define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ @@ -282,21 +320,29 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ -#define _SEGMENT_ENTRY (0) +#define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PRESENT) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ + +#define _SEGMENT_ENTRY_COMM 0x0010 /* Common-Segment, marks swap entry */ #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ -#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */ -#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */ +#define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ +#define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */ +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0002 /* SW segment soft dirty bit */ #else #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ + +/* Common bits in region and segment table entries, for swap entries */ +#define _RST_ENTRY_COMM 0x0010 /* Common-Region/Segment, marks swap entry */ +#define _RST_ENTRY_INVALID 0x0020 /* invalid region/segment table entry */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -312,7 +358,7 @@ static inline int is_module_addr(void *addr) #define _REGION2_INDEX (0x7ffUL << _REGION2_SHIFT) #define _REGION3_INDEX (0x7ffUL << _REGION3_SHIFT) #define _SEGMENT_INDEX (0x7ffUL << _SEGMENT_SHIFT) -#define _PAGE_INDEX (0xffUL << _PAGE_SHIFT) +#define _PAGE_INDEX (0xffUL << PAGE_SHIFT) #define _REGION1_SIZE (1UL << _REGION1_SHIFT) #define _REGION2_SIZE (1UL << _REGION2_SHIFT) @@ -375,9 +421,10 @@ static inline int is_module_addr(void *addr) #define PGSTE_HC_BIT 0x0020000000000000UL #define PGSTE_GR_BIT 0x0004000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ -#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ -#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ +#define PGSTE_ST2_MASK 0x0000ffff00000000UL +#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL @@ -399,101 +446,107 @@ static inline int is_module_addr(void *addr) /* * Page protection definitions. */ -#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT) -#define PAGE_RO __pgprot(_PAGE_PRESENT | _PAGE_READ | \ +#define __PAGE_NONE (_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT) +#define __PAGE_RO (_PAGE_PRESENT | _PAGE_READ | \ _PAGE_NOEXEC | _PAGE_INVALID | _PAGE_PROTECT) -#define PAGE_RX __pgprot(_PAGE_PRESENT | _PAGE_READ | \ +#define __PAGE_RX (_PAGE_PRESENT | _PAGE_READ | \ _PAGE_INVALID | _PAGE_PROTECT) -#define PAGE_RW __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ +#define __PAGE_RW (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ _PAGE_NOEXEC | _PAGE_INVALID | _PAGE_PROTECT) -#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ +#define __PAGE_RWX (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ _PAGE_INVALID | _PAGE_PROTECT) - -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ +#define __PAGE_SHARED (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ _PAGE_YOUNG | _PAGE_DIRTY | _PAGE_NOEXEC) -#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ +#define __PAGE_KERNEL (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ _PAGE_YOUNG | _PAGE_DIRTY | _PAGE_NOEXEC) -#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \ +#define __PAGE_KERNEL_RO (_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \ _PAGE_PROTECT | _PAGE_NOEXEC) -#define PAGE_KERNEL_EXEC __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ - _PAGE_YOUNG | _PAGE_DIRTY) -/* - * On s390 the page table entry has an invalid bit and a read-only bit. - * Read permission implies execute permission and write permission - * implies read permission. - */ - /*xwr*/ -#define __P000 PAGE_NONE -#define __P001 PAGE_RO -#define __P010 PAGE_RO -#define __P011 PAGE_RO -#define __P100 PAGE_RX -#define __P101 PAGE_RX -#define __P110 PAGE_RX -#define __P111 PAGE_RX - -#define __S000 PAGE_NONE -#define __S001 PAGE_RO -#define __S010 PAGE_RW -#define __S011 PAGE_RW -#define __S100 PAGE_RX -#define __S101 PAGE_RX -#define __S110 PAGE_RWX -#define __S111 PAGE_RWX +extern unsigned long page_noexec_mask; + +#define __pgprot_page_mask(x) __pgprot((x) & page_noexec_mask) + +#define PAGE_NONE __pgprot_page_mask(__PAGE_NONE) +#define PAGE_RO __pgprot_page_mask(__PAGE_RO) +#define PAGE_RX __pgprot_page_mask(__PAGE_RX) +#define PAGE_RW __pgprot_page_mask(__PAGE_RW) +#define PAGE_RWX __pgprot_page_mask(__PAGE_RWX) +#define PAGE_SHARED __pgprot_page_mask(__PAGE_SHARED) +#define PAGE_KERNEL __pgprot_page_mask(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot_page_mask(__PAGE_KERNEL_RO) /* * Segment entry (large page) protection definitions. */ -#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \ +#define __SEGMENT_NONE (_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_INVALID | \ _SEGMENT_ENTRY_PROTECT) -#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define __SEGMENT_RO (_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define __SEGMENT_RX (_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ) -#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_READ | \ +#define __SEGMENT_RW (_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_READ | \ +#define __SEGMENT_RWX (_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE) -#define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \ +#define __SEGMENT_KERNEL (_SEGMENT_ENTRY | \ _SEGMENT_ENTRY_LARGE | \ _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE | \ _SEGMENT_ENTRY_YOUNG | \ _SEGMENT_ENTRY_DIRTY | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY | \ +#define __SEGMENT_KERNEL_RO (_SEGMENT_ENTRY | \ _SEGMENT_ENTRY_LARGE | \ _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_YOUNG | \ _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_KERNEL_EXEC __pgprot(_SEGMENT_ENTRY | \ - _SEGMENT_ENTRY_LARGE | \ - _SEGMENT_ENTRY_READ | \ - _SEGMENT_ENTRY_WRITE | \ - _SEGMENT_ENTRY_YOUNG | \ - _SEGMENT_ENTRY_DIRTY) + +extern unsigned long segment_noexec_mask; + +#define __pgprot_segment_mask(x) __pgprot((x) & segment_noexec_mask) + +#define SEGMENT_NONE __pgprot_segment_mask(__SEGMENT_NONE) +#define SEGMENT_RO __pgprot_segment_mask(__SEGMENT_RO) +#define SEGMENT_RX __pgprot_segment_mask(__SEGMENT_RX) +#define SEGMENT_RW __pgprot_segment_mask(__SEGMENT_RW) +#define SEGMENT_RWX __pgprot_segment_mask(__SEGMENT_RWX) +#define SEGMENT_KERNEL __pgprot_segment_mask(__SEGMENT_KERNEL) +#define SEGMENT_KERNEL_RO __pgprot_segment_mask(__SEGMENT_KERNEL_RO) /* * Region3 entry (large page) protection definitions. */ -#define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \ - _REGION3_ENTRY_LARGE | \ - _REGION3_ENTRY_READ | \ - _REGION3_ENTRY_WRITE | \ - _REGION3_ENTRY_YOUNG | \ +#define __REGION3_KERNEL (_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_WRITE | \ + _REGION3_ENTRY_YOUNG | \ _REGION3_ENTRY_DIRTY | \ _REGION_ENTRY_NOEXEC) -#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \ - _REGION3_ENTRY_LARGE | \ - _REGION3_ENTRY_READ | \ - _REGION3_ENTRY_YOUNG | \ - _REGION_ENTRY_PROTECT | \ - _REGION_ENTRY_NOEXEC) +#define __REGION3_KERNEL_RO (_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_YOUNG | \ + _REGION_ENTRY_PROTECT | \ + _REGION_ENTRY_NOEXEC) + +extern unsigned long region_noexec_mask; + +#define __pgprot_region_mask(x) __pgprot((x) & region_noexec_mask) + +#define REGION3_KERNEL __pgprot_region_mask(__REGION3_KERNEL) +#define REGION3_KERNEL_RO __pgprot_region_mask(__REGION3_KERNEL_RO) static inline bool mm_p4d_folded(struct mm_struct *mm) { @@ -525,19 +578,20 @@ static inline int mm_has_pgste(struct mm_struct *mm) static inline int mm_is_protected(struct mm_struct *mm) { #ifdef CONFIG_PGSTE - if (unlikely(atomic_read(&mm->context.is_protected))) + if (unlikely(atomic_read(&mm->context.protected_count))) return 1; #endif return 0; } -static inline int mm_alloc_pgste(struct mm_struct *mm) +static inline pgste_t clear_pgste_bit(pgste_t pgste, unsigned long mask) { -#ifdef CONFIG_PGSTE - if (unlikely(mm->context.alloc_pgste)) - return 1; -#endif - return 0; + return __pgste(pgste_val(pgste) & ~mask); +} + +static inline pgste_t set_pgste_bit(pgste_t pgste, unsigned long mask) +{ + return __pgste(pgste_val(pgste) | mask); } static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) @@ -571,10 +625,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) } /* - * In the case that a guest uses storage keys - * faults should no longer be backed by zero pages + * As soon as the guest uses storage keys or enables PV, we deduplicate all + * mapped shared zeropages and prevent new shared zeropages from getting + * mapped. */ -#define mm_forbids_zeropage mm_has_pgste +#define mm_forbids_zeropage mm_forbids_zeropage +static inline int mm_forbids_zeropage(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (!mm->context.allow_cow_sharing) + return 1; +#endif + return 0; +} + static inline int mm_uses_skeys(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -596,7 +660,15 @@ static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new) : "cc"); } -static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new) +/** + * cspg() - Compare and Swap and Purge (CSPG) + * @ptr: Pointer to the value to be exchanged + * @old: The expected old value + * @new: The new value + * + * Return: True if compare and swap was successful, otherwise false. + */ +static inline bool cspg(unsigned long *ptr, unsigned long old, unsigned long new) { union register_pair r1 = { .even = old, .odd = new, }; unsigned long address = (unsigned long)ptr | 1; @@ -606,6 +678,7 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new : [r1] "+&d" (r1.pair), "+m" (*ptr) : [address] "d" (address) : "cc"); + return old == r1.even; } #define CRDTE_DTT_PAGE 0x00UL @@ -614,7 +687,18 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new #define CRDTE_DTT_REGION2 0x18UL #define CRDTE_DTT_REGION1 0x1cUL -static inline void crdte(unsigned long old, unsigned long new, +/** + * crdte() - Compare and Replace DAT Table Entry + * @old: The expected old value + * @new: The new value + * @table: Pointer to the value to be exchanged + * @dtt: Table type of the table to be exchanged + * @address: The address mapped by the entry to be replaced + * @asce: The ASCE of this entry + * + * Return: True if compare and replace was successful, otherwise false. + */ +static inline bool crdte(unsigned long old, unsigned long new, unsigned long *table, unsigned long dtt, unsigned long address, unsigned long asce) { @@ -625,6 +709,7 @@ static inline void crdte(unsigned long old, unsigned long new, : [r1] "+&d" (r1.pair) : [r2] "d" (r2.pair), [asce] "a" (asce) : "memory", "cc"); + return old == r1.even; } /* @@ -700,7 +785,7 @@ static inline int pud_present(pud_t pud) { if (pud_folded(pud)) return 1; - return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; + return (pud_val(pud) & _REGION3_ENTRY_PRESENT) != 0; } static inline int pud_none(pud_t pud) @@ -710,23 +795,28 @@ static inline int pud_none(pud_t pud) return pud_val(pud) == _REGION3_ENTRY_EMPTY; } -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline bool pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; - return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); + return (pud_present(pud) && (pud_val(pud) & _REGION3_ENTRY_LARGE) != 0); } -#define pmd_leaf pmd_large -static inline int pmd_large(pmd_t pmd) +static inline int pmd_present(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; +} + +#define pmd_leaf pmd_leaf +static inline bool pmd_leaf(pmd_t pmd) { - return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; + return (pmd_present(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0); } static inline int pmd_bad(pmd_t pmd) { - if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_large(pmd)) + if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_leaf(pmd)) return 1; return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; } @@ -735,7 +825,7 @@ static inline int pud_bad(pud_t pud) { unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; - if (type > _REGION_ENTRY_TYPE_R3 || pud_large(pud)) + if (type > _REGION_ENTRY_TYPE_R3 || pud_leaf(pud)) return 1; if (type < _REGION_ENTRY_TYPE_R3) return 0; @@ -753,11 +843,6 @@ static inline int p4d_bad(p4d_t p4d) return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } -static inline int pmd_present(pmd_t pmd) -{ - return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY; -} - static inline int pmd_none(pmd_t pmd) { return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; @@ -775,11 +860,13 @@ static inline int pud_write(pud_t pud) return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; } +#define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0; @@ -823,13 +910,12 @@ static inline int pte_protnone(pte_t pte) static inline int pmd_protnone(pmd_t pmd) { - /* pmd_large(pmd) implies pmd_present(pmd) */ - return pmd_large(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ); + /* pmd_leaf(pmd) implies pmd_present(pmd) */ + return pmd_leaf(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ); } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } @@ -908,6 +994,7 @@ static inline int pte_unused(pte_t pte) * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set. */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; @@ -1007,7 +1094,7 @@ static inline pte_t pte_wrprotect(pte_t pte) return set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE)); if (pte_val(pte) & _PAGE_DIRTY) @@ -1061,6 +1148,19 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long pto; + + pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + : "+m" (*ptep) + : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), + [asce] "a" (asce), [m4] "i" (local)); +} + static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, unsigned long opt, unsigned long asce, int local) @@ -1140,7 +1240,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res)) - uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + uv_convert_from_secure_pte(res); return res; } @@ -1158,7 +1258,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(vma->vm_mm) && pte_present(res)) - uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + uv_convert_from_secure_pte(res); return res; } @@ -1182,9 +1282,22 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } else { res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } - /* At this point the reference through the mapping is still present */ - if (mm_is_protected(mm) && pte_present(res)) - uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + /* Nothing to do */ + if (!mm_is_protected(mm) || !pte_present(res)) + return res; + /* + * At this point the reference through the mapping is still present. + * The notifier should have destroyed all protected vCPUs at this + * point, so the destroy should be successful. + */ + if (full && !uv_destroy_pte(res)) + return res; + /* + * If something went wrong and the page could not be destroyed, or + * if this is not a mm teardown, the slower export is used as + * fallback instead. + */ + uv_convert_from_secure_pte(res); return res; } @@ -1198,6 +1311,44 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte)); } +/* + * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE + * bits in the comparison. Those might change e.g. because of dirty and young + * tracking. + */ +static inline int pte_allow_rdp(pte_t old, pte_t new) +{ + /* + * Only allow changes from RO to RW + */ + if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT) + return 0; + + return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK); +} + +static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + /* + * RDP might not have propagated the PTE protection reset to all CPUs, + * so there could be spurious TLB protection faults. + * NOTE: This will also be called when a racing pagetable update on + * another thread already installed the correct PTE. Both cases cannot + * really be distinguished. + * Therefore, only do the local TLB flush when RDP can be used, and the + * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead. + * A local RDP can be used to do the flush. + */ + if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT)) + __ptep_rdp(address, ptep, 0, 0, 1); +} +#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault + +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new); + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS static inline int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1205,7 +1356,10 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, { if (pte_same(*ptep, entry)) return 0; - ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); + if (cpu_has_rdp() && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); + else + ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); return 1; } @@ -1250,24 +1404,37 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); #define pgprot_writecombine pgprot_writecombine pgprot_t pgprot_writecombine(pgprot_t prot); -#define pgprot_writethrough pgprot_writethrough -pgprot_t pgprot_writethrough(pgprot_t prot); +#define PFN_PTE_SHIFT PAGE_SHIFT /* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) - ptep_set_pte_at(mm, addr, ptep, entry); - else - set_pte(ptep, entry); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } } +#define set_ptes set_ptes /* * Conversion functions: convert a page and protection to a page entry, @@ -1278,21 +1445,9 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) pte_t __pte; __pte = __pte(physpage | pgprot_val(pgprot)); - if (!MACHINE_HAS_NX) - __pte = clear_pte_bit(__pte, __pgprot(_PAGE_NOEXEC)); return pte_mkyoung(__pte); } -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) -{ - unsigned long physpage = page_to_phys(page); - pte_t __pte = mk_pte_phys(physpage, pgprot); - - if (pte_write(__pte) && PageDirty(page)) - __pte = pte_mkdirty(__pte); - return __pte; -} - #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) @@ -1306,7 +1461,7 @@ static inline unsigned long pmd_deref(pmd_t pmd) unsigned long origin_mask; origin_mask = _SEGMENT_ENTRY_ORIGIN; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; return (unsigned long)__va(pmd_val(pmd) & origin_mask); } @@ -1321,11 +1476,12 @@ static inline unsigned long pud_deref(pud_t pud) unsigned long origin_mask; origin_mask = _REGION_ENTRY_ORIGIN; - if (pud_large(pud)) + if (pud_leaf(pud)) origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; return (unsigned long)__va(pud_val(pud) & origin_mask); } +#define pud_pfn pud_pfn static inline unsigned long pud_pfn(pud_t pud) { return __pa(pud_deref(pud)) >> PAGE_SHIFT; @@ -1423,7 +1579,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE)); if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) @@ -1646,8 +1802,6 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t entry) { - if (!MACHINE_HAS_NX) - entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC)); set_pmd(pmdp, entry); } @@ -1689,8 +1843,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + pmd_t pmd; + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); + pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } @@ -1713,17 +1869,16 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, #define pmdp_collapse_flush pmdp_collapse_flush #define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot)) -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) static inline int pmd_trans_huge(pmd_t pmd) { - return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; + return pmd_leaf(pmd); } #define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { - return MACHINE_HAS_EDAT1 ? 1 : 0; + return cpu_has_edat1() ? 1 : 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1777,10 +1932,59 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) +/* + * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE) + * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste + * as invalid. + * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010 + * | offset |Xtype |11TT|S0| + * |0000000000111111111122222222223333333333444444444455|555555|5566|66| + * |0123456789012345678901234567890123456789012345678901|234567|8901|23| + * + * Bits 0-51 store the offset. + * Bits 53-57 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT. + * Bit 52 (X) is unused. + */ + +#define __SWP_OFFSET_MASK_RSTE ((1UL << 52) - 1) +#define __SWP_OFFSET_SHIFT_RSTE 12 +#define __SWP_TYPE_MASK_RSTE ((1UL << 5) - 1) +#define __SWP_TYPE_SHIFT_RSTE 6 + +/* + * TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3 + * bits 0x01. See also __set_huge_pte_at(). + */ +static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset) +{ + unsigned long rste; + + rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM; + rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE; + rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE; + return rste; +} + +static inline unsigned long __swp_type_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE; +} + +static inline unsigned long __swp_offset_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE; +} + +#define __rste_to_swp_entry(rste) ((swp_entry_t) { rste }) extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); +extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); +extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); +extern void vmem_unmap_4k_page(unsigned long addr); +extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); extern int s390_enable_sie(void); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); @@ -1792,4 +1996,18 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define pmd_pgtable(pmd) \ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) +static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) +{ + unsigned long *pgstes, res; + + pgstes = pgt + _PAGE_ENTRIES; + + res = (pgstes[0] & PGSTE_ST2_MASK) << 16; + res |= pgstes[1] & PGSTE_ST2_MASK; + res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; + res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; + + return res; +} + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h new file mode 100644 index 000000000000..7ef3bbec98b0 --- /dev/null +++ b/arch/s390/include/asm/physmem_info.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MEM_DETECT_H +#define _ASM_S390_MEM_DETECT_H + +#include <linux/types.h> +#include <asm/page.h> + +enum physmem_info_source { + MEM_DETECT_NONE = 0, + MEM_DETECT_SCLP_STOR_INFO, + MEM_DETECT_DIAG260, + MEM_DETECT_DIAG500_STOR_LIMIT, + MEM_DETECT_SCLP_READ_INFO, + MEM_DETECT_BIN_SEARCH +}; + +struct physmem_range { + u64 start; + u64 end; +}; + +enum reserved_range_type { + RR_DECOMPRESSOR, + RR_INITRD, + RR_VMLINUX, + RR_AMODE31, + RR_IPLREPORT, + RR_CERT_COMP_LIST, + RR_MEM_DETECT_EXT, + RR_VMEM, + RR_MAX +}; + +struct reserved_range { + unsigned long start; + unsigned long end; + struct reserved_range *chain; +}; + +/* + * Storage element id is defined as 1 byte (up to 256 storage elements). + * In practise only storage element id 0 and 1 are used). + * According to architecture one storage element could have as much as + * 1020 subincrements. 255 physmem_ranges are embedded in physmem_info. + * If more physmem_ranges are required, a block of memory from already + * known physmem_range is taken (online_extended points to it). + */ +#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */ + +struct physmem_info { + u32 range_count; + u8 info_source; + unsigned long usable; + struct reserved_range reserved[RR_MAX]; + struct physmem_range online[MEM_INLINED_ENTRIES]; + struct physmem_range *online_extended; +}; + +extern struct physmem_info physmem_info; + +void add_physmem_online_range(u64 start, u64 end); + +static inline int __get_physmem_range(u32 n, unsigned long *start, + unsigned long *end, bool respect_usable_limit) +{ + if (n >= physmem_info.range_count) { + *start = 0; + *end = 0; + return -1; + } + + if (n < MEM_INLINED_ENTRIES) { + *start = (unsigned long)physmem_info.online[n].start; + *end = (unsigned long)physmem_info.online[n].end; + } else { + *start = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].start; + *end = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].end; + } + + if (respect_usable_limit && physmem_info.usable) { + if (*start >= physmem_info.usable) + return -1; + if (*end > physmem_info.usable) + *end = physmem_info.usable; + } + return 0; +} + +/** + * for_each_physmem_usable_range - early online memory range iterator + * @i: an integer used as loop variable + * @p_start: ptr to unsigned long for start address of the range + * @p_end: ptr to unsigned long for end address of the range + * + * Walks over detected online memory ranges below usable limit. + */ +#define for_each_physmem_usable_range(i, p_start, p_end) \ + for (i = 0; !__get_physmem_range(i, p_start, p_end, true); i++) + +/* Walks over all detected online memory ranges disregarding usable limit. */ +#define for_each_physmem_online_range(i, p_start, p_end) \ + for (i = 0; !__get_physmem_range(i, p_start, p_end, false); i++) + +static inline const char *get_physmem_info_source(void) +{ + switch (physmem_info.info_source) { + case MEM_DETECT_SCLP_STOR_INFO: + return "sclp storage info"; + case MEM_DETECT_DIAG260: + return "diag260"; + case MEM_DETECT_DIAG500_STOR_LIMIT: + return "diag500 storage limit"; + case MEM_DETECT_SCLP_READ_INFO: + return "sclp read info"; + case MEM_DETECT_BIN_SEARCH: + return "binary search"; + } + return "none"; +} + +#define RR_TYPE_NAME(t) case RR_ ## t: return #t +static inline const char *get_rr_type_name(enum reserved_range_type t) +{ + switch (t) { + RR_TYPE_NAME(DECOMPRESSOR); + RR_TYPE_NAME(INITRD); + RR_TYPE_NAME(VMLINUX); + RR_TYPE_NAME(AMODE31); + RR_TYPE_NAME(IPLREPORT); + RR_TYPE_NAME(CERT_COMP_LIST); + RR_TYPE_NAME(MEM_DETECT_EXT); + RR_TYPE_NAME(VMEM); + default: + return "UNKNOWN"; + } +} + +#define for_each_physmem_reserved_type_range(t, range, p_start, p_end) \ + for (range = &physmem_info.reserved[t], *p_start = range->start, *p_end = range->end; \ + range && range->end; range = range->chain ? __va(range->chain) : NULL, \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) + +static inline struct reserved_range *__physmem_reserved_next(enum reserved_range_type *t, + struct reserved_range *range) +{ + if (!range) { + range = &physmem_info.reserved[*t]; + if (range->end) + return range; + } + if (range->chain) + return __va(range->chain); + while (++*t < RR_MAX) { + range = &physmem_info.reserved[*t]; + if (range->end) + return range; + } + return NULL; +} + +#define for_each_physmem_reserved_range(t, range, p_start, p_end) \ + for (t = 0, range = __physmem_reserved_next(&t, NULL), \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0; \ + range; range = __physmem_reserved_next(&t, range), \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) + +static inline unsigned long get_physmem_reserved(enum reserved_range_type type, + unsigned long *addr, unsigned long *size) +{ + *addr = physmem_info.reserved[type].start; + *size = physmem_info.reserved[type].end - physmem_info.reserved[type].start; + return *size; +} + +#define AMODE31_START (physmem_info.reserved[RR_AMODE31].start) +#define AMODE31_END (physmem_info.reserved[RR_AMODE31].end) + +#endif diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h index dd3d20c332ac..b7b59faf16f4 100644 --- a/arch/s390/include/asm/pkey.h +++ b/arch/s390/include/asm/pkey.h @@ -2,7 +2,7 @@ /* * Kernelspace interface to the pkey device driver * - * Copyright IBM Corp. 2016,2019 + * Copyright IBM Corp. 2016, 2023 * * Author: Harald Freudenberger <freude@de.ibm.com> * @@ -20,9 +20,22 @@ * @param key pointer to a buffer containing the key blob * @param keylen size of the key blob in bytes * @param protkey pointer to buffer receiving the protected key + * @param xflags additional execution flags (see PKEY_XFLAG_* definitions below) + * As of now the only supported flag is PKEY_XFLAG_NOMEMALLOC. * @return 0 on success, negative errno value on failure */ -int pkey_keyblob2pkey(const u8 *key, u32 keylen, - struct pkey_protkey *protkey); +int pkey_key2protkey(const u8 *key, u32 keylen, + u8 *protkey, u32 *protkeylen, u32 *protkeytype, + u32 xflags); + +/* + * If this flag is given in the xflags parameter, the pkey implementation + * is not allowed to allocate memory but instead should fall back to use + * preallocated memory or simple fail with -ENOMEM. + * This flag is for protected key derive within a cipher or similar + * which must not allocate memory which would cause io operations - see + * also the CRYPTO_ALG_ALLOCATES_MEMORY flag in crypto.h. + */ +#define PKEY_XFLAG_NOMEMALLOC 0x0001 #endif /* _KAPI_PKEY_H */ diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index bf15da0fedbc..6ccd033acfe5 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -5,46 +5,62 @@ #include <asm/current.h> #include <linux/thread_info.h> #include <asm/atomic_ops.h> - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#include <asm/cmpxchg.h> +#include <asm/march.h> /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 + +/* + * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such + * that a decrement hitting 0 means we can and should reschedule. + */ #define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) -static inline int preempt_count(void) +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ +static __always_inline int preempt_count(void) { - return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED; + return READ_ONCE(get_lowcore()->preempt_count) & ~PREEMPT_NEED_RESCHED; } -static inline void preempt_count_set(int pc) +static __always_inline void preempt_count_set(int pc) { int old, new; + old = READ_ONCE(get_lowcore()->preempt_count); do { - old = READ_ONCE(S390_lowcore.preempt_count); - new = (old & PREEMPT_NEED_RESCHED) | - (pc & ~PREEMPT_NEED_RESCHED); - } while (__atomic_cmpxchg(&S390_lowcore.preempt_count, - old, new) != old); + new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); + } while (!arch_try_cmpxchg(&get_lowcore()->preempt_count, &old, new)); } -static inline void set_preempt_need_resched(void) +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * short instruction sequence. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) { - __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); + __atomic_and(~PREEMPT_NEED_RESCHED, &get_lowcore()->preempt_count); } -static inline void clear_preempt_need_resched(void) +static __always_inline void clear_preempt_need_resched(void) { - __atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); + __atomic_or(PREEMPT_NEED_RESCHED, &get_lowcore()->preempt_count); } -static inline bool test_preempt_need_resched(void) +static __always_inline bool test_preempt_need_resched(void) { - return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED); + return !(READ_ONCE(get_lowcore()->preempt_count) & PREEMPT_NEED_RESCHED); } -static inline void __preempt_count_add(int val) +static __always_inline void __preempt_count_add(int val) { /* * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES @@ -52,88 +68,59 @@ static inline void __preempt_count_add(int val) */ if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) { if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) { - __atomic_add_const(val, &S390_lowcore.preempt_count); + __atomic_add_const(val, &get_lowcore()->preempt_count); return; } } - __atomic_add(val, &S390_lowcore.preempt_count); + __atomic_add(val, &get_lowcore()->preempt_count); } -static inline void __preempt_count_sub(int val) +static __always_inline void __preempt_count_sub(int val) { __preempt_count_add(-val); } -static inline bool __preempt_count_dec_and_test(void) +/* + * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ +static __always_inline bool __preempt_count_dec_and_test(void) { - return __atomic_add(-1, &S390_lowcore.preempt_count) == 1; + return __atomic_add_const_and_test(-1, &get_lowcore()->preempt_count); } -static inline bool should_resched(int preempt_offset) +/* + * Returns true when we need to resched and can (barring IRQ state). + */ +static __always_inline bool should_resched(int preempt_offset) { - return unlikely(READ_ONCE(S390_lowcore.preempt_count) == - preempt_offset); + return unlikely(READ_ONCE(get_lowcore()->preempt_count) == preempt_offset); } -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define PREEMPT_ENABLED (0) - -static inline int preempt_count(void) -{ - return READ_ONCE(S390_lowcore.preempt_count); -} - -static inline void preempt_count_set(int pc) -{ - S390_lowcore.preempt_count = pc; -} - -static inline void set_preempt_need_resched(void) -{ -} - -static inline void clear_preempt_need_resched(void) -{ -} +#define init_task_preempt_count(p) do { } while (0) +/* Deferred to CPU bringup time */ +#define init_idle_preempt_count(p, cpu) do { } while (0) -static inline bool test_preempt_need_resched(void) -{ - return false; -} +#ifdef CONFIG_PREEMPTION -static inline void __preempt_count_add(int val) -{ - S390_lowcore.preempt_count += val; -} +void preempt_schedule(void); +void preempt_schedule_notrace(void); -static inline void __preempt_count_sub(int val) -{ - S390_lowcore.preempt_count -= val; -} +#ifdef CONFIG_PREEMPT_DYNAMIC -static inline bool __preempt_count_dec_and_test(void) -{ - return !--S390_lowcore.preempt_count && tif_need_resched(); -} +void dynamic_preempt_schedule(void); +void dynamic_preempt_schedule_notrace(void); +#define __preempt_schedule() dynamic_preempt_schedule() +#define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() -static inline bool should_resched(int preempt_offset) -{ - return unlikely(preempt_count() == preempt_offset && - tif_need_resched()); -} +#else /* CONFIG_PREEMPT_DYNAMIC */ -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ +#define __preempt_schedule() preempt_schedule() +#define __preempt_schedule_notrace() preempt_schedule_notrace() -#define init_task_preempt_count(p) do { } while (0) -/* Deferred to CPU bringup time */ -#define init_idle_preempt_count(p, cpu) do { } while (0) +#endif /* CONFIG_PREEMPT_DYNAMIC */ -#ifdef CONFIG_PREEMPTION -extern void preempt_schedule(void); -#define __preempt_schedule() preempt_schedule() -extern void preempt_schedule_notrace(void); -#define __preempt_schedule_notrace() preempt_schedule_notrace() #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index bd66f8e34949..6c8063cb8fe7 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -15,13 +15,11 @@ #include <linux/bits.h> #define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ -#define CIF_FPU 3 /* restore FPU registers */ #define CIF_ENABLED_WAIT 5 /* in enabled wait state */ #define CIF_MCCK_GUEST 6 /* machine check happening in guest */ #define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */ #define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY) -#define _CIF_FPU BIT(CIF_FPU) #define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT) #define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST) #define _CIF_DEDICATED_CPU BIT(CIF_DEDICATED_CPU) @@ -33,40 +31,68 @@ #include <linux/cpumask.h> #include <linux/linkage.h> #include <linux/irqflags.h> +#include <linux/bitops.h> +#include <asm/fpu-types.h> #include <asm/cpu.h> #include <asm/page.h> #include <asm/ptrace.h> #include <asm/setup.h> #include <asm/runtime_instr.h> -#include <asm/fpu/types.h> -#include <asm/fpu/internal.h> #include <asm/irqflags.h> +#include <asm/alternative.h> +#include <asm/fault.h> + +struct pcpu { + unsigned long ec_mask; /* bit mask for ec_xxx functions */ + unsigned long ec_clk; /* sigp timestamp for ec_xxx */ + unsigned long flags; /* per CPU flags */ + unsigned long capacity; /* cpu capacity for scheduler */ + signed char state; /* physical cpu state */ + signed char polarization; /* physical polarization */ + u16 address; /* physical cpu address */ +}; + +DECLARE_PER_CPU(struct pcpu, pcpu_devices); typedef long (*sys_call_ptr_t)(struct pt_regs *regs); -static inline void set_cpu_flag(int flag) +static __always_inline struct pcpu *this_pcpu(void) +{ + return (struct pcpu *)(get_lowcore()->pcpu); +} + +static __always_inline void set_cpu_flag(int flag) +{ + set_bit(flag, &this_pcpu()->flags); +} + +static __always_inline void clear_cpu_flag(int flag) +{ + clear_bit(flag, &this_pcpu()->flags); +} + +static __always_inline bool test_cpu_flag(int flag) { - S390_lowcore.cpu_flags |= (1UL << flag); + return test_bit(flag, &this_pcpu()->flags); } -static inline void clear_cpu_flag(int flag) +static __always_inline bool test_and_set_cpu_flag(int flag) { - S390_lowcore.cpu_flags &= ~(1UL << flag); + return test_and_set_bit(flag, &this_pcpu()->flags); } -static inline int test_cpu_flag(int flag) +static __always_inline bool test_and_clear_cpu_flag(int flag) { - return !!(S390_lowcore.cpu_flags & (1UL << flag)); + return test_and_clear_bit(flag, &this_pcpu()->flags); } /* * Test CIF flag of another CPU. The caller needs to ensure that * CPU hotplug can not happen, e.g. by disabling preemption. */ -static inline int test_cpu_flag_of(int flag, int cpu) +static __always_inline bool test_cpu_flag_of(int flag, int cpu) { - struct lowcore *lc = lowcore_ptr[cpu]; - return !!(lc->cpu_flags & (1UL << flag)); + return test_bit(flag, &per_cpu(pcpu_devices, cpu).flags); } #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY) @@ -76,13 +102,21 @@ static inline void get_cpu_id(struct cpuid *ptr) asm volatile("stidp %0" : "=Q" (*ptr)); } +static __always_inline unsigned long get_cpu_timer(void) +{ + unsigned long timer; + + asm volatile("stpt %[timer]" : [timer] "=Q" (timer)); + return timer; +} + void s390_adjust_jiffies(void); void s390_update_cpu_mhz(void); void cpu_detect_mhz_feature(void); extern const struct seq_operations cpuinfo_op; extern void execve_tail(void); -extern void __bpon(void); +unsigned long vdso_text_size(void); unsigned long vdso_size(void); /* @@ -102,6 +136,40 @@ unsigned long vdso_size(void); #define HAVE_ARCH_PICK_MMAP_LAYOUT +#define __stackleak_poison __stackleak_poison +static __always_inline void __stackleak_poison(unsigned long erase_low, + unsigned long erase_high, + unsigned long poison) +{ + unsigned long tmp, count; + + count = erase_high - erase_low; + if (!count) + return; + asm volatile( + " cghi %[count],8\n" + " je 2f\n" + " aghi %[count],-(8+1)\n" + " srlg %[tmp],%[count],8\n" + " ltgr %[tmp],%[tmp]\n" + " jz 1f\n" + "0: stg %[poison],0(%[addr])\n" + " mvc 8(256-8,%[addr]),0(%[addr])\n" + " la %[addr],256(%[addr])\n" + " brctg %[tmp],0b\n" + "1: stg %[poison],0(%[addr])\n" + " exrl %[count],3f\n" + " j 4f\n" + "2: stg %[poison],0(%[addr])\n" + " j 4f\n" + "3: mvc 8(1,%[addr]),0(%[addr])\n" + "4:\n" + : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp) + : [poison] "d" (poison) + : "memory", "cc" + ); +} + /* * Thread structure */ @@ -114,10 +182,10 @@ struct thread_struct { unsigned long hardirq_timer; /* task cputime in hardirq context */ unsigned long softirq_timer; /* task cputime in softirq context */ const sys_call_ptr_t *sys_call_table; /* system call table address */ - unsigned long gmap_addr; /* address of last gmap fault. */ - unsigned int gmap_write_flag; /* gmap fault write indication */ + union teid gmap_teid; /* address and flags of last gmap fault */ unsigned int gmap_int_code; /* int code of last gmap fault */ - unsigned int gmap_pfault; /* signal of a pending guest pfault */ + int ufpu_flags; /* user fpu flags */ + int kfpu_flags; /* kernel fpu flags */ /* Per-thread information related to debugging */ struct per_regs per_user; /* User specified PER registers */ @@ -133,11 +201,8 @@ struct thread_struct { struct gs_cb *gs_cb; /* Current guarded storage cb */ struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */ struct pgm_tdb trap_tdb; /* Transaction abort diagnose block */ - /* - * Warning: 'fpu' is dynamically-sized. It *MUST* be at - * the end. - */ - struct fpu fpu; /* FP and VX register save area */ + struct fpu ufpu; /* User FP and VX register save area */ + struct fpu kfpu; /* Kernel FP and VX register save area */ }; /* Flag to disable transactions. */ @@ -156,7 +221,6 @@ typedef struct thread_struct thread_struct; #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) &init_stack, \ - .fpu.regs = (void *) init_task.thread.fpu.fprs, \ .last_break = 1, \ } @@ -177,7 +241,6 @@ typedef struct thread_struct thread_struct; execve_tail(); \ } while (0) -/* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; struct seq_file; @@ -186,9 +249,6 @@ struct pt_regs; void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *tsk) { } - /* Free guarded storage control block */ void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); @@ -202,7 +262,23 @@ unsigned long __get_wchan(struct task_struct *p); /* Has task runtime instrumentation enabled ? */ #define is_ri_task(tsk) (!!(tsk)->thread.ri_cb) -register unsigned long current_stack_pointer asm("r15"); +/* avoid using global register due to gcc bug in versions < 8.4 */ +#define current_stack_pointer (__current_stack_pointer()) + +static __always_inline unsigned long __current_stack_pointer(void) +{ + unsigned long sp; + + asm volatile("lgr %0,15" : "=d" (sp)); + return sp; +} + +static __always_inline bool on_thread_stack(void) +{ + unsigned long ksp = get_lowcore()->kernel_stack; + + return !((ksp ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); +} static __always_inline unsigned short stap(void) { @@ -244,8 +320,8 @@ static inline void __load_psw(psw_t psw) */ static __always_inline void __load_psw_mask(unsigned long mask) { + psw_t psw __uninitialized; unsigned long addr; - psw_t psw; psw.mask = mask; @@ -268,14 +344,36 @@ static inline unsigned long __extract_psw(void) return (((unsigned long) reg1) << 32) | ((unsigned long) reg2); } -static inline void local_mcck_enable(void) +static inline unsigned long __local_mcck_save(void) { - __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK); + unsigned long mask = __extract_psw(); + + __load_psw_mask(mask & ~PSW_MASK_MCHECK); + return mask & PSW_MASK_MCHECK; +} + +#define local_mcck_save(mflags) \ +do { \ + typecheck(unsigned long, mflags); \ + mflags = __local_mcck_save(); \ +} while (0) + +static inline void local_mcck_restore(unsigned long mflags) +{ + unsigned long mask = __extract_psw(); + + mask &= ~PSW_MASK_MCHECK; + __load_psw_mask(mask | mflags); } static inline void local_mcck_disable(void) { - __load_psw_mask(__extract_psw() & ~PSW_MASK_MCHECK); + __local_mcck_save(); +} + +static inline void local_mcck_enable(void) +{ + __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK); } /* @@ -306,31 +404,20 @@ static __always_inline void __noreturn disabled_wait(void) #define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL -extern int memcpy_real(void *, unsigned long, size_t); -extern void memcpy_absolute(void *, void *, size_t); - -#define put_abs_lowcore(member, x) do { \ - unsigned long __abs_address = offsetof(struct lowcore, member); \ - __typeof__(((struct lowcore *)0)->member) __tmp = (x); \ - \ - memcpy_absolute(__va(__abs_address), &__tmp, sizeof(__tmp)); \ -} while (0) - -#define get_abs_lowcore(x, member) do { \ - unsigned long __abs_address = offsetof(struct lowcore, member); \ - __typeof__(((struct lowcore *)0)->member) *__ptr = &(x); \ - \ - memcpy_absolute(__ptr, __va(__abs_address), sizeof(*__ptr)); \ -} while (0) - -extern int s390_isolate_bp(void); -extern int s390_isolate_bp_guest(void); - static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) { return arch_irqs_disabled_flags(regs->psw.mask); } +static __always_inline void bpon(void) +{ + asm_inline volatile( + ALTERNATIVE(" nop\n", + " .insn rrf,0xb2e80000,0,0,13,0\n", + ALT_SPEC(82)) + ); +} + #endif /* __ASSEMBLY__ */ #endif /* __ASM_S390_PROCESSOR_H */ diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h deleted file mode 100644 index f960b2896606..000000000000 --- a/arch/s390/include/asm/ptdump.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _ASM_S390_PTDUMP_H -#define _ASM_S390_PTDUMP_H - -void ptdump_check_wx(void); - -static inline void debug_checkwx(void) -{ - if (IS_ENABLED(CONFIG_DEBUG_WX)) - ptdump_check_wx(); -} - -#endif /* _ASM_S390_PTDUMP_H */ diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 8bae33ab320a..62c0ab4a4b9d 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -9,28 +9,53 @@ #include <linux/bits.h> #include <uapi/asm/ptrace.h> +#include <asm/thread_info.h> #include <asm/tpi.h> #define PIF_SYSCALL 0 /* inside a system call */ -#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ #define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ #define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ #define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */ #define _PIF_SYSCALL BIT(PIF_SYSCALL) -#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) #define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) #define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) #define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS) -#ifndef __ASSEMBLY__ +#define PSW32_MASK_PER _AC(0x40000000, UL) +#define PSW32_MASK_DAT _AC(0x04000000, UL) +#define PSW32_MASK_IO _AC(0x02000000, UL) +#define PSW32_MASK_EXT _AC(0x01000000, UL) +#define PSW32_MASK_KEY _AC(0x00F00000, UL) +#define PSW32_MASK_BASE _AC(0x00080000, UL) /* Always one */ +#define PSW32_MASK_MCHECK _AC(0x00040000, UL) +#define PSW32_MASK_WAIT _AC(0x00020000, UL) +#define PSW32_MASK_PSTATE _AC(0x00010000, UL) +#define PSW32_MASK_ASC _AC(0x0000C000, UL) +#define PSW32_MASK_CC _AC(0x00003000, UL) +#define PSW32_MASK_PM _AC(0x00000f00, UL) +#define PSW32_MASK_RI _AC(0x00000080, UL) + +#define PSW32_ADDR_AMODE _AC(0x80000000, UL) +#define PSW32_ADDR_INSN _AC(0x7FFFFFFF, UL) + +#define PSW32_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 20) + +#define PSW32_ASC_PRIMARY _AC(0x00000000, UL) +#define PSW32_ASC_ACCREG _AC(0x00004000, UL) +#define PSW32_ASC_SECONDARY _AC(0x00008000, UL) +#define PSW32_ASC_HOME _AC(0x0000C000, UL) + +#define PSW_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 52) #define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \ - PSW_MASK_EA | PSW_MASK_BA) + PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT) #define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \ PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) +#ifndef __ASSEMBLY__ + struct psw_bits { unsigned long : 1; unsigned long per : 1; /* PER-Mask */ @@ -71,30 +96,6 @@ enum { &(*(struct psw_bits *)(&(__psw))); \ })) -#define PSW32_MASK_PER 0x40000000UL -#define PSW32_MASK_DAT 0x04000000UL -#define PSW32_MASK_IO 0x02000000UL -#define PSW32_MASK_EXT 0x01000000UL -#define PSW32_MASK_KEY 0x00F00000UL -#define PSW32_MASK_BASE 0x00080000UL /* Always one */ -#define PSW32_MASK_MCHECK 0x00040000UL -#define PSW32_MASK_WAIT 0x00020000UL -#define PSW32_MASK_PSTATE 0x00010000UL -#define PSW32_MASK_ASC 0x0000C000UL -#define PSW32_MASK_CC 0x00003000UL -#define PSW32_MASK_PM 0x00000f00UL -#define PSW32_MASK_RI 0x00000080UL - -#define PSW32_ADDR_AMODE 0x80000000UL -#define PSW32_ADDR_INSN 0x7FFFFFFFUL - -#define PSW32_DEFAULT_KEY (((u32)PAGE_DEFAULT_ACC) << 20) - -#define PSW32_ASC_PRIMARY 0x00000000UL -#define PSW32_ASC_ACCREG 0x00004000UL -#define PSW32_ASC_SECONDARY 0x00008000UL -#define PSW32_ASC_HOME 0x0000C000UL - typedef struct { unsigned int mask; unsigned int addr; @@ -126,7 +127,6 @@ struct pt_regs { struct tpi_info tpi_info; }; unsigned long flags; - unsigned long cr1; unsigned long last_break; }; @@ -201,6 +201,10 @@ static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag) return ret; } +struct task_struct; + +void update_cr_regs(struct task_struct *task); + /* * These are defined as per linux/ptrace.h, which see. */ @@ -225,8 +229,44 @@ static inline void instruction_pointer_set(struct pt_regs *regs, int regs_query_register_offset(const char *name); const char *regs_query_register_name(unsigned int offset); -unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); -unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); + +static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + return regs->gprs[15]; +} + +static __always_inline unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) +{ + if (offset >= NUM_GPRS) + return 0; + return regs->gprs[offset]; +} + +static __always_inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + unsigned long ksp = kernel_stack_pointer(regs); + + return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs:pt_regs which contains kernel stack pointer. + * @n:stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +static __always_inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long addr; + + addr = kernel_stack_pointer(regs) + n * sizeof(long); + if (!regs_within_kernel_stack(regs, addr)) + return 0; + return READ_ONCE_NOCHECK(addr); +} /** * regs_get_kernel_argument() - get Nth function argument in kernel @@ -247,11 +287,6 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, return regs_get_kernel_stack_nth(regs, argoffset + n); } -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ - return regs->gprs[15]; -} - static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->gprs[2] = rc; diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 2f983e0b95e0..69c4ead0c332 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -9,8 +9,9 @@ #define __QDIO_H__ #include <linux/interrupt.h> -#include <asm/cio.h> +#include <asm/dma-types.h> #include <asm/ccwdev.h> +#include <asm/cio.h> /* only use 4 queues to save some cachelines */ #define QDIO_MAX_QUEUES_PER_IRQ 4 @@ -34,9 +35,9 @@ * @dkey: access key for SLSB */ struct qdesfmt0 { - u64 sliba; - u64 sla; - u64 slsba; + dma64_t sliba; + dma64_t sla; + dma64_t slsba; u32 : 32; u32 akey : 4; u32 bkey : 4; @@ -74,7 +75,7 @@ struct qdr { /* private: */ u32 res[9]; /* public: */ - u64 qiba; + dma64_t qiba; u32 : 32; u32 qkey : 4; u32 : 28; @@ -146,7 +147,7 @@ struct qaob { u8 flags; u16 cbtbs; u8 sb_count; - u64 sba[QDIO_MAX_ELEMENTS_PER_BUFFER]; + dma64_t sba[QDIO_MAX_ELEMENTS_PER_BUFFER]; u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER]; u64 user0; u64 res4[2]; @@ -208,7 +209,7 @@ struct qdio_buffer_element { u8 scount; u8 sflags; u32 length; - u64 addr; + dma64_t addr; } __attribute__ ((packed, aligned(16))); /** @@ -224,7 +225,7 @@ struct qdio_buffer { * @sbal: absolute SBAL address */ struct sl_element { - u64 sbal; + dma64_t sbal; } __attribute__ ((packed)); /** diff --git a/arch/s390/include/asm/runtime-const.h b/arch/s390/include/asm/runtime-const.h new file mode 100644 index 000000000000..17878b1d048c --- /dev/null +++ b/arch/s390/include/asm/runtime-const.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_RUNTIME_CONST_H +#define _ASM_S390_RUNTIME_CONST_H + +#include <linux/uaccess.h> + +#define runtime_const_ptr(sym) \ +({ \ + typeof(sym) __ret; \ + \ + asm_inline( \ + "0: iihf %[__ret],%[c1]\n" \ + " iilf %[__ret],%[c2]\n" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n" \ + ".long 0b - .\n" \ + ".popsection" \ + : [__ret] "=d" (__ret) \ + : [c1] "i" (0x01234567UL), \ + [c2] "i" (0x89abcdefUL)); \ + __ret; \ +}) + +#define runtime_const_shift_right_32(val, sym) \ +({ \ + unsigned int __ret = (val); \ + \ + asm_inline( \ + "0: srl %[__ret],12\n" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n" \ + ".long 0b - .\n" \ + ".popsection" \ + : [__ret] "+d" (__ret)); \ + __ret; \ +}) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +/* 32-bit immediate for iihf and iilf in bits in I2 field */ +static inline void __runtime_fixup_32(u32 *p, unsigned int val) +{ + s390_kernel_write(p, &val, sizeof(val)); +} + +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + __runtime_fixup_32(where + 2, val >> 32); + __runtime_fixup_32(where + 8, val); +} + +/* Immediate value is lower 12 bits of D2 field of srl */ +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + u32 insn = *(u32 *)where; + + insn &= 0xfffff000; + insn |= (val & 63); + s390_kernel_write(where, &insn, sizeof(insn)); +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif /* _ASM_S390_RUNTIME_CONST_H */ diff --git a/arch/s390/include/asm/rwonce.h b/arch/s390/include/asm/rwonce.h new file mode 100644 index 000000000000..91fc24520e82 --- /dev/null +++ b/arch/s390/include/asm/rwonce.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_S390_RWONCE_H +#define __ASM_S390_RWONCE_H + +#include <linux/compiler_types.h> + +/* + * Use READ_ONCE_ALIGNED_128() for 128-bit block concurrent (atomic) read + * accesses. Note that x must be 128-bit aligned, otherwise a specification + * exception is generated. + */ +#define READ_ONCE_ALIGNED_128(x) \ +({ \ + union { \ + typeof(x) __x; \ + __uint128_t val; \ + } __u; \ + \ + BUILD_BUG_ON(sizeof(x) != 16); \ + asm volatile( \ + " lpq %[val],%[_x]\n" \ + : [val] "=d" (__u.val) \ + : [_x] "QS" (x) \ + : "memory"); \ + __u.__x; \ +}) + +#include <asm-generic/rwonce.h> + +#endif /* __ASM_S390_RWONCE_H */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 236b34b75ddb..1e62919bacf4 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -16,7 +16,13 @@ /* 24 + 16 * SCLP_MAX_CORES */ #define EXT_SCCB_READ_CPU (3 * PAGE_SIZE) +#define SCLP_ERRNOTIFY_AQ_RESET 0 +#define SCLP_ERRNOTIFY_AQ_REPAIR 1 +#define SCLP_ERRNOTIFY_AQ_INFO_LOG 2 +#define SCLP_ERRNOTIFY_AQ_OPTICS_DATA 3 + #ifndef __ASSEMBLY__ +#include <linux/uio.h> #include <asm/chpid.h> #include <asm/cpu.h> @@ -71,6 +77,7 @@ struct sclp_info { unsigned char has_core_type : 1; unsigned char has_sprp : 1; unsigned char has_hvs : 1; + unsigned char has_wti : 1; unsigned char has_esca : 1; unsigned char has_sief2 : 1; unsigned char has_64bscao : 1; @@ -83,11 +90,20 @@ struct sclp_info { unsigned char has_ibs : 1; unsigned char has_skey : 1; unsigned char has_kss : 1; + unsigned char has_diag204_bif : 1; unsigned char has_gisaf : 1; + unsigned char has_diag310 : 1; unsigned char has_diag318 : 1; + unsigned char has_diag320 : 1; + unsigned char has_diag324 : 1; unsigned char has_sipl : 1; + unsigned char has_sipl_eckd : 1; unsigned char has_dirq : 1; unsigned char has_iplcc : 1; + unsigned char has_zpci_lsi : 1; + unsigned char has_aisii : 1; + unsigned char has_aeni : 1; + unsigned char has_aisi : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; @@ -102,6 +118,34 @@ struct sclp_info { }; extern struct sclp_info sclp; +struct sccb_header { + u16 length; + u8 function_code; + u8 control_mask[3]; + u16 response_code; +} __packed; + +struct evbuf_header { + u16 length; + u8 type; + u8 flags; + u16 _reserved; +} __packed; + +struct err_notify_evbuf { + struct evbuf_header header; + u8 action; + u8 atype; + u32 fh; + u32 fid; + u8 data[]; +} __packed; + +struct err_notify_sccb { + struct sccb_header header; + struct err_notify_evbuf evbuf; +} __packed; + struct zpci_report_error_header { u8 version; /* Interface version byte */ u8 action; /* Action qualifier byte @@ -124,9 +168,12 @@ int sclp_early_read_storage_info(void); int sclp_early_get_core_info(struct sclp_core_info *info); void sclp_early_get_ipl_info(struct sclp_ipl_info *info); void sclp_early_detect(void); +void sclp_early_detect_machine_features(void); void sclp_early_printk(const char *s); void __sclp_early_printk(const char *s, unsigned int len); +void sclp_emergency_printk(const char *s); +int sclp_init(void); int sclp_early_get_memsize(unsigned long *mem); int sclp_early_get_hsa_size(unsigned long *hsa_size); int _sclp_get_core_info(struct sclp_core_info *info); @@ -142,8 +189,7 @@ int sclp_pci_deconfigure(u32 fid); int sclp_ap_configure(u32 apid); int sclp_ap_deconfigure(u32 apid); int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid); -int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count); -int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count); +size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count); void sclp_ocf_cpc_name_copy(char *dst); static inline int sclp_get_core_info(struct sclp_core_info *info, int early) diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h index 7ce584aff5bb..56003e26cdbf 100644 --- a/arch/s390/include/asm/scsw.h +++ b/arch/s390/include/asm/scsw.h @@ -11,6 +11,7 @@ #include <linux/types.h> #include <asm/css_chars.h> +#include <asm/dma-types.h> #include <asm/cio.h> /** @@ -53,7 +54,7 @@ struct cmd_scsw { __u32 fctl : 3; __u32 actl : 7; __u32 stctl : 5; - __u32 cpa; + dma32_t cpa; __u32 dstat : 8; __u32 cstat : 8; __u32 count : 16; @@ -93,7 +94,7 @@ struct tm_scsw { u32 fctl:3; u32 actl:7; u32 stctl:5; - u32 tcw; + dma32_t tcw; u32 dstat:8; u32 cstat:8; u32 fcxs:8; @@ -125,7 +126,7 @@ struct eadm_scsw { u32 fctl:3; u32 actl:7; u32 stctl:5; - u32 aob; + dma32_t aob; u32 dstat:8; u32 cstat:8; u32:16; @@ -215,6 +216,11 @@ union scsw { #define SNS2_ENV_DATA_PRESENT 0x10 #define SNS2_INPRECISE_END 0x04 +/* + * architectured values for PPRC errors + */ +#define SNS7_INVALID_ON_SEC 0x0e + /** * scsw_is_tm - check for transport mode scsw * @scsw: pointer to scsw diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 3fecaa4e8b74..0486e6ef62bf 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -23,7 +23,7 @@ */ #define __bootdata_preserved(var) __section(".boot.preserved.data." #var) var -extern unsigned long __samode31, __eamode31; -extern unsigned long __stext_amode31, __etext_amode31; +extern char *__samode31, *__eamode31; +extern char *__stext_amode31, *__etext_amode31; #endif diff --git a/arch/s390/include/asm/serial.h b/arch/s390/include/asm/serial.h deleted file mode 100644 index aaf85a69061c..000000000000 --- a/arch/s390/include/asm/serial.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_SERIAL_H -#define _ASM_S390_SERIAL_H - -#define BASE_BAUD 0 - -#endif /* _ASM_S390_SERIAL_H */ diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h index 950d87bd997a..94092f4ae764 100644 --- a/arch/s390/include/asm/set_memory.h +++ b/arch/s390/include/asm/set_memory.h @@ -6,37 +6,63 @@ extern struct mutex cpa_mutex; -#define SET_MEMORY_RO 1UL -#define SET_MEMORY_RW 2UL -#define SET_MEMORY_NX 4UL -#define SET_MEMORY_X 8UL -#define SET_MEMORY_4K 16UL +enum { + _SET_MEMORY_RO_BIT, + _SET_MEMORY_RW_BIT, + _SET_MEMORY_NX_BIT, + _SET_MEMORY_X_BIT, + _SET_MEMORY_4K_BIT, + _SET_MEMORY_INV_BIT, + _SET_MEMORY_DEF_BIT, +}; -int __set_memory(unsigned long addr, int numpages, unsigned long flags); +#define SET_MEMORY_RO BIT(_SET_MEMORY_RO_BIT) +#define SET_MEMORY_RW BIT(_SET_MEMORY_RW_BIT) +#define SET_MEMORY_NX BIT(_SET_MEMORY_NX_BIT) +#define SET_MEMORY_X BIT(_SET_MEMORY_X_BIT) +#define SET_MEMORY_4K BIT(_SET_MEMORY_4K_BIT) +#define SET_MEMORY_INV BIT(_SET_MEMORY_INV_BIT) +#define SET_MEMORY_DEF BIT(_SET_MEMORY_DEF_BIT) -static inline int set_memory_ro(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_RO); -} +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags); -static inline int set_memory_rw(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_RW); -} +#define set_memory_rox set_memory_rox -static inline int set_memory_nx(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_NX); +/* + * Generate two variants of each set_memory() function: + * + * set_memory_yy(unsigned long addr, int numpages); + * __set_memory_yy(void *start, void *end); + * + * The second variant exists for both convenience to avoid the usual + * (unsigned long) casts, but unlike the first variant it can also be used + * for areas larger than 8TB, which may happen at memory initialization. + */ +#define __SET_MEMORY_FUNC(fname, flags) \ +static inline int fname(unsigned long addr, int numpages) \ +{ \ + return __set_memory(addr, numpages, (flags)); \ +} \ + \ +static inline int __##fname(void *start, void *end) \ +{ \ + unsigned long numpages; \ + \ + numpages = (end - start) >> PAGE_SHIFT; \ + return __set_memory((unsigned long)start, numpages, (flags)); \ } -static inline int set_memory_x(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_X); -} +__SET_MEMORY_FUNC(set_memory_ro, SET_MEMORY_RO) +__SET_MEMORY_FUNC(set_memory_rw, SET_MEMORY_RW) +__SET_MEMORY_FUNC(set_memory_nx, SET_MEMORY_NX) +__SET_MEMORY_FUNC(set_memory_x, SET_MEMORY_X) +__SET_MEMORY_FUNC(set_memory_rox, SET_MEMORY_RO | SET_MEMORY_X) +__SET_MEMORY_FUNC(set_memory_rwnx, SET_MEMORY_RW | SET_MEMORY_NX) +__SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K) -static inline int set_memory_4k(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_4K); -} +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); +bool kernel_page_present(struct page *page); #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 77e6506898f5..031e881b4d88 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -13,27 +13,6 @@ #define PARMAREA 0x10400 #define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE -/* - * Machine features detected in early.c - */ - -#define MACHINE_FLAG_VM BIT(0) -#define MACHINE_FLAG_KVM BIT(1) -#define MACHINE_FLAG_LPAR BIT(2) -#define MACHINE_FLAG_DIAG9C BIT(3) -#define MACHINE_FLAG_ESOP BIT(4) -#define MACHINE_FLAG_IDTE BIT(5) -#define MACHINE_FLAG_EDAT1 BIT(7) -#define MACHINE_FLAG_EDAT2 BIT(8) -#define MACHINE_FLAG_TOPOLOGY BIT(10) -#define MACHINE_FLAG_TE BIT(11) -#define MACHINE_FLAG_TLB_LC BIT(12) -#define MACHINE_FLAG_VX BIT(13) -#define MACHINE_FLAG_TLB_GUEST BIT(14) -#define MACHINE_FLAG_NX BIT(15) -#define MACHINE_FLAG_GS BIT(16) -#define MACHINE_FLAG_SCC BIT(17) -#define MACHINE_FLAG_PCI_MIO BIT(18) #define LPP_MAGIC BIT(31) #define LPP_PID_MASK _AC(0xffffffff, UL) @@ -71,31 +50,12 @@ extern unsigned int zlib_dfltcc_support; #define ZLIB_DFLTCC_INFLATE_ONLY 3 #define ZLIB_DFLTCC_FULL_DEBUG 4 -extern int noexec_disabled; extern unsigned long ident_map_size; +extern unsigned long max_mappable; /* The Write Back bit position in the physaddr is given by the SLPC PCI */ extern unsigned long mio_wb_bit_mask; -#define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) -#define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) -#define MACHINE_IS_LPAR (S390_lowcore.machine_flags & MACHINE_FLAG_LPAR) - -#define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C) -#define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP) -#define MACHINE_HAS_IDTE (S390_lowcore.machine_flags & MACHINE_FLAG_IDTE) -#define MACHINE_HAS_EDAT1 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT1) -#define MACHINE_HAS_EDAT2 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT2) -#define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) -#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) -#define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) -#define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) -#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) -#define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) -#define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) -#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) -#define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO) - /* * Console mode. Override with conmode= */ @@ -115,13 +75,7 @@ extern unsigned int console_irq; #define SET_CONSOLE_VT220 do { console_mode = 4; } while (0) #define SET_CONSOLE_HVC do { console_mode = 5; } while (0) -#ifdef CONFIG_PFAULT -extern int pfault_init(void); -extern void pfault_fini(void); -#else /* CONFIG_PFAULT */ -#define pfault_init() ({-1;}) -#define pfault_fini() do { } while (0) -#endif /* CONFIG_PFAULT */ +void register_early_console(void); #ifdef CONFIG_VMCP void vmcp_cma_reserve(void); @@ -131,34 +85,17 @@ static inline void vmcp_cma_reserve(void) { } void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -void cmma_init(void); -void cmma_init_nodat(void); - extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); -extern unsigned long __kaslr_offset; -static inline unsigned long kaslr_offset(void) -{ - return __kaslr_offset; -} - -extern int is_full_image; - -struct initrd_data { - unsigned long start; - unsigned long size; -}; -extern struct initrd_data initrd_data; - struct oldmem_data { unsigned long start; unsigned long size; }; extern struct oldmem_data oldmem_data; -static inline u32 gen_lpswe(unsigned long addr) +static __always_inline u32 gen_lpswe(unsigned long addr) { BUILD_BUG_ON(addr > 0xfff); return 0xb2b20000 | addr; diff --git a/arch/s390/include/asm/shmparam.h b/arch/s390/include/asm/shmparam.h deleted file mode 100644 index e75d45649c54..000000000000 --- a/arch/s390/include/asm/shmparam.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * - * Derived from "include/asm-i386/shmparam.h" - */ -#ifndef _ASM_S390_SHMPARAM_H -#define _ASM_S390_SHMPARAM_H - -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ - -#endif /* _ASM_S390_SHMPARAM_H */ diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index edee63da08e7..472943b77066 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -38,6 +38,8 @@ #ifndef __ASSEMBLY__ +#include <asm/asm.h> + static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm, u32 *status) { @@ -46,13 +48,12 @@ static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm, asm volatile( " sigp %[r1],%[addr],0(%[order])\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=&d" (cc), [r1] "+&d" (r1.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [r1] "+d" (r1.pair) : [addr] "d" (addr), [order] "a" (order) - : "cc"); + : CC_CLOBBER); *status = r1.even; - return cc; + return CC_TRANSFORM(cc); } static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7f5d4763357b..03f4d01664f8 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -7,11 +7,30 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H -#include <asm/sigp.h> -#include <asm/lowcore.h> #include <asm/processor.h> +#include <asm/lowcore.h> +#include <asm/machine.h> +#include <asm/sigp.h> + +static __always_inline unsigned int raw_smp_processor_id(void) +{ + unsigned long lc_cpu_nr; + unsigned int cpu; + + BUILD_BUG_ON(sizeof_field(struct lowcore, cpu_nr) != sizeof(cpu)); + lc_cpu_nr = offsetof(struct lowcore, cpu_nr); + asm_inline( + ALTERNATIVE(" ly %[cpu],%[offzero](%%r0)\n", + " ly %[cpu],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [cpu] "=d" (cpu) + : [offzero] "i" (lc_cpu_nr), + [offalt] "i" (lc_cpu_nr + LOWCORE_ALT_ADDRESS), + "m" (((struct lowcore *)0)->cpu_nr)); + return cpu; +} -#define raw_smp_processor_id() (S390_lowcore.cpu_nr) +#define arch_scale_cpu_capacity smp_cpu_get_capacity extern struct mutex smp_cpu_state_mutex; extern unsigned int smp_cpu_mt_shift; @@ -24,16 +43,19 @@ extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -extern void smp_call_online_cpu(void (*func)(void *), void *); extern void smp_call_ipl_cpu(void (*func)(void *), void *); extern void smp_emergency_stop(void); extern int smp_find_processor_id(u16 address); extern int smp_store_status(int cpu); -extern void smp_save_dump_cpus(void); +extern void smp_save_dump_ipl_cpu(void); +extern void smp_save_dump_secondary_cpus(void); extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); +extern void smp_cpu_set_capacity(int cpu, unsigned long val); +extern void smp_set_core_capacity(int cpu, unsigned long val); +extern unsigned long smp_cpu_get_capacity(int cpu); extern int smp_cpu_get_cpu_address(int cpu); extern void smp_fill_possible_mask(void); extern void smp_detect_cpus(void); @@ -58,7 +80,7 @@ static inline void smp_cpus_done(unsigned int max_cpus) { } -extern int smp_rescan_cpus(void); +extern int smp_rescan_cpus(bool early); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h index fd17f25704bd..42d61296bbad 100644 --- a/arch/s390/include/asm/softirq_stack.h +++ b/arch/s390/include/asm/softirq_stack.h @@ -5,9 +5,10 @@ #include <asm/lowcore.h> #include <asm/stacktrace.h> +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK static inline void do_softirq_own_stack(void) { - call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq); + call_on_stack(0, get_lowcore()->async_stack, void, __do_softirq); } - +#endif #endif /* __ASM_S390_SOFTIRQ_STACK_H */ diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h index c549893602ea..668dfc5de538 100644 --- a/arch/s390/include/asm/sparsemem.h +++ b/arch/s390/include/asm/sparsemem.h @@ -2,7 +2,23 @@ #ifndef _ASM_S390_SPARSEMEM_H #define _ASM_S390_SPARSEMEM_H -#define SECTION_SIZE_BITS 28 +#define SECTION_SIZE_BITS 27 #define MAX_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS +#ifdef CONFIG_NUMA + +static inline int memory_add_physaddr_to_nid(u64 addr) +{ + return 0; +} +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid + +static inline int phys_to_target_node(u64 start) +{ + return 0; +} +#define phys_to_target_node phys_to_target_node + +#endif /* CONFIG_NUMA */ + #endif /* _ASM_S390_SPARSEMEM_H */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 37127cd7749e..f9935db9fd76 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -16,7 +16,23 @@ #include <asm/processor.h> #include <asm/alternative.h> -#define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval) +static __always_inline unsigned int spinlock_lockval(void) +{ + unsigned long lc_lockval; + unsigned int lockval; + + BUILD_BUG_ON(sizeof_field(struct lowcore, spinlock_lockval) != sizeof(lockval)); + lc_lockval = offsetof(struct lowcore, spinlock_lockval); + asm_inline( + ALTERNATIVE(" ly %[lockval],%[offzero](%%r0)\n", + " ly %[lockval],%[offalt](%%r0)\n", + ALT_FEATURE(MFEATURE_LOWCORE)) + : [lockval] "=d" (lockval) + : [offzero] "i" (lc_lockval), + [offalt] "i" (lc_lockval + LOWCORE_ALT_ADDRESS), + "m" (((struct lowcore *)0)->spinlock_lockval)); + return lockval; +} extern int spin_retry; @@ -57,8 +73,10 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lp) static inline int arch_spin_trylock_once(arch_spinlock_t *lp) { + int old = 0; + barrier(); - return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); + return likely(arch_try_cmpxchg(&lp->lock, &old, spinlock_lockval())); } static inline void arch_spin_lock(arch_spinlock_t *lp) @@ -79,10 +97,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) typecheck(int, lp->lock); kcsan_release(); asm_inline volatile( - ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */ - " sth %1,%0\n" - : "=R" (((unsigned short *) &lp->lock)[1]) - : "d" (0) : "cc", "memory"); + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */ + " mvhhi %[lock],0\n" + : [lock] "=Q" (((unsigned short *)&lp->lock)[1]) + : + : "memory"); } /* @@ -118,7 +137,9 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) static inline void arch_write_lock(arch_rwlock_t *rw) { - if (!__atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000)) + int old = 0; + + if (!arch_try_cmpxchg(&rw->cnts, &old, 0x30000)) arch_write_lock_wait(rw); } @@ -133,8 +154,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) int old; old = READ_ONCE(rw->cnts); - return (!(old & 0xffff0000) && - __atomic_cmpxchg_bool(&rw->cnts, old, old + 1)); + return (!(old & 0xffff0000) && arch_try_cmpxchg(&rw->cnts, &old, old + 1)); } static inline int arch_write_trylock(arch_rwlock_t *rw) @@ -142,7 +162,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) int old; old = READ_ONCE(rw->cnts); - return !old && __atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000); + return !old && arch_try_cmpxchg(&rw->cnts, &old, 0x30000); } #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index b69695e39957..3653ff57d6d9 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index b23c658dce77..1d5ca13dc90f 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -2,9 +2,27 @@ #ifndef _ASM_S390_STACKTRACE_H #define _ASM_S390_STACKTRACE_H +#include <linux/stacktrace.h> #include <linux/uaccess.h> #include <linux/ptrace.h> -#include <asm/switch_to.h> + +struct stack_frame_user { + unsigned long back_chain; + unsigned long empty1[5]; + unsigned long gprs[10]; + unsigned long empty2[4]; +}; + +struct stack_frame_vdso_wrapper { + struct stack_frame_user sf; + unsigned long return_address; +}; + +struct perf_callchain_entry_ctx; + +void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, + const struct pt_regs *regs, bool perf); enum stack_type { STACK_TYPE_UNKNOWN, @@ -46,6 +64,8 @@ struct stack_frame { unsigned long sie_savearea; unsigned long sie_reason; unsigned long sie_flags; + unsigned long sie_control_block_phys; + unsigned long sie_guest_asce; }; }; unsigned long gprs[10]; @@ -188,17 +208,53 @@ static __always_inline unsigned long get_stack_pointer(struct task_struct *task, (rettype)r2; \ }) -#define call_on_stack_noreturn(fn, stack) \ +/* + * Use call_nodat() to call a function with DAT disabled. + * Proper sign and zero extension of function arguments is done. + * Usage: + * + * rc = call_nodat(nr, rettype, fn, t1, a1, t2, a2, ...) + * + * - nr specifies the number of function arguments of fn. + * - fn is the function to be called, where fn is a physical address. + * - rettype is the return type of fn. + * - t1, a1, ... are pairs, where t1 must match the type of the first + * argument of fn, t2 the second, etc. a1 is the corresponding + * first function argument (not name), etc. + * + * fn() is called with standard C function call ABI, with the exception + * that no useful stackframe or stackpointer is passed via register 15. + * Therefore the called function must not use r15 to access the stack. + */ +#define call_nodat(nr, rettype, fn, ...) \ ({ \ - void (*__fn)(void) = fn; \ + rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = (fn); \ + /* aligned since psw_leave must not cross page boundary */ \ + psw_t __aligned(16) psw_leave; \ + psw_t psw_enter; \ + CALL_LARGS_##nr(__VA_ARGS__); \ + CALL_REGS_##nr; \ \ + CALL_TYPECHECK_##nr(__VA_ARGS__); \ + psw_enter.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; \ + psw_enter.addr = (unsigned long)__fn; \ asm volatile( \ - " la 15,0(%[_stack])\n" \ - " xc %[_bc](8,15),%[_bc](15)\n" \ - " brasl 14,%[_fn]\n" \ - ::[_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_stack] "a" (stack), [_fn] "X" (__fn)); \ - BUG(); \ + " epsw 0,1\n" \ + " risbg 1,0,0,31,32\n" \ + " larl 7,1f\n" \ + " stg 1,%[psw_leave]\n" \ + " stg 7,8+%[psw_leave]\n" \ + " la 7,%[psw_leave]\n" \ + " lra 7,0(7)\n" \ + " larl 1,0f\n" \ + " lra 14,0(1)\n" \ + " lpswe %[psw_enter]\n" \ + "0: lpswe 0(7)\n" \ + "1:\n" \ + : CALL_FMT_##nr, [psw_leave] "=Q" (psw_leave) \ + : [psw_enter] "Q" (psw_enter) \ + : "7", CALL_CLOBBER_##nr); \ + (rettype)r2; \ }) #endif /* _ASM_S390_STACKTRACE_H */ diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h index 4d74d7e33340..827cb208de86 100644 --- a/arch/s390/include/asm/stp.h +++ b/arch/s390/include/asm/stp.h @@ -94,5 +94,6 @@ struct stp_stzi { int stp_sync_check(void); int stp_island_check(void); void stp_queue_work(void); +bool stp_enabled(void); #endif /* __S390_STP_H */ diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 3fae93ddb322..f8f68f4ef255 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -15,36 +15,33 @@ #define __HAVE_ARCH_MEMCPY /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */ -#define __HAVE_ARCH_MEMSET16 /* arch function */ -#define __HAVE_ARCH_MEMSET32 /* arch function */ -#define __HAVE_ARCH_MEMSET64 /* arch function */ void *memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); void *memmove(void *dest, const void *src, size_t n); -#ifndef CONFIG_KASAN +#if !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) #define __HAVE_ARCH_MEMCHR /* inline & arch function */ #define __HAVE_ARCH_MEMCMP /* arch function */ #define __HAVE_ARCH_MEMSCAN /* inline & arch function */ #define __HAVE_ARCH_STRCAT /* inline & arch function */ #define __HAVE_ARCH_STRCMP /* arch function */ -#define __HAVE_ARCH_STRCPY /* inline & arch function */ #define __HAVE_ARCH_STRLCAT /* arch function */ #define __HAVE_ARCH_STRLEN /* inline & arch function */ #define __HAVE_ARCH_STRNCAT /* arch function */ -#define __HAVE_ARCH_STRNCPY /* arch function */ #define __HAVE_ARCH_STRNLEN /* inline & arch function */ #define __HAVE_ARCH_STRSTR /* arch function */ +#define __HAVE_ARCH_MEMSET16 /* arch function */ +#define __HAVE_ARCH_MEMSET32 /* arch function */ +#define __HAVE_ARCH_MEMSET64 /* arch function */ /* Prototypes for non-inlined arch strings functions. */ int memcmp(const void *s1, const void *s2, size_t n); int strcmp(const char *s1, const char *s2); size_t strlcat(char *dest, const char *src, size_t n); char *strncat(char *dest, const char *src, size_t n); -char *strncpy(char *dest, const char *src, size_t n); char *strstr(const char *s1, const char *s2); -#endif /* !CONFIG_KASAN */ +#endif /* !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) */ #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR @@ -55,18 +52,6 @@ char *strstr(const char *s1, const char *s2); #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) -extern void *__memcpy(void *dest, const void *src, size_t n); -extern void *__memset(void *s, int c, size_t n); -extern void *__memmove(void *dest, const void *src, size_t n); - -/* - * For files that are not instrumented (e.g. mm/slub.c) we - * should use not instrumented version of mem* functions. - */ - -#define memcpy(dst, src, len) __memcpy(dst, src, len) -#define memmove(dst, src, len) __memmove(dst, src, len) -#define memset(s, c, n) __memset(s, c, n) #define strlen(s) __strlen(s) #define __no_sanitize_prefix_strfunc(x) __##x @@ -79,24 +64,37 @@ extern void *__memmove(void *dest, const void *src, size_t n); #define __no_sanitize_prefix_strfunc(x) x #endif /* defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) */ +void *__memcpy(void *dest, const void *src, size_t n); +void *__memset(void *s, int c, size_t n); +void *__memmove(void *dest, const void *src, size_t n); void *__memset16(uint16_t *s, uint16_t v, size_t count); void *__memset32(uint32_t *s, uint32_t v, size_t count); void *__memset64(uint64_t *s, uint64_t v, size_t count); +#ifdef __HAVE_ARCH_MEMSET16 static inline void *memset16(uint16_t *s, uint16_t v, size_t count) { return __memset16(s, v, count * sizeof(v)); } +#endif +#ifdef __HAVE_ARCH_MEMSET32 static inline void *memset32(uint32_t *s, uint32_t v, size_t count) { return __memset32(s, v, count * sizeof(v)); } +#endif +#ifdef __HAVE_ARCH_MEMSET64 +#ifdef IN_BOOT_STRING_C +void *memset64(uint64_t *s, uint64_t v, size_t count); +#else static inline void *memset64(uint64_t *s, uint64_t v, size_t count) { return __memset64(s, v, count * sizeof(v)); } +#endif +#endif #if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY)) @@ -154,22 +152,6 @@ static inline char *strcat(char *dst, const char *src) } #endif -#ifdef __HAVE_ARCH_STRCPY -static inline char *strcpy(char *dst, const char *src) -{ - char *ret = dst; - - asm volatile( - " lghi 0,0\n" - "0: mvst %[dst],%[src]\n" - " jo 0b" - : [dst] "+&a" (dst), [src] "+&a" (src) - : - : "cc", "memory", "0"); - return ret; -} -#endif - #if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s) { @@ -207,7 +189,6 @@ static inline size_t strnlen(const char * s, size_t n) void *memchr(const void * s, int c, size_t n); void *memscan(void *s, int c, size_t n); char *strcat(char *dst, const char *src); -char *strcpy(char *dst, const char *src); size_t strlen(const char *s); size_t strnlen(const char * s, size_t n); #endif /* !IN_ARCH_STRING_C */ diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h deleted file mode 100644 index c61b2cc1a8a8..000000000000 --- a/arch/s390/include/asm/switch_to.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright IBM Corp. 1999, 2009 - * - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#ifndef __ASM_SWITCH_TO_H -#define __ASM_SWITCH_TO_H - -#include <linux/thread_info.h> -#include <asm/fpu/api.h> -#include <asm/ptrace.h> -#include <asm/guarded_storage.h> - -extern struct task_struct *__switch_to(void *, void *); -extern void update_cr_regs(struct task_struct *task); - -static inline void save_access_regs(unsigned int *acrs) -{ - typedef struct { int _[NUM_ACRS]; } acrstype; - - asm volatile("stam 0,15,%0" : "=Q" (*(acrstype *)acrs)); -} - -static inline void restore_access_regs(unsigned int *acrs) -{ - typedef struct { int _[NUM_ACRS]; } acrstype; - - asm volatile("lam 0,15,%0" : : "Q" (*(acrstype *)acrs)); -} - -#define switch_to(prev, next, last) do { \ - /* save_fpu_regs() sets the CIF_FPU flag, which enforces \ - * a restore of the floating point / vector registers as \ - * soon as the next task returns to user space \ - */ \ - save_fpu_regs(); \ - save_access_regs(&prev->thread.acrs[0]); \ - save_ri_cb(prev->thread.ri_cb); \ - save_gs_cb(prev->thread.gs_cb); \ - update_cr_regs(next); \ - restore_access_regs(&next->thread.acrs[0]); \ - restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ - restore_gs_cb(next->thread.gs_cb); \ - prev = __switch_to(prev, next); \ -} while (0) - -#endif /* __ASM_SWITCH_TO_H */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 27e3d804b311..bd4cb00ccd5e 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -24,6 +24,18 @@ static inline long syscall_get_nr(struct task_struct *task, (regs->int_code & 0xffff) : -1; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->int_code = (regs->int_code & ~0xffff) | (nr & 0xffff); +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -65,19 +77,26 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long *args) { unsigned long mask = -1UL; - unsigned int n = 6; #ifdef CONFIG_COMPAT if (test_tsk_thread_flag(task, TIF_31BIT)) mask = 0xffffffff; #endif - while (n-- > 0) - if (n > 0) - args[n] = regs->gprs[2 + n] & mask; + for (int i = 1; i < 6; i++) + args[i] = regs->gprs[2 + i] & mask; args[0] = regs->orig_gpr2 & mask; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->orig_gpr2 = args[0]; + for (int n = 1; n < 6; n++) + regs->gprs[2 + n] = args[n]; +} + static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_COMPAT diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index fde7e6b1df48..35c1d1b860d8 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -7,36 +7,13 @@ #ifndef _ASM_S390_SYSCALL_WRAPPER_H #define _ASM_S390_SYSCALL_WRAPPER_H -#define __SC_TYPE(t, a) t - -#define SYSCALL_PT_ARG6(regs, m, t1, t2, t3, t4, t5, t6)\ - SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5), \ - m(t6, (regs->gprs[7])) - -#define SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5) \ - SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4), \ - m(t5, (regs->gprs[6])) - -#define SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4) \ - SYSCALL_PT_ARG3(regs, m, t1, t2, t3), \ - m(t4, (regs->gprs[5])) - -#define SYSCALL_PT_ARG3(regs, m, t1, t2, t3) \ - SYSCALL_PT_ARG2(regs, m, t1, t2), \ - m(t3, (regs->gprs[4])) - -#define SYSCALL_PT_ARG2(regs, m, t1, t2) \ - SYSCALL_PT_ARG1(regs, m, t1), \ - m(t2, (regs->gprs[3])) - -#define SYSCALL_PT_ARG1(regs, m, t1) \ - m(t1, (regs->orig_gpr2)) - -#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) +/* Mapping of registers to parameters for syscalls */ +#define SC_S390_REGS_TO_ARGS(x, ...) \ + __MAP(x, __SC_ARGS \ + ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4] \ + ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7]) #ifdef CONFIG_COMPAT -#define __SC_COMPAT_TYPE(t, a) \ - __typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a #define __SC_COMPAT_CAST(t, a) \ ({ \ @@ -56,110 +33,108 @@ (t)__ReS; \ }) -#define __S390_SYS_STUBx(x, name, ...) \ - long __s390_sys##name(struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ - long __s390_sys##name(struct pt_regs *regs) \ - { \ - long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \ - __SC_COMPAT_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } - /* * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias * named __s390x_sys_*() */ #define COMPAT_SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ long __s390_compat_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ long __s390_compat_sys_##sname(void) #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ + long __s390_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO); \ long __s390x_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ + static inline long __do_sys_##sname(void); \ long __s390_sys_##sname(void) \ - __attribute__((alias(__stringify(__s390x_sys_##sname)))); \ - long __s390x_sys_##sname(void) + { \ + return __do_sys_##sname(); \ + } \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name); \ cond_syscall(__s390_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); \ - SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers) - #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments"); \ long __s390_compat_sys##name(struct pt_regs *regs); \ - long __s390_compat_sys##name(struct pt_regs *regs) \ - __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - long __se_compat_sys##name(struct pt_regs *regs); \ - long __se_compat_sys##name(struct pt_regs *regs) \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + long __s390_compat_sys##name(struct pt_regs *regs) \ + { \ + return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ { \ - long ret = __do_compat_sys##name(SYSCALL_PT_ARGS(x, regs, __SC_DELOUSE, \ - __MAP(x, __SC_TYPE, __VA_ARGS__))); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__)); \ } \ - __diag_pop(); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ cond_syscall(__s390_compat_sys_##name) -#define COMPAT_SYS_NI(name) \ - SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers) +#define __S390_SYS_STUBx(x, name, ...) \ + long __s390_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + long __s390_sys##name(struct pt_regs *regs) \ + { \ + return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__)); \ + } #else /* CONFIG_COMPAT */ -#define __S390_SYS_STUBx(x, fullname, name, ...) - #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ long __s390x_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ - long __s390x_sys_##sname(void) + static inline long __do_sys_##sname(void); \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); +#define __S390_SYS_STUBx(x, fullname, name, ...) #endif /* CONFIG_COMPAT */ -#define __SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments"); \ - long __s390x_sys##name(struct pt_regs *regs) \ - __attribute__((alias(__stringify(__se_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - long __se_sys##name(struct pt_regs *regs); \ - __S390_SYS_STUBx(x, name, __VA_ARGS__) \ - long __se_sys##name(struct pt_regs *regs) \ - { \ - long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \ - __SC_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } \ - __diag_pop(); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#define __SYSCALL_DEFINEx(x, name, ...) \ + long __s390x_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + __S390_SYS_STUBx(x, name, __VA_ARGS__); \ + long __s390x_sys##name(struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__)); \ + } \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) #endif /* _ASM_S390_SYSCALL_WRAPPER_H */ diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index ab1c6316055c..9088c5267f35 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -11,8 +11,34 @@ #ifndef __ASM_S390_SYSINFO_H #define __ASM_S390_SYSINFO_H -#include <asm/bitsperlong.h> #include <linux/uuid.h> +#include <asm/bitsperlong.h> +#include <asm/asm.h> + +/* + * stsi - store system information + * + * Returns the current configuration level if function code 0 was specified. + * Otherwise returns 0 on success or a negative value on error. + */ +static inline int stsi(void *sysinfo, int fc, int sel1, int sel2) +{ + int r0 = (fc << 28) | sel1; + int cc; + + asm volatile( + " lr %%r0,%[r0]\n" + " lr %%r1,%[r1]\n" + " stsi %[sysinfo]\n" + " lr %[r0],%%r0\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [r0] "+d" (r0), [sysinfo] "=Q" (*(char *)sysinfo) + : [r1] "d" (sel2) + : CC_CLOBBER_LIST("0", "1", "memory")); + if (cc == 3) + return -EOPNOTSUPP; + return fc ? 0 : (unsigned int)r0 >> 28; +} struct sysinfo_1_1_1 { unsigned char p:1; @@ -40,6 +66,10 @@ struct sysinfo_1_1_1 { unsigned int ncr; unsigned int npr; unsigned int ntr; + char reserved_3[4]; + char model_var_cap[16]; + unsigned int model_var_cap_rating; + unsigned int nvr; }; struct sysinfo_1_2_1 { diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h deleted file mode 100644 index 46fa3020b41e..000000000000 --- a/arch/s390/include/asm/termios.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * - * Derived from "include/asm-i386/termios.h" - */ -#ifndef _S390_TERMIOS_H -#define _S390_TERMIOS_H - -#include <uapi/asm/termios.h> - - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2)) -#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2)) - -#include <asm-generic/termios-base.h> - -#endif /* _S390_TERMIOS_H */ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index b2ffcb4fe000..391eb04d26d8 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -9,11 +9,12 @@ #define _ASM_THREAD_INFO_H #include <linux/bits.h> +#include <vdso/page.h> /* * General size of kernel stacks */ -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) || defined(CONFIG_KMSAN) #define THREAD_SIZE_ORDER 4 #else #define THREAD_SIZE_ORDER 2 @@ -21,12 +22,9 @@ #define BOOT_STACK_SIZE (PAGE_SIZE << 2) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#ifndef __ASSEMBLY__ -#include <asm/lowcore.h> -#include <asm/page.h> +#define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE) -#define STACK_INIT_OFFSET \ - (THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs)) +#ifndef __ASSEMBLY__ /* * low level task data that entry.S needs immediate access to @@ -38,6 +36,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ unsigned int cpu; /* current CPU */ + unsigned char sie; /* running in SIE context */ }; /* @@ -50,9 +49,6 @@ struct thread_info { struct task_struct; -void arch_release_task_struct(struct task_struct *tsk); -int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); - void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec @@ -61,46 +57,45 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers */ -/* _TIF_WORK bits */ #define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_UPROBE 3 /* breakpointed or single-stepping */ -#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ +#define TIF_UPROBE 4 /* breakpointed or single-stepping */ #define TIF_PATCH_PENDING 5 /* pending live patching update */ -#define TIF_PGSTE 6 /* New mm's will use 4K page tables */ +#define TIF_ASCE_PRIMARY 6 /* primary asce is kernel asce */ #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ -#define TIF_ISOLATE_BP 8 /* Run process with isolated BP */ +#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ #define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ - #define TIF_31BIT 16 /* 32bit process */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ #define TIF_SINGLE_STEP 19 /* This task is single stepped */ #define TIF_BLOCK_STEP 20 /* This task is block stepped */ #define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ - -/* _TIF_TRACE bits */ #define TIF_SYSCALL_TRACE 24 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ #define TIF_SECCOMP 26 /* secure computing */ #define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) #define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) -#define _TIF_ISOLATE_BP BIT(TIF_ISOLATE_BP) +#define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) +#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) - #define _TIF_31BIT BIT(TIF_31BIT) +#define _TIF_MEMDIE BIT(TIF_MEMDIE) +#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) - +#define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) +#define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) #define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP BIT(TIF_SECCOMP) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index ce878e85b6e4..bed8d0b5a282 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -13,6 +13,8 @@ #include <linux/preempt.h> #include <linux/time64.h> #include <asm/lowcore.h> +#include <asm/machine.h> +#include <asm/asm.h> /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL @@ -44,11 +46,12 @@ static inline int set_tod_clock(__u64 time) int cc; asm volatile( - " sck %1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc) : "Q" (time) : "cc"); - return cc; + " sck %[time]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [time] "Q" (time) + : CC_CLOBBER); + return CC_TRANSFORM(cc); } static inline int store_tod_clock_ext_cc(union tod_clock *clk) @@ -56,14 +59,15 @@ static inline int store_tod_clock_ext_cc(union tod_clock *clk) int cc; asm volatile( - " stcke %1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc), "=Q" (*clk) : : "cc"); - return cc; + " stcke %[clk]\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [clk] "=Q" (*clk) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc); } -static inline void store_tod_clock_ext(union tod_clock *tod) +static __always_inline void store_tod_clock_ext(union tod_clock *tod) { asm volatile("stcke %0" : "=Q" (*tod) : : "cc"); } @@ -93,6 +97,7 @@ extern unsigned char ptff_function_mask[16]; #define PTFF_QAF 0x00 /* query available functions */ #define PTFF_QTO 0x01 /* query tod offset */ #define PTFF_QSI 0x02 /* query steering information */ +#define PTFF_QPT 0x03 /* query physical clock */ #define PTFF_QUI 0x04 /* query UTC information */ #define PTFF_ATO 0x40 /* adjust tod offset */ #define PTFF_STO 0x41 /* set tod offset */ @@ -149,35 +154,34 @@ struct ptff_qui { " lgr 0,%[reg0]\n" \ " lgr 1,%[reg1]\n" \ " ptff\n" \ - " ipm %[rc]\n" \ - " srl %[rc],28\n" \ - : [rc] "=&d" (rc), "+m" (*(struct addrtype *)reg1) \ + CC_IPM(rc) \ + : CC_OUT(rc, rc), "+m" (*(struct addrtype *)reg1) \ : [reg0] "d" (reg0), [reg1] "d" (reg1) \ - : "cc", "0", "1"); \ - rc; \ + : CC_CLOBBER_LIST("0", "1")); \ + CC_TRANSFORM(rc); \ }) static inline unsigned long local_tick_disable(void) { unsigned long old; - old = S390_lowcore.clock_comparator; - S390_lowcore.clock_comparator = clock_comparator_max; - set_clock_comparator(S390_lowcore.clock_comparator); + old = get_lowcore()->clock_comparator; + get_lowcore()->clock_comparator = clock_comparator_max; + set_clock_comparator(get_lowcore()->clock_comparator); return old; } static inline void local_tick_enable(unsigned long comp) { - S390_lowcore.clock_comparator = comp; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = comp; + set_clock_comparator(get_lowcore()->clock_comparator); } #define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ typedef unsigned long cycles_t; -static inline unsigned long get_tod_clock(void) +static __always_inline unsigned long get_tod_clock(void) { union tod_clock clk; @@ -204,6 +208,11 @@ void init_cpu_timer(void); extern union tod_clock tod_clock_base; +static __always_inline unsigned long __get_tod_clock_monotonic(void) +{ + return get_tod_clock() - tod_clock_base.tod; +} + /** * get_clock_monotonic - returns current time in clock rate units * @@ -216,7 +225,7 @@ static inline unsigned long get_tod_clock_monotonic(void) unsigned long tod; preempt_disable_notrace(); - tod = get_tod_clock() - tod_clock_base.tod; + tod = __get_tod_clock_monotonic(); preempt_enable_notrace(); return tod; } @@ -240,11 +249,16 @@ static inline unsigned long get_tod_clock_monotonic(void) * -> ns = (th * 125) + ((tl * 125) >> 9); * */ -static inline unsigned long tod_to_ns(unsigned long todval) +static __always_inline unsigned long tod_to_ns(unsigned long todval) { return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } +static __always_inline u128 eitod_to_ns(u128 todval) +{ + return (todval * 125) >> 9; +} + /** * tod_after - compare two 64 bit TOD values * @a: first 64 bit TOD timestamp @@ -254,7 +268,7 @@ static inline unsigned long tod_to_ns(unsigned long todval) */ static inline int tod_after(unsigned long a, unsigned long b) { - if (MACHINE_HAS_SCC) + if (machine_has_scc()) return (long) a > (long) b; return a > b; } @@ -268,7 +282,7 @@ static inline int tod_after(unsigned long a, unsigned long b) */ static inline int tod_after_eq(unsigned long a, unsigned long b) { - if (MACHINE_HAS_SCC) + if (machine_has_scc()) return (long) a >= (long) b; return a >= b; } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 3a5c8fb590e5..1e50f6f1ad9d 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -22,10 +22,11 @@ * Pages used for the page tables is a different story. FIXME: more */ -void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size); + struct page *page, bool delay_rmap, int page_size); +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap); #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -35,16 +36,36 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, #include <asm/tlbflush.h> #include <asm-generic/tlb.h> +#include <asm/gmap.h> /* * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page - * has already been freed, so just do free_page_and_swap_cache. + * has already been freed, so just do free_folio_and_swap_cache. + * + * s390 doesn't delay rmap removal. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) + struct page *page, bool delay_rmap, int page_size) +{ + VM_WARN_ON_ONCE(delay_rmap); + + free_folio_and_swap_cache(page_folio(page)); + return false; +} + +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap) { - free_page_and_swap_cache(page); + struct encoded_page *encoded_pages[] = { + encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT), + encode_nr_pages(nr_pages), + }; + + VM_WARN_ON_ONCE(delay_rmap); + VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); + + free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages)); return false; } @@ -64,12 +85,9 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_pmds = 1; - /* - * page_table_free_rcu takes care of the allocation bit masks - * of the 2K table fragments in the 4K page table page, - * then calls tlb_remove_table. - */ - page_table_free_rcu(tlb, (unsigned long *) pte, address); + if (mm_has_pgste(tlb->mm)) + gmap_unlink(tlb->mm, (unsigned long *)pte, address); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); } /* @@ -84,12 +102,11 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } /* @@ -107,7 +124,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb_remove_table(tlb, p4d); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } /* @@ -122,11 +139,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, { if (mm_pud_folded(tlb->mm)) return; + __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_p4ds = 1; - tlb_remove_table(tlb, pud); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } - #endif /* _S390_TLB_H */ diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index a6e2cd89b609..75491baa2197 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -2,9 +2,11 @@ #ifndef _S390_TLBFLUSH_H #define _S390_TLBFLUSH_H +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/sched.h> #include <asm/processor.h> +#include <asm/machine.h> /* * Flush all TLB entries on the local CPU. @@ -22,7 +24,7 @@ static inline void __tlb_flush_idte(unsigned long asce) unsigned long opt; opt = IDTE_PTOA; - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc"); @@ -46,18 +48,13 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) { unsigned long gmap_asce; - /* - * If the machine has IDTE we prefer to do a per mm flush - * on all cpus instead of doing a local flush if the mm - * only ran on the local cpu. - */ preempt_disable(); atomic_inc(&mm->context.flush_count); /* Reset TLB flush mask */ cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); barrier(); gmap_asce = READ_ONCE(mm->context.gmap_asce); - if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { + if (cpu_has_idte() && gmap_asce != -1UL) { if (gmap_asce) __tlb_flush_idte(gmap_asce); __tlb_flush_idte(mm->context.asce); @@ -71,7 +68,7 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) static inline void __tlb_flush_kernel(void) { - if (MACHINE_HAS_IDTE) + if (cpu_has_idte()) __tlb_flush_idte(init_mm.context.asce); else __tlb_flush_global(); diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 3a0ac0c7a9a3..44110847342a 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -61,12 +61,21 @@ static inline void topology_expect_change(void) { } #endif /* CONFIG_SCHED_TOPOLOGY */ +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + return smp_get_base_cpu(cpu) == cpu; +} +#define topology_is_primary_thread topology_is_primary_thread + #define POLARIZATION_UNKNOWN (-1) #define POLARIZATION_HRZ (0) #define POLARIZATION_VL (1) #define POLARIZATION_VM (2) #define POLARIZATION_VH (3) +#define CPU_CAPACITY_HIGH SCHED_CAPACITY_SCALE +#define CPU_CAPACITY_LOW (SCHED_CAPACITY_SCALE >> 3) + #define SD_BOOK_INIT SD_CPU_INIT #ifdef CONFIG_NUMA diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h index 1ac538b8cbf5..f76e5fdff23a 100644 --- a/arch/s390/include/asm/tpi.h +++ b/arch/s390/include/asm/tpi.h @@ -19,6 +19,19 @@ struct tpi_info { u32 :12; } __packed __aligned(4); +/* I/O-Interruption Code as stored by TPI for an Adapter I/O */ +struct tpi_adapter_info { + u32 aism:8; + u32 :22; + u32 error:1; + u32 forward:1; + u32 reserved; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :27; +} __packed __aligned(4); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_TPI_H */ diff --git a/arch/s390/include/asm/trace/hiperdispatch.h b/arch/s390/include/asm/trace/hiperdispatch.h new file mode 100644 index 000000000000..46462ee645b0 --- /dev/null +++ b/arch/s390/include/asm/trace/hiperdispatch.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Tracepoint header for hiperdispatch + * + * Copyright IBM Corp. 2024 + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM s390 + +#if !defined(_TRACE_S390_HIPERDISPATCH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_S390_HIPERDISPATCH_H + +#include <linux/tracepoint.h> + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE + +#define TRACE_INCLUDE_PATH asm/trace +#define TRACE_INCLUDE_FILE hiperdispatch + +TRACE_EVENT(s390_hd_work_fn, + TP_PROTO(int steal_time_percentage, + int entitled_core_count, + int highcap_core_count), + TP_ARGS(steal_time_percentage, + entitled_core_count, + highcap_core_count), + TP_STRUCT__entry(__field(int, steal_time_percentage) + __field(int, entitled_core_count) + __field(int, highcap_core_count)), + TP_fast_assign(__entry->steal_time_percentage = steal_time_percentage; + __entry->entitled_core_count = entitled_core_count; + __entry->highcap_core_count = highcap_core_count;), + TP_printk("steal: %d entitled_core_count: %d highcap_core_count: %d", + __entry->steal_time_percentage, + __entry->entitled_core_count, + __entry->highcap_core_count) +); + +TRACE_EVENT(s390_hd_rebuild_domains, + TP_PROTO(int current_highcap_core_count, + int new_highcap_core_count), + TP_ARGS(current_highcap_core_count, + new_highcap_core_count), + TP_STRUCT__entry(__field(int, current_highcap_core_count) + __field(int, new_highcap_core_count)), + TP_fast_assign(__entry->current_highcap_core_count = current_highcap_core_count; + __entry->new_highcap_core_count = new_highcap_core_count), + TP_printk("change highcap_core_count: %u -> %u", + __entry->current_highcap_core_count, + __entry->new_highcap_core_count) +); + +#endif /* _TRACE_S390_HIPERDISPATCH_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index c2c9995466e0..a43fc88c0050 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -13,25 +13,81 @@ /* * User space memory access functions */ +#include <linux/pgtable.h> #include <asm/asm-extable.h> #include <asm/processor.h> -#include <asm/ctl_reg.h> #include <asm/extable.h> #include <asm/facility.h> #include <asm-generic/access_ok.h> +#include <asm/asce.h> +#include <linux/instrumented.h> void debug_user_asce(int exit); -unsigned long __must_check -raw_copy_from_user(void *to, const void __user *from, unsigned long n); - -unsigned long __must_check -raw_copy_to_user(void __user *to, const void *from, unsigned long n); +#ifdef CONFIG_KMSAN +#define uaccess_kmsan_or_inline noinline __maybe_unused __no_sanitize_memory +#else +#define uaccess_kmsan_or_inline __always_inline +#endif -#ifndef CONFIG_KASAN #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -#endif + +static uaccess_kmsan_or_inline __must_check unsigned long +raw_copy_from_user(void *to, const void __user *from, unsigned long size) +{ + unsigned long osize; + int cc; + + while (1) { + osize = size; + asm_inline volatile( + " lhi %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_FROM(0b, 0b) + EX_TABLE_UA_MVCOS_FROM(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to) + : [spec] "I" (0x81), [from] "Q" (*(const char __user *)from) + : CC_CLOBBER_LIST("memory", "0")); + if (__builtin_constant_p(osize) && osize <= 4096) + return osize - size; + if (likely(CC_TRANSFORM(cc) == 0)) + return osize - size; + size -= 4096; + to += 4096; + from += 4096; + } +} + +static uaccess_kmsan_or_inline __must_check unsigned long +raw_copy_to_user(void __user *to, const void *from, unsigned long size) +{ + unsigned long osize; + int cc; + + while (1) { + osize = size; + asm_inline volatile( + " llilh %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_TO(0b, 0b) + EX_TABLE_UA_MVCOS_TO(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) + : [spec] "I" (0x81), [from] "Q" (*(const char *)from) + : CC_CLOBBER_LIST("memory", "0")); + if (__builtin_constant_p(osize) && osize <= 4096) + return osize - size; + if (likely(CC_TRANSFORM(cc) == 0)) + return osize - size; + size -= 4096; + to += 4096; + from += 4096; + } +} unsigned long __must_check _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key); @@ -55,163 +111,113 @@ copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned lo return n; } -union oac { - unsigned int val; - struct { - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac1; - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac2; - }; -}; - int __noreturn __put_user_bad(void); -#define __put_user_asm(to, from, size) \ -({ \ - union oac __oac_spec = { \ - .oac1.as = PSW_BITS_AS_SECONDARY, \ - .oac1.a = 1, \ - }; \ - int __rc; \ - \ - asm volatile( \ - " lr 0,%[spec]\n" \ - "0: mvcos %[_to],%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ - EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ - : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [spec] "d" (__oac_spec.val) \ - : "cc", "0"); \ - __rc; \ -}) - -static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) -{ - int rc; - - switch (size) { - case 1: - rc = __put_user_asm((unsigned char __user *)ptr, - (unsigned char *)x, - size); - break; - case 2: - rc = __put_user_asm((unsigned short __user *)ptr, - (unsigned short *)x, - size); - break; - case 4: - rc = __put_user_asm((unsigned int __user *)ptr, - (unsigned int *)x, - size); - break; - case 8: - rc = __put_user_asm((unsigned long __user *)ptr, - (unsigned long *)x, - size); - break; - default: - __put_user_bad(); - break; - } - return rc; +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + +#define DEFINE_PUT_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ +__put_user_##type##_noinstr(unsigned type __user *to, \ + unsigned type *from, \ + unsigned long size) \ +{ \ + asm goto( \ + " llilh %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: nopr %%r7\n" \ + EX_TABLE(0b, %l[Efault]) \ + EX_TABLE(1b, %l[Efault]) \ + : [to] "+Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ + : "cc", "0" \ + : Efault \ + ); \ + return 0; \ +Efault: \ + return -EFAULT; \ } -int __noreturn __get_user_bad(void); +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ -#define __get_user_asm(to, from, size) \ -({ \ - union oac __oac_spec = { \ - .oac2.as = PSW_BITS_AS_SECONDARY, \ - .oac2.a = 1, \ - }; \ - int __rc; \ +#define DEFINE_PUT_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ +__put_user_##type##_noinstr(unsigned type __user *to, \ + unsigned type *from, \ + unsigned long size) \ +{ \ + int rc; \ \ - asm volatile( \ - " lr 0,%[spec]\n" \ - "0: mvcos 0(%[_to]),%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ + asm_inline volatile( \ + " llilh %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: lhi %[rc],0\n" \ "2:\n" \ - EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \ - EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \ - : [rc] "=&d" (__rc), "=Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [spec] "d" (__oac_spec.val), [_to] "a" (to), \ - [_ksize] "K" (size) \ + EX_TABLE_UA_FAULT(0b, 2b, %[rc]) \ + EX_TABLE_UA_FAULT(1b, 2b, %[rc]) \ + : [rc] "=d" (rc), [to] "+Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ : "cc", "0"); \ - __rc; \ -}) + return rc; \ +} -static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) -{ - int rc; +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - switch (size) { - case 1: - rc = __get_user_asm((unsigned char *)x, - (unsigned char __user *)ptr, - size); - break; - case 2: - rc = __get_user_asm((unsigned short *)x, - (unsigned short __user *)ptr, - size); - break; - case 4: - rc = __get_user_asm((unsigned int *)x, - (unsigned int __user *)ptr, - size); - break; - case 8: - rc = __get_user_asm((unsigned long *)x, - (unsigned long __user *)ptr, - size); - break; - default: - __get_user_bad(); - break; - } - return rc; +DEFINE_PUT_USER_NOINSTR(char); +DEFINE_PUT_USER_NOINSTR(short); +DEFINE_PUT_USER_NOINSTR(int); +DEFINE_PUT_USER_NOINSTR(long); + +#define DEFINE_PUT_USER(type) \ +static __always_inline int \ +__put_user_##type(unsigned type __user *to, unsigned type *from, \ + unsigned long size) \ +{ \ + int rc; \ + \ + rc = __put_user_##type##_noinstr(to, from, size); \ + instrument_put_user(*from, to, size); \ + return rc; \ } -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - */ +DEFINE_PUT_USER(char); +DEFINE_PUT_USER(short); +DEFINE_PUT_USER(int); +DEFINE_PUT_USER(long); + #define __put_user(x, ptr) \ ({ \ __typeof__(*(ptr)) __x = (x); \ - int __pu_err = -EFAULT; \ + int __prc; \ \ __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: \ + __prc = __put_user_char((unsigned char __user *)(ptr), \ + (unsigned char *)&__x, \ + sizeof(*(ptr))); \ + break; \ case 2: \ + __prc = __put_user_short((unsigned short __user *)(ptr),\ + (unsigned short *)&__x, \ + sizeof(*(ptr))); \ + break; \ case 4: \ + __prc = __put_user_int((unsigned int __user *)(ptr), \ + (unsigned int *)&__x, \ + sizeof(*(ptr))); \ + break; \ case 8: \ - __pu_err = __put_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __prc = __put_user_long((unsigned long __user *)(ptr), \ + (unsigned long *)&__x, \ + sizeof(*(ptr))); \ break; \ default: \ - __put_user_bad(); \ + __prc = __put_user_bad(); \ break; \ } \ - __builtin_expect(__pu_err, 0); \ + __builtin_expect(__prc, 0); \ }) #define put_user(x, ptr) \ @@ -220,45 +226,129 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign __put_user(x, ptr); \ }) +int __noreturn __get_user_bad(void); + +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + +#define DEFINE_GET_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ +__get_user_##type##_noinstr(unsigned type *to, \ + const unsigned type __user *from, \ + unsigned long size) \ +{ \ + asm goto( \ + " lhi %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: nopr %%r7\n" \ + EX_TABLE(0b, %l[Efault]) \ + EX_TABLE(1b, %l[Efault]) \ + : [to] "=Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ + : "cc", "0" \ + : Efault \ + ); \ + return 0; \ +Efault: \ + *to = 0; \ + return -EFAULT; \ +} + +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +#define DEFINE_GET_USER_NOINSTR(type) \ +static uaccess_kmsan_or_inline int \ +__get_user_##type##_noinstr(unsigned type *to, \ + const unsigned type __user *from, \ + unsigned long size) \ +{ \ + int rc; \ + \ + asm_inline volatile( \ + " lhi %%r0,%[spec]\n" \ + "0: mvcos %[to],%[from],%[size]\n" \ + "1: lhi %[rc],0\n" \ + "2:\n" \ + EX_TABLE_UA_FAULT(0b, 2b, %[rc]) \ + EX_TABLE_UA_FAULT(1b, 2b, %[rc]) \ + : [rc] "=d" (rc), [to] "=Q" (*to) \ + : [size] "d" (size), [from] "Q" (*from), \ + [spec] "I" (0x81) \ + : "cc", "0"); \ + if (likely(!rc)) \ + return 0; \ + *to = 0; \ + return rc; \ +} + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +DEFINE_GET_USER_NOINSTR(char); +DEFINE_GET_USER_NOINSTR(short); +DEFINE_GET_USER_NOINSTR(int); +DEFINE_GET_USER_NOINSTR(long); + +#define DEFINE_GET_USER(type) \ +static __always_inline int \ +__get_user_##type(unsigned type *to, const unsigned type __user *from, \ + unsigned long size) \ +{ \ + int rc; \ + \ + rc = __get_user_##type##_noinstr(to, from, size); \ + instrument_get_user(*to); \ + return rc; \ +} + +DEFINE_GET_USER(char); +DEFINE_GET_USER(short); +DEFINE_GET_USER(int); +DEFINE_GET_USER(long); + #define __get_user(x, ptr) \ ({ \ - int __gu_err = -EFAULT; \ + const __user void *____guptr = (ptr); \ + int __grc; \ \ __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: { \ + const unsigned char __user *__guptr = ____guptr; \ unsigned char __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_char(&__x, __guptr, sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ case 2: { \ + const unsigned short __user *__guptr = ____guptr; \ unsigned short __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_short(&__x, __guptr, sizeof(*(ptr)));\ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ case 4: { \ + const unsigned int __user *__guptr = ____guptr; \ unsigned int __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_int(&__x, __guptr, sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ case 8: { \ + const unsigned long __user *__guptr = ____guptr; \ unsigned long __x; \ \ - __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + __grc = __get_user_long(&__x, __guptr, sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *)&__x; \ break; \ }; \ default: \ - __get_user_bad(); \ + __grc = __get_user_bad(); \ break; \ } \ - __builtin_expect(__gu_err, 0); \ + __builtin_expect(__grc, 0); \ }) #define get_user(x, ptr) \ @@ -274,121 +364,330 @@ long __must_check strncpy_from_user(char *dst, const char __user *src, long coun long __must_check strnlen_user(const char __user *src, long count); -/* - * Zero Userspace - */ -unsigned long __must_check __clear_user(void __user *to, unsigned long size); +static uaccess_kmsan_or_inline __must_check unsigned long +__clear_user(void __user *to, unsigned long size) +{ + unsigned long osize; + int cc; + + while (1) { + osize = size; + asm_inline volatile( + " llilh %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_TO(0b, 0b) + EX_TABLE_UA_MVCOS_TO(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) + : [spec] "I" (0x81), [from] "Q" (*(const char *)empty_zero_page) + : CC_CLOBBER_LIST("memory", "0")); + if (__builtin_constant_p(osize) && osize <= 4096) + return osize - size; + if (CC_TRANSFORM(cc) == 0) + return osize - size; + size -= 4096; + to += 4096; + } +} -static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) +static __always_inline unsigned long __must_check clear_user(void __user *to, unsigned long n) { might_fault(); return __clear_user(to, n); } -int copy_to_user_real(void __user *dest, unsigned long src, unsigned long count); -void *s390_kernel_write(void *dst, const void *src, size_t size); +void *__s390_kernel_write(void *dst, const void *src, size_t size); -int __noreturn __put_kernel_bad(void); +static inline void *s390_kernel_write(void *dst, const void *src, size_t size) +{ + if (__is_defined(__DECOMPRESSOR)) + return memcpy(dst, src, size); + return __s390_kernel_write(dst, src, size); +} -#define __put_kernel_asm(val, to, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - "0: " insn " %[_val],%[_to]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ - EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ - : [rc] "=d" (__rc), [_to] "+Q" (*(to)) \ - : [_val] "d" (val) \ - : "cc"); \ - __rc; \ -}) +void __noreturn __mvc_kernel_nofault_bad(void); -#define __put_kernel_nofault(dst, src, type, err_label) \ +#if defined(CONFIG_CC_HAS_ASM_GOTO_OUTPUT) && defined(CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS) + +#define __mvc_kernel_nofault(dst, src, type, err_label) \ do { \ - unsigned long __x = (unsigned long)(*((type *)(src))); \ - int __pk_err; \ - \ switch (sizeof(type)) { \ case 1: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \ - break; \ case 2: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \ - break; \ case 4: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \ - break; \ case 8: \ - __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \ + asm goto( \ + "0: mvc %O[_dst](%[_len],%R[_dst]),%[_src]\n" \ + "1: nopr %%r7\n" \ + EX_TABLE(0b, %l[err_label]) \ + EX_TABLE(1b, %l[err_label]) \ + : [_dst] "=Q" (*(type *)dst) \ + : [_src] "Q" (*(type *)(src)), \ + [_len] "I" (sizeof(type)) \ + : \ + : err_label); \ break; \ default: \ - __pk_err = __put_kernel_bad(); \ + __mvc_kernel_nofault_bad(); \ break; \ } \ - if (unlikely(__pk_err)) \ - goto err_label; \ } while (0) -int __noreturn __get_kernel_bad(void); +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT) && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -#define __get_kernel_asm(val, from, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - "0: " insn " %[_val],%[_from]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - EX_TABLE_UA_LOAD_REG(0b, 2b, %[rc], %[_val]) \ - EX_TABLE_UA_LOAD_REG(1b, 2b, %[rc], %[_val]) \ - : [rc] "=d" (__rc), [_val] "=d" (val) \ - : [_from] "Q" (*(from)) \ - : "cc"); \ - __rc; \ -}) - -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define __mvc_kernel_nofault(dst, src, type, err_label) \ do { \ - int __gk_err; \ + type *(__dst) = (type *)(dst); \ + int __rc; \ \ switch (sizeof(type)) { \ - case 1: { \ - unsigned char __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \ - *((type *)(dst)) = (type)__x; \ - break; \ - }; \ - case 2: { \ - unsigned short __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \ - *((type *)(dst)) = (type)__x; \ - break; \ - }; \ - case 4: { \ - unsigned int __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \ - *((type *)(dst)) = (type)__x; \ - break; \ - }; \ - case 8: { \ - unsigned long __x; \ - \ - __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \ - *((type *)(dst)) = (type)__x; \ + case 1: \ + case 2: \ + case 4: \ + case 8: \ + asm_inline volatile( \ + "0: mvc 0(%[_len],%[_dst]),%[_src]\n" \ + "1: lhi %[_rc],0\n" \ + "2:\n" \ + EX_TABLE_UA_FAULT(0b, 2b, %[_rc]) \ + EX_TABLE_UA_FAULT(1b, 2b, %[_rc]) \ + : [_rc] "=d" (__rc), \ + "=m" (*__dst) \ + : [_src] "Q" (*(type *)(src)), \ + [_dst] "a" (__dst), \ + [_len] "I" (sizeof(type))); \ + if (__rc) \ + goto err_label; \ break; \ - }; \ default: \ - __gk_err = __get_kernel_bad(); \ + __mvc_kernel_nofault_bad(); \ break; \ } \ - if (unlikely(__gk_err)) \ - goto err_label; \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ + +#define __get_kernel_nofault __mvc_kernel_nofault +#define __put_kernel_nofault __mvc_kernel_nofault + +void __cmpxchg_user_key_called_with_bad_pointer(void); + +#define CMPXCHG_USER_KEY_MAX_LOOPS 128 + +static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval, + __uint128_t old, __uint128_t new, + unsigned long key, int size) +{ + bool sacf_flag; + int rc = 0; + + switch (size) { + case 1: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + _old = ((unsigned int)old & 0xff) << shift; + _new = ((unsigned int)new & 0xff) << shift; + mask = ~(0xff << shift); + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + disable_sacf_uaccess(sacf_flag); + *(unsigned char *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 2: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + _old = ((unsigned int)old & 0xffff) << shift; + _new = ((unsigned int)new & 0xffff) << shift; + mask = ~(0xffff << shift); + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + disable_sacf_uaccess(sacf_flag); + *(unsigned short *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 4: { + unsigned int prev = old; + + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cs %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" ((unsigned int)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + disable_sacf_uaccess(sacf_flag); + *(unsigned int *)uval = prev; + return rc; + } + case 8: { + unsigned long prev = old; + + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: csg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" ((unsigned long)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + disable_sacf_uaccess(sacf_flag); + *(unsigned long *)uval = prev; + return rc; + } + case 16: { + __uint128_t prev = old; + + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cdsg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(__int128_t *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + disable_sacf_uaccess(sacf_flag); + *(__uint128_t *)uval = prev; + return rc; + } + } + __cmpxchg_user_key_called_with_bad_pointer(); + return rc; +} + +/** + * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys + * @ptr: User space address of value to compare to @old and exchange with + * @new. Must be aligned to sizeof(*@ptr). + * @uval: Address where the old value of *@ptr is written to. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written to *@uval. + * @new: New value to place at *@ptr. + * @key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on a user space target, honoring storage key protection. + * @key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * The caller must compare *@uval and @old to determine if values have been + * exchanged. In case of an exception *@uval is set to zero. + * + * Return: 0: cmpxchg executed + * -EFAULT: an exception happened when trying to access *@ptr + * -EAGAIN: maxed out number of retries (byte and short only) + */ +#define cmpxchg_user_key(ptr, uval, old, new, key) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(uval) __uval = (uval); \ + \ + BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ + might_fault(); \ + __chk_user_ptr(__ptr); \ + __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ + (old), (new), (key), sizeof(*(__ptr))); \ +}) + #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 4260bc5ce7f8..70fc671397da 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -35,6 +35,5 @@ #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE -#define __ARCH_WANT_SYS_CLONE3 #endif /* _ASM_S390_UNISTD_H_ */ diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h index 0bf06f1682d8..b8ecf04e3468 100644 --- a/arch/s390/include/asm/unwind.h +++ b/arch/s390/include/asm/unwind.h @@ -4,7 +4,7 @@ #include <linux/sched.h> #include <linux/ftrace.h> -#include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/llist.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> @@ -43,13 +43,15 @@ struct unwind_state { bool error; }; -/* Recover the return address modified by kretprobe and ftrace_graph. */ +/* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long ip) { - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL); - if (is_kretprobe_trampoline(ip)) - ip = kretprobe_find_ret_addr(state->task, (void *)state->sp, &state->kr_cur); + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp); +#ifdef CONFIG_RETHOOK + if (is_rethook_trampoline(ip)) + ip = rethook_find_ret_addr(state->task, state->sp, &state->kr_cur); +#endif return ip; } diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index cfea7b77a5b8..8018549a1ad2 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -2,7 +2,7 @@ /* * Ultravisor Interfaces * - * Copyright IBM Corp. 2019, 2022 + * Copyright IBM Corp. 2019, 2024 * * Author(s): * Vasily Gorbik <gor@linux.ibm.com> @@ -16,7 +16,7 @@ #include <linux/bug.h> #include <linux/sched.h> #include <asm/page.h> -#include <asm/gmap.h> +#include <asm/asm.h> #define UVC_CC_OK 0 #define UVC_CC_ERROR 1 @@ -28,12 +28,15 @@ #define UVC_RC_INV_STATE 0x0003 #define UVC_RC_INV_LEN 0x0005 #define UVC_RC_NO_RESUME 0x0007 +#define UVC_RC_MORE_DATA 0x0100 #define UVC_RC_NEED_DESTROY 0x8000 #define UVC_CMD_QUI 0x0001 +#define UVC_CMD_QUERY_KEYS 0x0002 #define UVC_CMD_INIT_UV 0x000f #define UVC_CMD_CREATE_SEC_CONF 0x0100 #define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102 #define UVC_CMD_CREATE_SEC_CPU 0x0120 #define UVC_CMD_DESTROY_SEC_CPU 0x0121 #define UVC_CMD_CONV_TO_SEC_STOR 0x0200 @@ -50,9 +53,17 @@ #define UVC_CMD_SET_UNSHARE_ALL 0x0340 #define UVC_CMD_PIN_PAGE_SHARED 0x0341 #define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 +#define UVC_CMD_DUMP_INIT 0x0400 +#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401 +#define UVC_CMD_DUMP_CPU 0x0402 +#define UVC_CMD_DUMP_COMPLETE 0x0403 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 #define UVC_CMD_RETR_ATTEST 0x1020 +#define UVC_CMD_ADD_SECRET 0x1031 +#define UVC_CMD_LIST_SECRETS 0x1033 +#define UVC_CMD_LOCK_SECRETS 0x1034 +#define UVC_CMD_RETR_SECRET 0x1035 /* Bits in installed uv calls */ enum uv_cmds_inst { @@ -77,12 +88,24 @@ enum uv_cmds_inst { BIT_UVC_CMD_UNSHARE_ALL = 20, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23, + BIT_UVC_CMD_DUMP_INIT = 24, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25, + BIT_UVC_CMD_DUMP_CPU = 26, + BIT_UVC_CMD_DUMP_COMPLETE = 27, BIT_UVC_CMD_RETR_ATTEST = 28, + BIT_UVC_CMD_ADD_SECRET = 29, + BIT_UVC_CMD_LIST_SECRETS = 30, + BIT_UVC_CMD_LOCK_SECRETS = 31, + BIT_UVC_CMD_RETR_SECRET = 33, + BIT_UVC_CMD_QUERY_KEYS = 34, }; enum uv_feat_ind { BIT_UV_FEAT_MISC = 0, BIT_UV_FEAT_AIV = 1, + BIT_UV_FEAT_AP = 4, + BIT_UV_FEAT_AP_INTR = 5, }; struct uv_cb_header { @@ -107,12 +130,42 @@ struct uv_cb_qui { u32 reserved70[3]; /* 0x0070 */ u32 max_num_sec_conf; /* 0x007c */ u64 max_guest_stor_addr; /* 0x0080 */ - u8 reserved88[158 - 136]; /* 0x0088 */ + u8 reserved88[0x9e - 0x88]; /* 0x0088 */ u16 max_guest_cpu_id; /* 0x009e */ u64 uv_feature_indications; /* 0x00a0 */ - u8 reserveda8[200 - 168]; /* 0x00a8 */ + u64 reserveda8; /* 0x00a8 */ + u64 supp_se_hdr_versions; /* 0x00b0 */ + u64 supp_se_hdr_pcf; /* 0x00b8 */ + u64 reservedc0; /* 0x00c0 */ + u64 conf_dump_storage_state_len; /* 0x00c8 */ + u64 conf_dump_finalize_len; /* 0x00d0 */ + u64 reservedd8; /* 0x00d8 */ + u64 supp_att_req_hdr_ver; /* 0x00e0 */ + u64 supp_att_pflags; /* 0x00e8 */ + u64 reservedf0; /* 0x00f0 */ + u64 supp_add_secret_req_ver; /* 0x00f8 */ + u64 supp_add_secret_pcf; /* 0x0100 */ + u64 supp_secret_types; /* 0x0108 */ + u16 max_assoc_secrets; /* 0x0110 */ + u16 max_retr_secrets; /* 0x0112 */ + u8 reserved114[0x120 - 0x114]; /* 0x0114 */ } __packed __aligned(8); +struct uv_key_hash { + u64 dword[4]; +} __packed __aligned(8); + +#define UVC_QUERY_KEYS_IDX_HK 0 +#define UVC_QUERY_KEYS_IDX_BACK_HK 1 + +/* Query Ultravisor Keys */ +struct uv_cb_query_keys { + struct uv_cb_header header; /* 0x0000 */ + u64 reserved08[3]; /* 0x0008 */ + struct uv_key_hash key_hashes[15]; /* 0x0020 */ +} __packed __aligned(8); +static_assert(sizeof(struct uv_cb_query_keys) == 0x200); + /* Initialize Ultravisor */ struct uv_cb_init { struct uv_cb_header header; @@ -129,7 +182,15 @@ struct uv_cb_cgc { u64 guest_handle; u64 conf_base_stor_origin; u64 conf_virt_stor_origin; - u64 reserved30; + u8 reserved30[6]; + union { + struct { + u16 : 14; + u16 ap_instr_intr : 1; + u16 ap_allow_instr : 1; + }; + u16 raw; + } flags; u64 guest_stor_origin; u64 guest_stor_len; u64 guest_sca; @@ -213,6 +274,14 @@ struct uv_cb_nodata { u64 reserved20[4]; } __packed __aligned(8); +/* Destroy Configuration Fast */ +struct uv_cb_destroy_fast { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[5]; +} __packed __aligned(8); + /* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; @@ -240,18 +309,139 @@ struct uv_cb_attest { u64 reserved168[4]; /* 0x0168 */ } __packed __aligned(8); +struct uv_cb_dump_cpu { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 dump_area_origin; + u64 reserved28[5]; +} __packed __aligned(8); + +struct uv_cb_dump_stor_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 gaddr; + u64 reserved28[4]; +} __packed __aligned(8); + +struct uv_cb_dump_complete { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 reserved30[5]; +} __packed __aligned(8); + +/* + * A common UV call struct for pv guests that contains a single address + * Examples: + * Add Secret + */ +struct uv_cb_guest_addr { + struct uv_cb_header header; + u64 reserved08[3]; + u64 addr; + u64 reserved28[4]; +} __packed __aligned(8); + +#define UVC_RC_RETR_SECR_BUF_SMALL 0x0109 +#define UVC_RC_RETR_SECR_STORE_EMPTY 0x010f +#define UVC_RC_RETR_SECR_INV_IDX 0x0110 +#define UVC_RC_RETR_SECR_INV_SECRET 0x0111 + +struct uv_cb_retr_secr { + struct uv_cb_header header; + u64 reserved08[2]; + u16 secret_idx; + u16 reserved1a; + u32 buf_size; + u64 buf_addr; + u64 reserved28[4]; +} __packed __aligned(8); + +struct uv_cb_list_secrets { + struct uv_cb_header header; + u64 reserved08[2]; + u8 reserved18[6]; + u16 start_idx; + u64 list_addr; + u64 reserved28[4]; +} __packed __aligned(8); + +enum uv_secret_types { + UV_SECRET_INVAL = 0x0, + UV_SECRET_NULL = 0x1, + UV_SECRET_ASSOCIATION = 0x2, + UV_SECRET_PLAIN = 0x3, + UV_SECRET_AES_128 = 0x4, + UV_SECRET_AES_192 = 0x5, + UV_SECRET_AES_256 = 0x6, + UV_SECRET_AES_XTS_128 = 0x7, + UV_SECRET_AES_XTS_256 = 0x8, + UV_SECRET_HMAC_SHA_256 = 0x9, + UV_SECRET_HMAC_SHA_512 = 0xa, + /* 0x0b - 0x10 reserved */ + UV_SECRET_ECDSA_P256 = 0x11, + UV_SECRET_ECDSA_P384 = 0x12, + UV_SECRET_ECDSA_P521 = 0x13, + UV_SECRET_ECDSA_ED25519 = 0x14, + UV_SECRET_ECDSA_ED448 = 0x15, +}; + +/** + * uv_secret_list_item_hdr - UV secret metadata. + * @index: Index of the secret in the secret list. + * @type: Type of the secret. See `enum uv_secret_types`. + * @length: Length of the stored secret. + */ +struct uv_secret_list_item_hdr { + u16 index; + u16 type; + u32 length; +} __packed __aligned(8); + +#define UV_SECRET_ID_LEN 32 +/** + * uv_secret_list_item - UV secret entry. + * @hdr: The metadata of this secret. + * @id: The ID of this secret, not the secret itself. + */ +struct uv_secret_list_item { + struct uv_secret_list_item_hdr hdr; + u64 reserverd08; + u8 id[UV_SECRET_ID_LEN]; +} __packed __aligned(8); + +/** + * uv_secret_list - UV secret-metadata list. + * @num_secr_stored: Number of secrets stored in this list. + * @total_num_secrets: Number of secrets stored in the UV for this guest. + * @next_secret_idx: positive number if there are more secrets available or zero. + * @secrets: Up to 85 UV-secret metadata entries. + */ +struct uv_secret_list { + u16 num_secr_stored; + u16 total_num_secrets; + u16 next_secret_idx; + u16 reserved_06; + u64 reserved_08; + struct uv_secret_list_item secrets[85]; +} __packed __aligned(8); +static_assert(sizeof(struct uv_secret_list) == PAGE_SIZE); + static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; asm volatile( - " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=d" (cc) + " .insn rrf,0xb9a40000,%[r1],%[r2],0,0\n" + CC_IPM(cc) + : CC_OUT(cc, cc) : [r1] "a" (r1), [r2] "a" (r2) - : "memory", "cc"); - return cc; + : CC_CLOBBER_LIST("memory")); + return CC_TRANSFORM(cc); } static inline int uv_call(unsigned long r1, unsigned long r2) @@ -296,6 +486,48 @@ static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc) return cc ? -EINVAL : 0; } +/** + * uv_list_secrets() - Do a List Secrets UVC. + * + * @buf: Buffer to write list into; size of one page. + * @start_idx: The smallest index that should be included in the list. + * For the fist invocation use 0. + * @rc: Pointer to store the return code or NULL. + * @rrc: Pointer to store the return reason code or NULL. + * + * This function calls the List Secrets UVC. The result is written into `buf`, + * that needs to be at least one page of writable memory. + * `buf` consists of: + * * %struct uv_secret_list_hdr + * * %struct uv_secret_list_item (multiple) + * + * For `start_idx` use _0_ for the first call. If there are more secrets available + * but could not fit into the page then `rc` is `UVC_RC_MORE_DATA`. + * In this case use `uv_secret_list_hdr.next_secret_idx` for `start_idx`. + * + * Context: might sleep. + * + * Return: The UVC condition code. + */ +static inline int uv_list_secrets(struct uv_secret_list *buf, u16 start_idx, + u16 *rc, u16 *rrc) +{ + struct uv_cb_list_secrets uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_LIST_SECRETS, + .start_idx = start_idx, + .list_addr = (u64)buf, + }; + int cc = uv_call_sched(0, (u64)&uvcb); + + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; + + return cc; +} + struct uv_info { unsigned long inst_calls_list[4]; unsigned long uv_base_stor_len; @@ -307,11 +539,28 @@ struct uv_info { unsigned int max_num_sec_conf; unsigned short max_guest_cpu_id; unsigned long uv_feature_indications; + unsigned long supp_se_hdr_ver; + unsigned long supp_se_hdr_pcf; + unsigned long conf_dump_storage_state_len; + unsigned long conf_dump_finalize_len; + unsigned long supp_att_req_hdr_ver; + unsigned long supp_att_pflags; + unsigned long supp_add_secret_req_ver; + unsigned long supp_add_secret_pcf; + unsigned long supp_secret_types; + unsigned short max_assoc_secrets; + unsigned short max_retr_secrets; }; extern struct uv_info uv_info; -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +static inline bool uv_has_feature(u8 feature_bit) +{ + if (feature_bit >= sizeof(uv_info.uv_feature_indications) * 8) + return false; + return test_bit_inv(feature_bit, &uv_info.uv_feature_indications); +} + extern int prot_virt_guest; static inline int is_prot_virt_guest(void) @@ -339,7 +588,10 @@ static inline int share(unsigned long addr, u16 cmd) if (!uv_call(0, (u64)&uvcb)) return 0; - return -EINVAL; + pr_err("%s UVC failed (rc: 0x%x, rrc: 0x%x), possible hypervisor bug.\n", + uvcb.header.cmd == UVC_CMD_SET_SHARED_ACCESS ? "Share" : "Unshare", + uvcb.header.rc, uvcb.header.rrc); + panic("System security cannot be guaranteed unless the system panics now.\n"); } /* @@ -363,13 +615,11 @@ static inline int uv_remove_shared(unsigned long addr) return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); } -#else -#define is_prot_virt_guest() 0 -static inline int uv_set_shared(unsigned long addr) { return 0; } -static inline int uv_remove_shared(unsigned long addr) { return 0; } -#endif +int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret); +int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size); -#if IS_ENABLED(CONFIG_KVM) extern int prot_virt_host; static inline int is_prot_virt_host(void) @@ -377,31 +627,14 @@ static inline int is_prot_virt_host(void) return prot_virt_host; } -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int uv_destroy_owned_page(unsigned long paddr); +int uv_pin_shared(unsigned long paddr); +int uv_destroy_folio(struct folio *folio); +int uv_destroy_pte(pte_t pte); +int uv_convert_from_secure_pte(pte_t pte); +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); int uv_convert_from_secure(unsigned long paddr); -int uv_convert_owned_from_secure(unsigned long paddr); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); +int uv_convert_from_secure_folio(struct folio *folio); void setup_uv(void); -#else -#define is_prot_virt_host() 0 -static inline void setup_uv(void) {} - -static inline int uv_destroy_owned_page(unsigned long paddr) -{ - return 0; -} - -static inline int uv_convert_from_secure(unsigned long paddr) -{ - return 0; -} - -static inline int uv_convert_owned_from_secure(unsigned long paddr) -{ - return 0; -} -#endif #endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/include/asm/vdso-symbols.h b/arch/s390/include/asm/vdso-symbols.h new file mode 100644 index 000000000000..0df17574d788 --- /dev/null +++ b/arch/s390/include/asm/vdso-symbols.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_VDSO_SYMBOLS_H__ +#define __S390_VDSO_SYMBOLS_H__ + +#include <generated/vdso64-offsets.h> +#ifdef CONFIG_COMPAT +#include <generated/vdso32-offsets.h> +#endif + +#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) +#ifdef CONFIG_COMPAT +#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) +#else +#define VDSO32_SYMBOL(tsk, name) (-1UL) +#endif + +#endif /* __S390_VDSO_SYMBOLS_H__ */ diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 53165aa7813a..420a073fdde5 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -6,28 +6,11 @@ #ifndef __ASSEMBLY__ -#include <generated/vdso64-offsets.h> -#ifdef CONFIG_COMPAT -#include <generated/vdso32-offsets.h> -#endif - -#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) -#ifdef CONFIG_COMPAT -#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) -#else -#define VDSO32_SYMBOL(tsk, name) (-1UL) -#endif - -extern struct vdso_data *vdso_data; - int vdso_getcpu_init(void); #endif /* __ASSEMBLY__ */ -/* Default link address for the vDSO */ -#define VDSO_LBASE 0 - -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #define VDSO_VERSION_STRING LINUX_2.6.29 diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h deleted file mode 100644 index 73ee89142666..000000000000 --- a/arch/s390/include/asm/vdso/data.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __S390_ASM_VDSO_DATA_H -#define __S390_ASM_VDSO_DATA_H - -#include <linux/types.h> -#include <vdso/datapage.h> - -struct arch_vdso_data { - __s64 tod_steering_delta; - __u64 tod_steering_end; -}; - -#endif /* __S390_ASM_VDSO_DATA_H */ diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..f8713ce39bb2 --- /dev/null +++ b/arch/s390/include/asm/vdso/getrandom.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include <vdso/datapage.h> +#include <asm/vdso/vsyscall.h> +#include <asm/syscall.h> +#include <asm/unistd.h> +#include <asm/page.h> + +/** + * getrandom_syscall - Invoke the getrandom() syscall. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) +{ + return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags); +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index db84942eb78f..fb4564308e9d 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -6,23 +6,15 @@ #define VDSO_HAS_CLOCK_GETRES 1 +#define VDSO_DELTA_NOMASK 1 + #include <asm/syscall.h> #include <asm/timex.h> #include <asm/unistd.h> #include <linux/compiler.h> -#define vdso_calc_delta __arch_vdso_calc_delta -static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) -{ - return (cycles - last) * mult; -} - -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} -static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd) { u64 adj, now; @@ -52,12 +44,4 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) return syscall2(__NR_clock_getres, (long)clkid, (long)ts); } -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif - #endif diff --git a/arch/s390/include/asm/vdso/time_data.h b/arch/s390/include/asm/vdso/time_data.h new file mode 100644 index 000000000000..8a08752422e6 --- /dev/null +++ b/arch/s390/include/asm/vdso/time_data.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_ASM_VDSO_TIME_DATA_H +#define __S390_ASM_VDSO_TIME_DATA_H + +#include <linux/types.h> + +struct arch_vdso_time_data { + __s64 tod_steering_delta; + __u64 tod_steering_end; +}; + +#endif /* __S390_ASM_VDSO_TIME_DATA_H */ diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h index 6c67c08cefdd..d346ebe51301 100644 --- a/arch/s390/include/asm/vdso/vsyscall.h +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -5,18 +5,8 @@ #ifndef __ASSEMBLY__ #include <linux/hrtimer.h> -#include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <asm/vdso.h> -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ - -static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __s390_get_k_vdso_data /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/s390/include/asm/vga.h b/arch/s390/include/asm/vga.h deleted file mode 100644 index 605dc46bac5e..000000000000 --- a/arch/s390/include/asm/vga.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_VGA_H -#define _ASM_S390_VGA_H - -/* Avoid compile errors due to missing asm/vga.h */ - -#endif /* _ASM_S390_VGA_H */ diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index fe17e448c0c5..9d25fb35a042 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,20 +2,22 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H -#define __ARCH_HAS_VTIME_TASK_SWITCH - static inline void update_timer_sys(void) { - S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer; - S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.sys_enter_timer; - S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; + struct lowcore *lc = get_lowcore(); + + lc->system_timer += lc->last_update_timer - lc->exit_timer; + lc->user_timer += lc->exit_timer - lc->sys_enter_timer; + lc->last_update_timer = lc->sys_enter_timer; } static inline void update_timer_mcck(void) { - S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer; - S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.mcck_enter_timer; - S390_lowcore.last_update_timer = S390_lowcore.mcck_enter_timer; + struct lowcore *lc = get_lowcore(); + + lc->system_timer += lc->last_update_timer - lc->exit_timer; + lc->user_timer += lc->exit_timer - lc->mcck_enter_timer; + lc->last_update_timer = lc->mcck_enter_timer; } #endif /* _S390_VTIME_H */ diff --git a/arch/s390/include/asm/word-at-a-time.h b/arch/s390/include/asm/word-at-a-time.h new file mode 100644 index 000000000000..eaa19dee7699 --- /dev/null +++ b/arch/s390/include/asm/word-at-a-time.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_WORD_AT_A_TIME_H +#define _ASM_WORD_AT_A_TIME_H + +#include <linux/bitops.h> +#include <linux/wordpart.h> +#include <asm/asm-extable.h> +#include <asm/bitsperlong.h> + +struct word_at_a_time { + const unsigned long bits; +}; + +#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x7f) } + +static inline unsigned long prep_zero_mask(unsigned long val, unsigned long data, const struct word_at_a_time *c) +{ + return data; +} + +static inline unsigned long create_zero_mask(unsigned long data) +{ + return __fls(data); +} + +static inline unsigned long find_zero(unsigned long data) +{ + return (data ^ (BITS_PER_LONG - 1)) >> 3; +} + +static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) +{ + unsigned long mask = (val & c->bits) + c->bits; + + *data = ~(mask | val | c->bits); + return *data; +} + +static inline unsigned long zero_bytemask(unsigned long data) +{ + return ~1UL << data; +} + +/* + * Load an unaligned word from kernel space. + * + * In the (very unlikely) case of the word being a page-crosser + * and the next page not being mapped, take the exception and + * return zeroes in the non-existing part. + */ +static inline unsigned long load_unaligned_zeropad(const void *addr) +{ + unsigned long data; + + asm_inline volatile( + "0: lg %[data],0(%[addr])\n" + "1: nopr %%r7\n" + EX_TABLE_ZEROPAD(0b, 1b, %[data], %[addr]) + EX_TABLE_ZEROPAD(1b, 1b, %[data], %[addr]) + : [data] "=d" (data) + : [addr] "a" (addr), "m" (*(unsigned long *)addr)); + return data; +} + +#endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/arch/s390/include/uapi/asm/cmb.h b/arch/s390/include/uapi/asm/cmb.h index ecbe94941403..115434ab98fb 100644 --- a/arch/s390/include/uapi/asm/cmb.h +++ b/arch/s390/include/uapi/asm/cmb.h @@ -31,7 +31,7 @@ struct cmbdata { __u64 size; __u64 elapsed_time; - /* basic and exended format: */ + /* basic and extended format: */ __u64 ssch_rsch_count; __u64 sample_count; __u64 device_connect_time; diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 9ec86fae9980..7c364b33c84d 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -24,7 +24,7 @@ /* * struct dasd_information2_t * represents any data about the device, which is visible to userspace. - * including foramt and featueres. + * including format and featueres. */ typedef struct dasd_information2_t { unsigned int devno; /* S/390 devno */ @@ -78,6 +78,7 @@ typedef struct dasd_information2_t { * 0x040: give access to raw eckd data * 0x080: enable discard support * 0x100: enable autodisable for IFCC errors (default) + * 0x200: enable requeue of all requests on autoquiesce */ #define DASD_FEATURE_READONLY 0x001 #define DASD_FEATURE_USEDIAG 0x002 @@ -88,6 +89,7 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_USERAW 0x040 #define DASD_FEATURE_DISCARD 0x080 #define DASD_FEATURE_PATH_AUTODISABLE 0x100 +#define DASD_FEATURE_REQUEUEQUIESCE 0x200 #define DASD_FEATURE_DEFAULT DASD_FEATURE_PATH_AUTODISABLE #define DASD_PARTN_BITS 2 @@ -183,6 +185,18 @@ typedef struct format_data_t { } format_data_t; /* + * struct dasd_copypair_swap_data_t + * represents all data necessary to issue a swap of the copy pair relation + */ +struct dasd_copypair_swap_data_t { + char primary[20]; /* BUSID of primary */ + char secondary[20]; /* BUSID of secondary */ + + /* Reserved for future updates. */ + __u8 reserved[64]; +}; + +/* * values to be used for format_data_t.intensity * 0/8: normal format * 1/9: also write record zero @@ -280,7 +294,7 @@ struct dasd_snid_ioctl_data { /******************************************************************************** * SECTION: Definition of IOCTLs * - * Here ist how the ioctl-nr should be used: + * Here is how the ioctl-nr should be used: * 0 - 31 DASD driver itself * 32 - 239 still open * 240 - 255 reserved for EMC @@ -326,6 +340,8 @@ struct dasd_snid_ioctl_data { #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) /* Release Allocated Space */ #define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t) +/* Swap copy pair relation */ +#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t) /* Get Sense Path Group ID (SNID) data */ #define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) diff --git a/arch/s390/include/uapi/asm/diag.h b/arch/s390/include/uapi/asm/diag.h new file mode 100644 index 000000000000..b7e6ccb4ff6e --- /dev/null +++ b/arch/s390/include/uapi/asm/diag.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Diag ioctls and its associated structures definitions. + * + * Copyright IBM Corp. 2024 + */ + +#ifndef __S390_UAPI_ASM_DIAG_H +#define __S390_UAPI_ASM_DIAG_H + +#include <linux/types.h> + +#define DIAG_MAGIC_STR 'D' + +struct diag324_pib { + __u64 address; + __u64 sequence; +}; + +struct diag310_memtop { + __u64 address; + __u64 nesting_lvl; +}; + +/* Diag ioctl definitions */ +#define DIAG324_GET_PIBBUF _IOWR(DIAG_MAGIC_STR, 0x77, struct diag324_pib) +#define DIAG324_GET_PIBLEN _IOR(DIAG_MAGIC_STR, 0x78, size_t) +#define DIAG310_GET_STRIDE _IOR(DIAG_MAGIC_STR, 0x79, size_t) +#define DIAG310_GET_MEMTOPLEN _IOWR(DIAG_MAGIC_STR, 0x7a, size_t) +#define DIAG310_GET_MEMTOPBUF _IOWR(DIAG_MAGIC_STR, 0x7b, struct diag310_memtop) + +#endif /* __S390_UAPI_ASM_DIAG_H */ diff --git a/arch/s390/include/uapi/asm/fs3270.h b/arch/s390/include/uapi/asm/fs3270.h new file mode 100644 index 000000000000..c4bc1108af6a --- /dev/null +++ b/arch/s390/include/uapi/asm/fs3270.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_FS3270_H +#define __ASM_S390_UAPI_FS3270_H + +#include <linux/types.h> +#include <asm/ioctl.h> + +/* ioctls for fullscreen 3270 */ +#define TUBICMD _IO('3', 3) /* set ccw command for fs reads. */ +#define TUBOCMD _IO('3', 4) /* set ccw command for fs writes. */ +#define TUBGETI _IO('3', 7) /* get ccw command for fs reads. */ +#define TUBGETO _IO('3', 8) /* get ccw command for fs writes. */ +#define TUBGETMOD _IO('3', 13) /* get characteristics like model, cols, rows */ + +/* For TUBGETMOD */ +struct raw3270_iocb { + __u16 model; + __u16 line_cnt; + __u16 col_cnt; + __u16 pf_cnt; + __u16 re_cnt; + __u16 map; +}; + +#endif /* __ASM_S390_UAPI_FS3270_H */ diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index d1ecd5d722a0..2cd28af50dd4 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -27,6 +27,7 @@ enum ipl_pbt { IPL_PBT_FCP = 0, IPL_PBT_SCP_DATA = 1, IPL_PBT_CCW = 2, + IPL_PBT_ECKD = 3, IPL_PBT_NVME = 4, }; @@ -111,6 +112,34 @@ struct ipl_pb0_ccw { __u8 reserved5[8]; } __packed; +/* IPL Parameter Block 0 for ECKD */ +struct ipl_pb0_eckd { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u32 reserved2[78]; + __u8 opt; + __u8 reserved4[4]; + __u8 reserved5:5; + __u8 ssid:3; + __u16 devno; + __u32 reserved6[5]; + __u32 bootprog; + __u8 reserved7[12]; + struct { + __u16 cyl; + __u8 head; + __u8 record; + __u32 reserved; + } br_chr __packed; + __u32 scp_data_len; + __u8 reserved8[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_ECKD_OPT_IPL 0x10 +#define IPL_PB0_ECKD_OPT_DUMP 0x20 + #define IPL_PB0_CCW_VM_FLAG_NSS 0x80 #define IPL_PB0_CCW_VM_FLAG_VP 0x40 diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 7a6b14874d65..60345dd2cba2 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -12,7 +12,320 @@ #include <linux/types.h> #define __KVM_S390 -#define __KVM_HAVE_GUEST_DEBUG + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + union { + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + __u8 pad1[6]; /* ignored */ + __u64 old_addr; /* ignored if cmpxchg flag unset */ + }; + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* ignored */ + }; +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +#define KVM_S390_MEMOP_SIDA_READ 2 +#define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 +#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 + +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) + +/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ +#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) +#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u +#define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u +/* Anything below 0xfffe0000u is taken by INT_IO */ +#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ + (((schid)) | \ + ((ssid) << 16) | \ + ((cssid) << 18) | \ + ((ai) << 26)) +#define KVM_S390_INT_IO_MIN 0x00000000u +#define KVM_S390_INT_IO_MAX 0xfffdffffu +#define KVM_S390_INT_IO_AI_MASK 0x04000000u + + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; +#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 +#define KVM_S390_PGM_FLAGS_ILC_0 0x02 +#define KVM_S390_PGM_FLAGS_ILC_1 0x04 +#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 +#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 + __u8 flags; + __u8 pad[2]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 +struct kvm_s390_stop_info { + __u32 flags; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; + +struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +}; + +struct kvm_s390_pv_unp { + __u64 addr; + __u64 size; + __u64 tweak; +}; + +enum pv_cmd_dmp_id { + KVM_PV_DUMP_INIT, + KVM_PV_DUMP_CONFIG_STOR_STATE, + KVM_PV_DUMP_COMPLETE, + KVM_PV_DUMP_CPU, +}; + +struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + __u64 reserved[4]; +}; + +enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, +}; + +struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; +}; + +struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; +}; + +struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; +}; + +struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + union { + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; +}; + +enum pv_cmd_id { + KVM_PV_ENABLE, + KVM_PV_DISABLE, + KVM_PV_SET_SEC_PARMS, + KVM_PV_UNPACK, + KVM_PV_VERIFY, + KVM_PV_PREP_RESET, + KVM_PV_UNSHARE_ALL, + KVM_PV_INFO, + KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, +}; + +struct kvm_pv_cmd { + __u32 cmd; /* Command to be executed */ + __u16 rc; /* Ultravisor return code */ + __u16 rrc; /* Ultravisor return reason code */ + __u64 data; /* Data or address */ + __u32 flags; /* flags for future extensions. Must be 0 for now */ + __u32 reserved[3]; +}; + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) /* Device control API: s390-specific devices */ #define KVM_DEV_FLIC_GET_ALL_IRQS 1 @@ -74,6 +387,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 #define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 @@ -155,7 +469,24 @@ struct kvm_s390_vm_cpu_subfunc { __u8 kdsa[16]; /* with MSA9 */ __u8 sortl[32]; /* with STFLE.150 */ __u8 dfltcc[32]; /* with STFLE.151 */ - __u8 reserved[1728]; + __u8 pfcr[16]; /* with STFLE.201 */ + __u8 reserved[1712]; +}; + +#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST 6 +#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST 7 + +#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS 64 +struct kvm_s390_vm_cpu_uv_feat { + union { + struct { + __u64 : 4; + __u64 ap : 1; /* bit 4 */ + __u64 ap_intr : 1; /* bit 5 */ + __u64 : 58; + }; + __u64 feat; + }; }; /* kvm attributes for crypto */ diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h index 924b876f992c..ca42e941675d 100644 --- a/arch/s390/include/uapi/asm/pkey.h +++ b/arch/s390/include/uapi/asm/pkey.h @@ -2,7 +2,7 @@ /* * Userspace interface to the pkey device driver * - * Copyright IBM Corp. 2017, 2019 + * Copyright IBM Corp. 2017, 2023 * * Author: Harald Freudenberger <freude@de.ibm.com> * @@ -26,33 +26,44 @@ #define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */ #define MAXAESCIPHERKEYSIZE 136 /* our aes cipher keys have always 136 bytes */ #define MINEP11AESKEYBLOBSIZE 256 /* min EP11 AES key blob size */ -#define MAXEP11AESKEYBLOBSIZE 320 /* max EP11 AES key blob size */ +#define MAXEP11AESKEYBLOBSIZE 336 /* max EP11 AES key blob size */ /* Minimum size of a key blob */ #define MINKEYBLOBSIZE SECKEYBLOBSIZE /* defines for the type field within the pkey_protkey struct */ -#define PKEY_KEYTYPE_AES_128 1 -#define PKEY_KEYTYPE_AES_192 2 -#define PKEY_KEYTYPE_AES_256 3 -#define PKEY_KEYTYPE_ECC 4 +#define PKEY_KEYTYPE_AES_128 1 +#define PKEY_KEYTYPE_AES_192 2 +#define PKEY_KEYTYPE_AES_256 3 +#define PKEY_KEYTYPE_ECC 4 +#define PKEY_KEYTYPE_ECC_P256 5 +#define PKEY_KEYTYPE_ECC_P384 6 +#define PKEY_KEYTYPE_ECC_P521 7 +#define PKEY_KEYTYPE_ECC_ED25519 8 +#define PKEY_KEYTYPE_ECC_ED448 9 +#define PKEY_KEYTYPE_AES_XTS_128 10 +#define PKEY_KEYTYPE_AES_XTS_256 11 +#define PKEY_KEYTYPE_HMAC_512 12 +#define PKEY_KEYTYPE_HMAC_1024 13 /* the newer ioctls use a pkey_key_type enum for type information */ enum pkey_key_type { - PKEY_TYPE_CCA_DATA = (__u32) 1, - PKEY_TYPE_CCA_CIPHER = (__u32) 2, - PKEY_TYPE_EP11 = (__u32) 3, - PKEY_TYPE_CCA_ECC = (__u32) 0x1f, - PKEY_TYPE_EP11_AES = (__u32) 6, - PKEY_TYPE_EP11_ECC = (__u32) 7, + PKEY_TYPE_CCA_DATA = (__u32)1, + PKEY_TYPE_CCA_CIPHER = (__u32)2, + PKEY_TYPE_EP11 = (__u32)3, + PKEY_TYPE_CCA_ECC = (__u32)0x1f, + PKEY_TYPE_EP11_AES = (__u32)6, + PKEY_TYPE_EP11_ECC = (__u32)7, + PKEY_TYPE_PROTKEY = (__u32)8, + PKEY_TYPE_UVSECRET = (__u32)9, }; /* the newer ioctls use a pkey_key_size enum for key size information */ enum pkey_key_size { - PKEY_SIZE_AES_128 = (__u32) 128, - PKEY_SIZE_AES_192 = (__u32) 192, - PKEY_SIZE_AES_256 = (__u32) 256, - PKEY_SIZE_UNKNOWN = (__u32) 0xFFFFFFFF, + PKEY_SIZE_AES_128 = (__u32)128, + PKEY_SIZE_AES_192 = (__u32)192, + PKEY_SIZE_AES_256 = (__u32)256, + PKEY_SIZE_UNKNOWN = (__u32)0xFFFFFFFF, }; /* some of the newer ioctls use these flags */ @@ -115,6 +126,7 @@ struct pkey_genseck { __u32 keytype; /* in: key type to generate */ struct pkey_seckey seckey; /* out: the secure key blob */ }; + #define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck) /* @@ -127,6 +139,7 @@ struct pkey_clr2seck { struct pkey_clrkey clrkey; /* in: the clear key value */ struct pkey_seckey seckey; /* out: the secure key blob */ }; + #define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck) /* @@ -138,6 +151,7 @@ struct pkey_sec2protk { struct pkey_seckey seckey; /* in: the secure key blob */ struct pkey_protkey protkey; /* out: the protected key */ }; + #define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk) /* @@ -148,6 +162,7 @@ struct pkey_clr2protk { struct pkey_clrkey clrkey; /* in: the clear key value */ struct pkey_protkey protkey; /* out: the protected key */ }; + #define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk) /* @@ -159,6 +174,7 @@ struct pkey_findcard { __u16 cardnr; /* out: card number */ __u16 domain; /* out: domain number */ }; + #define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard) /* @@ -168,6 +184,7 @@ struct pkey_skey2pkey { struct pkey_seckey seckey; /* in: the secure key blob */ struct pkey_protkey protkey; /* out: the protected key */ }; + #define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey) /* @@ -185,6 +202,7 @@ struct pkey_verifykey { __u16 keysize; /* out: key size in bits */ __u32 attributes; /* out: attribute bits */ }; + #define PKEY_VERIFYKEY _IOWR(PKEY_IOCTL_MAGIC, 0x07, struct pkey_verifykey) #define PKEY_VERIFY_ATTR_AES 0x00000001 /* key is an AES key */ #define PKEY_VERIFY_ATTR_OLD_MKVP 0x00000100 /* key has old MKVP value */ @@ -216,6 +234,7 @@ struct pkey_kblob2pkey { __u32 keylen; /* in: the key blob length */ struct pkey_protkey protkey; /* out: the protected key */ }; + #define PKEY_KBLOB2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x0A, struct pkey_kblob2pkey) /* @@ -248,6 +267,7 @@ struct pkey_genseck2 { __u32 keylen; /* in: available key blob buffer size */ /* out: actual key blob size */ }; + #define PKEY_GENSECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x11, struct pkey_genseck2) /* @@ -282,6 +302,7 @@ struct pkey_clr2seck2 { __u32 keylen; /* in: available key blob buffer size */ /* out: actual key blob size */ }; + #define PKEY_CLR2SECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x12, struct pkey_clr2seck2) /* @@ -319,6 +340,7 @@ struct pkey_verifykey2 { enum pkey_key_size size; /* out: the key size */ __u32 flags; /* out: additional key info flags */ }; + #define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2) /* @@ -341,6 +363,7 @@ struct pkey_kblob2pkey2 { __u32 apqn_entries; /* in: # of apqn target list entries */ struct pkey_protkey protkey; /* out: the protected key */ }; + #define PKEY_KBLOB2PROTK2 _IOWR(PKEY_IOCTL_MAGIC, 0x1A, struct pkey_kblob2pkey2) /* @@ -348,7 +371,7 @@ struct pkey_kblob2pkey2 { * Is able to find out which type of secure key is given (CCA AES secure * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private * key) and tries to find all matching crypto cards based on the MKVP and maybe - * other criterias (like CCA AES cipher keys need a CEX5C or higher, EP11 keys + * other criteria (like CCA AES cipher keys need a CEX5C or higher, EP11 keys * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of * APQNs is further filtered by the key's mkvp which needs to match to either * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters @@ -365,7 +388,7 @@ struct pkey_kblob2pkey2 { * is empty (apqn_entries is 0) the apqn_entries field is updated to the number * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0 * but the number of apqn targets does not fit into the list, the apqn_targets - * field is updatedd with the number of reqired entries but there are no apqn + * field is updated with the number of required entries but there are no apqn * values stored in the list and the ioctl returns with ENOSPC. If no matching * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0. */ @@ -377,6 +400,7 @@ struct pkey_apqns4key { __u32 apqn_entries; /* in: max # of apqn entries in the list */ /* out: # apqns stored into the list */ }; + #define PKEY_APQNS4K _IOWR(PKEY_IOCTL_MAGIC, 0x1B, struct pkey_apqns4key) /* @@ -403,7 +427,7 @@ struct pkey_apqns4key { * is empty (apqn_entries is 0) the apqn_entries field is updated to the number * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0 * but the number of apqn targets does not fit into the list, the apqn_targets - * field is updatedd with the number of reqired entries but there are no apqn + * field is updated with the number of required entries but there are no apqn * values stored in the list and the ioctl returns with ENOSPC. If no matching * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0. */ @@ -416,6 +440,7 @@ struct pkey_apqns4keytype { __u32 apqn_entries; /* in: max # of apqn entries in the list */ /* out: # apqns stored into the list */ }; + #define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype) /* @@ -442,6 +467,7 @@ struct pkey_kblob2pkey3 { __u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */ __u8 __user *pkey; /* in: pkey blob buffer space ptr */ }; + #define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3) #endif /* _UAPI_PKEY_H */ diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h index ad64d673b5e6..bb0826024bb9 100644 --- a/arch/s390/include/uapi/asm/ptrace.h +++ b/arch/s390/include/uapi/asm/ptrace.h @@ -8,6 +8,8 @@ #ifndef _UAPI_S390_PTRACE_H #define _UAPI_S390_PTRACE_H +#include <linux/const.h> + /* * Offsets in the user_regs_struct. They are used for the ptrace * system call and in entry.S @@ -166,6 +168,64 @@ #endif /* __s390x__ */ +#ifndef __s390x__ + +#define PSW_MASK_PER _AC(0x40000000, UL) +#define PSW_MASK_DAT _AC(0x04000000, UL) +#define PSW_MASK_IO _AC(0x02000000, UL) +#define PSW_MASK_EXT _AC(0x01000000, UL) +#define PSW_MASK_KEY _AC(0x00F00000, UL) +#define PSW_MASK_BASE _AC(0x00080000, UL) /* always one */ +#define PSW_MASK_MCHECK _AC(0x00040000, UL) +#define PSW_MASK_WAIT _AC(0x00020000, UL) +#define PSW_MASK_PSTATE _AC(0x00010000, UL) +#define PSW_MASK_ASC _AC(0x0000C000, UL) +#define PSW_MASK_CC _AC(0x00003000, UL) +#define PSW_MASK_PM _AC(0x00000F00, UL) +#define PSW_MASK_RI _AC(0x00000000, UL) +#define PSW_MASK_EA _AC(0x00000000, UL) +#define PSW_MASK_BA _AC(0x00000000, UL) + +#define PSW_MASK_USER _AC(0x0000FF00, UL) + +#define PSW_ADDR_AMODE _AC(0x80000000, UL) +#define PSW_ADDR_INSN _AC(0x7FFFFFFF, UL) + +#define PSW_ASC_PRIMARY _AC(0x00000000, UL) +#define PSW_ASC_ACCREG _AC(0x00004000, UL) +#define PSW_ASC_SECONDARY _AC(0x00008000, UL) +#define PSW_ASC_HOME _AC(0x0000C000, UL) + +#else /* __s390x__ */ + +#define PSW_MASK_PER _AC(0x4000000000000000, UL) +#define PSW_MASK_DAT _AC(0x0400000000000000, UL) +#define PSW_MASK_IO _AC(0x0200000000000000, UL) +#define PSW_MASK_EXT _AC(0x0100000000000000, UL) +#define PSW_MASK_BASE _AC(0x0000000000000000, UL) +#define PSW_MASK_KEY _AC(0x00F0000000000000, UL) +#define PSW_MASK_MCHECK _AC(0x0004000000000000, UL) +#define PSW_MASK_WAIT _AC(0x0002000000000000, UL) +#define PSW_MASK_PSTATE _AC(0x0001000000000000, UL) +#define PSW_MASK_ASC _AC(0x0000C00000000000, UL) +#define PSW_MASK_CC _AC(0x0000300000000000, UL) +#define PSW_MASK_PM _AC(0x00000F0000000000, UL) +#define PSW_MASK_RI _AC(0x0000008000000000, UL) +#define PSW_MASK_EA _AC(0x0000000100000000, UL) +#define PSW_MASK_BA _AC(0x0000000080000000, UL) + +#define PSW_MASK_USER _AC(0x0000FF0180000000, UL) + +#define PSW_ADDR_AMODE _AC(0x0000000000000000, UL) +#define PSW_ADDR_INSN _AC(0xFFFFFFFFFFFFFFFF, UL) + +#define PSW_ASC_PRIMARY _AC(0x0000000000000000, UL) +#define PSW_ASC_ACCREG _AC(0x0000400000000000, UL) +#define PSW_ASC_SECONDARY _AC(0x0000800000000000, UL) +#define PSW_ASC_HOME _AC(0x0000C00000000000, UL) + +#endif /* __s390x__ */ + #define NUM_GPRS 16 #define NUM_FPRS 16 #define NUM_CRS 16 @@ -214,69 +274,6 @@ typedef struct { unsigned long addr; } __attribute__ ((aligned(8))) psw_t; -#ifndef __s390x__ - -#define PSW_MASK_PER 0x40000000UL -#define PSW_MASK_DAT 0x04000000UL -#define PSW_MASK_IO 0x02000000UL -#define PSW_MASK_EXT 0x01000000UL -#define PSW_MASK_KEY 0x00F00000UL -#define PSW_MASK_BASE 0x00080000UL /* always one */ -#define PSW_MASK_MCHECK 0x00040000UL -#define PSW_MASK_WAIT 0x00020000UL -#define PSW_MASK_PSTATE 0x00010000UL -#define PSW_MASK_ASC 0x0000C000UL -#define PSW_MASK_CC 0x00003000UL -#define PSW_MASK_PM 0x00000F00UL -#define PSW_MASK_RI 0x00000000UL -#define PSW_MASK_EA 0x00000000UL -#define PSW_MASK_BA 0x00000000UL - -#define PSW_MASK_USER 0x0000FF00UL - -#define PSW_ADDR_AMODE 0x80000000UL -#define PSW_ADDR_INSN 0x7FFFFFFFUL - -#define PSW_DEFAULT_KEY (((unsigned long) PAGE_DEFAULT_ACC) << 20) - -#define PSW_ASC_PRIMARY 0x00000000UL -#define PSW_ASC_ACCREG 0x00004000UL -#define PSW_ASC_SECONDARY 0x00008000UL -#define PSW_ASC_HOME 0x0000C000UL - -#else /* __s390x__ */ - -#define PSW_MASK_PER 0x4000000000000000UL -#define PSW_MASK_DAT 0x0400000000000000UL -#define PSW_MASK_IO 0x0200000000000000UL -#define PSW_MASK_EXT 0x0100000000000000UL -#define PSW_MASK_BASE 0x0000000000000000UL -#define PSW_MASK_KEY 0x00F0000000000000UL -#define PSW_MASK_MCHECK 0x0004000000000000UL -#define PSW_MASK_WAIT 0x0002000000000000UL -#define PSW_MASK_PSTATE 0x0001000000000000UL -#define PSW_MASK_ASC 0x0000C00000000000UL -#define PSW_MASK_CC 0x0000300000000000UL -#define PSW_MASK_PM 0x00000F0000000000UL -#define PSW_MASK_RI 0x0000008000000000UL -#define PSW_MASK_EA 0x0000000100000000UL -#define PSW_MASK_BA 0x0000000080000000UL - -#define PSW_MASK_USER 0x0000FF0180000000UL - -#define PSW_ADDR_AMODE 0x0000000000000000UL -#define PSW_ADDR_INSN 0xFFFFFFFFFFFFFFFFUL - -#define PSW_DEFAULT_KEY (((unsigned long) PAGE_DEFAULT_ACC) << 52) - -#define PSW_ASC_PRIMARY 0x0000000000000000UL -#define PSW_ASC_ACCREG 0x0000400000000000UL -#define PSW_ASC_SECONDARY 0x0000800000000000UL -#define PSW_ASC_HOME 0x0000C00000000000UL - -#endif /* __s390x__ */ - - /* * The s390_regs structure is used to define the elf_gregset_t. */ diff --git a/arch/s390/include/uapi/asm/raw3270.h b/arch/s390/include/uapi/asm/raw3270.h new file mode 100644 index 000000000000..6676f102bd50 --- /dev/null +++ b/arch/s390/include/uapi/asm/raw3270.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_RAW3270_H +#define __ASM_S390_UAPI_RAW3270_H + +/* Local Channel Commands */ +#define TC_WRITE 0x01 /* Write */ +#define TC_RDBUF 0x02 /* Read Buffer */ +#define TC_EWRITE 0x05 /* Erase write */ +#define TC_READMOD 0x06 /* Read modified */ +#define TC_EWRITEA 0x0d /* Erase write alternate */ +#define TC_WRITESF 0x11 /* Write structured field */ + +/* Buffer Control Orders */ +#define TO_GE 0x08 /* Graphics Escape */ +#define TO_SF 0x1d /* Start field */ +#define TO_SBA 0x11 /* Set buffer address */ +#define TO_IC 0x13 /* Insert cursor */ +#define TO_PT 0x05 /* Program tab */ +#define TO_RA 0x3c /* Repeat to address */ +#define TO_SFE 0x29 /* Start field extended */ +#define TO_EUA 0x12 /* Erase unprotected to address */ +#define TO_MF 0x2c /* Modify field */ +#define TO_SA 0x28 /* Set attribute */ + +/* Field Attribute Bytes */ +#define TF_INPUT 0x40 /* Visible input */ +#define TF_INPUTN 0x4c /* Invisible input */ +#define TF_INMDT 0xc1 /* Visible, Set-MDT */ +#define TF_LOG 0x60 + +/* Character Attribute Bytes */ +#define TAT_RESET 0x00 +#define TAT_FIELD 0xc0 +#define TAT_EXTHI 0x41 +#define TAT_FGCOLOR 0x42 +#define TAT_CHARS 0x43 +#define TAT_BGCOLOR 0x45 +#define TAT_TRANS 0x46 + +/* Extended-Highlighting Bytes */ +#define TAX_RESET 0x00 +#define TAX_BLINK 0xf1 +#define TAX_REVER 0xf2 +#define TAX_UNDER 0xf4 + +/* Reset value */ +#define TAR_RESET 0x00 + +/* Color values */ +#define TAC_RESET 0x00 +#define TAC_BLUE 0xf1 +#define TAC_RED 0xf2 +#define TAC_PINK 0xf3 +#define TAC_GREEN 0xf4 +#define TAC_TURQ 0xf5 +#define TAC_YELLOW 0xf6 +#define TAC_WHITE 0xf7 +#define TAC_DEFAULT 0x00 + +/* Write Control Characters */ +#define TW_NONE 0x40 /* No particular action */ +#define TW_KR 0xc2 /* Keyboard restore */ +#define TW_PLUSALARM 0x04 /* Add this bit for alarm */ + +#define RAW3270_FIRSTMINOR 1 /* First minor number */ +#define RAW3270_MAXDEVS 255 /* Max number of 3270 devices */ + +#define AID_CLEAR 0x6d +#define AID_ENTER 0x7d +#define AID_PF3 0xf3 +#define AID_PF7 0xf7 +#define AID_PF8 0xf8 +#define AID_READ_PARTITION 0x88 + +#endif /* __ASM_S390_UAPI_RAW3270_H */ diff --git a/arch/s390/include/uapi/asm/statfs.h b/arch/s390/include/uapi/asm/statfs.h index 72604f7792c3..f85b50723dd3 100644 --- a/arch/s390/include/uapi/asm/statfs.h +++ b/arch/s390/include/uapi/asm/statfs.h @@ -30,7 +30,7 @@ struct statfs { unsigned int f_namelen; unsigned int f_frsize; unsigned int f_flags; - unsigned int f_spare[4]; + unsigned int f_spare[5]; }; struct statfs64 { @@ -45,7 +45,7 @@ struct statfs64 { unsigned int f_namelen; unsigned int f_frsize; unsigned int f_flags; - unsigned int f_spare[4]; + unsigned int f_spare[5]; }; #endif diff --git a/arch/s390/include/uapi/asm/termios.h b/arch/s390/include/uapi/asm/termios.h deleted file mode 100644 index 54223169c806..000000000000 --- a/arch/s390/include/uapi/asm/termios.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S390 version - * - * Derived from "include/asm-i386/termios.h" - */ - -#ifndef _UAPI_S390_TERMIOS_H -#define _UAPI_S390_TERMIOS_H - -#include <asm/termbits.h> -#include <asm/ioctls.h> - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - - -#endif /* _UAPI_S390_TERMIOS_H */ diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h index da034c606314..84457dbb26b4 100644 --- a/arch/s390/include/uapi/asm/types.h +++ b/arch/s390/include/uapi/asm/types.h @@ -12,15 +12,18 @@ #ifndef __ASSEMBLY__ -/* A address type so that arithmetic can be done on it & it can be upgraded to - 64 bit when necessary -*/ -typedef unsigned long addr_t; +typedef unsigned long addr_t; typedef __signed__ long saddr_t; typedef struct { - __u32 u[4]; -} __vector128; + union { + struct { + __u64 high; + __u64 low; + }; + __u32 u[4]; + }; +} __attribute__((packed, aligned(4))) __vector128; #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h index 10a5ac918e02..4947f26ad9fb 100644 --- a/arch/s390/include/uapi/asm/uvdevice.h +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* - * Copyright IBM Corp. 2022 + * Copyright IBM Corp. 2022, 2024 * Author(s): Steffen Eiden <seiden@linux.ibm.com> */ #ifndef __S390_ASM_UVDEVICE_H @@ -32,6 +32,33 @@ struct uvio_attest { __u16 reserved136; /* 0x0136 */ }; +/** + * uvio_uvdev_info - Information of supported functions + * @supp_uvio_cmds - supported IOCTLs by this device + * @supp_uv_cmds - supported UVCs corresponding to the IOCTL + * + * UVIO request to get information about supported request types by this + * uvdevice and the Ultravisor. Everything is output. Bits are in LSB0 + * ordering. If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the + * uvdevice and the Ultravisor support that call. + * + * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds` + * as there is no corresponding UV-call. + */ +struct uvio_uvdev_info { + /* + * If bit `n` is set, this device supports the IOCTL with nr `n`. + */ + __u64 supp_uvio_cmds; + /* + * If bit `n` is set, the Ultravisor(UV) supports the UV-call + * corresponding to the IOCTL with nr `n` in the calling context (host + * or guest). The value is only valid if the corresponding bit in + * @supp_uvio_cmds is set as well. + */ + __u64 supp_uv_cmds; +}; + /* * The following max values define an upper length for the IOCTL in/out buffers. * However, they do not represent the maximum the Ultravisor allows which is @@ -42,10 +69,38 @@ struct uvio_attest { #define UVIO_ATT_ARCB_MAX_LEN 0x100000 #define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000 #define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000 +#define UVIO_ADD_SECRET_MAX_LEN 0x100000 +#define UVIO_LIST_SECRETS_LEN 0x1000 +#define UVIO_RETR_SECRET_MAX_LEN 0x2000 #define UVIO_DEVICE_NAME "uv" #define UVIO_TYPE_UVC 'u' -#define UVIO_IOCTL_ATT _IOWR(UVIO_TYPE_UVC, 0x01, struct uvio_ioctl_cb) +enum UVIO_IOCTL_NR { + UVIO_IOCTL_UVDEV_INFO_NR = 0x00, + UVIO_IOCTL_ATT_NR, + UVIO_IOCTL_ADD_SECRET_NR, + UVIO_IOCTL_LIST_SECRETS_NR, + UVIO_IOCTL_LOCK_SECRETS_NR, + UVIO_IOCTL_RETR_SECRET_NR, + /* must be the last entry */ + UVIO_IOCTL_NUM_IOCTLS +}; + +#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb) +#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR) +#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) +#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR) +#define UVIO_IOCTL_RETR_SECRET UVIO_IOCTL(UVIO_IOCTL_RETR_SECRET_NR) + +#define UVIO_SUPP_CALL(nr) (1ULL << (nr)) +#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) +#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) +#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR) +#define UVIO_SUPP_RETR_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_RETR_SECRET_NR) #endif /* __S390_ASM_UVDEVICE_H */ diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index d83713f67530..f4785abe1b9f 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -85,7 +85,8 @@ struct ica_rsa_modexpo_crt { struct CPRBX { __u16 cprb_len; /* CPRB length 220 */ __u8 cprb_ver_id; /* CPRB version id. 0x02 */ - __u8 _pad_000[3]; /* Alignment pad bytes */ + __u8 ctfm; /* Command Type Filtering Mask */ + __u8 pad_000[2]; /* Alignment pad bytes */ __u8 func_id[2]; /* function id 0x5432 */ __u8 cprb_flags[4]; /* Flags */ __u32 req_parml; /* request parameter buffer len */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 27d6b3c7aa06..ea5ed6654050 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -10,6 +10,9 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) # Do not trace early setup code CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_stacktrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_unwind_bc.o = $(CC_FLAGS_FTRACE) endif @@ -33,22 +36,24 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -obj-y := traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o +obj-y := head64.o traps.o time.o process.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o -obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o -obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o +obj-y += debug.o irq.o ipl.o dis.o vdso.o cpufeature.o +obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o +obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o stacktrace.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o +obj-y += diag/ -extra-y += head64.o vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) +obj-$(CONFIG_SYSFS) += cpacf.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o @@ -56,26 +61,27 @@ obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_KPROBES) += kprobes_insn_page.o obj-$(CONFIG_KPROBES) += mcount.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o - +obj-$(CONFIG_CERT_STORE) += cert_store.o obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o obj-$(CONFIG_TRACEPOINTS) += trace.o -obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c new file mode 100644 index 000000000000..6252b7d115dd --- /dev/null +++ b/arch/s390/kernel/abs_lowcore.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/pgtable.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> + +unsigned long __bootdata_preserved(__abs_lowcore); + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + unsigned long phys = __pa(lc); + int rc, i; + + for (i = 0; i < LC_PAGES; i++) { + rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc); + if (rc) { + /* + * Do not unmap allocated page tables in case the + * allocation was not requested. In such a case the + * request is expected coming from an atomic context, + * while the unmap attempt might sleep. + */ + if (alloc) { + for (--i; i >= 0; i--) { + addr -= PAGE_SIZE; + vmem_unmap_4k_page(addr); + } + } + return rc; + } + addr += PAGE_SIZE; + phys += PAGE_SIZE; + } + return 0; +} + +void abs_lowcore_unmap(int cpu) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + int i; + + for (i = 0; i < LC_PAGES; i++) { + vmem_unmap_4k_page(addr); + addr += PAGE_SIZE; + } +} diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index e7bca29f9c34..90c0e6408992 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,75 +1,90 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <asm/text-patching.h> + +#ifndef pr_fmt +#define pr_fmt(fmt) "alt: " fmt +#endif + +#include <linux/uaccess.h> +#include <linux/printk.h> +#include <asm/nospec-branch.h> +#include <asm/abs_lowcore.h> #include <asm/alternative.h> #include <asm/facility.h> -#include <asm/nospec-branch.h> +#include <asm/sections.h> +#include <asm/machine.h> + +#ifndef a_debug +#define a_debug pr_debug +#endif + +#ifndef __kernel_va +#define __kernel_va(x) (void *)(x) +#endif + +unsigned long __bootdata_preserved(machine_features[1]); + +struct alt_debug { + unsigned long facilities[MAX_FACILITY_BIT / BITS_PER_LONG]; + unsigned long mfeatures[MAX_MFEATURE_BIT / BITS_PER_LONG]; + int spec; +}; -static int __initdata_or_module alt_instr_disabled; +static struct alt_debug __bootdata_preserved(alt_debug); -static int __init disable_alternative_instructions(char *str) +static void alternative_dump(u8 *old, u8 *new, unsigned int len, unsigned int type, unsigned int data) { - alt_instr_disabled = 1; - return 0; -} + char oinsn[33], ninsn[33]; + unsigned long kptr; + unsigned int pos; -early_param("noaltinstr", disable_alternative_instructions); + for (pos = 0; pos < len && 2 * pos < sizeof(oinsn) - 3; pos++) + hex_byte_pack(&oinsn[2 * pos], old[pos]); + oinsn[2 * pos] = 0; + for (pos = 0; pos < len && 2 * pos < sizeof(ninsn) - 3; pos++) + hex_byte_pack(&ninsn[2 * pos], new[pos]); + ninsn[2 * pos] = 0; + kptr = (unsigned long)__kernel_va(old); + a_debug("[%d/%3d] %016lx: %s -> %s\n", type, data, kptr, oinsn, ninsn); +} -static void __init_or_module __apply_alternatives(struct alt_instr *start, - struct alt_instr *end) +void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx) { + struct alt_debug *d; struct alt_instr *a; - u8 *instr, *replacement; + bool debug, replace; + u8 *old, *new; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ + d = &alt_debug; for (a = start; a < end; a++) { - instr = (u8 *)&a->instr_offset + a->instr_offset; - replacement = (u8 *)&a->repl_offset + a->repl_offset; - - if (!__test_facility(a->facility, alt_stfle_fac_list)) - continue; - - if (unlikely(a->instrlen % 2)) { - WARN_ONCE(1, "cpu alternatives instructions length is " - "odd, skipping patching\n"); + if (!(a->ctx & ctx)) continue; + switch (a->type) { + case ALT_TYPE_FACILITY: + replace = test_facility(a->data); + debug = __test_facility(a->data, d->facilities); + break; + case ALT_TYPE_FEATURE: + replace = test_machine_feature(a->data); + debug = __test_machine_feature(a->data, d->mfeatures); + break; + case ALT_TYPE_SPEC: + replace = nobp_enabled(); + debug = d->spec; + break; + default: + replace = false; + debug = false; } - - s390_kernel_write(instr, replacement, a->instrlen); + if (!replace) + continue; + old = (u8 *)&a->instr_offset + a->instr_offset; + new = (u8 *)&a->repl_offset + a->repl_offset; + if (debug) + alternative_dump(old, new, a->instrlen, a->type, a->data); + s390_kernel_write(old, new, a->instrlen); } } - -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) -{ - if (!alt_instr_disabled) - __apply_alternatives(start, end); -} - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -void __init apply_alternative_instructions(void) -{ - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static void do_sync_core(void *info) -{ - sync_core(); -} - -void text_poke_sync(void) -{ - on_each_cpu(do_sync_core, NULL, 1); -} - -void text_poke_sync_lock(void) -{ - cpus_read_lock(); - text_poke_sync(); - cpus_read_unlock(); -} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index d8ce965c0a97..95ecad9c7d7d 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -5,16 +5,14 @@ * and format the required data. */ -#define ASM_OFFSETS_C - #include <linux/kbuild.h> -#include <linux/kvm_host.h> #include <linux/sched.h> #include <linux/purgatory.h> #include <linux/pgtable.h> -#include <asm/idle.h> -#include <asm/gmap.h> +#include <linux/ftrace_regs.h> +#include <asm/kvm_host_types.h> #include <asm/stacktrace.h> +#include <asm/ptrace.h> int main(void) { @@ -28,6 +26,7 @@ int main(void) BLANK(); /* thread info offsets */ OFFSET(__TI_flags, task_struct, thread_info.flags); + OFFSET(__TI_sie, task_struct, thread_info.sie); BLANK(); /* pt_regs offsets */ OFFSET(__PT_PSW, pt_regs, psw); @@ -49,8 +48,8 @@ int main(void) OFFSET(__PT_R14, pt_regs, gprs[14]); OFFSET(__PT_R15, pt_regs, gprs[15]); OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); + OFFSET(__PT_INT_CODE, pt_regs, int_code); OFFSET(__PT_FLAGS, pt_regs, flags); - OFFSET(__PT_CR1, pt_regs, cr1); OFFSET(__PT_LAST_BREAK, pt_regs, last_break); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); BLANK(); @@ -62,19 +61,22 @@ int main(void) OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); + OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); + OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); - /* idle data offsets */ - OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); - OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); - OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); + OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); + DEFINE(STACK_FRAME_USER_OVERHEAD, sizeof(struct stack_frame_user)); + OFFSET(__SFVDSO_RETURN_ADDRESS, stack_frame_vdso_wrapper, return_address); + DEFINE(STACK_FRAME_VDSO_OVERHEAD, sizeof(struct stack_frame_vdso_wrapper)); BLANK(); /* hardware defined lowcore locations 0x000 - 0x1ff */ OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr); OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code); OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc); - OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code); + OFFSET(__LC_PGM_CODE, lowcore, pgm_code); + OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_int_code); OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code); OFFSET(__LC_MON_CLASS_NR, lowcore, mon_class_num); OFFSET(__LC_PER_CODE, lowcore, per_code); @@ -109,10 +111,9 @@ int main(void) OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw); OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw); /* software defined lowcore locations 0x200 - 0xdff*/ - OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); - OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); + OFFSET(__LC_SAVE_AREA, lowcore, save_area); OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); - OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); + OFFSET(__LC_PCPU, lowcore, pcpu); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); @@ -121,8 +122,6 @@ int main(void) OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); - OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); - OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); @@ -137,7 +136,6 @@ int main(void) OFFSET(__LC_USER_ASCE, lowcore, user_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); - OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); @@ -160,7 +158,6 @@ int main(void) OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); /* gmap/sie offsets */ - OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20); /* kexec_sha_region */ @@ -177,5 +174,9 @@ int main(void) DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); + OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs)); + + OFFSET(__PCPU_FLAGS, pcpu, flags); return 0; } diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index 7ee3651d00ab..4f2669030220 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -46,7 +46,7 @@ struct cache_info { #define CACHE_MAX_LEVEL 8 union cache_topology { struct cache_info ci[CACHE_MAX_LEVEL]; - unsigned long long raw; + unsigned long raw; }; static const char * const cache_type_string[] = { @@ -166,5 +166,6 @@ int populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, pvt, ctype, level, cpu); } } + this_cpu_ci->cpu_map_populated = true; return 0; } diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c new file mode 100644 index 000000000000..c217a5e64094 --- /dev/null +++ b/arch/s390/kernel/cert_store.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIAG 0x320 support and certificate store handling + * + * Copyright IBM Corp. 2023 + * Author(s): Anastasia Eskova <anastasia.eskova@ibm.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/key-type.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/kobject.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/vmalloc.h> +#include <crypto/sha2.h> +#include <keys/user-type.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/sclp.h> + +#define DIAG_MAX_RETRIES 10 + +#define VCE_FLAGS_VALID_MASK 0x80 + +#define ISM_LEN_DWORDS 4 +#define VCSSB_LEN_BYTES 128 +#define VCSSB_LEN_NO_CERTS 4 +#define VCB_LEN_NO_CERTS 64 +#define VC_NAME_LEN_BYTES 64 + +#define CERT_STORE_KEY_TYPE_NAME "cert_store_key" +#define CERT_STORE_KEYRING_NAME "cert_store" + +static debug_info_t *cert_store_dbf; +static debug_info_t *cert_store_hexdump; + +#define pr_dbf_msg(fmt, ...) \ + debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__) + +enum diag320_subcode { + DIAG320_SUBCODES = 0, + DIAG320_STORAGE = 1, + DIAG320_CERT_BLOCK = 2, +}; + +enum diag320_rc { + DIAG320_RC_OK = 0x0001, + DIAG320_RC_CS_NOMATCH = 0x0306, +}; + +/* Verification Certificates Store Support Block (VCSSB). */ +struct vcssb { + u32 vcssb_length; + u8 pad_0x04[3]; + u8 version; + u8 pad_0x08[8]; + u32 cs_token; + u8 pad_0x14[12]; + u16 total_vc_index_count; + u16 max_vc_index_count; + u8 pad_0x24[28]; + u32 max_vce_length; + u32 max_vcxe_length; + u8 pad_0x48[8]; + u32 max_single_vcb_length; + u32 total_vcb_length; + u32 max_single_vcxb_length; + u32 total_vcxb_length; + u8 pad_0x60[32]; +} __packed __aligned(8); + +/* Verification Certificate Entry (VCE) Header. */ +struct vce_header { + u32 vce_length; + u8 flags; + u8 key_type; + u16 vc_index; + u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */ + u8 vc_format; + u8 pad_0x49; + u16 key_id_length; + u8 pad_0x4c; + u8 vc_hash_type; + u16 vc_hash_length; + u8 pad_0x50[4]; + u32 vc_length; + u8 pad_0x58[8]; + u16 vc_hash_offset; + u16 vc_offset; + u8 pad_0x64[28]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB) Header. */ +struct vcb_header { + u32 vcb_input_length; + u8 pad_0x04[4]; + u16 first_vc_index; + u16 last_vc_index; + u32 pad_0x0c; + u32 cs_token; + u8 pad_0x14[12]; + u32 vcb_output_length; + u8 pad_0x24[3]; + u8 version; + u16 stored_vc_count; + u16 remaining_vc_count; + u8 pad_0x2c[20]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB). */ +struct vcb { + struct vcb_header vcb_hdr; + u8 vcb_buf[]; +} __packed __aligned(4); + +/* Verification Certificate Entry (VCE). */ +struct vce { + struct vce_header vce_hdr; + u8 cert_data_buf[]; +} __packed __aligned(4); + +static void cert_store_key_describe(const struct key *key, struct seq_file *m) +{ + char ascii[VC_NAME_LEN_BYTES + 1]; + + /* + * First 64 bytes of the key description is key name in EBCDIC CP 500. + * Convert it to ASCII for displaying in /proc/keys. + */ + strscpy(ascii, key->description); + EBCASC_500(ascii, VC_NAME_LEN_BYTES); + seq_puts(m, ascii); + + seq_puts(m, &key->description[VC_NAME_LEN_BYTES]); + if (key_is_positive(key)) + seq_printf(m, ": %u", key->datalen); +} + +/* + * Certificate store key type takes over properties of + * user key but cannot be updated. + */ +static struct key_type key_type_cert_store_key = { + .name = CERT_STORE_KEY_TYPE_NAME, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = cert_store_key_describe, + .read = user_read, +}; + +/* Logging functions. */ +static void pr_dbf_vcb(const struct vcb *b) +{ + pr_dbf_msg("VCB Header:"); + pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length); + pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index); + pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index); + pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token); + pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length); + pr_dbf_msg("version: %d", b->vcb_hdr.version); + pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count); + pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count); +} + +static void pr_dbf_vce(const struct vce *e) +{ + unsigned char vc_name[VC_NAME_LEN_BYTES + 1]; + char log_string[VC_NAME_LEN_BYTES + 40]; + + pr_dbf_msg("VCE Header:"); + pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length); + pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags); + pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type); + pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index); + pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format); + pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length); + pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type); + pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length); + pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset); + pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length); + pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset); + + /* Certificate name in ASCII. */ + memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES); + EBCASC_500(vc_name, VC_NAME_LEN_BYTES); + vc_name[VC_NAME_LEN_BYTES] = '\0'; + + snprintf(log_string, sizeof(log_string), + "index: %d vce_hdr.vc_name (ASCII): %s", + e->vce_hdr.vc_index, vc_name); + debug_text_event(cert_store_hexdump, 3, log_string); + + /* Certificate data. */ + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start"); + debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128); + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end"); + debug_event(cert_store_hexdump, 3, + (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128); +} + +static void pr_dbf_vcssb(const struct vcssb *s) +{ + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1"); + debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES); + + pr_dbf_msg("VCSSB:"); + pr_dbf_msg("vcssb_length: %u", s->vcssb_length); + pr_dbf_msg("version: %u", s->version); + pr_dbf_msg("cs_token: %u", s->cs_token); + pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count); + pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count); + pr_dbf_msg("max_vce_length: %u", s->max_vce_length); + pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length); + pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length); + pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length); + pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length); + pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length); +} + +static int __diag320(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, }; + + asm_inline volatile( + " diag %[rp],%[subcode],0x320\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + + return rp.odd; +} + +static int diag320(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X320); + + return __diag320(subcode, addr); +} + +/* + * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in + * VCE. Return -EINVAL if hashes don't match. + */ +static int check_certificate_hash(const struct vce *vce) +{ + u8 hash[SHA256_DIGEST_SIZE]; + u16 vc_hash_length; + u8 *vce_hash; + + vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset; + vc_hash_length = vce->vce_hdr.vc_hash_length; + sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash); + if (memcmp(vce_hash, hash, vc_hash_length) == 0) + return 0; + + pr_dbf_msg("SHA256 hash of received certificate does not match"); + debug_text_event(cert_store_hexdump, 3, "VCE hash:"); + debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE); + debug_text_event(cert_store_hexdump, 3, "Calculated hash:"); + debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE); + + return -EINVAL; +} + +static int check_certificate_valid(const struct vce *vce) +{ + if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) { + pr_dbf_msg("Certificate entry is invalid"); + return -EINVAL; + } + if (vce->vce_hdr.vc_format != 1) { + pr_dbf_msg("Certificate format is not supported"); + return -EINVAL; + } + if (vce->vce_hdr.vc_hash_type != 1) { + pr_dbf_msg("Hash type is not supported"); + return -EINVAL; + } + + return check_certificate_hash(vce); +} + +static struct key *get_user_session_keyring(void) +{ + key_ref_t us_keyring_ref; + + us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING, + KEY_LOOKUP_CREATE, KEY_NEED_LINK); + if (IS_ERR(us_keyring_ref)) { + pr_dbf_msg("Couldn't get user session keyring: %ld", + PTR_ERR(us_keyring_ref)); + return ERR_PTR(-ENOKEY); + } + key_ref_put(us_keyring_ref); + return key_ref_to_ptr(us_keyring_ref); +} + +/* Invalidate all keys from cert_store keyring. */ +static int invalidate_keyring_keys(struct key *keyring) +{ + unsigned long num_keys, key_index; + size_t keyring_payload_len; + key_serial_t *key_array; + struct key *current_key; + int rc; + + keyring_payload_len = key_type_keyring.read(keyring, NULL, 0); + num_keys = keyring_payload_len / sizeof(key_serial_t); + key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL); + if (!key_array) + return -ENOMEM; + + rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len); + if (rc != keyring_payload_len) { + pr_dbf_msg("Couldn't read keyring payload"); + goto out; + } + + for (key_index = 0; key_index < num_keys; key_index++) { + current_key = key_lookup(key_array[key_index]); + pr_dbf_msg("Invalidating key %08x", current_key->serial); + + key_invalidate(current_key); + key_put(current_key); + rc = key_unlink(keyring, current_key); + if (rc) { + pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc); + break; + } + } +out: + kfree(key_array); + return rc; +} + +static struct key *find_cs_keyring(void) +{ + key_ref_t cs_keyring_ref; + struct key *cs_keyring; + + cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true), + &key_type_keyring, CERT_STORE_KEYRING_NAME, + false); + if (!IS_ERR(cs_keyring_ref)) { + cs_keyring = key_ref_to_ptr(cs_keyring_ref); + key_ref_put(cs_keyring_ref); + goto found; + } + /* Search default locations: thread, process, session keyrings */ + cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL); + if (IS_ERR(cs_keyring)) + return NULL; + key_put(cs_keyring); +found: + return cs_keyring; +} + +static void cleanup_cs_keys(void) +{ + struct key *cs_keyring; + + cs_keyring = find_cs_keyring(); + if (!cs_keyring) + return; + + pr_dbf_msg("Found cert_store keyring. Purging..."); + /* + * Remove cert_store_key_type in case invalidation + * of old cert_store keys failed (= severe error). + */ + if (invalidate_keyring_keys(cs_keyring)) + unregister_key_type(&key_type_cert_store_key); + + keyring_clear(cs_keyring); + key_invalidate(cs_keyring); + key_put(cs_keyring); + key_unlink(get_user_session_keyring(), cs_keyring); +} + +static struct key *create_cs_keyring(void) +{ + static struct key *cs_keyring; + + /* Cleanup previous cs_keyring and all associated keys if any. */ + cleanup_cs_keys(); + cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID, + GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP, + NULL, get_user_session_keyring()); + if (IS_ERR(cs_keyring)) { + pr_dbf_msg("Can't allocate cert_store keyring"); + return NULL; + } + + pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial); + + /* + * In case a previous clean-up ran into an + * error and unregistered key type. + */ + register_key_type(&key_type_cert_store_key); + + return cs_keyring; +} + +/* + * Allocate memory and create key description in format + * [key name in EBCDIC]:[VCE index]:[CS token]. + * Return a pointer to key description or NULL if memory + * allocation failed. Memory should be freed by caller. + */ +static char *get_key_description(struct vcssb *vcssb, const struct vce *vce) +{ + size_t len, name_len; + u32 cs_token; + char *desc; + + cs_token = vcssb->cs_token; + /* Description string contains "%64s:%05u:%010u\0". */ + name_len = sizeof(vce->vce_hdr.vc_name); + len = name_len + 1 + 5 + 1 + 10 + 1; + desc = kmalloc(len, GFP_KERNEL); + if (!desc) + return NULL; + + memcpy(desc, vce->vce_hdr.vc_name, name_len); + snprintf(desc + name_len, len - name_len, ":%05u:%010u", + vce->vce_hdr.vc_index, cs_token); + + return desc; +} + +/* + * Create a key of type "cert_store_key" using the data from VCE for key + * payload and key description. Link the key to "cert_store" keyring. + */ +static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce, + struct key *keyring) +{ + key_ref_t newkey; + char *desc; + int rc; + + desc = get_key_description(vcssb, vce); + if (!desc) + return -ENOMEM; + + newkey = key_create_or_update( + make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME, + desc, (u8 *)vce + vce->vce_hdr.vc_offset, + vce->vce_hdr.vc_length, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + + rc = PTR_ERR_OR_ZERO(newkey); + if (rc) { + pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc); + rc = -ENOKEY; + goto out; + } + + key_ref_put(newkey); +out: + kfree(desc); + return rc; +} + +/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */ +static int get_vcssb(struct vcssb *vcssb) +{ + int diag320_rc; + + memset(vcssb, 0, sizeof(*vcssb)); + vcssb->vcssb_length = VCSSB_LEN_BYTES; + diag320_rc = diag320(DIAG320_STORAGE, vcssb); + pr_dbf_vcssb(vcssb); + + if (diag320_rc != DIAG320_RC_OK) { + pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc); + return -EIO; + } + if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) { + pr_dbf_msg("No certificates available for current configuration"); + return -ENOKEY; + } + + return 0; +} + +static u32 get_4k_mult_vcb_size(struct vcssb *vcssb) +{ + return round_up(vcssb->max_single_vcb_length, PAGE_SIZE); +} + +/* Fill input fields of single-entry VCB that will be read by LPAR. */ +static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index) +{ + memset(vcb, 0, sizeof(*vcb)); + vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb); + vcb->vcb_hdr.cs_token = vcssb->cs_token; + + /* Request single entry. */ + vcb->vcb_hdr.first_vc_index = index; + vcb->vcb_hdr.last_vc_index = index; +} + +static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce) +{ + struct vce *extracted_vce; + + extracted_vce = (struct vce *)vcb->vcb_buf; + memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length); + pr_dbf_vce(vce); +} + +static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb) +{ + int rc, diag320_rc; + + fill_vcb_input(vcssb, vcb, index); + + diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb); + pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc); + pr_dbf_vcb(vcb); + + switch (diag320_rc) { + case DIAG320_RC_OK: + rc = 0; + if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) { + pr_dbf_msg("No certificate entry for index %u", index); + rc = -ENOKEY; + } else if (vcb->vcb_hdr.remaining_vc_count != 0) { + /* Retry on insufficient space. */ + pr_dbf_msg("Couldn't get all requested certificates"); + rc = -EAGAIN; + } + break; + case DIAG320_RC_CS_NOMATCH: + pr_dbf_msg("Certificate Store token mismatch"); + rc = -EAGAIN; + break; + default: + pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc); + rc = -EINVAL; + break; + } + + return rc; +} + +/* + * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call, + * extract VCE and create a key from its' certificate. + */ +static int create_key_from_sevcb(struct vcssb *vcssb, u16 index, + struct key *keyring) +{ + struct vcb *vcb; + struct vce *vce; + int rc; + + rc = -ENOMEM; + vcb = vmalloc(get_4k_mult_vcb_size(vcssb)); + vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr)); + if (!vcb || !vce) + goto out; + + rc = get_sevcb(vcssb, index, vcb); + if (rc) + goto out; + + extract_vce_from_sevcb(vcb, vce); + rc = check_certificate_valid(vce); + if (rc) + goto out; + + rc = create_key_from_vce(vcssb, vce, keyring); + if (rc) + goto out; + + pr_dbf_msg("Successfully created key from Certificate Entry %d", index); +out: + vfree(vce); + vfree(vcb); + return rc; +} + +/* + * Request a single-entry VCB for each VCE available for the partition. + * Create a key from it and link it to cert_store keyring. If no keys + * could be created (i.e. VCEs were invalid) return -ENOKEY. + */ +static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring) +{ + int rc, index, count, added; + + count = 0; + added = 0; + /* Certificate Store entries indices start with 1 and have no gaps. */ + for (index = 1; index < vcssb->total_vc_index_count + 1; index++) { + pr_dbf_msg("Creating key from VCE %u", index); + rc = create_key_from_sevcb(vcssb, index, keyring); + count++; + + if (rc == -EAGAIN) + return rc; + + if (rc) + pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc); + else + added++; + } + + if (added == 0) { + pr_dbf_msg("Processed %d entries. No keys created", count); + return -ENOKEY; + } + + pr_info("Added %d of %d keys to cert_store keyring", added, count); + + /* + * Do not allow to link more keys to certificate store keyring after all + * the VCEs were processed. + */ + rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL); + if (rc) + pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc); + + return 0; +} + +/* + * Check which DIAG320 subcodes are installed. + * Return -ENOENT if subcodes 1 or 2 are not available. + */ +static int query_diag320_subcodes(void) +{ + unsigned long ism[ISM_LEN_DWORDS]; + int rc; + + rc = diag320(0, ism); + if (rc != DIAG320_RC_OK) { + pr_dbf_msg("DIAG320 subcode query returned %04x", rc); + return -ENOENT; + } + + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0"); + debug_event(cert_store_hexdump, 3, ism, sizeof(ism)); + + if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) { + pr_dbf_msg("Not all required DIAG320 subcodes are installed"); + return -ENOENT; + } + + return 0; +} + +/* + * Check if Certificate Store is supported by the firmware and DIAG320 subcodes + * 1 and 2 are installed. Create cert_store keyring and link all certificates + * available for the current partition to it as "cert_store_key" type + * keys. On refresh or error invalidate cert_store keyring and destroy + * all keys of "cert_store_key" type. + */ +static int fill_cs_keyring(void) +{ + struct key *cs_keyring; + struct vcssb *vcssb; + int rc; + + rc = -ENOMEM; + vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL); + if (!vcssb) + goto cleanup_keys; + + rc = -ENOENT; + if (!sclp.has_diag320) { + pr_dbf_msg("Certificate Store is not supported"); + goto cleanup_keys; + } + + rc = query_diag320_subcodes(); + if (rc) + goto cleanup_keys; + + rc = get_vcssb(vcssb); + if (rc) + goto cleanup_keys; + + rc = -ENOMEM; + cs_keyring = create_cs_keyring(); + if (!cs_keyring) + goto cleanup_keys; + + rc = add_certificates_to_keyring(vcssb, cs_keyring); + if (rc) + goto cleanup_cs_keyring; + + goto out; + +cleanup_cs_keyring: + key_put(cs_keyring); +cleanup_keys: + cleanup_cs_keys(); +out: + kfree(vcssb); + return rc; +} + +static DEFINE_MUTEX(cs_refresh_lock); +static int cs_status_val = -1; + +static ssize_t cs_status_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (cs_status_val == -1) + return sysfs_emit(buf, "uninitialized\n"); + else if (cs_status_val == 0) + return sysfs_emit(buf, "ok\n"); + + return sysfs_emit(buf, "failed (%d)\n", cs_status_val); +} + +static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status); + +static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc, retries; + + pr_dbf_msg("Refresh certificate store information requested"); + rc = mutex_lock_interruptible(&cs_refresh_lock); + if (rc) + return rc; + + for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) { + /* Request certificates from certificate store. */ + rc = fill_cs_keyring(); + if (rc) + pr_dbf_msg("Failed to refresh certificate store information (%d)", rc); + if (rc != -EAGAIN) + break; + } + cs_status_val = rc; + mutex_unlock(&cs_refresh_lock); + + return rc ?: count; +} + +static struct kobj_attribute refresh_attr = __ATTR_WO(refresh); + +static const struct attribute *cert_store_attrs[] __initconst = { + &cs_status_attr.attr, + &refresh_attr.attr, + NULL, +}; + +static struct kobject *cert_store_kobj; + +static int __init cert_store_init(void) +{ + int rc = -ENOMEM; + + cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64); + if (!cert_store_dbf) + goto cleanup_dbf; + + cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128); + if (!cert_store_hexdump) + goto cleanup_dbf; + + debug_register_view(cert_store_hexdump, &debug_hex_ascii_view); + debug_register_view(cert_store_dbf, &debug_sprintf_view); + + /* Create directory /sys/firmware/cert_store. */ + cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj); + if (!cert_store_kobj) + goto cleanup_dbf; + + rc = sysfs_create_files(cert_store_kobj, cert_store_attrs); + if (rc) + goto cleanup_kobj; + + register_key_type(&key_type_cert_store_key); + + return rc; + +cleanup_kobj: + kobject_put(cert_store_kobj); +cleanup_dbf: + debug_unregister(cert_store_dbf); + debug_unregister(cert_store_hexdump); + + return rc; +} +device_initcall(cert_store_init); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index eee1ad3e1b29..5a86b9d1da71 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -24,11 +24,12 @@ #include <linux/tty.h> #include <linux/personality.h> #include <linux/binfmts.h> +#include <asm/vdso-symbols.h> +#include <asm/access-regs.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/lowcore.h> -#include <asm/switch_to.h> -#include <asm/vdso.h> +#include <asm/fpu.h> #include "compat_linux.h" #include "compat_ptrace.h" #include "entry.h" @@ -55,7 +56,7 @@ typedef struct static void store_sigregs(void) { save_access_regs(current->thread.acrs); - save_fpu_regs(); + save_user_fpu_regs(); } /* Load registers after signal return */ @@ -78,7 +79,7 @@ static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); + fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.ufpu); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) return -EFAULT; return 0; @@ -98,10 +99,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) return -EINVAL; - /* Test the floating-point-control word. */ - if (test_fp_ctl(user_sregs.fpregs.fpc)) - return -EINVAL; - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 | @@ -116,7 +113,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); + fpregs_load((_s390_fp_regs *)&user_sregs.fpregs, ¤t->thread.ufpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; @@ -137,13 +134,13 @@ static int save_sigregs_ext32(struct pt_regs *regs, return -EFAULT; /* Save vector registers to signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.ufpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, - current->thread.fpu.vxrs + __NUM_VXRS_LOW, + current->thread.ufpu.vxrs + __NUM_VXRS_LOW, sizeof(sregs_ext->vxrs_high))) return -EFAULT; } @@ -165,15 +162,15 @@ static int restore_sigregs_ext32(struct pt_regs *regs, *(__u32 *)®s->gprs[i] = gprs_high[i]; /* Restore vector registers from signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW, &sregs_ext->vxrs_high, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.ufpu.vxrs[i].low = vxrs[i]; } return 0; } @@ -187,7 +184,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask)) goto badframe; set_current_blocked(&set); - save_fpu_regs(); + save_user_fpu_regs(); if (restore_sigregs32(regs, &frame->sregs)) goto badframe; if (restore_sigregs_ext32(regs, &frame->sregs_ext)) @@ -210,7 +207,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; - save_fpu_regs(); + save_user_fpu_regs(); if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) @@ -265,7 +262,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, * the machine supports it */ frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) frame_size -= sizeof(frame->sregs_ext.vxrs_low) + sizeof(frame->sregs_ext.vxrs_high); frame = get_sigframe(&ksig->ka, regs, frame_size); @@ -348,11 +345,12 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, * the machine supports it */ uc_flags = UC_GPRS_HIGH; - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { uc_flags |= UC_VXRS; - } else + } else { frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + sizeof(frame->uc.uc_mcontext_ext.vxrs_high); + } frame = get_sigframe(&ksig->ka, regs, frame_size); if (frame == (void __user *) -1UL) return -EFAULT; diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c new file mode 100644 index 000000000000..4b9b34f95d72 --- /dev/null +++ b/arch/s390/kernel/cpacf.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "cpacf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/sysfs.h> +#include <asm/cpacf.h> + +#define CPACF_QUERY(name, instruction) \ +static ssize_t name##_query_raw_read(struct file *fp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_mask_t mask; \ + \ + if (!cpacf_query(CPACF_##instruction, &mask)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &mask, sizeof(mask)); \ +} \ +static const BIN_ATTR_RO(name##_query_raw, sizeof(cpacf_mask_t)) + +CPACF_QUERY(km, KM); +CPACF_QUERY(kmc, KMC); +CPACF_QUERY(kimd, KIMD); +CPACF_QUERY(klmd, KLMD); +CPACF_QUERY(kmac, KMAC); +CPACF_QUERY(pckmo, PCKMO); +CPACF_QUERY(kmf, KMF); +CPACF_QUERY(kmctr, KMCTR); +CPACF_QUERY(kmo, KMO); +CPACF_QUERY(pcc, PCC); +CPACF_QUERY(prno, PRNO); +CPACF_QUERY(kma, KMA); +CPACF_QUERY(kdsa, KDSA); + +#define CPACF_QAI(name, instruction) \ +static ssize_t name##_query_auth_info_raw_read( \ + struct file *fp, struct kobject *kobj, \ + const struct bin_attribute *attr, char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_qai_t qai; \ + \ + if (!cpacf_qai(CPACF_##instruction, &qai)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &qai, \ + sizeof(qai)); \ +} \ +static const BIN_ATTR_RO(name##_query_auth_info_raw, sizeof(cpacf_qai_t)) + +CPACF_QAI(km, KM); +CPACF_QAI(kmc, KMC); +CPACF_QAI(kimd, KIMD); +CPACF_QAI(klmd, KLMD); +CPACF_QAI(kmac, KMAC); +CPACF_QAI(pckmo, PCKMO); +CPACF_QAI(kmf, KMF); +CPACF_QAI(kmctr, KMCTR); +CPACF_QAI(kmo, KMO); +CPACF_QAI(pcc, PCC); +CPACF_QAI(prno, PRNO); +CPACF_QAI(kma, KMA); +CPACF_QAI(kdsa, KDSA); + +static const struct bin_attribute *const cpacf_attrs[] = { + &bin_attr_km_query_raw, + &bin_attr_kmc_query_raw, + &bin_attr_kimd_query_raw, + &bin_attr_klmd_query_raw, + &bin_attr_kmac_query_raw, + &bin_attr_pckmo_query_raw, + &bin_attr_kmf_query_raw, + &bin_attr_kmctr_query_raw, + &bin_attr_kmo_query_raw, + &bin_attr_pcc_query_raw, + &bin_attr_prno_query_raw, + &bin_attr_kma_query_raw, + &bin_attr_kdsa_query_raw, + &bin_attr_km_query_auth_info_raw, + &bin_attr_kmc_query_auth_info_raw, + &bin_attr_kimd_query_auth_info_raw, + &bin_attr_klmd_query_auth_info_raw, + &bin_attr_kmac_query_auth_info_raw, + &bin_attr_pckmo_query_auth_info_raw, + &bin_attr_kmf_query_auth_info_raw, + &bin_attr_kmctr_query_auth_info_raw, + &bin_attr_kmo_query_auth_info_raw, + &bin_attr_pcc_query_auth_info_raw, + &bin_attr_prno_query_auth_info_raw, + &bin_attr_kma_query_auth_info_raw, + &bin_attr_kdsa_query_auth_info_raw, + NULL, +}; + +static const struct attribute_group cpacf_attr_grp = { + .name = "cpacf", + .bin_attrs_new = cpacf_attrs, +}; + +static int __init cpacf_init(void) +{ + struct device *cpu_root; + int rc = 0; + + cpu_root = bus_get_dev_root(&cpu_subsys); + if (cpu_root) { + rc = sysfs_create_group(&cpu_root->kobj, &cpacf_attr_grp); + put_device(cpu_root); + } + return rc; +} +device_initcall(cpacf_init); diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 72e106cfd8c7..2f4174b961de 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -16,10 +16,11 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/io.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> -#include <asm/io.h> +#include <asm/asm.h> static DEFINE_SPINLOCK(cpcmd_lock); static char cpcmd_buf[241]; @@ -45,12 +46,11 @@ static int diag8_response(int cmdlen, char *response, int *rlen) ry.odd = *rlen; asm volatile( " diag %[rx],%[ry],0x8\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=&d" (cc), [ry] "+&d" (ry.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [ry] "+d" (ry.pair) : [rx] "d" (rx.pair) - : "cc"); - if (cc) + : CC_CLOBBER); + if (CC_TRANSFORM(cc)) *rlen += ry.odd; else *rlen = ry.odd; diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c new file mode 100644 index 000000000000..76210f001028 --- /dev/null +++ b/arch/s390/kernel/cpufeature.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2022 + */ + +#include <linux/cpufeature.h> +#include <linux/bug.h> +#include <asm/machine.h> +#include <asm/elf.h> + +enum { + TYPE_HWCAP, + TYPE_FACILITY, + TYPE_MACHINE, +}; + +struct s390_cpu_feature { + unsigned int type : 4; + unsigned int num : 28; +}; + +static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = { + [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA}, + [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS}, + [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158}, + [S390_CPU_FEATURE_D288] = {.type = TYPE_MACHINE, .num = MFEATURE_DIAG288}, +}; + +/* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + struct s390_cpu_feature *feature; + + if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES)) + return 0; + feature = &s390_cpu_features[num]; + switch (feature->type) { + case TYPE_HWCAP: + return !!(elf_hwcap & BIT(feature->num)); + case TYPE_FACILITY: + return test_facility(feature->num); + case TYPE_MACHINE: + return test_machine_feature(feature->num); + default: + WARN_ON_ONCE(1); + return 0; + } +} +EXPORT_SYMBOL(cpu_have_feature); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 28124d0fa1d5..adb164223f8c 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -21,6 +21,8 @@ #include <asm/elf.h> #include <asm/ipl.h> #include <asm/sclp.h> +#include <asm/maccess.h> +#include <asm/fpu.h> #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -45,7 +47,7 @@ struct save_area { u64 fprs[16]; u32 fpc; u32 prefix; - u64 todpreg; + u32 todpreg; u64 timer; u64 todcmp; u64 vxrs_low[16]; @@ -61,9 +63,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = memblock_alloc(sizeof(*sa), 8); - if (!sa) - panic("Failed to allocate save area\n"); + sa = memblock_alloc_or_panic(sizeof(*sa), 8); if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); @@ -109,43 +109,20 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) /* Copy lower halves of vector registers 0-15 */ for (i = 0; i < 16; i++) - memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8); + sa->vxrs_low[i] = vxrs[i].low; /* Copy vector registers 16-31 */ memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } -/* - * Return physical address for virtual address - */ -static inline void *load_real_addr(void *addr) -{ - unsigned long real_addr; - - asm volatile( - " lra %0,0(%1)\n" - " jz 0f\n" - " la %0,0\n" - "0:" - : "=a" (real_addr) : "a" (addr) : "cc"); - return (void *)real_addr; -} - -/* - * Copy memory of the old, dumped system to a kernel space virtual address - */ -int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) +static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long len; - void *ra; - int rc; + size_t len, copied, res = 0; while (count) { if (!oldmem_data.start && src < sclp.hsa_size) { /* Copy from zfcp/nvme dump HSA area */ len = min(count, sclp.hsa_size - src); - rc = memcpy_hsa_kernel(dst, src, len); - if (rc) - return rc; + copied = memcpy_hsa_iter(iter, src, len); } else { /* Check for swapped kdump oldmem areas */ if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) { @@ -157,56 +134,27 @@ int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) } else { len = count; } - if (is_vmalloc_or_module_addr(dst)) { - ra = load_real_addr(dst); - len = min(PAGE_SIZE - offset_in_page(ra), len); - } else { - ra = dst; - } - if (memcpy_real(ra, src, len)) - return -EFAULT; + copied = memcpy_real_iter(iter, src, len); } - dst += len; - src += len; - count -= len; + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - return 0; + return res; } -/* - * Copy memory of the old, dumped system to a user space virtual address - */ -static int copy_oldmem_user(void __user *dst, unsigned long src, size_t count) +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) { - unsigned long len; - int rc; + struct iov_iter iter; + struct kvec kvec; - while (count) { - if (!oldmem_data.start && src < sclp.hsa_size) { - /* Copy from zfcp/nvme dump HSA area */ - len = min(count, sclp.hsa_size - src); - rc = memcpy_hsa_user(dst, src, len); - if (rc) - return rc; - } else { - /* Check for swapped kdump oldmem areas */ - if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) { - src -= oldmem_data.start; - len = min(count, oldmem_data.size - src); - } else if (oldmem_data.start && src < oldmem_data.size) { - len = min(count, oldmem_data.size - src); - src += oldmem_data.start; - } else { - len = count; - } - rc = copy_to_user_real(dst, src, count); - if (rc) - return rc; - } - dst += len; - src += len; - count -= len; - } + kvec.iov_base = dst; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (copy_oldmem_iter(&iter, src, count) < count) + return -EFAULT; return 0; } @@ -217,26 +165,9 @@ ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, unsigned long offset) { unsigned long src; - int rc; - if (!(iter_is_iovec(iter) || iov_iter_is_kvec(iter))) - return -EINVAL; - /* Multi-segment iterators are not supported */ - if (iter->nr_segs > 1) - return -EINVAL; - if (!csize) - return 0; src = pfn_to_phys(pfn) + offset; - - /* XXX: pass the iov_iter down to a common function */ - if (iter_is_iovec(iter)) - rc = copy_oldmem_user(iter->iov->iov_base, src, csize); - else - rc = copy_oldmem_kernel(iter->kvec->iov_base, src, csize); - if (rc < 0) - return rc; - iov_iter_advance(iter, csize); - return csize; + return copy_oldmem_iter(iter, src, csize); } /* @@ -304,14 +235,16 @@ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, prot); } -static const char *nt_name(Elf64_Word type) +/* + * Return true only when in a kdump or stand-alone kdump environment. + * Note that /proc/vmcore might also be available in "standard zfcp/nvme dump" + * environments, where this function returns false; see dump_available(). + */ +bool is_kdump_kernel(void) { - const char *name = "LINUX"; - - if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) - name = KEXEC_CORE_NOTE_NAME; - return name; + return oldmem_data.start; } +EXPORT_SYMBOL_GPL(is_kdump_kernel); /* * Initialize ELF note @@ -337,10 +270,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, return PTR_ADD(buf, len); } -static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) -{ - return nt_init_name(buf, type, desc, d_len, nt_name(type)); -} +#define nt_init(buf, type, desc) \ + nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type) /* * Calculate the size of ELF note @@ -356,10 +287,7 @@ static size_t nt_size_name(int d_len, const char *name) return size; } -static inline size_t nt_size(Elf64_Word type, int d_len) -{ - return nt_size_name(d_len, nt_name(type)); -} +#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type) /* * Fill ELF notes for one CPU with save area registers @@ -380,18 +308,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); /* Create ELF notes for the CPU */ - ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); - ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); - ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); - ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); - ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); - ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); - ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); - if (MACHINE_HAS_VX) { - ptr = nt_init(ptr, NT_S390_VXRS_HIGH, - &sa->vxrs_high, sizeof(sa->vxrs_high)); - ptr = nt_init(ptr, NT_S390_VXRS_LOW, - &sa->vxrs_low, sizeof(sa->vxrs_low)); + ptr = nt_init(ptr, PRSTATUS, nt_prstatus); + ptr = nt_init(ptr, PRFPREG, nt_fpregset); + ptr = nt_init(ptr, S390_TIMER, sa->timer); + ptr = nt_init(ptr, S390_TODCMP, sa->todcmp); + ptr = nt_init(ptr, S390_TODPREG, sa->todpreg); + ptr = nt_init(ptr, S390_CTRS, sa->ctrs); + ptr = nt_init(ptr, S390_PREFIX, sa->prefix); + if (cpu_has_vx()) { + ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high); + ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low); } return ptr; } @@ -404,16 +330,16 @@ static size_t get_cpu_elf_notes_size(void) struct save_area *sa = NULL; size_t size; - size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); - size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); - size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); - size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); - size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); - size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); - size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); - if (MACHINE_HAS_VX) { - size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); - size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + size = nt_size(PRSTATUS, struct elf_prstatus); + size += nt_size(PRFPREG, elf_fpregset_t); + size += nt_size(S390_TIMER, sa->timer); + size += nt_size(S390_TODCMP, sa->todcmp); + size += nt_size(S390_TODPREG, sa->todpreg); + size += nt_size(S390_CTRS, sa->ctrs); + size += nt_size(S390_PREFIX, sa->prefix); + if (cpu_has_vx()) { + size += nt_size(S390_VXRS_HIGH, sa->vxrs_high); + size += nt_size(S390_VXRS_LOW, sa->vxrs_low); } return size; @@ -428,8 +354,8 @@ static void *nt_prpsinfo(void *ptr) memset(&prpsinfo, 0, sizeof(prpsinfo)); prpsinfo.pr_sname = 'R'; - strcpy(prpsinfo.pr_fname, "vmlinux"); - return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); + strscpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, PRPSINFO, prpsinfo); } /* @@ -518,7 +444,7 @@ static void *nt_final(void *ptr) /* * Initialize ELF header (new kernel) */ -static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +static void *ehdr_init(Elf64_Ehdr *ehdr, int phdr_count) { memset(ehdr, 0, sizeof(*ehdr)); memcpy(ehdr->e_ident, ELFMAG, SELFMAG); @@ -532,7 +458,8 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) ehdr->e_phoff = sizeof(Elf64_Ehdr); ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - ehdr->e_phnum = mem_chunk_cnt + 1; + /* Number of PT_LOAD program headers plus PT_NOTE program header */ + ehdr->e_phnum = phdr_count + 1; return ehdr + 1; } @@ -563,27 +490,77 @@ static int get_mem_chunk_cnt(void) return cnt; } +static void fill_ptload(Elf64_Phdr *phdr, unsigned long paddr, + unsigned long vaddr, unsigned long size) +{ + phdr->p_type = PT_LOAD; + phdr->p_vaddr = vaddr; + phdr->p_offset = paddr; + phdr->p_paddr = paddr; + phdr->p_filesz = size; + phdr->p_memsz = size; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize ELF loads (new kernel) */ -static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) +static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm) { + unsigned long old_identity_base = 0; phys_addr_t start, end; u64 idx; + if (os_info_has_vm) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); for_each_physmem_range(idx, &oldmem_type, &start, &end) { - phdr->p_filesz = end - start; - phdr->p_type = PT_LOAD; - phdr->p_offset = start; - phdr->p_vaddr = start; - phdr->p_paddr = start; - phdr->p_memsz = end - start; - phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_align = PAGE_SIZE; + fill_ptload(phdr, start, old_identity_base + start, + end - start); phdr++; } } +static bool os_info_has_vm(void) +{ + return os_info_old_value(OS_INFO_KASLR_OFFSET); +} + +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +/* + * Fill PT_LOAD for a physical memory range owned by a device and detected by + * its device driver. + */ +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size) +{ + unsigned long old_identity_base = 0; + + if (os_info_has_vm()) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); + fill_ptload(phdr, paddr, old_identity_base + paddr, size); +} +#endif + +/* + * Prepare PT_LOAD type program header for kernel image region + */ +static void text_init(Elf64_Phdr *phdr) +{ + unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS); + unsigned long start = os_info_old_value(OS_INFO_IMAGE_START); + unsigned long end = os_info_old_value(OS_INFO_IMAGE_END); + + phdr->p_type = PT_LOAD; + phdr->p_vaddr = start; + phdr->p_filesz = end - start; + phdr->p_memsz = end - start; + phdr->p_offset = start_phys; + phdr->p_paddr = start_phys; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize notes (new kernel) */ @@ -609,7 +586,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) return ptr; } -static size_t get_elfcorehdr_size(int mem_chunk_cnt) +static size_t get_elfcorehdr_size(int phdr_count) { size_t size; @@ -617,7 +594,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) /* PT_NOTES */ size += sizeof(Elf64_Phdr); /* nt_prpsinfo */ - size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + size += nt_size(PRPSINFO, struct elf_prpsinfo); /* regsets */ size += get_cpu_cnt() * get_cpu_elf_notes_size(); /* nt_vmcoreinfo */ @@ -625,7 +602,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) /* nt_final */ size += sizeof(Elf64_Nhdr); /* PT_LOADS */ - size += mem_chunk_cnt * sizeof(Elf64_Phdr); + size += phdr_count * sizeof(Elf64_Phdr); return size; } @@ -635,10 +612,10 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) */ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { - Elf64_Phdr *phdr_notes, *phdr_loads; - int mem_chunk_cnt; + Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text; + int mem_chunk_cnt, phdr_text_cnt; + size_t alloc_size; void *ptr, *hdr; - u32 alloc_size; u64 hdr_off; /* If we are not in kdump or zfcp/nvme dump mode return */ @@ -656,12 +633,14 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) } mem_chunk_cnt = get_mem_chunk_cnt(); + phdr_text_cnt = os_info_has_vm() ? 1 : 0; - alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + alloc_size = get_elfcorehdr_size(mem_chunk_cnt + phdr_text_cnt); hdr = kzalloc(alloc_size, GFP_KERNEL); - /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + /* + * Without elfcorehdr /proc/vmcore cannot be created. Thus creating * a dump with this crash kernel will fail. Panic now to allow other * dump mechanisms to take over. */ @@ -669,18 +648,25 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) panic("s390 kdump allocating elfcorehdr failed"); /* Init elf header */ - ptr = ehdr_init(hdr, mem_chunk_cnt); + phdr_notes = ehdr_init(hdr, mem_chunk_cnt + phdr_text_cnt); /* Init program headers */ - phdr_notes = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_loads = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + if (phdr_text_cnt) { + phdr_text = phdr_notes + 1; + phdr_loads = phdr_text + 1; + } else { + phdr_loads = phdr_notes + 1; + } + ptr = PTR_ADD(phdr_loads, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init kernel text program header */ + if (phdr_text_cnt) + text_init(phdr_text); /* Init loads */ + loads_init(phdr_loads, phdr_text_cnt); + /* Finalize program headers */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, hdr_off); *addr = (unsigned long long) hdr; *size = (unsigned long long) hdr_off; BUG_ON(elfcorehdr_size > alloc_size); diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c new file mode 100644 index 000000000000..8cc26cf2c64a --- /dev/null +++ b/arch/s390/kernel/ctlreg.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/irqflags.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cache.h> +#include <asm/abs_lowcore.h> +#include <asm/ctlreg.h> + +/* + * ctl_lock guards access to global control register contents which + * are kept in the control register save area within absolute lowcore + * at physical address zero. + */ +static DEFINE_SPINLOCK(system_ctl_lock); + +void system_ctlreg_lock(void) + __acquires(&system_ctl_lock) +{ + spin_lock(&system_ctl_lock); +} + +void system_ctlreg_unlock(void) + __releases(&system_ctl_lock) +{ + spin_unlock(&system_ctl_lock); +} + +static bool system_ctlreg_area_init __ro_after_init; + +void __init system_ctlreg_init_save_area(struct lowcore *lc) +{ + struct lowcore *abs_lc; + + abs_lc = get_abs_lowcore(); + __local_ctl_store(0, 15, lc->cregs_save_area); + __local_ctl_store(0, 15, abs_lc->cregs_save_area); + put_abs_lowcore(abs_lc); + system_ctlreg_area_init = true; +} + +struct ctlreg_parms { + unsigned long andval; + unsigned long orval; + unsigned long val; + int request; + int cr; +}; + +static void ctlreg_callback(void *info) +{ + struct ctlreg_parms *pp = info; + struct ctlreg regs[16]; + + __local_ctl_store(0, 15, regs); + if (pp->request == CTLREG_LOAD) { + regs[pp->cr].val = pp->val; + } else { + regs[pp->cr].val &= pp->andval; + regs[pp->cr].val |= pp->orval; + } + __local_ctl_load(0, 15, regs); +} + +static void system_ctlreg_update(void *info) +{ + unsigned long flags; + + if (system_state == SYSTEM_BOOTING) { + /* + * For very early calls do not call on_each_cpu() + * since not everything might be setup. + */ + local_irq_save(flags); + ctlreg_callback(info); + local_irq_restore(flags); + } else { + on_each_cpu(ctlreg_callback, info, 1); + } +} + +void system_ctlreg_modify(unsigned int cr, unsigned long data, int request) +{ + struct ctlreg_parms pp = { .cr = cr, .request = request, }; + struct lowcore *abs_lc; + + switch (request) { + case CTLREG_SET_BIT: + pp.orval = 1UL << data; + pp.andval = -1UL; + break; + case CTLREG_CLEAR_BIT: + pp.orval = 0; + pp.andval = ~(1UL << data); + break; + case CTLREG_LOAD: + pp.val = data; + break; + } + if (system_ctlreg_area_init) { + system_ctlreg_lock(); + abs_lc = get_abs_lowcore(); + if (request == CTLREG_LOAD) { + abs_lc->cregs_save_area[cr].val = pp.val; + } else { + abs_lc->cregs_save_area[cr].val &= pp.andval; + abs_lc->cregs_save_area[cr].val |= pp.orval; + } + put_abs_lowcore(abs_lc); + system_ctlreg_update(&pp); + system_ctlreg_unlock(); + } else { + system_ctlreg_update(&pp); + } +} +EXPORT_SYMBOL(system_ctlreg_modify); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 4331c7e6e1c0..2a41be2f7925 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/math.h> #include <linux/minmax.h> #include <linux/debugfs.h> @@ -38,13 +39,13 @@ typedef struct file_private_info { loff_t offset; /* offset of last read in file */ - int act_area; /* number of last formated area */ + int act_area; /* number of last formatted area */ int act_page; /* act page in given area */ - int act_entry; /* last formated entry (offset */ + int act_entry; /* last formatted entry (offset */ /* relative to beginning of last */ - /* formated page) */ + /* formatted page) */ size_t act_entry_offset; /* up to this offset we copied */ - /* in last read the last formated */ + /* in last read the last formatted */ /* entry to userland */ char temp_buf[2048]; /* buffer for output */ debug_info_t *debug_info_org; /* original debug information */ @@ -60,10 +61,10 @@ typedef struct { * except of floats, and long long (32 bit) * */ - long args[0]; + long args[]; } debug_sprintf_entry_t; -/* internal function prototyes */ +/* internal function prototypes */ static int debug_init(void); static ssize_t debug_output(struct file *file, char __user *user_buf, @@ -77,12 +78,14 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, static void debug_info_get(debug_info_t *); static void debug_info_put(debug_info_t *); static int debug_prolog_level_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_prolog_pages_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); @@ -90,9 +93,8 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf); -static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event); + char *out_buf, size_t out_buf_size, + const char *in_buf); static void debug_areas_swap(debug_info_t *a, debug_info_t *b); static void debug_events_append(debug_info_t *dest, debug_info_t *src); @@ -139,7 +141,7 @@ struct debug_view debug_sprintf_view = { "sprintf", NULL, &debug_dflt_header_fn, - (debug_format_proc_t *)&debug_sprintf_format_fn, + &debug_sprintf_format_fn, NULL, NULL }; @@ -163,7 +165,6 @@ static const struct file_operations debug_file_ops = { .write = debug_input, .open = debug_open, .release = debug_close, - .llseek = no_llseek, }; static struct dentry *debug_debugfs_root_entry; @@ -250,7 +251,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strlcpy(rc->name, name, sizeof(rc->name)); + strscpy(rc->name, name); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); @@ -351,7 +352,10 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode) for (i = 0; i < in->nr_areas; i++) { for (j = 0; j < in->pages_per_area; j++) memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE); + rc->active_pages[i] = in->active_pages[i]; + rc->active_entries[i] = in->active_entries[i]; } + rc->active_area = in->active_area; out: spin_unlock_irqrestore(&in->lock, flags); return rc; @@ -381,7 +385,7 @@ static void debug_info_put(debug_info_t *db_info) /* * debug_format_entry: - * - format one debug entry and return size of formated data + * - format one debug entry and return size of formatted data */ static int debug_format_entry(file_private_info_t *p_info) { @@ -392,8 +396,10 @@ static int debug_format_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { /* print prolog */ - if (view->prolog_proc) - len += view->prolog_proc(id_snap, view, p_info->temp_buf); + if (view->prolog_proc) { + len += view->prolog_proc(id_snap, view, p_info->temp_buf, + sizeof(p_info->temp_buf)); + } goto out; } if (!id_snap->areas) /* this is true, if we have a prolog only view */ @@ -403,21 +409,31 @@ static int debug_format_entry(file_private_info_t *p_info) if (act_entry->clock == 0LL) goto out; /* empty entry */ - if (view->header_proc) + if (view->header_proc) { len += view->header_proc(id_snap, view, p_info->act_area, - act_entry, p_info->temp_buf + len); - if (view->format_proc) + act_entry, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len); + } + if (view->format_proc) { len += view->format_proc(id_snap, view, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len, DEBUG_DATA(act_entry)); + } out: return len; } -/* - * debug_next_entry: - * - goto next entry in p_info +/** + * debug_next_entry - Go to the next entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the next entry. If no further entry + * exists the current position is set to one after the end the return value + * indicates that no further entries exist. + * + * Return: True if there are more following entries, false otherwise */ -static inline int debug_next_entry(file_private_info_t *p_info) +static inline bool debug_next_entry(file_private_info_t *p_info) { debug_info_t *id; @@ -425,10 +441,10 @@ static inline int debug_next_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { p_info->act_entry = 0; p_info->act_page = 0; - goto out; + return true; } if (!id->areas) - return 1; + return false; p_info->act_entry += id->entry_size; /* switch to next page, if we reached the end of the page */ if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) { @@ -441,16 +457,93 @@ static inline int debug_next_entry(file_private_info_t *p_info) p_info->act_page = 0; } if (p_info->act_area >= id->nr_areas) - return 1; + return false; } -out: - return 0; + return true; +} + +/** + * debug_to_act_entry - Go to the currently active entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the currently active + * entry of @p_info->debug_info_snap + */ +static void debug_to_act_entry(file_private_info_t *p_info) +{ + debug_info_t *snap_id; + + snap_id = p_info->debug_info_snap; + p_info->act_area = snap_id->active_area; + p_info->act_page = snap_id->active_pages[snap_id->active_area]; + p_info->act_entry = snap_id->active_entries[snap_id->active_area]; +} + +/** + * debug_prev_entry - Go to the previous entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the previous entry. If no previous entry + * exists the current position is set left as DEBUG_PROLOG_ENTRY and the return value + * indicates that no previous entries exist. + * + * Return: True if there are more previous entries, false otherwise + */ + +static inline bool debug_prev_entry(file_private_info_t *p_info) +{ + debug_info_t *id; + + id = p_info->debug_info_snap; + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) + debug_to_act_entry(p_info); + if (!id->areas) + return false; + p_info->act_entry -= id->entry_size; + /* switch to prev page, if we reached the beginning of the page */ + if (p_info->act_entry < 0) { + /* end of previous page */ + p_info->act_entry = rounddown(PAGE_SIZE, id->entry_size) - id->entry_size; + p_info->act_page--; + if (p_info->act_page < 0) { + /* previous area */ + p_info->act_area--; + p_info->act_page = id->pages_per_area - 1; + } + if (p_info->act_area < 0) + p_info->act_area = (id->nr_areas - 1) % id->nr_areas; + } + /* check full circle */ + if (id->active_area == p_info->act_area && + id->active_pages[id->active_area] == p_info->act_page && + id->active_entries[id->active_area] == p_info->act_entry) + return false; + return true; +} + +/** + * debug_move_entry - Go to next entry in either the forward or backward direction + * @p_info: Private info that is manipulated + * @reverse: If true go to the next entry in reverse i.e. previous + * + * Sets the current position in @p_info to the next (@reverse == false) or + * previous (@reverse == true) entry. + * + * Return: True if there are further entries in that direction, + * false otherwise. + */ +static bool debug_move_entry(file_private_info_t *p_info, bool reverse) +{ + if (reverse) + return debug_prev_entry(p_info); + else + return debug_next_entry(p_info); } /* * debug_output: * - called for user read() - * - copies formated debug entries to the user buffer + * - copies formatted debug entries to the user buffer */ static ssize_t debug_output(struct file *file, /* file descriptor */ char __user *user_buf, /* user buffer */ @@ -486,7 +579,7 @@ static ssize_t debug_output(struct file *file, /* file descriptor */ } if (copy_size == formatted_line_residue) { entry_offset = 0; - if (debug_next_entry(p_info)) + if (!debug_next_entry(p_info)) goto out; } } @@ -521,15 +614,51 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf, return rc; /* number of input characters */ } +static file_private_info_t *debug_file_private_alloc(debug_info_t *debug_info, + struct debug_view *view) +{ + debug_info_t *debug_info_snapshot; + file_private_info_t *p_info; + + /* + * Make snapshot of current debug areas to get it consistent. + * To copy all the areas is only needed, if we have a view which + * formats the debug areas. + */ + if (!view->format_proc && !view->header_proc) + debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); + else + debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); + + if (!debug_info_snapshot) + return NULL; + p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + if (!p_info) { + debug_info_free(debug_info_snapshot); + return NULL; + } + p_info->offset = 0; + p_info->debug_info_snap = debug_info_snapshot; + p_info->debug_info_org = debug_info; + p_info->view = view; + p_info->act_area = 0; + p_info->act_page = 0; + p_info->act_entry = DEBUG_PROLOG_ENTRY; + p_info->act_entry_offset = 0; + debug_info_get(debug_info); + + return p_info; +} + /* * debug_open: * - called for user open() - * - copies formated output to private_data area of the file + * - copies formatted output to private_data area of the file * handle */ static int debug_open(struct inode *inode, struct file *file) { - debug_info_t *debug_info, *debug_info_snapshot; + debug_info_t *debug_info; file_private_info_t *p_info; int i, rc = 0; @@ -547,42 +676,26 @@ static int debug_open(struct inode *inode, struct file *file) goto out; found: - - /* Make snapshot of current debug areas to get it consistent. */ - /* To copy all the areas is only needed, if we have a view which */ - /* formats the debug areas. */ - - if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc) - debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); - else - debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); - - if (!debug_info_snapshot) { - rc = -ENOMEM; - goto out; - } - p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + p_info = debug_file_private_alloc(debug_info, debug_info->views[i]); if (!p_info) { - debug_info_free(debug_info_snapshot); rc = -ENOMEM; goto out; } - p_info->offset = 0; - p_info->debug_info_snap = debug_info_snapshot; - p_info->debug_info_org = debug_info; - p_info->view = debug_info->views[i]; - p_info->act_area = 0; - p_info->act_page = 0; - p_info->act_entry = DEBUG_PROLOG_ENTRY; - p_info->act_entry_offset = 0; file->private_data = p_info; - debug_info_get(debug_info); nonseekable_open(inode, file); out: mutex_unlock(&debug_mutex); return rc; } +static void debug_file_private_free(file_private_info_t *p_info) +{ + if (p_info->debug_info_snap) + debug_info_free(p_info->debug_info_snap); + debug_info_put(p_info->debug_info_org); + kfree(p_info); +} + /* * debug_close: * - called for user close() @@ -593,13 +706,59 @@ static int debug_close(struct inode *inode, struct file *file) file_private_info_t *p_info; p_info = (file_private_info_t *) file->private_data; - if (p_info->debug_info_snap) - debug_info_free(p_info->debug_info_snap); - debug_info_put(p_info->debug_info_org); - kfree(file->private_data); + debug_file_private_free(p_info); + file->private_data = NULL; return 0; /* success */ } +/** + * debug_dump - Get a textual representation of debug info, or as much as fits + * @id: Debug information to use + * @view: View with which to dump the debug information + * @buf: Buffer the textual debug data representation is written to + * @buf_size: Size of the buffer, including the trailing '\0' byte + * @reverse: Go backwards from the last written entry + * + * This function may be used whenever a textual representation of the debug + * information is required without using an s390dbf file. + * + * Note: It is the callers responsibility to supply a view that is compatible + * with the debug information data. + * + * Return: On success returns the number of bytes written to the buffer not + * including the trailing '\0' byte. If bug_size == 0 the function returns 0. + * On failure an error code less than 0 is returned. + */ +ssize_t debug_dump(debug_info_t *id, struct debug_view *view, + char *buf, size_t buf_size, bool reverse) +{ + file_private_info_t *p_info; + size_t size, offset = 0; + + /* Need space for '\0' byte */ + if (buf_size < 1) + return 0; + buf_size--; + + p_info = debug_file_private_alloc(id, view); + if (!p_info) + return -ENOMEM; + + /* There is always at least the DEBUG_PROLOG_ENTRY */ + do { + size = debug_format_entry(p_info); + size = min(size, buf_size - offset); + memcpy(buf + offset, p_info->temp_buf, size); + offset += size; + if (offset >= buf_size) + break; + } while (debug_move_entry(p_info, reverse)); + debug_file_private_free(p_info); + buf[offset] = '\0'; + + return offset; +} + /* Create debugfs entries and add to internal list. */ static void _debug_register(debug_info_t *id) { @@ -954,7 +1113,7 @@ static int debug_active = 1; * always allow read, allow write only if debug_stoppable is set or * if debug_active is already off */ -static int s390dbf_procactive(struct ctl_table *table, int write, +static int s390dbf_procactive(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) @@ -963,7 +1122,7 @@ static int s390dbf_procactive(struct ctl_table *table, int write, return 0; } -static struct ctl_table s390dbf_table[] = { +static const struct ctl_table s390dbf_table[] = { { .procname = "debug_stoppable", .data = &debug_stoppable, @@ -978,17 +1137,6 @@ static struct ctl_table s390dbf_table[] = { .mode = S_IRUGO | S_IWUSR, .proc_handler = s390dbf_procactive, }, - { } -}; - -static struct ctl_table s390dbf_dir_table[] = { - { - .procname = "s390dbf", - .maxlen = 0, - .mode = S_IRUGO | S_IXUGO, - .child = s390dbf_table, - }, - { } }; static struct ctl_table_header *s390dbf_sysctl_header; @@ -1304,9 +1452,9 @@ static inline int debug_get_uint(char *buf) */ static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { - return sprintf(out_buf, "%i\n", id->pages_per_area); + return scnprintf(out_buf, out_buf_size, "%i\n", id->pages_per_area); } /* @@ -1353,14 +1501,14 @@ out: * prints out actual debug level */ static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { int rc = 0; if (id->level == DEBUG_OFF_LEVEL) - rc = sprintf(out_buf, "-\n"); + rc = scnprintf(out_buf, out_buf_size, "-\n"); else - rc = sprintf(out_buf, "%i\n", id->level); + rc = scnprintf(out_buf, out_buf_size, "%i\n", id->level); return rc; } @@ -1477,22 +1625,24 @@ out: * prints debug data in hex/ascii format */ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) + char *out_buf, size_t out_buf_size, const char *in_buf) { int i, rc = 0; - for (i = 0; i < id->buf_size; i++) - rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); - rc += sprintf(out_buf + rc, "| "); + for (i = 0; i < id->buf_size; i++) { + rc += scnprintf(out_buf + rc, out_buf_size - rc, + "%02x ", ((unsigned char *)in_buf)[i]); + } + rc += scnprintf(out_buf + rc, out_buf_size - rc, "| "); for (i = 0; i < id->buf_size; i++) { unsigned char c = in_buf[i]; if (isascii(c) && isprint(c)) - rc += sprintf(out_buf + rc, "%c", c); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "%c", c); else - rc += sprintf(out_buf + rc, "."); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "."); } - rc += sprintf(out_buf + rc, "\n"); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "\n"); return rc; } @@ -1500,7 +1650,8 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, * prints header for debug entry */ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) + int area, debug_entry_t *entry, char *out_buf, + size_t out_buf_size) { unsigned long sec, usec; unsigned long caller; @@ -1517,23 +1668,24 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, else except_str = "-"; caller = (unsigned long) entry->caller; - rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ", - area, sec, usec, level, except_str, - entry->cpu, (void *)caller); + rc += scnprintf(out_buf, out_buf_size, "%02i %011ld:%06lu %1u %1s %04u %px ", + area, sec, usec, level, except_str, + entry->cpu, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); /* - * prints debug data sprintf-formated: + * prints debug data sprintf-formatted: * debug_sprinf_event/exception calls must be used together with this view */ #define DEBUG_SPRINTF_MAX_ARGS 10 -static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event) +int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, size_t out_buf_size, const char *inbuf) { + debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf; int num_longs, num_used_args = 0, i, rc = 0; int index[DEBUG_SPRINTF_MAX_ARGS]; @@ -1544,8 +1696,9 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, goto out; /* bufsize of entry too small */ if (num_longs == 1) { /* no args, we use only the string */ - strcpy(out_buf, curr_event->string); - rc = strlen(curr_event->string); + rc = strscpy(out_buf, curr_event->string, out_buf_size); + if (rc == -E2BIG) + rc = out_buf_size; goto out; } @@ -1557,15 +1710,17 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, for (i = 0; i < num_used_args; i++) index[i] = i; - rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], - curr_event->args[index[1]], curr_event->args[index[2]], - curr_event->args[index[3]], curr_event->args[index[4]], - curr_event->args[index[5]], curr_event->args[index[6]], - curr_event->args[index[7]], curr_event->args[index[8]], - curr_event->args[index[9]]); + rc = scnprintf(out_buf, out_buf_size, + curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); out: return rc; } +EXPORT_SYMBOL(debug_sprintf_format_fn); /* * debug_init: @@ -1573,7 +1728,7 @@ out: */ static int __init debug_init(void) { - s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table); + s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table); mutex_lock(&debug_mutex); debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL); initialized = 1; diff --git a/arch/s390/kernel/diag/Makefile b/arch/s390/kernel/diag/Makefile new file mode 100644 index 000000000000..956aee6c4090 --- /dev/null +++ b/arch/s390/kernel/diag/Makefile @@ -0,0 +1 @@ +obj-y := diag_misc.o diag324.o diag.o diag310.o diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag/diag.c index a778714e4d8b..56b862ba9be8 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag/diag.c @@ -11,11 +11,13 @@ #include <linux/cpu.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/vmalloc.h> #include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/trace/diag.h> #include <asm/sections.h> -#include "entry.h" +#include <asm/asm.h> +#include "../entry.h" struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -35,6 +37,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" }, [DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" }, [DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" }, + [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" }, [DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" }, [DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" }, [DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" }, @@ -48,7 +51,11 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" }, [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, + [DIAG_STAT_X310] = { .code = 0x310, .name = "Memory Topology Information" }, [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, + [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, + [DIAG_STAT_X324] = { .code = 0x324, .name = "Power Information Block" }, + [DIAG_STAT_X49C] = { .code = 0x49c, .name = "Warning-Track Interruption" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; @@ -57,12 +64,16 @@ struct diag_ops __amode31_ref diag_amode31_ops = { .diag26c = _diag26c_amode31, .diag14 = _diag14_amode31, .diag0c = _diag0c_amode31, + .diag8c = _diag8c_amode31, .diag308_reset = _diag308_reset_amode31 }; static struct diag210 _diag210_tmp_amode31 __section(".amode31.data"); struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31; +static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data"); +static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31; + static int show_diag_stat(struct seq_file *m, void *v) { struct diag_stat *stat; @@ -140,20 +151,51 @@ void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr) EXPORT_SYMBOL(diag_stat_inc_norecursion); /* + * Diagnose 0c: Pseudo Timer + */ +void diag0c(struct hypfs_diag0c_entry *data) +{ + diag_stat_inc(DIAG_STAT_X00C); + diag_amode31_ops.diag0c(virt_to_phys(data)); +} + +/* * Diagnose 14: Input spool file manipulation + * + * The subcode parameter determines the type of the first parameter rx. + * Currently used are the following 3 subcommands: + * 0x0: Read the Next Spool File Buffer (Data Record) + * 0x28: Position a Spool File to the Designated Record + * 0xfff: Retrieve Next File Descriptor + * + * For subcommands 0x0 and 0xfff, the value of the first parameter is + * a virtual address of a memory buffer and needs virtual to physical + * address translation. For other subcommands the rx parameter is not + * a virtual address. */ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) { diag_stat_inc(DIAG_STAT_X014); + switch (subcode) { + case 0x0: + case 0xfff: + rx = virt_to_phys((void *)rx); + break; + default: + /* Do nothing */ + break; + } return diag_amode31_ops.diag14(rx, ry1, subcode); } EXPORT_SYMBOL(diag14); +#define DIAG204_BUSY_RC 8 + static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) { union register_pair rp = { .even = *subcode, .odd = size }; - asm volatile( + asm_inline volatile( " diag %[addr],%[rp],0x204\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) @@ -162,12 +204,35 @@ static inline int __diag204(unsigned long *subcode, unsigned long size, void *ad return rp.odd; } +/** + * diag204() - Issue diagnose 204 call. + * @subcode: Subcode of diagnose 204 to be executed. + * @size: Size of area in pages which @area points to, if given. + * @addr: Vmalloc'ed memory area where the result is written to. + * + * Execute diagnose 204 with the given subcode and write the result to the + * memory area specified with @addr. For subcodes which do not write a + * result to memory both @size and @addr must be zero. If @addr is + * specified it must be page aligned and must have been allocated with + * vmalloc(). Conversion to real / physical addresses will be handled by + * this function if required. + */ int diag204(unsigned long subcode, unsigned long size, void *addr) { + if (addr) { + if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) + return -EINVAL; + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) + return -EINVAL; + } + if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) + addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); diag_stat_inc(DIAG_STAT_X204); size = __diag204(&subcode, size, addr); - if (subcode) - return -1; + if (subcode == DIAG204_BUSY_RC) + return -EBUSY; + else if (subcode) + return -EOPNOTSUPP; return size; } EXPORT_SYMBOL(diag204); @@ -194,17 +259,41 @@ int diag210(struct diag210 *addr) } EXPORT_SYMBOL(diag210); +/* + * Diagnose 8C: Access 3270 Display Device Information + */ +int diag8c(struct diag8c *addr, struct ccw_dev_id *devno) +{ + static DEFINE_SPINLOCK(diag8c_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag8c_lock, flags); + + diag_stat_inc(DIAG_STAT_X08C); + ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr)); + + *addr = *__diag8c_tmp_amode31; + spin_unlock_irqrestore(&diag8c_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag8c); + int diag224(void *ptr) { + unsigned long addr = __pa(ptr); int rc = -EOPNOTSUPP; diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" + asm_inline volatile("\n" + " diag %[type],%[addr],0x224\n" + "0: lhi %[rc],0\n" "1:\n" EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + : [rc] "+d" (rc) + , "=m" (*(struct { char buf[PAGE_SIZE]; } *)ptr) + : [type] "d" (0), [addr] "d" (addr)); return rc; } EXPORT_SYMBOL(diag224); @@ -215,6 +304,21 @@ EXPORT_SYMBOL(diag224); int diag26c(void *req, void *resp, enum diag26c_sc subcode) { diag_stat_inc(DIAG_STAT_X26C); - return diag_amode31_ops.diag26c(req, resp, subcode); + return diag_amode31_ops.diag26c(virt_to_phys(req), virt_to_phys(resp), subcode); } EXPORT_SYMBOL(diag26c); + +int diag49c(unsigned long subcode) +{ + int cc; + + diag_stat_inc(DIAG_STAT_X49C); + asm volatile( + " diag %[subcode],0,0x49c\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [subcode] "d" (subcode) + : CC_CLOBBER); + return CC_TRANSFORM(cc); +} +EXPORT_SYMBOL(diag49c); diff --git a/arch/s390/kernel/diag/diag310.c b/arch/s390/kernel/diag/diag310.c new file mode 100644 index 000000000000..d6a34454aa5a --- /dev/null +++ b/arch/s390/kernel/diag/diag310.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request memory topology information via diag0x310. + * + * Copyright IBM Corp. 2025 + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <asm/diag.h> +#include <asm/sclp.h> +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +#define DIAG310_LEVELMIN 1 +#define DIAG310_LEVELMAX 6 + +enum diag310_sc { + DIAG310_SUBC_0 = 0, + DIAG310_SUBC_1 = 1, + DIAG310_SUBC_4 = 4, + DIAG310_SUBC_5 = 5 +}; + +enum diag310_retcode { + DIAG310_RET_SUCCESS = 0x0001, + DIAG310_RET_BUSY = 0x0101, + DIAG310_RET_OPNOTSUPP = 0x0102, + DIAG310_RET_SC4_INVAL = 0x0401, + DIAG310_RET_SC4_NODATA = 0x0402, + DIAG310_RET_SC5_INVAL = 0x0501, + DIAG310_RET_SC5_NODATA = 0x0502, + DIAG310_RET_SC5_ESIZE = 0x0503 +}; + +union diag310_response { + u64 response; + struct { + u64 result : 32; + u64 : 16; + u64 rc : 16; + }; +}; + +union diag310_req_subcode { + u64 subcode; + struct { + u64 : 48; + u64 st : 8; + u64 sc : 8; + }; +}; + +union diag310_req_size { + u64 size; + struct { + u64 page_count : 32; + u64 : 32; + }; +}; + +static inline unsigned long diag310(unsigned long subcode, unsigned long size, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, .odd = size }; + + diag_stat_inc(DIAG_STAT_X310); + asm volatile("diag %[rp],%[subcode],0x310\n" + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "memory"); + return rp.odd; +} + +static int diag310_result_to_errno(unsigned int result) +{ + switch (result) { + case DIAG310_RET_BUSY: + return -EBUSY; + case DIAG310_RET_OPNOTSUPP: + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int diag310_get_subcode_mask(unsigned long *mask) +{ + union diag310_response res; + + res.response = diag310(DIAG310_SUBC_0, 0, NULL); + if (res.rc != DIAG310_RET_SUCCESS) + return diag310_result_to_errno(res.rc); + *mask = res.response; + return 0; +} + +static int diag310_get_memtop_stride(unsigned long *stride) +{ + union diag310_response res; + + res.response = diag310(DIAG310_SUBC_1, 0, NULL); + if (res.rc != DIAG310_RET_SUCCESS) + return diag310_result_to_errno(res.rc); + *stride = res.result; + return 0; +} + +static int diag310_get_memtop_size(unsigned long *pages, unsigned long level) +{ + union diag310_req_subcode req = { .sc = DIAG310_SUBC_4, .st = level }; + union diag310_response res; + + res.response = diag310(req.subcode, 0, NULL); + switch (res.rc) { + case DIAG310_RET_SUCCESS: + *pages = res.result; + return 0; + case DIAG310_RET_SC4_NODATA: + return -ENODATA; + case DIAG310_RET_SC4_INVAL: + return -EINVAL; + default: + return diag310_result_to_errno(res.rc); + } +} + +static int diag310_store_topology_map(void *buf, unsigned long pages, unsigned long level) +{ + union diag310_req_subcode req_sc = { .sc = DIAG310_SUBC_5, .st = level }; + union diag310_req_size req_size = { .page_count = pages }; + union diag310_response res; + + res.response = diag310(req_sc.subcode, req_size.size, buf); + switch (res.rc) { + case DIAG310_RET_SUCCESS: + return 0; + case DIAG310_RET_SC5_NODATA: + return -ENODATA; + case DIAG310_RET_SC5_ESIZE: + return -EOVERFLOW; + case DIAG310_RET_SC5_INVAL: + return -EINVAL; + default: + return diag310_result_to_errno(res.rc); + } +} + +static int diag310_check_features(void) +{ + static int features_available; + unsigned long mask; + int rc; + + if (READ_ONCE(features_available)) + return 0; + if (!sclp.has_diag310) + return -EOPNOTSUPP; + rc = diag310_get_subcode_mask(&mask); + if (rc) + return rc; + if (!test_bit_inv(DIAG310_SUBC_1, &mask)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG310_SUBC_4, &mask)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG310_SUBC_5, &mask)) + return -EOPNOTSUPP; + WRITE_ONCE(features_available, 1); + return 0; +} + +static int memtop_get_stride_len(unsigned long *res) +{ + static unsigned long memtop_stride; + unsigned long stride; + int rc; + + stride = READ_ONCE(memtop_stride); + if (!stride) { + rc = diag310_get_memtop_stride(&stride); + if (rc) + return rc; + WRITE_ONCE(memtop_stride, stride); + } + *res = stride; + return 0; +} + +static int memtop_get_page_count(unsigned long *res, unsigned long level) +{ + static unsigned long memtop_pages[DIAG310_LEVELMAX]; + unsigned long pages; + int rc; + + if (level > DIAG310_LEVELMAX || level < DIAG310_LEVELMIN) + return -EINVAL; + pages = READ_ONCE(memtop_pages[level - 1]); + if (!pages) { + rc = diag310_get_memtop_size(&pages, level); + if (rc) + return rc; + WRITE_ONCE(memtop_pages[level - 1], pages); + } + *res = pages; + return 0; +} + +long diag310_memtop_stride(unsigned long arg) +{ + size_t __user *argp = (void __user *)arg; + unsigned long stride; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + rc = memtop_get_stride_len(&stride); + if (rc) + return rc; + if (put_user(stride, argp)) + return -EFAULT; + return 0; +} + +long diag310_memtop_len(unsigned long arg) +{ + size_t __user *argp = (void __user *)arg; + unsigned long pages, level; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + if (get_user(level, argp)) + return -EFAULT; + rc = memtop_get_page_count(&pages, level); + if (rc) + return rc; + if (put_user(pages * PAGE_SIZE, argp)) + return -EFAULT; + return 0; +} + +long diag310_memtop_buf(unsigned long arg) +{ + struct diag310_memtop __user *udata = (struct diag310_memtop __user *)arg; + unsigned long level, pages, data_size; + u64 address; + void *buf; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + if (get_user(level, &udata->nesting_lvl)) + return -EFAULT; + if (get_user(address, &udata->address)) + return -EFAULT; + rc = memtop_get_page_count(&pages, level); + if (rc) + return rc; + data_size = pages * PAGE_SIZE; + buf = __vmalloc_node(data_size, PAGE_SIZE, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, + NUMA_NO_NODE, __builtin_return_address(0)); + if (!buf) + return -ENOMEM; + rc = diag310_store_topology_map(buf, pages, level); + if (rc) + goto out; + if (copy_to_user((void __user *)address, buf, data_size)) + rc = -EFAULT; +out: + vfree(buf); + return rc; +} diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c new file mode 100644 index 000000000000..7fa4c0b7eb6c --- /dev/null +++ b/arch/s390/kernel/diag/diag324.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request power readings for resources in a computing environment via + * diag 0x324. diag 0x324 stores the power readings in the power information + * block (pib). + * + * Copyright IBM Corp. 2024 + */ + +#define pr_fmt(fmt) "diag324: " fmt +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/ioctl.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/ktime.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> + +#include <asm/diag.h> +#include <asm/sclp.h> +#include <asm/timex.h> +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +enum subcode { + DIAG324_SUBC_0 = 0, + DIAG324_SUBC_1 = 1, + DIAG324_SUBC_2 = 2, +}; + +enum retcode { + DIAG324_RET_SUCCESS = 0x0001, + DIAG324_RET_SUBC_NOTAVAIL = 0x0103, + DIAG324_RET_INSUFFICIENT_SIZE = 0x0104, + DIAG324_RET_READING_UNAVAILABLE = 0x0105, +}; + +union diag324_response { + u64 response; + struct { + u64 installed : 32; + u64 : 16; + u64 rc : 16; + } sc0; + struct { + u64 format : 16; + u64 : 16; + u64 pib_len : 16; + u64 rc : 16; + } sc1; + struct { + u64 : 48; + u64 rc : 16; + } sc2; +}; + +union diag324_request { + u64 request; + struct { + u64 : 32; + u64 allocated : 16; + u64 : 12; + u64 sc : 4; + } sc2; +}; + +struct pib { + u32 : 8; + u32 num : 8; + u32 len : 16; + u32 : 24; + u32 hlen : 8; + u64 : 64; + u64 intv; + u8 r[]; +} __packed; + +struct pibdata { + struct pib *pib; + ktime_t expire; + u64 sequence; + size_t len; + int rc; +}; + +static DEFINE_MUTEX(pibmutex); +static struct pibdata pibdata; + +#define PIBWORK_DELAY (5 * NSEC_PER_SEC) + +static void pibwork_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(pibwork, pibwork_handler); + +static unsigned long diag324(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr }; + + diag_stat_inc(DIAG_STAT_X324); + asm volatile("diag %[rp],%[subcode],0x324\n" + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "memory"); + return rp.odd; +} + +static void pibwork_handler(struct work_struct *work) +{ + struct pibdata *data = &pibdata; + ktime_t timedout; + + mutex_lock(&pibmutex); + timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); + if (ktime_before(ktime_get(), timedout)) { + mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + goto out; + } + vfree(data->pib); + data->pib = NULL; +out: + mutex_unlock(&pibmutex); +} + +static void pib_update(struct pibdata *data) +{ + union diag324_request req = { .sc2.sc = DIAG324_SUBC_2, .sc2.allocated = data->len }; + union diag324_response res; + int rc; + + memset(data->pib, 0, data->len); + res.response = diag324(req.request, data->pib); + switch (res.sc2.rc) { + case DIAG324_RET_SUCCESS: + rc = 0; + break; + case DIAG324_RET_SUBC_NOTAVAIL: + rc = -ENOENT; + break; + case DIAG324_RET_INSUFFICIENT_SIZE: + rc = -EMSGSIZE; + break; + case DIAG324_RET_READING_UNAVAILABLE: + rc = -EBUSY; + break; + default: + rc = -EINVAL; + } + data->rc = rc; +} + +long diag324_pibbuf(unsigned long arg) +{ + struct diag324_pib __user *udata = (struct diag324_pib __user *)arg; + struct pibdata *data = &pibdata; + static bool first = true; + u64 address; + int rc; + + if (!data->len) + return -EOPNOTSUPP; + if (get_user(address, &udata->address)) + return -EFAULT; + mutex_lock(&pibmutex); + rc = -ENOMEM; + if (!data->pib) + data->pib = vmalloc(data->len); + if (!data->pib) + goto out; + if (first || ktime_after(ktime_get(), data->expire)) { + pib_update(data); + data->sequence++; + data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); + mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + first = false; + } + rc = data->rc; + if (rc != 0 && rc != -EBUSY) + goto out; + rc = copy_to_user((void __user *)address, data->pib, data->pib->len); + rc |= put_user(data->sequence, &udata->sequence); + if (rc) + rc = -EFAULT; +out: + mutex_unlock(&pibmutex); + return rc; +} + +long diag324_piblen(unsigned long arg) +{ + struct pibdata *data = &pibdata; + + if (!data->len) + return -EOPNOTSUPP; + if (put_user(data->len, (size_t __user *)arg)) + return -EFAULT; + return 0; +} + +static int __init diag324_init(void) +{ + union diag324_response res; + unsigned long installed; + + if (!sclp.has_diag324) + return -EOPNOTSUPP; + res.response = diag324(DIAG324_SUBC_0, NULL); + if (res.sc0.rc != DIAG324_RET_SUCCESS) + return -EOPNOTSUPP; + installed = res.response; + if (!test_bit_inv(DIAG324_SUBC_1, &installed)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG324_SUBC_2, &installed)) + return -EOPNOTSUPP; + res.response = diag324(DIAG324_SUBC_1, NULL); + if (res.sc1.rc != DIAG324_RET_SUCCESS) + return -EOPNOTSUPP; + pibdata.len = res.sc1.pib_len; + return 0; +} +device_initcall(diag324_init); diff --git a/arch/s390/kernel/diag/diag_ioctl.h b/arch/s390/kernel/diag/diag_ioctl.h new file mode 100644 index 000000000000..7080be946785 --- /dev/null +++ b/arch/s390/kernel/diag/diag_ioctl.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _DIAG_IOCTL_H +#define _DIAG_IOCTL_H + +#include <linux/types.h> + +long diag324_pibbuf(unsigned long arg); +long diag324_piblen(unsigned long arg); + +long diag310_memtop_stride(unsigned long arg); +long diag310_memtop_len(unsigned long arg); +long diag310_memtop_buf(unsigned long arg); + +#endif /* _DIAG_IOCTL_H */ diff --git a/arch/s390/kernel/diag/diag_misc.c b/arch/s390/kernel/diag/diag_misc.c new file mode 100644 index 000000000000..efffe02ea02e --- /dev/null +++ b/arch/s390/kernel/diag/diag_misc.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide diagnose information via misc device /dev/diag. + * + * Copyright IBM Corp. 2024 + */ + +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ioctl.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/types.h> + +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + long rc; + + switch (cmd) { + case DIAG324_GET_PIBLEN: + rc = diag324_piblen(arg); + break; + case DIAG324_GET_PIBBUF: + rc = diag324_pibbuf(arg); + break; + case DIAG310_GET_STRIDE: + rc = diag310_memtop_stride(arg); + break; + case DIAG310_GET_MEMTOPLEN: + rc = diag310_memtop_len(arg); + break; + case DIAG310_GET_MEMTOPBUF: + rc = diag310_memtop_buf(arg); + break; + default: + rc = -ENOIOCTLCMD; + break; + } + return rc; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = diag_ioctl, +}; + +static struct miscdevice diagdev = { + .name = "diag", + .minor = MISC_DYNAMIC_MINOR, + .fops = &fops, + .mode = 0444, +}; + +static int diag_init(void) +{ + return misc_register(&diagdev); +} + +device_initcall(diag_init); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 90bbb4ea1d08..94eb8168ea44 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -24,8 +24,8 @@ #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/atomic.h> +#include <linux/io.h> #include <asm/dis.h> -#include <asm/io.h> #include <asm/cpcmd.h> #include <asm/lowcore.h> #include <asm/debug.h> @@ -122,6 +122,7 @@ enum { U8_32, /* 8 bit unsigned value starting at 32 */ U12_16, /* 12 bit unsigned value starting at 16 */ U16_16, /* 16 bit unsigned value starting at 16 */ + U16_20, /* 16 bit unsigned value starting at 20 */ U16_32, /* 16 bit unsigned value starting at 32 */ U32_16, /* 32 bit unsigned value starting at 16 */ VX_12, /* Vector index register starting at position 12 */ @@ -184,6 +185,7 @@ static const struct s390_operand operands[] = { [U8_32] = { 8, 32, 0 }, [U12_16] = { 12, 16, 0 }, [U16_16] = { 16, 16, 0 }, + [U16_20] = { 16, 20, 0 }, [U16_32] = { 16, 32, 0 }, [U32_16] = { 32, 16, 0 }, [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, @@ -257,7 +259,6 @@ static const unsigned char formats[][6] = { [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, - [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, @@ -300,14 +301,17 @@ static const unsigned char formats[][6] = { [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VV0UU] = { V_8, V_12, U8_28, U4_24, 0, 0 }, [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, - [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRI_VVV0UV] = { V_8, V_12, V_16, V_32, U8_24, 0 }, + [INSTR_VRR_0V0U] = { V_12, U16_20, 0, 0, 0, 0 }, [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_0VVU] = { V_12, V_16, U16_20, 0, 0, 0 }, [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, @@ -455,21 +459,21 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) if (separator) ptr += sprintf(ptr, "%c", separator); if (operand->flags & OPERAND_GPR) - ptr += sprintf(ptr, "%%r%i", value); + ptr += sprintf(ptr, "%%r%u", value); else if (operand->flags & OPERAND_FPR) - ptr += sprintf(ptr, "%%f%i", value); + ptr += sprintf(ptr, "%%f%u", value); else if (operand->flags & OPERAND_AR) - ptr += sprintf(ptr, "%%a%i", value); + ptr += sprintf(ptr, "%%a%u", value); else if (operand->flags & OPERAND_CR) - ptr += sprintf(ptr, "%%c%i", value); + ptr += sprintf(ptr, "%%c%u", value); else if (operand->flags & OPERAND_VR) - ptr += sprintf(ptr, "%%v%i", value); + ptr += sprintf(ptr, "%%v%u", value); else if (operand->flags & OPERAND_PCREL) { void *pcrel = (void *)((int)value + addr); ptr += sprintf(ptr, "%px", pcrel); } else if (operand->flags & OPERAND_SIGNED) - ptr += sprintf(ptr, "%i", value); + ptr += sprintf(ptr, "%i", (int)value); else ptr += sprintf(ptr, "%u", value); if (operand->flags & OPERAND_DISP) @@ -516,7 +520,7 @@ void show_code(struct pt_regs *regs) if (copy_from_regs(regs, code + end, (void *)addr, 2)) break; } - /* Code snapshot useable ? */ + /* Code snapshot usable ? */ if ((regs->psw.addr & 1) || start >= end) { printk("%s Code: Bad PSW.\n", mode); return; diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 1e3233eb510a..dd410962ecbe 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <asm/asm-offsets.h> #include <asm/processor.h> #include <asm/debug.h> #include <asm/dis.h> @@ -41,60 +42,50 @@ const char *stack_type_name(enum stack_type type) EXPORT_SYMBOL_GPL(stack_type_name); static inline bool in_stack(unsigned long sp, struct stack_info *info, - enum stack_type type, unsigned long low, - unsigned long high) + enum stack_type type, unsigned long stack) { - if (sp < low || sp >= high) + if (sp < stack || sp >= stack + THREAD_SIZE) return false; info->type = type; - info->begin = low; - info->end = high; + info->begin = stack; + info->end = stack + THREAD_SIZE; return true; } static bool in_task_stack(unsigned long sp, struct task_struct *task, struct stack_info *info) { - unsigned long stack; + unsigned long stack = (unsigned long)task_stack_page(task); - stack = (unsigned long) task_stack_page(task); - return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE); + return in_stack(sp, info, STACK_TYPE_TASK, stack); } static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->async_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.async_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_IRQ, stack); } static bool in_nodat_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->nodat_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.nodat_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_NODAT, stack); } static bool in_mcck_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->mcck_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.mcck_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_MCCK, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_MCCK, stack); } static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->restart_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.restart_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_RESTART, stack); } int get_stack_info(unsigned long sp, struct task_struct *task, @@ -152,7 +143,13 @@ void show_stack(struct task_struct *task, unsigned long *stack, static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); - printk(" [<%016lx>] %pSR\n", regs->last_break, (void *)regs->last_break); + printk(" [<%016lx>] ", regs->last_break); + if (user_mode(regs)) { + print_vma_addr(KERN_CONT, regs->last_break); + pr_cont("\n"); + } else { + pr_cont("%pSR\n", (void *)regs->last_break); + } } void show_registers(struct pt_regs *regs) @@ -202,13 +199,8 @@ void __noreturn die(struct pt_regs *regs, const char *str) console_verbose(); spin_lock_irq(&die_lock); bust_spinlocks(1); - printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, + printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff, regs->int_code >> 17, ++die_counter); -#ifdef CONFIG_PREEMPT - pr_cont("PREEMPT "); -#elif defined(CONFIG_PREEMPT_RT) - pr_cont("PREEMPT_RT "); -#endif pr_cont("SMP "); if (debug_pagealloc_enabled()) pr_cont("DEBUG_PAGEALLOC"); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 432c8c987256..54cf0923050f 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -7,6 +7,8 @@ #define KMSG_COMPONENT "setup" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/sched/debug.h> +#include <linux/cpufeature.h> #include <linux/compiler.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,8 +20,13 @@ #include <linux/uaccess.h> #include <linux/kernel.h> #include <asm/asm-extable.h> +#include <linux/memblock.h> +#include <asm/access-regs.h> +#include <asm/asm-offsets.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ebcdic.h> +#include <asm/fpu.h> #include <asm/ipl.h> #include <asm/lowcore.h> #include <asm/processor.h> @@ -30,24 +37,36 @@ #include <asm/sclp.h> #include <asm/facility.h> #include <asm/boot_data.h> -#include <asm/switch_to.h> #include "entry.h" -int __bootdata(is_full_image); +#define __decompressor_handled_param(func, param) \ +static int __init ignore_decompressor_param_##func(char *s) \ +{ \ + return 0; \ +} \ +early_param(#param, ignore_decompressor_param_##func) + +#define decompressor_handled_param(param) __decompressor_handled_param(param, param) + +decompressor_handled_param(mem); +decompressor_handled_param(vmalloc); +decompressor_handled_param(dfltcc); +decompressor_handled_param(facilities); +decompressor_handled_param(nokaslr); +decompressor_handled_param(cmma); +decompressor_handled_param(relocate_lowcore); +decompressor_handled_param(bootdebug); +__decompressor_handled_param(debug_alternative, debug-alternative); +#if IS_ENABLED(CONFIG_KVM) +decompressor_handled_param(prot_virt); +#endif -static void __init reset_tod_clock(void) +static void __init kasan_early_init(void) { - union tod_clock clk; - - if (store_tod_clock_ext_cc(&clk) == 0) - return; - /* TOD clock not running. Set the clock to Unix Epoch. */ - if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) - disabled_wait(); - - memset(&tod_clock_base, 0, sizeof(tod_clock_base)); - tod_clock_base.tod = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; +#ifdef CONFIG_KASAN + init_task.kasan_depth = 0; + pr_info("KernelAddressSanitizer initialized\n"); +#endif } /* @@ -68,26 +87,6 @@ static noinline __init void init_kernel_storage_key(void) static __initdata char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); -static noinline __init void detect_machine_type(void) -{ - struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; - - /* Check current-configuration-level */ - if (stsi(NULL, 0, 0, 0) <= 2) { - S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR; - return; - } - /* Get virtual-machine cpu information. */ - if (stsi(vmms, 3, 2, 2) || !vmms->count) - return; - - /* Detect known hypervisors */ - if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; - else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) - S390_lowcore.machine_flags |= MACHINE_FLAG_VM; -} - /* Remove leading, trailing and double whitespace. */ static inline void strim_all(char *str) { @@ -128,9 +127,9 @@ static noinline __init void setup_arch_string(void) strim_all(hvstr); } else { sprintf(hvstr, "%s", - MACHINE_IS_LPAR ? "LPAR" : - MACHINE_IS_VM ? "z/VM" : - MACHINE_IS_KVM ? "KVM" : "unknown"); + machine_is_lpar() ? "LPAR" : + machine_is_vm() ? "z/VM" : + machine_is_kvm() ? "KVM" : "unknown"); } dump_stack_set_arch_desc("%s (%s)", mstr, hvstr); } @@ -139,9 +138,8 @@ static __init void setup_topology(void) { int max_mnest; - if (!test_facility(11)) + if (!cpu_has_topology()) return; - S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY; for (max_mnest = 6; max_mnest > 1; max_mnest--) { if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) break; @@ -149,103 +147,58 @@ static __init void setup_topology(void) topology_max_mnest = max_mnest; } -void __do_early_pgm_check(struct pt_regs *regs) +void __init __do_early_pgm_check(struct pt_regs *regs) { - if (!fixup_exception(regs)) - disabled_wait(); + struct lowcore *lc = get_lowcore(); + unsigned long ip; + + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = lc->trans_exc_code; + ip = __rewind_psw(regs->psw, regs->int_code >> 16); + + /* Monitor Event? Might be a warning */ + if ((regs->int_code & PGM_INT_CODE_MASK) == 0x40) { + if (report_bug(ip, regs) == BUG_TRAP_TYPE_WARN) + return; + } + if (fixup_exception(regs)) + return; + /* + * Unhandled exception - system cannot continue but try to get some + * helpful messages to the console. Use early_printk() to print + * some basic information in case it is too early for printk(). + */ + register_early_console(); + early_printk("PANIC: early exception %04x PSW: %016lx %016lx\n", + regs->int_code & 0xffff, regs->psw.mask, regs->psw.addr); + show_regs(regs); + disabled_wait(); } static noinline __init void setup_lowcore_early(void) { + struct lowcore *lc = get_lowcore(); psw_t psw; psw.addr = (unsigned long)early_pgm_check_handler; - psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; - if (IS_ENABLED(CONFIG_KASAN)) - psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw = psw; - S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; -} - -static noinline __init void setup_facility_list(void) -{ - memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); - if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) - __clear_facility(82, alt_stfle_fac_list); -} - -static __init void detect_diag9c(void) -{ - unsigned int cpu_address; - int rc; - - cpu_address = stap(); - diag_stat_inc(DIAG_STAT_X09C); - asm volatile( - " diag %2,0,0x9c\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); - if (!rc) - S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; -} - -static __init void detect_machine_facilities(void) -{ - if (test_facility(8)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; - __ctl_set_bit(0, 23); - } - if (test_facility(78)) - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; - if (test_facility(3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; - if (test_facility(50) && test_facility(73)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_TE; - __ctl_set_bit(0, 55); - } - if (test_facility(51)) - S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; - if (test_facility(129)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_VX; - __ctl_set_bit(0, 17); - } - if (test_facility(130) && !noexec_disabled) { - S390_lowcore.machine_flags |= MACHINE_FLAG_NX; - __ctl_set_bit(0, 20); - } - if (test_facility(133)) - S390_lowcore.machine_flags |= MACHINE_FLAG_GS; - if (test_facility(139) && (tod_clock_base.tod >> 63)) { - /* Enabled signed clock comparator comparisons */ - S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; - clock_comparator_max = -1ULL >> 1; - __ctl_set_bit(0, 53); - } - if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; - /* the control bit is set during PCI initialization */ - } + psw.mask = PSW_KERNEL_BITS; + lc->program_new_psw = psw; + lc->preempt_count = INIT_PREEMPT_COUNT; + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); } static inline void save_vector_registers(void) { #ifdef CONFIG_CRASH_DUMP - if (test_facility(129)) + if (cpu_has_vx()) save_vx_regs(boot_cpu_vector_save_area); #endif } -static inline void setup_control_registers(void) +static inline void setup_low_address_protection(void) { - unsigned long reg; - - __ctl_store(reg, 0, 0); - reg |= CR0_LOW_ADDRESS_PROTECTION; - reg |= CR0_EMERGENCY_SIGNAL_SUBMASK; - reg |= CR0_EXTERNAL_CALL_SUBMASK; - __ctl_load(reg, 0, 0); + system_ctl_set_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); } static inline void setup_access_registers(void) @@ -255,30 +208,11 @@ static inline void setup_access_registers(void) restore_access_regs(acrs); } -static int __init disable_vector_extension(char *str) -{ - S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; - __ctl_clear_bit(0, 17); - return 0; -} -early_param("novx", disable_vector_extension); - char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); -} - -static void __init check_image_bootable(void) -{ - if (is_full_image) - return; - - sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); - sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); - sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); - disabled_wait(); + strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } static void __init sort_amode31_extable(void) @@ -288,24 +222,18 @@ static void __init sort_amode31_extable(void) void __init startup_init(void) { - sclp_early_adjust_va(); - reset_tod_clock(); - check_image_bootable(); + kasan_early_init(); time_early_init(); init_kernel_storage_key(); lockdep_off(); sort_amode31_extable(); setup_lowcore_early(); - setup_facility_list(); - detect_machine_type(); setup_arch_string(); setup_boot_command_line(); - detect_diag9c(); - detect_machine_facilities(); save_vector_registers(); setup_topology(); sclp_early_detect(); - setup_control_registers(); + setup_low_address_protection(); setup_access_registers(); lockdep_on(); } diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c index d9d53f44008a..cefe020a3be3 100644 --- a/arch/s390/kernel/early_printk.c +++ b/arch/s390/kernel/early_printk.c @@ -6,6 +6,7 @@ #include <linux/console.h> #include <linux/kernel.h> #include <linux/init.h> +#include <asm/setup.h> #include <asm/sclp.h> static void sclp_early_write(struct console *con, const char *s, unsigned int len) @@ -20,6 +21,16 @@ static struct console sclp_early_console = { .index = -1, }; +void __init register_early_console(void) +{ + if (early_console) + return; + if (!sclp.has_linemode && !sclp.has_vt220) + return; + early_console = &sclp_early_console; + register_console(early_console); +} + static int __init setup_early_printk(char *buf) { if (early_console) @@ -27,10 +38,7 @@ static int __init setup_early_printk(char *buf) /* Accept only "earlyprintk" and "earlyprintk=sclp" */ if (buf && !str_has_prefix(buf, "sclp")) return 0; - if (!sclp.has_linemode && !sclp.has_vt220) - return 0; - early_console = &sclp_early_console; - register_console(early_console); + register_early_console(); return 0; } early_param("earlyprintk", setup_early_printk); diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S deleted file mode 100644 index f521c6da37b8..000000000000 --- a/arch/s390/kernel/earlypgm.S +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright IBM Corp. 2006, 2007 - * Author(s): Michael Holzheu <holzheu@de.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -ENTRY(early_pgm_check_handler) - stmg %r8,%r15,__LC_SAVE_AREA_SYNC - aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) - la %r11,STACK_FRAME_OVERHEAD(%r15) - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - lgr %r2,%r11 - brasl %r14,__do_early_pgm_check - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - lpswe __LC_RETURN_PSW -ENDPROC(early_pgm_check_handler) diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c index 7f8246c9be08..0e51fa537262 100644 --- a/arch/s390/kernel/ebcdic.c +++ b/arch/s390/kernel/ebcdic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * ECBDIC -> ASCII, ASCII -> ECBDIC, + * EBCDIC -> ASCII, ASCII -> EBCDIC, * upper to lower case (EBCDIC) conversion tables. * * S390 version diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index d2a1f2f4f5b8..0f00f4b06d51 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -8,10 +8,11 @@ * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), */ +#include <linux/export.h> #include <linux/init.h> #include <linux/linkage.h> #include <asm/asm-extable.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/processor.h> #include <asm/cache.h> #include <asm/dwarf.h> @@ -23,62 +24,51 @@ #include <asm/page.h> #include <asm/sigp.h> #include <asm/irq.h> -#include <asm/vx-insn.h> +#include <asm/fpu-insn.h> #include <asm/setup.h> #include <asm/nmi.h> -#include <asm/export.h> #include <asm/nospec-insn.h> - -STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER -STACK_SIZE = 1 << STACK_SHIFT -STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE +#include <asm/lowcore.h> +#include <asm/machine.h> _LPP_OFFSET = __LC_LPP .macro STBEAR address - ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", ALT_FACILITY(193) .endm .macro LBEAR address - ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 - .endm - - .macro LPSWEY address,lpswe - ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193) .endm - .macro MBEAR reg - ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + .macro LPSWEY address, lpswe + ALTERNATIVE_2 "b \lpswe;nopr", \ + ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193), \ + __stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0), \ + ALT_FEATURE(MFEATURE_LOWCORE) .endm - .macro CHECK_STACK savearea -#ifdef CONFIG_CHECK_STACK - tml %r15,STACK_SIZE - CONFIG_STACK_GUARD - lghi %r14,\savearea - jz stack_overflow -#endif + .macro MBEAR reg, lowcore + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK(\lowcore)),\ + ALT_FACILITY(193) .endm - .macro CHECK_VMAP_STACK savearea,oklabel -#ifdef CONFIG_VMAP_STACK + .macro CHECK_VMAP_STACK savearea, lowcore, oklabel lgr %r14,%r15 - nill %r14,0x10000 - STACK_SIZE - oill %r14,STACK_INIT - clg %r14,__LC_KERNEL_STACK + nill %r14,0x10000 - THREAD_SIZE + oill %r14,STACK_INIT_OFFSET + clg %r14,__LC_KERNEL_STACK(\lowcore) je \oklabel - clg %r14,__LC_ASYNC_STACK + clg %r14,__LC_ASYNC_STACK(\lowcore) je \oklabel - clg %r14,__LC_MCCK_STACK + clg %r14,__LC_MCCK_STACK(\lowcore) je \oklabel - clg %r14,__LC_NODAT_STACK + clg %r14,__LC_NODAT_STACK(\lowcore) je \oklabel - clg %r14,__LC_RESTART_STACK + clg %r14,__LC_RESTART_STACK(\lowcore) je \oklabel - lghi %r14,\savearea - j stack_overflow -#else - j \oklabel -#endif + la %r14,\savearea(\lowcore) + j stack_invalid .endm /* @@ -104,151 +94,108 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_SPEC(82) .endm .macro BPON - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm .macro BPENTER tif_ptr,tif_mask ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ - "j .+12; nop; nop", 82 + "j .+12; nop; nop", ALT_SPEC(82) .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ - "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82 - .endm - - /* - * The CHKSTG macro jumps to the provided label in case the - * machine check interruption code reports one of unrecoverable - * storage errors: - * - Storage error uncorrected - * - Storage key error uncorrected - * - Storage degradation with Failing-storage-address validity - */ - .macro CHKSTG errlabel - TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR) - jnz \errlabel - TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD - jz .Loklabel\@ - TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR - jnz \errlabel -.Loklabel\@: + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm #if IS_ENABLED(CONFIG_KVM) - /* - * The OUTSIDE macro jumps to the provided label in case the value - * in the provided register is outside of the provided range. The - * macro is useful for checking whether a PSW stored in a register - * pair points inside or outside of a block of instructions. - * @reg: register to check - * @start: start of the range - * @end: end of the range - * @outside_label: jump here if @reg is outside of [@start..@end) - */ - .macro OUTSIDE reg,start,end,outside_label - lgr %r14,\reg - larl %r13,\start - slgr %r14,%r13 -#ifdef CONFIG_AS_IS_LLVM - clgfrl %r14,.Lrange_size\@ -#else - clgfi %r14,\end - \start -#endif - jhe \outside_label -#ifdef CONFIG_AS_IS_LLVM - .section .rodata, "a" - .align 4 -.Lrange_size\@: - .long \end - \start - .previous -#endif - .endm - - .macro SIEEXIT - lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + .macro SIEEXIT sie_control,lowcore + lg %r9,\sie_control # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE(\lowcore) # load primary asce + lg %r9,__LC_CURRENT(\lowcore) + mvi __TI_sie(%r9),0 larl %r9,sie_exit # skip forward to sie_exit .endm #endif + .macro STACKLEAK_ERASE +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + brasl %r14,stackleak_erase_on_task_stack +#endif + .endm + GEN_BR_THUNK %r14 .section .kprobes.text, "ax" .Ldummy: /* - * This nop exists only in order to avoid that __bpon starts at - * the beginning of the kprobes text section. In that case we would - * have several symbols at the same address. E.g. objdump would take - * an arbitrary symbol name when disassembling this code. - * With the added nop in between the __bpon symbol is unique - * again. + * The following nop exists only in order to avoid that the next + * symbol starts at the beginning of the kprobes text section. + * In that case there would be several symbols at the same address. + * E.g. objdump would take an arbitrary symbol when disassembling + * the code. + * With the added nop in between this cannot happen. */ nop 0 -ENTRY(__bpon) - .globl __bpon - BPON - BR_EX %r14 -ENDPROC(__bpon) - /* - * Scheduler resume function, called by switch_to - * gpr2 = (task_struct *) prev - * gpr3 = (task_struct *) next + * Scheduler resume function, called by __switch_to + * gpr2 = (task_struct *)prev + * gpr3 = (task_struct *)next * Returns: * gpr2 = prev */ -ENTRY(__switch_to) +SYM_FUNC_START(__switch_to_asm) stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task lghi %r4,__TASK_stack lghi %r1,__TASK_thread - llill %r5,STACK_INIT + llill %r5,STACK_INIT_OFFSET stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev lg %r15,0(%r4,%r3) # start of kernel stack of next agr %r15,%r5 # end of kernel stack of next - stg %r3,__LC_CURRENT # store task struct of next - stg %r15,__LC_KERNEL_STACK # store end of kernel stack + GET_LC %r13 + stg %r3,__LC_CURRENT(%r13) # store task struct of next + stg %r15,__LC_KERNEL_STACK(%r13) # store end of kernel stack lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next aghi %r3,__TASK_pid - mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next + mvc __LC_CURRENT_PID(4,%r13),0(%r3) # store pid of next + ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40) lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 BR_EX %r14 -ENDPROC(__switch_to) +SYM_FUNC_END(__switch_to_asm) #if IS_ENABLED(CONFIG_KVM) /* - * sie64a calling convention: - * %r2 pointer to sie control block - * %r3 guest register save area + * __sie64a calling convention: + * %r2 pointer to sie control block phys + * %r3 pointer to sie control block virt + * %r4 guest register save area + * %r5 guest asce */ -ENTRY(sie64a) +SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers - lg %r12,__LC_CURRENT - stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer - stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area + GET_LC %r13 + lg %r14,__LC_CURRENT(%r13) + stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. + stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses + stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 - mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 - lg %r14,__LC_GMAP # get gmap pointer - ltgr %r14,%r14 - jz .Lsie_gmap - lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce -.Lsie_gmap: + mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags + lmg %r0,%r13,0(%r4) # load guest gprs 0-13 + mvi __TI_sie(%r14),1 + lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now tm __SIE_PROG20+3(%r14),3 # last exit... jnz .Lsie_skip - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lsie_skip # exit if fp/vx regs changed - BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr + BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST .Lsie_entry: sie 0(%r14) # Let the next instruction be NOP to avoid triggering a machine check @@ -256,24 +203,15 @@ ENTRY(sie64a) nopr 7 .Lsie_leave: BPOFF - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST .Lsie_skip: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce -.Lsie_done: -# some program checks are suppressing. C code (e.g. do_protection_exception) -# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There -# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. -# Other instructions between sie64a and .Lsie_done should not cause program -# interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -.Lrewind_pad6: - nopr 7 -.Lrewind_pad4: - nopr 7 -.Lrewind_pad2: - nopr 7 - .globl sie_exit -sie_exit: + GET_LC %r14 + lctlg %c1,%c1,__LC_USER_ASCE(%r14) # load primary asce + lg %r14,__LC_CURRENT(%r14) + mvi __TI_sie(%r14),0 +SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to @@ -284,17 +222,8 @@ sie_exit: lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers lg %r2,__SF_SIE_REASON(%r15) # return exit reason code BR_EX %r14 -.Lsie_fault: - lghi %r14,-EFAULT - stg %r14,__SF_SIE_REASON(%r15) # set exit reason code - j sie_exit - - EX_TABLE(.Lrewind_pad6,.Lsie_fault) - EX_TABLE(.Lrewind_pad4,.Lsie_fault) - EX_TABLE(.Lrewind_pad2,.Lsie_fault) - EX_TABLE(sie_exit,.Lsie_fault) -ENDPROC(sie64a) -EXPORT_SYMBOL(sie64a) +SYM_FUNC_END(__sie64a) +EXPORT_SYMBOL(__sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -303,19 +232,17 @@ EXPORT_SYMBOL(sie_exit) * are entered with interrupts disabled. */ -ENTRY(system_call) - stpt __LC_SYS_ENTER_TIMER - stmg %r8,%r15,__LC_SAVE_AREA_SYNC +SYM_CODE_START(system_call) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF lghi %r14,0 .Lsysc_per: - STBEAR __LC_LAST_BREAK - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r12,__LC_CURRENT - lg %r15,__LC_KERNEL_STACK + STBEAR __LC_LAST_BREAK(%r13) + lg %r15,__LC_KERNEL_STACK(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP # clear user controlled register to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -328,74 +255,71 @@ ENTRY(system_call) xgr %r10,%r10 xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs - mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC - MBEAR %r2 + mvc __PT_R8(64,%r2),__LC_SAVE_AREA(%r13) + MBEAR %r2,%r13 lgr %r3,%r14 brasl %r14,__do_syscall - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + STACKLEAK_ERASE + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE -ENDPROC(system_call) +SYM_CODE_END(system_call) # # a new process exits the kernel with ret_from_fork # -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) lgr %r3,%r11 brasl %r14,__ret_from_fork - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + STACKLEAK_ERASE + GET_LC %r13 + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE -ENDPROC(ret_from_fork) +SYM_CODE_END(ret_from_fork) /* * Program check handler routine */ -ENTRY(pgm_check_handler) - stpt __LC_SYS_ENTER_TIMER +SYM_CODE_START(pgm_check_handler) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_SYNC - lg %r12,__LC_CURRENT - lghi %r10,0 - lmg %r8,%r9,__LC_PGM_OLD_PSW + lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13) + xgr %r10,%r10 tmhh %r8,0x0001 # coming from user space? - jno .Lpgm_skip_asce - lctlg %c1,%c1,__LC_KERNEL_ASCE - j 3f # -> fault in user space -.Lpgm_skip_asce: + jo 3f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) - # cleanup critical section for program checks in sie64a - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f - SIEEXIT + lg %r11,__LC_CURRENT(%r13) + tm __TI_sie(%r11),0xff + jz 1f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 lghi %r10,_PIF_GUEST_FAULT #endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 2f # -> enabled, can't be a double fault - tm __LC_PGM_ILC+3,0x80 # check for per exception + tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -2: CHECK_STACK __LC_SAVE_AREA_SYNC - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f -3: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - lg %r15,__LC_KERNEL_STACK +2: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + # CHECK_VMAP_STACK branches to stack_invalid or 4f + CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f +3: lg %r15,__LC_KERNEL_STACK(%r13) 4: la %r11,STACK_FRAME_OVERHEAD(%r15) stg %r10,__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13) stmg %r8,%r9,__PT_PSW(%r11) - # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -404,15 +328,16 @@ ENTRY(pgm_check_handler) xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 + xgr %r12,%r12 lgr %r2,%r11 brasl %r14,__do_pgm_check tmhh %r8,0x0001 # returning to user space? jno .Lpgm_exit_kernel - lctlg %c1,%c1,__LC_USER_ASCE - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - stpt __LC_EXIT_TIMER + STACKLEAK_ERASE + BPON + stpt __LC_EXIT_TIMER(%r13) .Lpgm_exit_kernel: - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE @@ -421,39 +346,38 @@ ENTRY(pgm_check_handler) # single stepped system call # .Lpgm_svcper: - mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW + mvc __LC_RETURN_PSW(8,%r13),__LC_SVC_NEW_PSW(%r13) larl %r14,.Lsysc_per - stg %r14,__LC_RETURN_PSW+8 + stg %r14,__LC_RETURN_PSW+8(%r13) lghi %r14,1 - LBEAR __LC_PGM_LAST_BREAK + LBEAR __LC_PGM_LAST_BREAK(%r13) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per -ENDPROC(pgm_check_handler) +SYM_CODE_END(pgm_check_handler) /* * Interrupt handler macro used for external and IO interrupts. */ .macro INT_HANDLER name,lc_old_psw,handler -ENTRY(\name) - stckf __LC_INT_CLOCK - stpt __LC_SYS_ENTER_TIMER - STBEAR __LC_LAST_BREAK +SYM_CODE_START(\name) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stckf __LC_INT_CLOCK(%r13) + stpt __LC_SYS_ENTER_TIMER(%r13) + STBEAR __LC_LAST_BREAK(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lg %r12,__LC_CURRENT - lmg %r8,%r9,\lc_old_psw + lmg %r8,%r9,\lc_old_psw(%r13) tmhh %r8,0x0001 # interrupting from user ? jnz 1f #if IS_ENABLED(CONFIG_KVM) - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - SIEEXIT + lg %r10,__LC_CURRENT(%r13) + tm __TI_sie(%r10),0xff + jz 0f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -0: CHECK_STACK __LC_SAVE_AREA_ASYNC - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) +0: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f -1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK +1: lg %r15,__LC_KERNEL_STACK(%r13) 2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -467,114 +391,85 @@ ENTRY(\name) xgr %r7,%r7 xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC - MBEAR %r11 + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + MBEAR %r11,%r13 stmg %r8,%r9,__PT_PSW(%r11) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + mvc __LC_RETURN_PSW(16,%r13),__PT_PSW(%r11) tmhh %r8,0x0001 # returning to user ? jno 2f - lctlg %c1,%c1,__LC_USER_ASCE - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - stpt __LC_EXIT_TIMER + STACKLEAK_ERASE + BPON + stpt __LC_EXIT_TIMER(%r13) 2: LBEAR __PT_LAST_BREAK(%r11) lmg %r0,%r15,__PT_R0(%r11) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE -ENDPROC(\name) +SYM_CODE_END(\name) .endm + .section .irqentry.text, "ax" + INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq -/* - * Load idle PSW. - */ -ENTRY(psw_idle) - stg %r14,(__SF_GPRS+8*8)(%r15) - stg %r3,__SF_EMPTY(%r15) - larl %r1,psw_idle_exit - stg %r1,__SF_EMPTY+8(%r15) - larl %r1,smp_cpu_mtid - llgf %r1,0(%r1) - ltgr %r1,%r1 - jz .Lpsw_idle_stcctm - .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2) -.Lpsw_idle_stcctm: - oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT - BPON - stckf __CLOCK_IDLE_ENTER(%r2) - stpt __TIMER_IDLE_ENTER(%r2) - lpswe __SF_EMPTY(%r15) -.globl psw_idle_exit -psw_idle_exit: - BR_EX %r14 -ENDPROC(psw_idle) + .section .kprobes.text, "ax" /* * Machine check handler routines */ -ENTRY(mcck_int_handler) - stckf __LC_MCCK_CLOCK +SYM_CODE_START(mcck_int_handler) BPOFF - la %r1,4095 # validate r1 - spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer - LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear - lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs - lg %r12,__LC_CURRENT - lmg %r8,%r9,__LC_MCK_OLD_PSW - TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE + GET_LC %r13 + lmg %r8,%r9,__LC_MCK_OLD_PSW(%r13) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CR_VALID jno .Lmcck_panic # control registers invalid -> panic - la %r14,4095 - lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs ptlb - lghi %r14,__LC_CPU_TIMER_SAVE_AREA - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID + lay %r14,__LC_CPU_TIMER_SAVE_AREA(%r13) + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CPU_TIMER_VALID jo 3f - la %r14,__LC_SYS_ENTER_TIMER - clc 0(8,%r14),__LC_EXIT_TIMER + la %r14,__LC_SYS_ENTER_TIMER(%r13) + clc 0(8,%r14),__LC_EXIT_TIMER(%r13) jl 1f - la %r14,__LC_EXIT_TIMER -1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER + la %r14,__LC_EXIT_TIMER(%r13) +1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER(%r13) jl 2f - la %r14,__LC_LAST_UPDATE_TIMER + la %r14,__LC_LAST_UPDATE_TIMER(%r13) 2: spt 0(%r14) - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) -3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) +3: TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? - jnz 6f - TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + jnz .Lmcck_user + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic #if IS_ENABLED(CONFIG_KVM) - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f - OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f - oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST - j 5f -4: CHKSTG .Lmcck_panic -5: larl %r14,.Lstosm_tmp - stosm 0(%r14),0x04 # turn dat on, keep irqs off - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - SIEEXIT - j .Lmcck_stack + lg %r10,__LC_CURRENT(%r13) + tm __TI_sie(%r10),0xff + jz .Lmcck_user + # Need to compare the address instead of __TI_SIE flag. + # Otherwise there would be a race between setting the flag + # and entering SIE (or leaving and clearing the flag). This + # would cause machine checks targeted at the guest to be + # handled by the host. + larl %r14,.Lsie_entry + clgrjl %r9,%r14, 4f + larl %r14,.Lsie_leave + clgrjhe %r9,%r14, 4f + lg %r10,__LC_PCPU(%r13) + oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST +4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -6: CHKSTG .Lmcck_panic - larl %r14,.Lstosm_tmp - stosm 0(%r14),0x04 # turn dat on, keep irqs off - tmhh %r8,0x0001 # interrupting from user ? - jz .Lmcck_stack - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP -.Lmcck_stack: - lg %r15,__LC_MCCK_STACK +.Lmcck_user: + lg %r15,__LC_MCCK_STACK(%r13) la %r11,STACK_FRAME_OVERHEAD(%r15) - stctg %c1,%c1,__PT_CR1(%r11) - lctlg %c1,%c1,__LC_KERNEL_ASCE xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lghi %r14,__LC_GPREGS_SAVE_AREA+64 - stmg %r0,%r7,__PT_R0(%r11) + lay %r14,__LC_GPREGS_SAVE_AREA(%r13) + mvc __PT_R0(128,%r11),0(%r14) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -584,30 +479,19 @@ ENTRY(mcck_int_handler) xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 - mvc __PT_R8(64,%r11),0(%r14) stmg %r8,%r9,__PT_PSW(%r11) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,s390_do_machine_check - cghi %r2,0 - je .Lmcck_return - lg %r1,__LC_KERNEL_STACK # switch to kernel stack - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r2,%r11 - lgr %r15,%r1 - brasl %r14,s390_handle_mcck -.Lmcck_return: - lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) - mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW - tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? + mvc __LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW + tm __LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ? jno 0f - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - stpt __LC_EXIT_TIMER -0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 + BPON + stpt __LC_EXIT_TIMER(%r13) +0: ALTERNATIVE "brcl 0,0", __stringify(lay %r12,__LC_LAST_BREAK_SAVE_AREA(%r13)),\ + ALT_FACILITY(193) LBEAR 0(%r12) lmg %r11,%r15,__PT_R11(%r11) LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE @@ -621,10 +505,10 @@ ENTRY(mcck_int_handler) */ lhi %r5,0 lhi %r6,1 - larl %r7,.Lstop_lock + larl %r7,stop_lock cs %r5,%r6,0(%r7) # single CPU-stopper only jnz 4f - larl %r7,.Lthis_cpu + larl %r7,this_cpu stap 0(%r7) # this CPU address lh %r4,0(%r7) nilh %r4,0 @@ -640,26 +524,28 @@ ENTRY(mcck_int_handler) 3: sigp %r1,%r4,SIGP_STOP # stop this CPU brc SIGP_CC_BUSY,3b 4: j 4b -ENDPROC(mcck_int_handler) +SYM_CODE_END(mcck_int_handler) -ENTRY(restart_int_handler) - ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 +SYM_CODE_START(restart_int_handler) + ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40) stg %r15,__LC_SAVE_AREA_RESTART TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 jz 0f - la %r15,4095 - lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r15) -0: larl %r15,.Lstosm_tmp - stosm 0(%r15),0x04 # turn dat on, keep irqs off - lg %r15,__LC_RESTART_STACK + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA +0: larl %r15,daton_psw + lpswe 0(%r15) # turn dat on, keep irqs off +.Ldaton: + GET_LC %r15 + lg %r15,__LC_RESTART_STACK(%r15) xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART - mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW + GET_LC %r13 + mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART(%r13) + mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW(%r13) xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) - lg %r1,__LC_RESTART_FN # load fn, parm & source cpu - lg %r2,__LC_RESTART_DATA - lgf %r3,__LC_RESTART_SOURCE + lg %r1,__LC_RESTART_FN(%r13) # load fn, parm & source cpu + lg %r2,__LC_RESTART_DATA(%r13) + lgf %r3,__LC_RESTART_SOURCE(%r13) ltgr %r3,%r3 # test source cpu address jm 1f # negative -> skip source stop 0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu @@ -670,46 +556,70 @@ ENTRY(restart_int_handler) 2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu brc 2,2b 3: j 3b -ENDPROC(restart_int_handler) +SYM_CODE_END(restart_int_handler) + + __INIT +SYM_CODE_START(early_pgm_check_handler) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW(%r13) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(early_pgm_check_handler) + __FINIT .section .kprobes.text, "ax" -#if defined(CONFIG_CHECK_STACK) || defined(CONFIG_VMAP_STACK) /* - * The synchronous or the asynchronous stack overflowed. We are dead. + * The synchronous or the asynchronous stack pointer is invalid. We are dead. * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -ENTRY(stack_overflow) - lg %r15,__LC_NODAT_STACK # change to panic stack +SYM_CODE_START(stack_invalid) + GET_LC %r15 + lg %r15,__LC_NODAT_STACK(%r15) # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_R8(64,%r11),0(%r14) - stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + GET_LC %r2 + mvc __PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs - jg kernel_stack_overflow -ENDPROC(stack_overflow) -#endif + jg kernel_stack_invalid +SYM_CODE_END(stack_invalid) .section .data, "aw" - .align 4 -.Lstop_lock: .long 0 -.Lthis_cpu: .short 0 -.Lstosm_tmp: .byte 0 + .balign 4 +SYM_DATA_LOCAL(stop_lock, .long 0) +SYM_DATA_LOCAL(this_cpu, .short 0) + .balign 8 +SYM_DATA_START_LOCAL(daton_psw) + .quad PSW_KERNEL_BITS + .quad .Ldaton +SYM_DATA_END(daton_psw) + .section .rodata, "a" + .balign 8 #define SYSCALL(esame,emu) .quad __s390x_ ## esame - .globl sys_call_table -sys_call_table: -#include "asm/syscall_table.h" +SYM_DATA_START(sys_call_table) +#include <asm/syscall_table.h> +SYM_DATA_END(sys_call_table) #undef SYSCALL #ifdef CONFIG_COMPAT #define SYSCALL(esame,emu) .quad __s390_ ## emu - .globl sys_call_table_emu -sys_call_table_emu: -#include "asm/syscall_table.h" +SYM_DATA_START(sys_call_table_emu) +#include <asm/syscall_table.h> +SYM_DATA_END(sys_call_table_emu) #undef SYSCALL #endif diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 995ec7449feb..dd55cc6bbc28 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -19,6 +19,7 @@ void mcck_int_handler(void); void restart_int_handler(void); void early_pgm_check_handler(void); +struct task_struct *__switch_to_asm(struct task_struct *prev, struct task_struct *next); void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs); void __do_pgm_check(struct pt_regs *regs); void __do_syscall(struct pt_regs *regs, int per_trap); @@ -30,19 +31,16 @@ void do_secure_storage_access(struct pt_regs *regs); void do_non_secure_storage_access(struct pt_regs *regs); void do_secure_storage_violation(struct pt_regs *regs); void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); -void kernel_stack_overflow(struct pt_regs * regs); +void kernel_stack_invalid(struct pt_regs *regs); void handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs); -void __init init_IRQ(void); void do_io_irq(struct pt_regs *regs); void do_ext_irq(struct pt_regs *regs); void do_restart(void *arg); void __init startup_init(void); void die(struct pt_regs *regs, const char *str); int setup_profiling_timer(unsigned int multiplier); -void __init time_init(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; struct fadvise64_64_args; @@ -73,6 +71,5 @@ extern struct exception_table_entry _stop_amode31_ex_table[]; #define __amode31_data __section(".amode31.data") #define __amode31_ref __section(".amode31.refs") extern long _start_amode31_refs[], _end_amode31_refs[]; -extern unsigned long __amode31_base; #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c new file mode 100644 index 000000000000..f02127219a27 --- /dev/null +++ b/arch/s390/kernel/facility.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2023 + */ + +#include <asm/facility.h> + +unsigned int stfle_size(void) +{ + static unsigned int size; + unsigned int r; + u64 dummy; + + r = READ_ONCE(size); + if (!r) { + r = __stfle_asm(&dummy, 1) + 1; + WRITE_ONCE(size, r); + } + return r; +} +EXPORT_SYMBOL(stfle_size); diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index d864c9a325e2..6f2e87920288 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -8,258 +8,186 @@ #include <linux/kernel.h> #include <linux/cpu.h> #include <linux/sched.h> -#include <asm/fpu/types.h> -#include <asm/fpu/api.h> +#include <asm/fpu.h> -asm(".include \"asm/vx-insn.h\"\n"); - -void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +void __kernel_fpu_begin(struct kernel_fpu *state, int flags) { + __vector128 *vxrs = state->vxrs; + int mask; + /* * Limit the save to the FPU/vector registers already - * in use by the previous context + * in use by the previous context. */ - flags &= state->mask; - + flags &= state->hdr.mask; if (flags & KERNEL_FPC) - /* Save floating point control */ - asm volatile("stfpc %0" : "=Q" (state->fpc)); - - if (!MACHINE_HAS_VX) { - if (flags & KERNEL_VXR_V0V7) { - /* Save floating-point registers */ - asm volatile("std 0,%0" : "=Q" (state->fprs[0])); - asm volatile("std 1,%0" : "=Q" (state->fprs[1])); - asm volatile("std 2,%0" : "=Q" (state->fprs[2])); - asm volatile("std 3,%0" : "=Q" (state->fprs[3])); - asm volatile("std 4,%0" : "=Q" (state->fprs[4])); - asm volatile("std 5,%0" : "=Q" (state->fprs[5])); - asm volatile("std 6,%0" : "=Q" (state->fprs[6])); - asm volatile("std 7,%0" : "=Q" (state->fprs[7])); - asm volatile("std 8,%0" : "=Q" (state->fprs[8])); - asm volatile("std 9,%0" : "=Q" (state->fprs[9])); - asm volatile("std 10,%0" : "=Q" (state->fprs[10])); - asm volatile("std 11,%0" : "=Q" (state->fprs[11])); - asm volatile("std 12,%0" : "=Q" (state->fprs[12])); - asm volatile("std 13,%0" : "=Q" (state->fprs[13])); - asm volatile("std 14,%0" : "=Q" (state->fprs[14])); - asm volatile("std 15,%0" : "=Q" (state->fprs[15])); - } + fpu_stfpc(&state->hdr.fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_LOW) + save_fp_regs_vx(vxrs); return; } - - /* Test and save vector registers */ - asm volatile ( - /* - * Test if any vector register must be saved and, if so, - * test if all register can be saved. - */ - " la 1,%[vxrs]\n" /* load save area */ - " tmll %[m],30\n" /* KERNEL_VXR */ - " jz 7f\n" /* no work -> done */ - " jo 5f\n" /* -> save V0..V31 */ - /* - * Test for special case KERNEL_FPU_MID only. In this - * case a vstm V8..V23 is the best instruction - */ - " chi %[m],12\n" /* KERNEL_VXR_MID */ - " jne 0f\n" /* -> save V8..V23 */ - " VSTM 8,23,128,1\n" /* vstm %v8,%v23,128(%r1) */ - " j 7f\n" - /* Test and save the first half of 16 vector registers */ - "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ - " jz 3f\n" /* -> KERNEL_VXR_HIGH */ - " jo 2f\n" /* 11 -> save V0..V15 */ - " brc 2,1f\n" /* 10 -> save V8..V15 */ - " VSTM 0,7,0,1\n" /* vstm %v0,%v7,0(%r1) */ - " j 3f\n" - "1: VSTM 8,15,128,1\n" /* vstm %v8,%v15,128(%r1) */ - " j 3f\n" - "2: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ - /* Test and save the second half of 16 vector registers */ - "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ - " jz 7f\n" - " jo 6f\n" /* 11 -> save V16..V31 */ - " brc 2,4f\n" /* 10 -> save V24..V31 */ - " VSTM 16,23,256,1\n" /* vstm %v16,%v23,256(%r1) */ - " j 7f\n" - "4: VSTM 24,31,384,1\n" /* vstm %v24,%v31,384(%r1) */ - " j 7f\n" - "5: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ - "6: VSTM 16,31,256,1\n" /* vstm %v16,%v31,256(%r1) */ - "7:" - : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) - : [m] "d" (flags) - : "1", "cc"); + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + vxrs += fpu_vstm(0, 15, vxrs); + vxrs += fpu_vstm(16, 31, vxrs); + return; + } + if (mask == KERNEL_VXR_MID) { + vxrs += fpu_vstm(8, 23, vxrs); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + vxrs += fpu_vstm(0, 15, vxrs); + else if (mask == KERNEL_VXR_V0V7) + vxrs += fpu_vstm(0, 7, vxrs); + else + vxrs += fpu_vstm(8, 15, vxrs); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + vxrs += fpu_vstm(16, 31, vxrs); + else if (mask == KERNEL_VXR_V16V23) + vxrs += fpu_vstm(16, 23, vxrs); + else + vxrs += fpu_vstm(24, 31, vxrs); + } } EXPORT_SYMBOL(__kernel_fpu_begin); -void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) +void __kernel_fpu_end(struct kernel_fpu *state, int flags) { + __vector128 *vxrs = state->vxrs; + int mask; + /* * Limit the restore to the FPU/vector registers of the - * previous context that have been overwritte by the - * current context + * previous context that have been overwritten by the + * current context. */ - flags &= state->mask; - + flags &= state->hdr.mask; if (flags & KERNEL_FPC) - /* Restore floating-point controls */ - asm volatile("lfpc %0" : : "Q" (state->fpc)); - - if (!MACHINE_HAS_VX) { - if (flags & KERNEL_VXR_V0V7) { - /* Restore floating-point registers */ - asm volatile("ld 0,%0" : : "Q" (state->fprs[0])); - asm volatile("ld 1,%0" : : "Q" (state->fprs[1])); - asm volatile("ld 2,%0" : : "Q" (state->fprs[2])); - asm volatile("ld 3,%0" : : "Q" (state->fprs[3])); - asm volatile("ld 4,%0" : : "Q" (state->fprs[4])); - asm volatile("ld 5,%0" : : "Q" (state->fprs[5])); - asm volatile("ld 6,%0" : : "Q" (state->fprs[6])); - asm volatile("ld 7,%0" : : "Q" (state->fprs[7])); - asm volatile("ld 8,%0" : : "Q" (state->fprs[8])); - asm volatile("ld 9,%0" : : "Q" (state->fprs[9])); - asm volatile("ld 10,%0" : : "Q" (state->fprs[10])); - asm volatile("ld 11,%0" : : "Q" (state->fprs[11])); - asm volatile("ld 12,%0" : : "Q" (state->fprs[12])); - asm volatile("ld 13,%0" : : "Q" (state->fprs[13])); - asm volatile("ld 14,%0" : : "Q" (state->fprs[14])); - asm volatile("ld 15,%0" : : "Q" (state->fprs[15])); - } + fpu_lfpc(&state->hdr.fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_LOW) + load_fp_regs_vx(vxrs); return; } - - /* Test and restore (load) vector registers */ - asm volatile ( - /* - * Test if any vector register must be loaded and, if so, - * test if all registers can be loaded at once. - */ - " la 1,%[vxrs]\n" /* load restore area */ - " tmll %[m],30\n" /* KERNEL_VXR */ - " jz 7f\n" /* no work -> done */ - " jo 5f\n" /* -> restore V0..V31 */ - /* - * Test for special case KERNEL_FPU_MID only. In this - * case a vlm V8..V23 is the best instruction - */ - " chi %[m],12\n" /* KERNEL_VXR_MID */ - " jne 0f\n" /* -> restore V8..V23 */ - " VLM 8,23,128,1\n" /* vlm %v8,%v23,128(%r1) */ - " j 7f\n" - /* Test and restore the first half of 16 vector registers */ - "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ - " jz 3f\n" /* -> KERNEL_VXR_HIGH */ - " jo 2f\n" /* 11 -> restore V0..V15 */ - " brc 2,1f\n" /* 10 -> restore V8..V15 */ - " VLM 0,7,0,1\n" /* vlm %v0,%v7,0(%r1) */ - " j 3f\n" - "1: VLM 8,15,128,1\n" /* vlm %v8,%v15,128(%r1) */ - " j 3f\n" - "2: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ - /* Test and restore the second half of 16 vector registers */ - "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ - " jz 7f\n" - " jo 6f\n" /* 11 -> restore V16..V31 */ - " brc 2,4f\n" /* 10 -> restore V24..V31 */ - " VLM 16,23,256,1\n" /* vlm %v16,%v23,256(%r1) */ - " j 7f\n" - "4: VLM 24,31,384,1\n" /* vlm %v24,%v31,384(%r1) */ - " j 7f\n" - "5: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ - "6: VLM 16,31,256,1\n" /* vlm %v16,%v31,256(%r1) */ - "7:" - : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) - : [m] "d" (flags) - : "1", "cc"); + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + vxrs += fpu_vlm(0, 15, vxrs); + vxrs += fpu_vlm(16, 31, vxrs); + return; + } + if (mask == KERNEL_VXR_MID) { + vxrs += fpu_vlm(8, 23, vxrs); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + vxrs += fpu_vlm(0, 15, vxrs); + else if (mask == KERNEL_VXR_V0V7) + vxrs += fpu_vlm(0, 7, vxrs); + else + vxrs += fpu_vlm(8, 15, vxrs); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + vxrs += fpu_vlm(16, 31, vxrs); + else if (mask == KERNEL_VXR_V16V23) + vxrs += fpu_vlm(16, 23, vxrs); + else + vxrs += fpu_vlm(24, 31, vxrs); + } } EXPORT_SYMBOL(__kernel_fpu_end); -void __load_fpu_regs(void) +void load_fpu_state(struct fpu *state, int flags) { - struct fpu *state = ¤t->thread.fpu; - unsigned long *regs = current->thread.fpu.regs; + __vector128 *vxrs = &state->vxrs[0]; + int mask; - asm volatile("lfpc %0" : : "Q" (state->fpc)); - if (likely(MACHINE_HAS_VX)) { - asm volatile("lgr 1,%0\n" - "VLM 0,15,0,1\n" - "VLM 16,31,256,1\n" - : - : "d" (regs) - : "1", "cc", "memory"); - } else { - asm volatile("ld 0,%0" : : "Q" (regs[0])); - asm volatile("ld 1,%0" : : "Q" (regs[1])); - asm volatile("ld 2,%0" : : "Q" (regs[2])); - asm volatile("ld 3,%0" : : "Q" (regs[3])); - asm volatile("ld 4,%0" : : "Q" (regs[4])); - asm volatile("ld 5,%0" : : "Q" (regs[5])); - asm volatile("ld 6,%0" : : "Q" (regs[6])); - asm volatile("ld 7,%0" : : "Q" (regs[7])); - asm volatile("ld 8,%0" : : "Q" (regs[8])); - asm volatile("ld 9,%0" : : "Q" (regs[9])); - asm volatile("ld 10,%0" : : "Q" (regs[10])); - asm volatile("ld 11,%0" : : "Q" (regs[11])); - asm volatile("ld 12,%0" : : "Q" (regs[12])); - asm volatile("ld 13,%0" : : "Q" (regs[13])); - asm volatile("ld 14,%0" : : "Q" (regs[14])); - asm volatile("ld 15,%0" : : "Q" (regs[15])); + if (flags & KERNEL_FPC) + fpu_lfpc_safe(&state->fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_V0V7) + load_fp_regs_vx(state->vxrs); + return; + } + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + fpu_vlm(0, 15, &vxrs[0]); + fpu_vlm(16, 31, &vxrs[16]); + return; + } + if (mask == KERNEL_VXR_MID) { + fpu_vlm(8, 23, &vxrs[8]); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + fpu_vlm(0, 15, &vxrs[0]); + else if (mask == KERNEL_VXR_V0V7) + fpu_vlm(0, 7, &vxrs[0]); + else + fpu_vlm(8, 15, &vxrs[8]); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + fpu_vlm(16, 31, &vxrs[16]); + else if (mask == KERNEL_VXR_V16V23) + fpu_vlm(16, 23, &vxrs[16]); + else + fpu_vlm(24, 31, &vxrs[24]); } - clear_cpu_flag(CIF_FPU); -} -EXPORT_SYMBOL(__load_fpu_regs); - -void load_fpu_regs(void) -{ - raw_local_irq_disable(); - __load_fpu_regs(); - raw_local_irq_enable(); } -EXPORT_SYMBOL(load_fpu_regs); -void save_fpu_regs(void) +void save_fpu_state(struct fpu *state, int flags) { - unsigned long flags, *regs; - struct fpu *state; - - local_irq_save(flags); + __vector128 *vxrs = &state->vxrs[0]; + int mask; - if (test_cpu_flag(CIF_FPU)) - goto out; - - state = ¤t->thread.fpu; - regs = current->thread.fpu.regs; - - asm volatile("stfpc %0" : "=Q" (state->fpc)); - if (likely(MACHINE_HAS_VX)) { - asm volatile("lgr 1,%0\n" - "VSTM 0,15,0,1\n" - "VSTM 16,31,256,1\n" - : - : "d" (regs) - : "1", "cc", "memory"); - } else { - asm volatile("std 0,%0" : "=Q" (regs[0])); - asm volatile("std 1,%0" : "=Q" (regs[1])); - asm volatile("std 2,%0" : "=Q" (regs[2])); - asm volatile("std 3,%0" : "=Q" (regs[3])); - asm volatile("std 4,%0" : "=Q" (regs[4])); - asm volatile("std 5,%0" : "=Q" (regs[5])); - asm volatile("std 6,%0" : "=Q" (regs[6])); - asm volatile("std 7,%0" : "=Q" (regs[7])); - asm volatile("std 8,%0" : "=Q" (regs[8])); - asm volatile("std 9,%0" : "=Q" (regs[9])); - asm volatile("std 10,%0" : "=Q" (regs[10])); - asm volatile("std 11,%0" : "=Q" (regs[11])); - asm volatile("std 12,%0" : "=Q" (regs[12])); - asm volatile("std 13,%0" : "=Q" (regs[13])); - asm volatile("std 14,%0" : "=Q" (regs[14])); - asm volatile("std 15,%0" : "=Q" (regs[15])); + if (flags & KERNEL_FPC) + fpu_stfpc(&state->fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_LOW) + save_fp_regs_vx(state->vxrs); + return; + } + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + fpu_vstm(0, 15, &vxrs[0]); + fpu_vstm(16, 31, &vxrs[16]); + return; + } + if (mask == KERNEL_VXR_MID) { + fpu_vstm(8, 23, &vxrs[8]); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + fpu_vstm(0, 15, &vxrs[0]); + else if (mask == KERNEL_VXR_V0V7) + fpu_vstm(0, 7, &vxrs[0]); + else + fpu_vstm(8, 15, &vxrs[8]); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + fpu_vstm(16, 31, &vxrs[16]); + else if (mask == KERNEL_VXR_V16V23) + fpu_vstm(16, 23, &vxrs[16]); + else + fpu_vstm(24, 31, &vxrs[24]); } - set_cpu_flag(CIF_FPU); -out: - local_irq_restore(flags); } -EXPORT_SYMBOL(save_fpu_regs); +EXPORT_SYMBOL(save_fpu_state); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 416b5a94353d..e94bb98f5231 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -7,13 +7,15 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/moduleloader.h> #include <linux/hardirq.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/kmsan-checks.h> +#include <linux/cpufeature.h> #include <linux/kprobes.h> +#include <linux/execmem.h> #include <trace/syscall.h> #include <asm/asm-offsets.h> #include <asm/text-patching.h> @@ -49,30 +51,6 @@ struct ftrace_insn { s32 disp; } __packed; -asm( - " .align 16\n" - "ftrace_shared_hotpatch_trampoline_br:\n" - " lmg %r0,%r1,2(%r1)\n" - " br %r1\n" - "ftrace_shared_hotpatch_trampoline_br_end:\n" -); - -#ifdef CONFIG_EXPOLINE -asm( - " .align 16\n" - "ftrace_shared_hotpatch_trampoline_exrl:\n" - " lmg %r0,%r1,2(%r1)\n" - " exrl %r0,0f\n" - " j .\n" - "0: br %r1\n" - "ftrace_shared_hotpatch_trampoline_exrl_end:\n" -); -#endif /* CONFIG_EXPOLINE */ - -#ifdef CONFIG_MODULES -static char *ftrace_plt; -#endif /* CONFIG_MODULES */ - static const char *ftrace_shared_hotpatch_trampoline(const char **end) { const char *tstart, *tend; @@ -92,19 +70,20 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) bool ftrace_need_init_nop(void) { - return true; + return !cpu_has_seq_insn(); } int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline = __ftrace_hotpatch_trampolines_start; - static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; + static const struct ftrace_insn orig = { .opc = 0xc004, .disp = 0 }; static struct ftrace_hotpatch_trampoline *trampoline; struct ftrace_hotpatch_trampoline **next_trampoline; struct ftrace_hotpatch_trampoline *trampolines_end; struct ftrace_hotpatch_trampoline tmp; struct ftrace_insn *insn; + struct ftrace_insn old; const char *shared; s32 disp; @@ -118,7 +97,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) if (mod) { next_trampoline = &mod->arch.next_trampoline; trampolines_end = mod->arch.trampolines_end; - shared = ftrace_plt; } #endif @@ -126,8 +104,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return -ENOMEM; trampoline = (*next_trampoline)++; + if (copy_from_kernel_nofault(&old, (void *)rec->ip, sizeof(old))) + return -EFAULT; /* Check for the compiler-generated fentry nop (brcl 0, .). */ - if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig)))) + if (WARN_ON_ONCE(memcmp(&orig, &old, sizeof(old)))) return -EINVAL; /* Generate the trampoline. */ @@ -163,8 +143,35 @@ static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrac return trampoline; } -int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) +static inline struct ftrace_insn +ftrace_generate_branch_insn(unsigned long ip, unsigned long target) +{ + /* brasl r0,target or brcl 0,0 */ + return (struct ftrace_insn){ .opc = target ? 0xc005 : 0xc004, + .disp = target ? (target - ip) / 2 : 0 }; +} + +static int ftrace_patch_branch_insn(unsigned long ip, unsigned long old_target, + unsigned long target) +{ + struct ftrace_insn orig = ftrace_generate_branch_insn(ip, old_target); + struct ftrace_insn new = ftrace_generate_branch_insn(ip, target); + struct ftrace_insn old; + + if (!IS_ALIGNED(ip, 8)) + return -EINVAL; + if (copy_from_kernel_nofault(&old, (void *)ip, sizeof(old))) + return -EFAULT; + /* Verify that the to be replaced code matches what we expect. */ + if (memcmp(&orig, &old, sizeof(old))) + return -EINVAL; + s390_kernel_write((void *)ip, &new, sizeof(new)); + return 0; +} + +static int ftrace_modify_trampoline_call(struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long addr) { struct ftrace_hotpatch_trampoline *trampoline; u64 old; @@ -180,6 +187,15 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return 0; } +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + if (cpu_has_seq_insn()) + return ftrace_patch_branch_insn(rec->ip, old_addr, addr); + else + return ftrace_modify_trampoline_call(rec, old_addr, addr); +} + static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) { u16 old; @@ -198,11 +214,14 @@ static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - /* Expect brcl 0xf,... */ - return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); + /* Expect brcl 0xf,... for the !cpu_has_seq_insn() case */ + if (cpu_has_seq_insn()) + return ftrace_patch_branch_insn(rec->ip, addr, 0); + else + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); } -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +static int ftrace_make_trampoline_call(struct dyn_ftrace *rec, unsigned long addr) { struct ftrace_hotpatch_trampoline *trampoline; @@ -214,6 +233,14 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); } +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + if (cpu_has_seq_insn()) + return ftrace_patch_branch_insn(rec->ip, 0, addr); + else + return ftrace_make_trampoline_call(rec, addr); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { ftrace_func = func; @@ -234,75 +261,20 @@ void ftrace_arch_code_modify_post_process(void) text_poke_sync_lock(); } -#ifdef CONFIG_MODULES +#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __init ftrace_plt_init(void) +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { - const char *start, *end; - - ftrace_plt = module_alloc(PAGE_SIZE); - if (!ftrace_plt) - panic("cannot allocate ftrace plt\n"); - - start = ftrace_shared_hotpatch_trampoline(&end); - memcpy(ftrace_plt, start, end - start); - set_memory_ro((unsigned long)ftrace_plt, 1); - return 0; -} -device_initcall(ftrace_plt_init); - -#endif /* CONFIG_MODULES */ + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; + unsigned long sp = arch_ftrace_regs(fregs)->regs.gprs[15]; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* - * Hook the return address and push it in the stack of return addresses - * in current thread info. - */ -unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, - unsigned long ip) -{ if (unlikely(ftrace_graph_is_dead())) - goto out; + return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) - goto out; - ip -= MCOUNT_INSN_SIZE; - if (!function_graph_enter(ra, ip, 0, (void *) sp)) - ra = (unsigned long) return_to_handler; -out: - return ra; -} -NOKPROBE_SYMBOL(prepare_ftrace_return); - -/* - * Patch the kernel code at ftrace_graph_caller location. The instruction - * there is branch relative on condition. To enable the ftrace graph code - * block, we simply patch the mask field of the instruction to zero and - * turn the instruction into a nop. - * To disable the ftrace graph code the mask field will be patched to - * all ones, which turns the instruction into an unconditional branch. - */ -int ftrace_enable_ftrace_graph_caller(void) -{ - int rc; - - /* Expect brc 0xf,... */ - rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); - if (rc) - return rc; - text_poke_sync_lock(); - return 0; -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - int rc; - - /* Expect brc 0x0,... */ - rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); - if (rc) - return rc; - text_poke_sync_lock(); - return 0; + return; + if (!function_graph_enter_regs(*parent, ip, 0, (unsigned long *)sp, fregs)) + *parent = (unsigned long)&return_to_handler; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -316,10 +288,14 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + kmsan_unpoison_memory(fregs, ftrace_regs_size()); regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)ip); if (!regs || unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h index 7f75a9616406..23337065f402 100644 --- a/arch/s390/kernel/ftrace.h +++ b/arch/s390/kernel/ftrace.h @@ -18,7 +18,5 @@ extern const char ftrace_shared_hotpatch_trampoline_br[]; extern const char ftrace_shared_hotpatch_trampoline_br_end[]; extern const char ftrace_shared_hotpatch_trampoline_exrl[]; extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; -extern const char ftrace_plt_template[]; -extern const char ftrace_plt_template_end[]; #endif /* _FTRACE_H */ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c index d14dd1c2e524..cf26d7a37425 100644 --- a/arch/s390/kernel/guarded_storage.c +++ b/arch/s390/kernel/guarded_storage.c @@ -4,6 +4,7 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -28,7 +29,7 @@ static int gs_enable(void) return -ENOMEM; gs_cb->gsd = 25; preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); load_gs_cb(gs_cb); current->thread.gs_cb = gs_cb; preempt_enable(); @@ -42,7 +43,7 @@ static int gs_disable(void) preempt_disable(); kfree(current->thread.gs_cb); current->thread.gs_cb = NULL; - __ctl_clear_bit(2, 4); + local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT); preempt_enable(); } return 0; @@ -84,7 +85,7 @@ void gs_load_bc_cb(struct pt_regs *regs) if (gs_cb) { kfree(current->thread.gs_cb); current->thread.gs_bc_cb = NULL; - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); load_gs_cb(gs_cb); current->thread.gs_cb = gs_cb; } @@ -109,7 +110,7 @@ static int gs_broadcast(void) SYSCALL_DEFINE2(s390_guarded_storage, int, command, struct gs_cb __user *, gs_cb) { - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -EOPNOTSUPP; switch (command) { case GS_ENABLE: diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index d7b8b6ad574d..7edb9ded199c 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -10,32 +10,31 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <asm/lowcore.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page.h> #include <asm/ptrace.h> __HEAD -ENTRY(startup_continue) - larl %r1,tod_clock_base - mvc 0(16,%r1),__LC_BOOT_CLOCK +SYM_CODE_START(startup_continue) # # Setup stack # + GET_LC %r2 larl %r14,init_task - stg %r14,__LC_CURRENT - larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE -#ifdef CONFIG_KASAN - brasl %r14,kasan_early_init -#endif + stg %r14,__LC_CURRENT(%r2) + larl %r15,init_thread_union+STACK_INIT_OFFSET + stg %r15,__LC_KERNEL_STACK(%r2) + brasl %r14,sclp_early_adjust_va # allow sclp_early_printk brasl %r14,startup_init # s390 specific early init brasl %r14,start_kernel # common init code # # We returned from start_kernel ?!? PANIK # basr %r13,0 - lpswe .Ldw-.(%r13) # load disabled wait psw + lpswe dw_psw-.(%r13) # load disabled wait psw +SYM_CODE_END(startup_continue) - .align 16 -.LPG1: -.Ldw: .quad 0x0002000180000000,0x0000000000000000 + .balign 16 +SYM_DATA_LOCAL(dw_psw, .quad 0x0002000180000000,0x0000000000000000) diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c new file mode 100644 index 000000000000..e7b66d046e8d --- /dev/null +++ b/arch/s390/kernel/hiperdispatch.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "hd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +/* + * Hiperdispatch: + * Dynamically calculates the optimum number of high capacity COREs + * by considering the state the system is in. When hiperdispatch decides + * that a capacity update is necessary, it schedules a topology update. + * During topology updates the CPU capacities are always re-adjusted. + * + * There is two places where CPU capacities are being accessed within + * hiperdispatch. + * -> hiperdispatch's reoccuring work function reads CPU capacities to + * determine high capacity CPU count. + * -> during a topology update hiperdispatch's adjustment function + * updates CPU capacities. + * These two can run on different CPUs in parallel which can cause + * hiperdispatch to make wrong decisions. This can potentially cause + * some overhead by leading to extra rebuild_sched_domains() calls + * for correction. Access to capacities within hiperdispatch has to be + * serialized to prevent the overhead. + * + * Hiperdispatch decision making revolves around steal time. + * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time + * crosses the threshold value hiperdispatch falls back to giving high + * capacities to entitled CPUs. When steal time drops below the + * threshold boundary, hiperdispatch utilizes all CPUs by giving all + * of them high capacity. + * + * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread + * performance. Comparing the throughput of; + * - single CORE, with N threads, running N tasks + * - N separate COREs running N tasks, + * using individual COREs for individual tasks yield better + * performance. This performance difference is roughly ~30% (can change + * between machine generations) + * + * Hiperdispatch tries to hint scheduler to use individual COREs for + * each task, as long as steal time on those COREs are less than 30%, + * therefore delaying the throughput loss caused by using SMP threads. + */ + +#include <linux/cpufeature.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/kernel_stat.h> +#include <linux/kstrtox.h> +#include <linux/ktime.h> +#include <linux/sysctl.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <asm/hiperdispatch.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/topology.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hiperdispatch.h> + +#define HD_DELAY_FACTOR (4) +#define HD_DELAY_INTERVAL (HZ / 4) +#define HD_STEAL_THRESHOLD 30 +#define HD_STEAL_AVG_WEIGHT 16 + +static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ +static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ +static int hd_high_capacity_cores; /* Current CORE count with high capacity */ +static int hd_entitled_cores; /* Total vertical high and medium CORE count */ +static int hd_online_cores; /* Current online CORE count */ + +static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ +static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ +static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ +static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ + +static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; +static unsigned int hd_delay_factor = HD_DELAY_FACTOR; +static int hd_enabled; + +static void hd_capacity_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); + +static int hd_set_hiperdispatch_mode(int enable) +{ + if (!cpu_has_topology()) + enable = 0; + if (hd_enabled == enable) + return 0; + hd_enabled = enable; + return 1; +} + +void hd_reset_state(void) +{ + cpumask_clear(&hd_vl_coremask); + cpumask_clear(&hd_vmvl_cpumask); + hd_entitled_cores = 0; + hd_online_cores = 0; +} + +void hd_add_core(int cpu) +{ + const struct cpumask *siblings; + int polarization; + + hd_online_cores++; + polarization = smp_cpu_get_polarization(cpu); + siblings = topology_sibling_cpumask(cpu); + switch (polarization) { + case POLARIZATION_VH: + hd_entitled_cores++; + break; + case POLARIZATION_VM: + hd_entitled_cores++; + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + case POLARIZATION_VL: + cpumask_set_cpu(cpu, &hd_vl_coremask); + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + } +} + +/* Serialize update and read operations of debug counters. */ +static DEFINE_MUTEX(hd_counter_mutex); + +static void hd_update_times(void) +{ + static ktime_t prev; + ktime_t now; + + /* + * Check if hiperdispatch is active, if not set the prev to 0. + * This way it is possible to differentiate the first update iteration after + * enabling hiperdispatch. + */ + if (hd_entitled_cores == 0 || hd_enabled == 0) { + prev = ktime_set(0, 0); + return; + } + now = ktime_get(); + if (ktime_after(prev, 0)) { + if (hd_high_capacity_cores == hd_online_cores) + hd_high_time += ktime_ms_delta(now, prev); + else + hd_low_time += ktime_ms_delta(now, prev); + } + prev = now; +} + +static void hd_update_capacities(void) +{ + int cpu, upscaling_cores; + unsigned long capacity; + + upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; + capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; + hd_high_capacity_cores = hd_entitled_cores; + for_each_cpu(cpu, &hd_vl_coremask) { + smp_set_core_capacity(cpu, capacity); + if (capacity != CPU_CAPACITY_HIGH) + continue; + hd_high_capacity_cores++; + upscaling_cores--; + if (upscaling_cores == 0) + capacity = CPU_CAPACITY_LOW; + } +} + +void hd_disable_hiperdispatch(void) +{ + cancel_delayed_work_sync(&hd_capacity_work); + hd_high_capacity_cores = hd_online_cores; + hd_previous_steal = 0; +} + +int hd_enable_hiperdispatch(void) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + mutex_unlock(&hd_counter_mutex); + if (hd_enabled == 0) + return 0; + if (hd_entitled_cores == 0) + return 0; + if (hd_online_cores <= hd_entitled_cores) + return 0; + mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + hd_update_capacities(); + return 1; +} + +static unsigned long hd_steal_avg(unsigned long new) +{ + static unsigned long steal; + + steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; + return steal; +} + +static unsigned long hd_calculate_steal_percentage(void) +{ + unsigned long time_delta, steal_delta, steal, percentage; + static ktime_t prev; + int cpus, cpu; + ktime_t now; + + cpus = 0; + steal = 0; + percentage = 0; + for_each_cpu(cpu, &hd_vmvl_cpumask) { + steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + cpus++; + } + /* + * If there is no vertical medium and low CPUs steal time + * is 0 as vertical high CPUs shouldn't experience steal time. + */ + if (cpus == 0) + return percentage; + now = ktime_get(); + time_delta = ktime_to_ns(ktime_sub(now, prev)); + if (steal > hd_previous_steal && hd_previous_steal != 0) { + steal_delta = (steal - hd_previous_steal) * 100 / time_delta; + percentage = steal_delta / cpus; + } + hd_previous_steal = steal; + prev = now; + return percentage; +} + +static void hd_capacity_work_fn(struct work_struct *work) +{ + unsigned long steal_percentage, new_cores; + + mutex_lock(&smp_cpu_state_mutex); + /* + * If online cores are less or equal to entitled cores hiperdispatch + * does not need to make any adjustments, call a topology update to + * disable hiperdispatch. + * Normally this check is handled on topology update, but during cpu + * unhotplug, topology and cpu mask updates are done in reverse + * order, causing hd_enable_hiperdispatch() to get stale data. + */ + if (hd_online_cores <= hd_entitled_cores) { + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return; + } + steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); + if (steal_percentage < hd_steal_threshold) + new_cores = hd_online_cores; + else + new_cores = hd_entitled_cores; + if (hd_high_capacity_cores != new_cores) { + trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); + hd_high_capacity_cores = new_cores; + atomic64_inc(&hd_adjustments); + topology_schedule_update(); + } + trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); + mutex_unlock(&smp_cpu_state_mutex); + schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); +} + +static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int hiperdispatch; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &hiperdispatch, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + hiperdispatch = hd_enabled; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + mutex_lock(&smp_cpu_state_mutex); + if (hd_set_hiperdispatch_mode(hiperdispatch)) + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return 0; +} + +static const struct ctl_table hiperdispatch_ctl_table[] = { + { + .procname = "hiperdispatch", + .mode = 0644, + .proc_handler = hiperdispatch_ctl_handler, + }, +}; + +static ssize_t hd_steal_threshold_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_steal_threshold); +} + +static ssize_t hd_steal_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (val > 100) + return -ERANGE; + hd_steal_threshold = val; + return count; +} + +static DEVICE_ATTR_RW(hd_steal_threshold); + +static ssize_t hd_delay_factor_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_delay_factor); +} + +static ssize_t hd_delay_factor_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (!val) + return -ERANGE; + hd_delay_factor = val; + return count; +} + +static DEVICE_ATTR_RW(hd_delay_factor); + +static struct attribute *hd_attrs[] = { + &dev_attr_hd_steal_threshold.attr, + &dev_attr_hd_delay_factor.attr, + NULL, +}; + +static const struct attribute_group hd_attr_group = { + .name = "hiperdispatch", + .attrs = hd_attrs, +}; + +static int hd_greedy_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_high_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); + +static int hd_conservative_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_low_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); + +static int hd_adjustment_count_get(void *unused, u64 *val) +{ + *val = atomic64_read(&hd_adjustments); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); + +static void __init hd_create_debugfs_counters(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); + debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); + debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); + debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); +} + +static void __init hd_create_attributes(void) +{ + struct device *dev; + + dev = bus_get_dev_root(&cpu_subsys); + if (!dev) + return; + if (sysfs_create_group(&dev->kobj, &hd_attr_group)) + pr_warn("Unable to create hiperdispatch attribute group\n"); + put_device(dev); +} + +static int __init hd_init(void) +{ + if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { + hd_set_hiperdispatch_mode(1); + topology_schedule_update(); + } + if (!register_sysctl("s390", hiperdispatch_ctl_table)) + pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); + hd_create_debugfs_counters(); + hd_create_attributes(); + return 0; +} +late_initcall(hd_init); diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 4bf1ee293f2b..39cb8d0ae348 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -12,9 +12,9 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/cpu.h> -#include <linux/sched/cputime.h> #include <trace/events/power.h> #include <asm/cpu_mf.h> +#include <asm/cputime.h> #include <asm/nmi.h> #include <asm/smp.h> #include "entry.h" @@ -24,117 +24,66 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); + unsigned long idle_time; u64 cycles_new[8]; int i; - clear_cpu_flag(CIF_ENABLED_WAIT); if (smp_cpu_mtid) { stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); for (i = 0; i < smp_cpu_mtid; i++) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } - idle->clock_idle_exit = S390_lowcore.int_clock; - idle->timer_idle_exit = S390_lowcore.sys_enter_timer; + idle_time = lc->int_clock - idle->clock_idle_enter; + + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; - S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; - S390_lowcore.last_update_clock = idle->clock_idle_exit; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; - S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; - S390_lowcore.last_update_timer = idle->timer_idle_exit; + /* Account time spent with enabled wait psw loaded as idle time. */ + WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + account_idle_time(cputime_to_nsecs(idle_time)); } -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - unsigned long idle_time; unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ - psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); - - /* psw_idle() returns with interrupts disabled. */ - psw_idle(idle, psw_mask); - - /* Account time spent with enabled wait psw loaded as idle time. */ - raw_write_seqcount_begin(&idle->seqcount); - idle_time = idle->clock_idle_exit - idle->clock_idle_enter; - idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; - idle->idle_time += idle_time; - idle->idle_count++; - account_idle_time(cputime_to_nsecs(idle_time)); - raw_write_seqcount_end(&idle->seqcount); - raw_local_irq_enable(); + set_cpu_flag(CIF_ENABLED_WAIT); + if (smp_cpu_mtid) + stcctm(MT_DIAG, smp_cpu_mtid, (u64 *)&idle->mt_cycles_enter); + idle->clock_idle_enter = get_tod_clock_fast(); + idle->timer_idle_enter = get_cpu_timer(); + bpon(); + __load_psw_mask(psw_mask); } static ssize_t show_idle_count(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned long idle_count; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_count = READ_ONCE(idle->idle_count); - if (READ_ONCE(idle->clock_idle_enter)) - idle_count++; - } while (read_seqcount_retry(&idle->seqcount, seq)); - return sprintf(buf, "%lu\n", idle_count); + + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count)); } DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); static ssize_t show_idle_time(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - unsigned long now, idle_time, idle_enter, idle_exit, in_idle; struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_time = READ_ONCE(idle->idle_time); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - idle_time += in_idle; - return sprintf(buf, "%lu\n", idle_time >> 12); -} -DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); -u64 arch_cpu_idle_time(int cpu) -{ - struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); - unsigned long now, idle_enter, idle_exit, in_idle; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - return cputime_to_nsecs(in_idle); + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12); } +DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); void arch_cpu_idle_enter(void) { @@ -144,7 +93,7 @@ void arch_cpu_idle_exit(void) { } -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { cpu_die(); } diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 1cc85b8ff42e..ff15f91affde 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/delay.h> +#include <linux/kstrtox.h> #include <linux/panic_notifier.h> #include <linux/reboot.h> #include <linux/ctype.h> @@ -19,7 +20,9 @@ #include <linux/gfp.h> #include <linux/crash_dump.h> #include <linux/debug_locks.h> +#include <linux/vmalloc.h> #include <asm/asm-extable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ipl.h> #include <asm/smp.h> @@ -29,6 +32,7 @@ #include <asm/sclp.h> #include <asm/checksum.h> #include <asm/debug.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/sections.h> #include <asm/boot_data.h> @@ -38,6 +42,8 @@ #define IPL_UNKNOWN_STR "unknown" #define IPL_CCW_STR "ccw" +#define IPL_ECKD_STR "eckd" +#define IPL_ECKD_DUMP_STR "eckd_dump" #define IPL_FCP_STR "fcp" #define IPL_FCP_DUMP_STR "fcp_dump" #define IPL_NVME_STR "nvme" @@ -45,6 +51,7 @@ #define IPL_NSS_STR "nss" #define DUMP_CCW_STR "ccw" +#define DUMP_ECKD_STR "eckd" #define DUMP_FCP_STR "fcp" #define DUMP_NVME_STR "nvme" #define DUMP_NONE_STR "none" @@ -91,6 +98,10 @@ static char *ipl_type_str(enum ipl_type type) switch (type) { case IPL_TYPE_CCW: return IPL_CCW_STR; + case IPL_TYPE_ECKD: + return IPL_ECKD_STR; + case IPL_TYPE_ECKD_DUMP: + return IPL_ECKD_DUMP_STR; case IPL_TYPE_FCP: return IPL_FCP_STR; case IPL_TYPE_FCP_DUMP: @@ -112,6 +123,7 @@ enum dump_type { DUMP_TYPE_CCW = 2, DUMP_TYPE_FCP = 4, DUMP_TYPE_NVME = 8, + DUMP_TYPE_ECKD = 16, }; static char *dump_type_str(enum dump_type type) @@ -121,6 +133,8 @@ static char *dump_type_str(enum dump_type type) return DUMP_NONE_STR; case DUMP_TYPE_CCW: return DUMP_CCW_STR; + case DUMP_TYPE_ECKD: + return DUMP_ECKD_STR; case DUMP_TYPE_FCP: return DUMP_FCP_STR; case DUMP_TYPE_NVME: @@ -146,6 +160,7 @@ static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN; static struct ipl_parameter_block *reipl_block_fcp; static struct ipl_parameter_block *reipl_block_nvme; static struct ipl_parameter_block *reipl_block_ccw; +static struct ipl_parameter_block *reipl_block_eckd; static struct ipl_parameter_block *reipl_block_nss; static struct ipl_parameter_block *reipl_block_actual; @@ -154,20 +169,24 @@ static enum dump_type dump_type = DUMP_TYPE_NONE; static struct ipl_parameter_block *dump_block_fcp; static struct ipl_parameter_block *dump_block_nvme; static struct ipl_parameter_block *dump_block_ccw; +static struct ipl_parameter_block *dump_block_eckd; static struct sclp_ipl_info sclp_ipl_info; static bool reipl_nvme_clear; static bool reipl_fcp_clear; static bool reipl_ccw_clear; +static bool reipl_eckd_clear; -static inline int __diag308(unsigned long subcode, void *addr) +static unsigned long os_info_flags; + +static inline int __diag308(unsigned long subcode, unsigned long addr) { union register_pair r1; - r1.even = (unsigned long) addr; + r1.even = addr; r1.odd = 0; - asm volatile( + asm_inline volatile( " diag %[r1],%[subcode],0x308\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) @@ -180,7 +199,7 @@ static inline int __diag308(unsigned long subcode, void *addr) int diag308(unsigned long subcode, void *addr) { diag_stat_inc(DIAG_STAT_X308); - return __diag308(subcode, addr); + return __diag308(subcode, addr ? virt_to_phys(addr) : 0); } EXPORT_SYMBOL_GPL(diag308); @@ -191,7 +210,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ char *page) \ { \ - return scnprintf(page, PAGE_SIZE, _format, ##args); \ + return sysfs_emit(page, _format, ##args); \ } #define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \ @@ -217,14 +236,14 @@ IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \ _ipl_blk.ssid, _ipl_blk.devno); \ IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name, (S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) \ #define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \ IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL) + __ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL) #define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \ IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ @@ -239,7 +258,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ return len; \ } \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name,(S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) @@ -249,15 +268,74 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ struct kobj_attribute *attr, \ const char *buf, size_t len) \ { \ - strncpy(_value, buf, sizeof(_value) - 1); \ + if (len >= sizeof(_value)) \ + return -E2BIG; \ + len = strscpy(_value, buf); \ + if ((ssize_t)len < 0) \ + return len; \ strim(_value); \ return len; \ } \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name,(S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) +#define IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +static ssize_t sys_##_prefix##_scp_data_show(struct file *filp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t off, \ + size_t count) \ +{ \ + size_t size = _ipl_block.scp_data_len; \ + void *scp_data = _ipl_block.scp_data; \ + \ + return memory_read_from_buffer(buf, count, &off, \ + scp_data, size); \ +} + +#define IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\ +static ssize_t sys_##_prefix##_scp_data_store(struct file *filp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t off, \ + size_t count) \ +{ \ + size_t scpdata_len = count; \ + size_t padding; \ + \ + if (off) \ + return -EINVAL; \ + \ + memcpy(_ipl_block.scp_data, buf, count); \ + if (scpdata_len % 8) { \ + padding = 8 - (scpdata_len % 8); \ + memset(_ipl_block.scp_data + scpdata_len, \ + 0, padding); \ + scpdata_len += padding; \ + } \ + \ + _ipl_block_hdr.len = _ipl_bp_len + scpdata_len; \ + _ipl_block.len = _ipl_bp0_len + scpdata_len; \ + _ipl_block.scp_data_len = scpdata_len; \ + \ + return count; \ +} + +#define DEFINE_IPL_ATTR_SCP_DATA_RO(_prefix, _ipl_block, _size) \ +IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +static const struct bin_attribute sys_##_prefix##_scp_data_attr = \ + __BIN_ATTR(scp_data, 0444, sys_##_prefix##_scp_data_show, \ + NULL, _size) + +#define DEFINE_IPL_ATTR_SCP_DATA_RW(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len, _size)\ +IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\ +static const struct bin_attribute sys_##_prefix##_scp_data_attr = \ + __BIN_ATTR(scp_data, 0644, sys_##_prefix##_scp_data_show, \ + sys_##_prefix##_scp_data_store, _size) + /* * ipl section */ @@ -280,6 +358,11 @@ static __init enum ipl_type get_ipl_type(void) return IPL_TYPE_NVME_DUMP; else return IPL_TYPE_NVME; + case IPL_PBT_ECKD: + if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return IPL_TYPE_ECKD_DUMP; + else + return IPL_TYPE_ECKD; } return IPL_TYPE_UNKNOWN; } @@ -290,7 +373,7 @@ EXPORT_SYMBOL_GPL(ipl_info); static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", ipl_type_str(ipl_info.type)); + return sysfs_emit(page, "%s\n", ipl_type_str(ipl_info.type)); } static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); @@ -298,7 +381,7 @@ static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); static ssize_t ipl_secure_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%i\n", !!ipl_secure_flag); + return sysfs_emit(page, "%i\n", !!ipl_secure_flag); } static struct kobj_attribute sys_ipl_secure_attr = @@ -307,7 +390,7 @@ static struct kobj_attribute sys_ipl_secure_attr = static ssize_t ipl_has_secure_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%i\n", !!sclp.has_sipl); + return sysfs_emit(page, "%i\n", !!sclp.has_sipl); } static struct kobj_attribute sys_ipl_has_secure_attr = @@ -320,79 +403,69 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj, if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); - return sprintf(page, "%s\n", parm); + return sysfs_emit(page, "%s\n", parm); } static struct kobj_attribute sys_ipl_vm_parm_attr = - __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL); + __ATTR(parm, 0444, ipl_vm_parm_show, NULL); static ssize_t sys_ipl_device_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { switch (ipl_info.type) { case IPL_TYPE_CCW: - return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, - ipl_block.ccw.devno); + return sysfs_emit(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + return sysfs_emit(page, "0.%x.%04x\n", ipl_block.eckd.ssid, + ipl_block.eckd.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: - return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); + return sysfs_emit(page, "0.0.%04x\n", ipl_block.fcp.devno); case IPL_TYPE_NVME: case IPL_TYPE_NVME_DUMP: - return sprintf(page, "%08ux\n", ipl_block.nvme.fid); + return sysfs_emit(page, "%08ux\n", ipl_block.nvme.fid); default: return 0; } } static struct kobj_attribute sys_ipl_device_attr = - __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL); + __ATTR(device, 0444, sys_ipl_device_show, NULL); -static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) +static ssize_t sys_ipl_parameter_read(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, char *buf, + loff_t off, size_t count) { return memory_read_from_buffer(buf, count, &off, &ipl_block, ipl_block.hdr.len); } -static struct bin_attribute ipl_parameter_attr = - __BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL, +static const struct bin_attribute sys_ipl_parameter_attr = + __BIN_ATTR(binary_parameter, 0444, sys_ipl_parameter_read, NULL, PAGE_SIZE); -static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.fcp.scp_data_len; - void *scp_data = &ipl_block.fcp.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.nvme.scp_data_len; - void *scp_data = &ipl_block.nvme.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_fcp, ipl_block.fcp, PAGE_SIZE); -static struct bin_attribute ipl_scp_data_attr = - __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE); +static const struct bin_attribute *const ipl_fcp_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_fcp_scp_data_attr, + NULL, +}; -static struct bin_attribute ipl_nvme_scp_data_attr = - __BIN_ATTR(scp_data, S_IRUGO, ipl_nvme_scp_data_read, NULL, PAGE_SIZE); +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_nvme, ipl_block.nvme, PAGE_SIZE); -static struct bin_attribute *ipl_fcp_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_scp_data_attr, +static const struct bin_attribute *const ipl_nvme_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_nvme_scp_data_attr, NULL, }; -static struct bin_attribute *ipl_nvme_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_nvme_scp_data_attr, +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_eckd, ipl_block.eckd, PAGE_SIZE); + +static const struct bin_attribute *const ipl_eckd_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_eckd_scp_data_attr, NULL, }; @@ -417,103 +490,180 @@ DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n", DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n", (unsigned long long)ipl_block.nvme.br_lba); +/* ECKD ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n", + (unsigned long long)ipl_block.eckd.bootprog); + +#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *buf) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + \ + if (!ipb->br_chr.cyl && \ + !ipb->br_chr.head && \ + !ipb->br_chr.record) \ + return sysfs_emit(buf, "auto\n"); \ + \ + return sysfs_emit(buf, "0x%x,0x%x,0x%x\n", \ + ipb->br_chr.cyl, \ + ipb->br_chr.head, \ + ipb->br_chr.record); \ +} + +#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + unsigned long args[3] = { 0 }; \ + char *p, *p1, *tmp = NULL; \ + int i, rc; \ + \ + if (!strncmp(buf, "auto", 4)) \ + goto out; \ + \ + tmp = kstrdup(buf, GFP_KERNEL); \ + p = tmp; \ + for (i = 0; i < 3; i++) { \ + p1 = strsep(&p, ", "); \ + if (!p1) { \ + rc = -EINVAL; \ + goto err; \ + } \ + rc = kstrtoul(p1, 0, args + i); \ + if (rc) \ + goto err; \ + } \ + \ + rc = -EINVAL; \ + if (i != 3) \ + goto err; \ + \ + if ((args[0] || args[1]) && !args[2]) \ + goto err; \ + \ + if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255) \ + goto err; \ + \ +out: \ + ipb->br_chr.cyl = args[0]; \ + ipb->br_chr.head = args[1]; \ + ipb->br_chr.record = args[2]; \ + rc = len; \ +err: \ + kfree(tmp); \ + return rc; \ +} + +IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd); +static struct kobj_attribute sys_ipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL); + +IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd); + +static struct kobj_attribute sys_reipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store); + static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { char loadparm[LOADPARM_LEN + 1] = {}; if (!sclp_ipl_info.is_valid) - return sprintf(page, "#unknown#\n"); + return sysfs_emit(page, "#unknown#\n"); memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); EBCASC(loadparm, LOADPARM_LEN); strim(loadparm); - return sprintf(page, "%s\n", loadparm); + return sysfs_emit(page, "%s\n", loadparm); } static struct kobj_attribute sys_ipl_ccw_loadparm_attr = __ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL); static struct attribute *ipl_fcp_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_fcp_wwpn_attr.attr, &sys_ipl_fcp_lun_attr.attr, &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_fcp_attr_group = { +static const struct attribute_group ipl_fcp_attr_group = { .attrs = ipl_fcp_attrs, - .bin_attrs = ipl_fcp_bin_attrs, + .bin_attrs_new = ipl_fcp_bin_attrs, }; static struct attribute *ipl_nvme_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_nvme_fid_attr.attr, &sys_ipl_nvme_nsid_attr.attr, &sys_ipl_nvme_bootprog_attr.attr, &sys_ipl_nvme_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_nvme_attr_group = { +static const struct attribute_group ipl_nvme_attr_group = { .attrs = ipl_nvme_attrs, - .bin_attrs = ipl_nvme_bin_attrs, + .bin_attrs_new = ipl_nvme_bin_attrs, }; +static struct attribute *ipl_eckd_attrs[] = { + &sys_ipl_eckd_bootprog_attr.attr, + &sys_ipl_eckd_br_chr_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_device_attr.attr, + NULL, +}; + +static const struct attribute_group ipl_eckd_attr_group = { + .attrs = ipl_eckd_attrs, + .bin_attrs_new = ipl_eckd_bin_attrs, +}; /* CCW ipl device attributes */ static struct attribute *ipl_ccw_attrs_vm[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; static struct attribute *ipl_ccw_attrs_lpar[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_ccw_attr_group_vm = { +static const struct attribute_group ipl_ccw_attr_group_vm = { .attrs = ipl_ccw_attrs_vm, }; -static struct attribute_group ipl_ccw_attr_group_lpar = { +static const struct attribute_group ipl_ccw_attr_group_lpar = { .attrs = ipl_ccw_attrs_lpar }; -/* UNKNOWN ipl device attributes */ - -static struct attribute *ipl_unknown_attrs[] = { +static struct attribute *ipl_common_attrs[] = { &sys_ipl_type_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_unknown_attr_group = { - .attrs = ipl_unknown_attrs, +static const struct attribute_group ipl_common_attr_group = { + .attrs = ipl_common_attrs, }; static struct kset *ipl_kset; static void __ipl_run(void *unused) { - __bpon(); diag308(DIAG308_LOAD_CLEAR, NULL); } @@ -531,15 +681,22 @@ static int __init ipl_init(void) rc = -ENOMEM; goto out; } + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group); + if (rc) + goto out; switch (ipl_info.type) { case IPL_TYPE_CCW: - if (MACHINE_IS_VM) + if (machine_is_vm()) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_ccw_attr_group_vm); else rc = sysfs_create_group(&ipl_kset->kobj, &ipl_ccw_attr_group_lpar); break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group); + break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); @@ -549,8 +706,6 @@ static int __init ipl_init(void) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); break; default: - rc = sysfs_create_group(&ipl_kset->kobj, - &ipl_unknown_attr_group); break; } out: @@ -577,7 +732,7 @@ static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb, char vmparm[DIAG308_VMPARM_SIZE + 1] = {}; ipl_block_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); - return sprintf(page, "%s\n", vmparm); + return sysfs_emit(page, "%s\n", vmparm); } static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, @@ -641,54 +796,20 @@ static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj, } static struct kobj_attribute sys_reipl_nss_vmparm_attr = - __ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show, - reipl_nss_vmparm_store); + __ATTR(parm, 0644, reipl_nss_vmparm_show, + reipl_nss_vmparm_store); static struct kobj_attribute sys_reipl_ccw_vmparm_attr = - __ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show, - reipl_ccw_vmparm_store); + __ATTR(parm, 0644, reipl_ccw_vmparm_show, + reipl_ccw_vmparm_store); /* FCP reipl device attributes */ -static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_fcp->fcp.scp_data_len; - void *scp_data = reipl_block_fcp->fcp.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - - if (off) - return -EINVAL; - - memcpy(reipl_block_fcp->fcp.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_fcp->fcp.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } - - reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; - reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; - reipl_block_fcp->fcp.scp_data_len = scpdata_len; +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_fcp, reipl_block_fcp->hdr, + reipl_block_fcp->fcp, + IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN, + DIAG308_SCPDATA_SIZE); - return count; -} -static struct bin_attribute sys_reipl_fcp_scp_data_attr = - __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read, - reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE); - -static struct bin_attribute *reipl_fcp_bin_attrs[] = { +static const struct bin_attribute *const reipl_fcp_bin_attrs[] = { &sys_reipl_fcp_scp_data_attr, NULL, }; @@ -719,7 +840,7 @@ static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb, char buf[LOADPARM_LEN + 1]; reipl_get_ascii_loadparm(buf, ipb); - return sprintf(page, "%s\n", buf); + return sysfs_emit(page, "%s\n", buf); } static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, @@ -750,35 +871,39 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return len; } -/* FCP wrapper */ -static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_fcp, page); -} - -static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_fcp, buf, len); -} - -static struct kobj_attribute sys_reipl_fcp_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show, - reipl_fcp_loadparm_store); +#define DEFINE_GENERIC_LOADPARM(name) \ +static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *page) \ +{ \ + return reipl_generic_loadparm_show(reipl_block_##name, page); \ +} \ +static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \ +} \ +static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \ + __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \ + reipl_##name##_loadparm_store) + +DEFINE_GENERIC_LOADPARM(fcp); +DEFINE_GENERIC_LOADPARM(nvme); +DEFINE_GENERIC_LOADPARM(ccw); +DEFINE_GENERIC_LOADPARM(nss); +DEFINE_GENERIC_LOADPARM(eckd); static ssize_t reipl_fcp_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_fcp_clear); + return sysfs_emit(page, "%u\n", reipl_fcp_clear); } static ssize_t reipl_fcp_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_fcp_clear) < 0) + if (kstrtobool(buf, &reipl_fcp_clear) < 0) return -EINVAL; return len; } @@ -793,9 +918,9 @@ static struct attribute *reipl_fcp_attrs[] = { NULL, }; -static struct attribute_group reipl_fcp_attr_group = { +static const struct attribute_group reipl_fcp_attr_group = { .attrs = reipl_fcp_attrs, - .bin_attrs = reipl_fcp_bin_attrs, + .bin_attrs_new = reipl_fcp_bin_attrs, }; static struct kobj_attribute sys_reipl_fcp_clear_attr = @@ -803,46 +928,12 @@ static struct kobj_attribute sys_reipl_fcp_clear_attr = /* NVME reipl device attributes */ -static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_nvme->nvme.scp_data_len; - void *scp_data = reipl_block_nvme->nvme.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - if (off) - return -EINVAL; - - memcpy(reipl_block_nvme->nvme.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_nvme->nvme.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } - - reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len; - reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len; - reipl_block_nvme->nvme.scp_data_len = scpdata_len; - - return count; -} +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_nvme, reipl_block_nvme->hdr, + reipl_block_nvme->nvme, + IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN, + DIAG308_SCPDATA_SIZE); -static struct bin_attribute sys_reipl_nvme_scp_data_attr = - __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_nvme_scpdata_read, - reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE); - -static struct bin_attribute *reipl_nvme_bin_attrs[] = { +static const struct bin_attribute *const reipl_nvme_bin_attrs[] = { &sys_reipl_nvme_scp_data_attr, NULL, }; @@ -856,24 +947,6 @@ DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", reipl_block_nvme->nvme.br_lba); -/* nvme wrapper */ -static ssize_t reipl_nvme_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_nvme, page); -} - -static ssize_t reipl_nvme_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_nvme, buf, len); -} - -static struct kobj_attribute sys_reipl_nvme_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nvme_loadparm_show, - reipl_nvme_loadparm_store); - static struct attribute *reipl_nvme_attrs[] = { &sys_reipl_nvme_fid_attr.attr, &sys_reipl_nvme_nsid_attr.attr, @@ -883,22 +956,22 @@ static struct attribute *reipl_nvme_attrs[] = { NULL, }; -static struct attribute_group reipl_nvme_attr_group = { +static const struct attribute_group reipl_nvme_attr_group = { .attrs = reipl_nvme_attrs, - .bin_attrs = reipl_nvme_bin_attrs + .bin_attrs_new = reipl_nvme_bin_attrs }; static ssize_t reipl_nvme_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_nvme_clear); + return sysfs_emit(page, "%u\n", reipl_nvme_clear); } static ssize_t reipl_nvme_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_nvme_clear) < 0) + if (kstrtobool(buf, &reipl_nvme_clear) < 0) return -EINVAL; return len; } @@ -909,49 +982,17 @@ static struct kobj_attribute sys_reipl_nvme_clear_attr = /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); -/* NSS wrapper */ -static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_nss, page); -} - -static ssize_t reipl_nss_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_nss, buf, len); -} - -/* CCW wrapper */ -static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_ccw, page); -} - -static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_ccw, buf, len); -} - -static struct kobj_attribute sys_reipl_ccw_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show, - reipl_ccw_loadparm_store); - static ssize_t reipl_ccw_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_ccw_clear); + return sysfs_emit(page, "%u\n", reipl_ccw_clear); } static ssize_t reipl_ccw_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_ccw_clear) < 0) + if (kstrtobool(buf, &reipl_ccw_clear) < 0) return -EINVAL; return len; } @@ -984,6 +1025,52 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { .attrs = reipl_ccw_attrs_lpar, }; +/* ECKD reipl device attributes */ + +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_eckd, reipl_block_eckd->hdr, + reipl_block_eckd->eckd, + IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN, + DIAG308_SCPDATA_SIZE); + +static const struct bin_attribute *const reipl_eckd_bin_attrs[] = { + &sys_reipl_eckd_scp_data_attr, + NULL, +}; + +DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->eckd.bootprog); + +static struct attribute *reipl_eckd_attrs[] = { + &sys_reipl_eckd_device_attr.attr, + &sys_reipl_eckd_bootprog_attr.attr, + &sys_reipl_eckd_br_chr_attr.attr, + &sys_reipl_eckd_loadparm_attr.attr, + NULL, +}; + +static const struct attribute_group reipl_eckd_attr_group = { + .attrs = reipl_eckd_attrs, + .bin_attrs_new = reipl_eckd_bin_attrs +}; + +static ssize_t reipl_eckd_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sysfs_emit(page, "%u\n", reipl_eckd_clear); +} + +static ssize_t reipl_eckd_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_eckd_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_eckd_clear_attr = + __ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store); /* NSS reipl device attributes */ static void reipl_get_ascii_nss_name(char *dst, @@ -1000,7 +1087,7 @@ static ssize_t reipl_nss_name_show(struct kobject *kobj, char nss_name[NSS_NAME_SIZE + 1] = {}; reipl_get_ascii_nss_name(nss_name, reipl_block_nss); - return sprintf(page, "%s\n", nss_name); + return sysfs_emit(page, "%s\n", nss_name); } static ssize_t reipl_nss_name_store(struct kobject *kobj, @@ -1031,12 +1118,8 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, } static struct kobj_attribute sys_reipl_nss_name_attr = - __ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show, - reipl_nss_name_store); - -static struct kobj_attribute sys_reipl_nss_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show, - reipl_nss_loadparm_store); + __ATTR(name, 0644, reipl_nss_name_show, + reipl_nss_name_store); static struct attribute *reipl_nss_attrs[] = { &sys_reipl_nss_name_attr.attr, @@ -1052,8 +1135,8 @@ static struct attribute_group reipl_nss_attr_group = { void set_os_info_reipl_block(void) { - os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual, - reipl_block_actual->hdr.len); + os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual, + reipl_block_actual->hdr.len); } /* reipl type */ @@ -1067,6 +1150,9 @@ static int reipl_set_type(enum ipl_type type) case IPL_TYPE_CCW: reipl_block_actual = reipl_block_ccw; break; + case IPL_TYPE_ECKD: + reipl_block_actual = reipl_block_eckd; + break; case IPL_TYPE_FCP: reipl_block_actual = reipl_block_fcp; break; @@ -1086,7 +1172,7 @@ static int reipl_set_type(enum ipl_type type) static ssize_t reipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", ipl_type_str(reipl_type)); + return sysfs_emit(page, "%s\n", ipl_type_str(reipl_type)); } static ssize_t reipl_type_store(struct kobject *kobj, @@ -1097,6 +1183,8 @@ static ssize_t reipl_type_store(struct kobject *kobj, if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0) rc = reipl_set_type(IPL_TYPE_CCW); + else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_ECKD); else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0) rc = reipl_set_type(IPL_TYPE_FCP); else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0) @@ -1112,6 +1200,7 @@ static struct kobj_attribute reipl_type_attr = static struct kset *reipl_kset; static struct kset *reipl_fcp_kset; static struct kset *reipl_nvme_kset; +static struct kset *reipl_eckd_kset; static void __reipl_run(void *unused) { @@ -1123,6 +1212,13 @@ static void __reipl_run(void *unused) else diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); break; + case IPL_TYPE_ECKD: + diag308(DIAG308_SET, reipl_block_eckd); + if (reipl_eckd_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); + break; case IPL_TYPE_FCP: diag308(DIAG308_SET, reipl_block_fcp); if (reipl_fcp_clear) @@ -1146,6 +1242,7 @@ static void __reipl_run(void *unused) break; case IPL_TYPE_FCP_DUMP: case IPL_TYPE_NVME_DUMP: + case IPL_TYPE_ECKD_DUMP: break; } disabled_wait(); @@ -1176,7 +1273,7 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; /* VM PARM */ - if (MACHINE_IS_VM && ipl_block_valid && + if (machine_is_vm() && ipl_block_valid && (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; @@ -1190,7 +1287,7 @@ static int __init reipl_nss_init(void) { int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; reipl_block_nss = (void *) get_zeroed_page(GFP_KERNEL); @@ -1215,8 +1312,8 @@ static int __init reipl_ccw_init(void) return -ENOMEM; rc = sysfs_create_group(&reipl_kset->kobj, - MACHINE_IS_VM ? &reipl_ccw_attr_group_vm - : &reipl_ccw_attr_group_lpar); + machine_is_vm() ? &reipl_ccw_attr_group_vm + : &reipl_ccw_attr_group_lpar); if (rc) return rc; @@ -1343,6 +1440,58 @@ out1: return rc; } +static int __init reipl_eckd_init(void) +{ + int rc; + + if (!sclp.has_sipl_eckd) + return 0; + + reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!reipl_block_eckd) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL, + &reipl_kset->kobj); + if (!reipl_eckd_kset) { + free_page((unsigned long)reipl_block_eckd); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_eckd_kset->kobj, + &sys_reipl_eckd_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_eckd_clear = true; + } + + if (ipl_info.type == IPL_TYPE_ECKD) { + memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block)); + } else { + reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD; + reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_ECKD; + return 0; + +out2: + sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); +out1: + kset_unregister(reipl_eckd_kset); + free_page((unsigned long)reipl_block_eckd); + return rc; +} + static int __init reipl_type_init(void) { enum ipl_type reipl_type = ipl_info.type; @@ -1364,6 +1513,9 @@ static int __init reipl_type_init(void) } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) { + memcpy(reipl_block_eckd, reipl_block, size); + reipl_type = IPL_TYPE_ECKD; } out: return reipl_set_type(reipl_type); @@ -1384,6 +1536,9 @@ static int __init reipl_init(void) rc = reipl_ccw_init(); if (rc) return rc; + rc = reipl_eckd_init(); + if (rc) + return rc; rc = reipl_fcp_init(); if (rc) return rc; @@ -1419,6 +1574,11 @@ DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", dump_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr, + dump_block_fcp->fcp, + IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_fcp_attrs[] = { &sys_dump_fcp_device_attr.attr, &sys_dump_fcp_wwpn_attr.attr, @@ -1428,9 +1588,15 @@ static struct attribute *dump_fcp_attrs[] = { NULL, }; -static struct attribute_group dump_fcp_attr_group = { +static const struct bin_attribute *const dump_fcp_bin_attrs[] = { + &sys_dump_fcp_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_fcp_attr_group = { .name = IPL_FCP_STR, .attrs = dump_fcp_attrs, + .bin_attrs_new = dump_fcp_bin_attrs, }; /* NVME dump device attributes */ @@ -1443,6 +1609,11 @@ DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", dump_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr, + dump_block_nvme->nvme, + IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_nvme_attrs[] = { &sys_dump_nvme_fid_attr.attr, &sys_dump_nvme_nsid_attr.attr, @@ -1451,9 +1622,49 @@ static struct attribute *dump_nvme_attrs[] = { NULL, }; -static struct attribute_group dump_nvme_attr_group = { +static const struct bin_attribute *const dump_nvme_bin_attrs[] = { + &sys_dump_nvme_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_nvme_attr_group = { .name = IPL_NVME_STR, .attrs = dump_nvme_attrs, + .bin_attrs_new = dump_nvme_bin_attrs, +}; + +/* ECKD dump device attributes */ +DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->eckd.bootprog); + +IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); + +static struct kobj_attribute sys_dump_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store); + +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_eckd, dump_block_eckd->hdr, + dump_block_eckd->eckd, + IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN, + DIAG308_SCPDATA_SIZE); + +static struct attribute *dump_eckd_attrs[] = { + &sys_dump_eckd_device_attr.attr, + &sys_dump_eckd_bootprog_attr.attr, + &sys_dump_eckd_br_chr_attr.attr, + NULL, +}; + +static const struct bin_attribute *const dump_eckd_bin_attrs[] = { + &sys_dump_eckd_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_eckd_attr_group = { + .name = IPL_ECKD_STR, + .attrs = dump_eckd_attrs, + .bin_attrs_new = dump_eckd_bin_attrs, }; /* CCW dump device attributes */ @@ -1482,7 +1693,7 @@ static int dump_set_type(enum dump_type type) static ssize_t dump_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", dump_type_str(dump_type)); + return sysfs_emit(page, "%s\n", dump_type_str(dump_type)); } static ssize_t dump_type_store(struct kobject *kobj, @@ -1495,6 +1706,8 @@ static ssize_t dump_type_store(struct kobject *kobj, rc = dump_set_type(DUMP_TYPE_NONE); else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0) rc = dump_set_type(DUMP_TYPE_CCW); + else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_ECKD); else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0) rc = dump_set_type(DUMP_TYPE_FCP); else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0) @@ -1505,6 +1718,24 @@ static ssize_t dump_type_store(struct kobject *kobj, static struct kobj_attribute dump_type_attr = __ATTR(dump_type, 0644, dump_type_show, dump_type_store); +static ssize_t dump_area_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sysfs_emit(page, "%lu\n", sclp.hsa_size); +} + +static struct kobj_attribute dump_area_size_attr = __ATTR_RO(dump_area_size); + +static struct attribute *dump_attrs[] = { + &dump_type_attr.attr, + &dump_area_size_attr.attr, + NULL, +}; + +static struct attribute_group dump_attr_group = { + .attrs = dump_attrs, +}; + static struct kset *dump_kset; static void diag308_dump(void *dump_block) @@ -1523,6 +1754,9 @@ static void __dump_run(void *unused) case DUMP_TYPE_CCW: diag308_dump(dump_block_ccw); break; + case DUMP_TYPE_ECKD: + diag308_dump(dump_block_eckd); + break; case DUMP_TYPE_FCP: diag308_dump(dump_block_fcp); break; @@ -1601,13 +1835,36 @@ static int __init dump_nvme_init(void) } dump_block_nvme->hdr.len = IPL_BP_NVME_LEN; dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN; - dump_block_nvme->fcp.pbt = IPL_PBT_NVME; - dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP; + dump_block_nvme->nvme.len = IPL_BP0_NVME_LEN; + dump_block_nvme->nvme.pbt = IPL_PBT_NVME; + dump_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_DUMP; dump_capabilities |= DUMP_TYPE_NVME; return 0; } +static int __init dump_eckd_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd) + return 0; /* LDIPL DUMP is not installed */ + dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!dump_block_eckd) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group); + if (rc) { + free_page((unsigned long)dump_block_eckd); + return rc; + } + dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + dump_block_eckd->eckd.pbt = IPL_PBT_ECKD; + dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_ECKD; + return 0; +} + static int __init dump_init(void) { int rc; @@ -1615,7 +1872,7 @@ static int __init dump_init(void) dump_kset = kset_create_and_add("dump", NULL, firmware_kobj); if (!dump_kset) return -ENOMEM; - rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr); + rc = sysfs_create_group(&dump_kset->kobj, &dump_attr_group); if (rc) { kset_unregister(dump_kset); return rc; @@ -1623,6 +1880,9 @@ static int __init dump_init(void) rc = dump_ccw_init(); if (rc) return rc; + rc = dump_eckd_init(); + if (rc) + return rc; rc = dump_fcp_init(); if (rc) return rc; @@ -1641,13 +1901,28 @@ static struct shutdown_action __refdata dump_action = { static void dump_reipl_run(struct shutdown_trigger *trigger) { - unsigned long ipib = (unsigned long) reipl_block_actual; + struct lowcore *abs_lc; unsigned int csum; - csum = (__force unsigned int) - csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); - put_abs_lowcore(ipib, ipib); - put_abs_lowcore(ipib_checksum, csum); + /* + * Set REIPL_CLEAR flag in os_info flags entry indicating + * 'clear' sysfs attribute has been set on the panicked system + * for specified reipl type. + * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN. + */ + if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) || + (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) || + (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) || + (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) || + reipl_type == IPL_TYPE_NSS || + reipl_type == IPL_TYPE_UNKNOWN) + os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR; + os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); + csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0); + abs_lc = get_abs_lowcore(); + abs_lc->ipib = __pa(reipl_block_actual); + abs_lc->ipib_checksum = csum; + put_abs_lowcore(abs_lc); dump_run(trigger); } @@ -1660,11 +1935,13 @@ static struct shutdown_action __refdata dump_reipl_action = { * vmcmd shutdown action: Trigger vm command on shutdown. */ -static char vmcmd_on_reboot[128]; -static char vmcmd_on_panic[128]; -static char vmcmd_on_halt[128]; -static char vmcmd_on_poff[128]; -static char vmcmd_on_restart[128]; +#define VMCMD_MAX_SIZE 240 + +static char vmcmd_on_reboot[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_panic[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_halt[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_poff[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_restart[VMCMD_MAX_SIZE + 1]; DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot); DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic); @@ -1711,7 +1988,7 @@ static void vmcmd_run(struct shutdown_trigger *trigger) static int vmcmd_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -EOPNOTSUPP; vmcmd_kset = kset_create_and_add("vmcmd", NULL, firmware_kobj); if (!vmcmd_kset) @@ -1776,7 +2053,7 @@ static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR, static ssize_t on_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_reboot_trigger.action->name); + return sysfs_emit(page, "%s\n", on_reboot_trigger.action->name); } static ssize_t on_reboot_store(struct kobject *kobj, @@ -1802,7 +2079,7 @@ static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action}; static ssize_t on_panic_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_panic_trigger.action->name); + return sysfs_emit(page, "%s\n", on_panic_trigger.action->name); } static ssize_t on_panic_store(struct kobject *kobj, @@ -1828,7 +2105,7 @@ static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR, static ssize_t on_restart_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_restart_trigger.action->name); + return sysfs_emit(page, "%s\n", on_restart_trigger.action->name); } static ssize_t on_restart_store(struct kobject *kobj, @@ -1854,7 +2131,7 @@ void do_restart(void *arg) tracing_off(); debug_locks_off(); lgr_info_log(); - smp_call_online_cpu(__do_restart, arg); + smp_call_ipl_cpu(__do_restart, arg); } /* on halt */ @@ -1864,7 +2141,7 @@ static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action}; static ssize_t on_halt_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_halt_trigger.action->name); + return sysfs_emit(page, "%s\n", on_halt_trigger.action->name); } static ssize_t on_halt_store(struct kobject *kobj, @@ -1890,7 +2167,7 @@ static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action}; static ssize_t on_poff_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_poff_trigger.action->name); + return sysfs_emit(page, "%s\n", on_poff_trigger.action->name); } static ssize_t on_poff_store(struct kobject *kobj, @@ -1972,26 +2249,28 @@ static int __init s390_ipl_init(void) __initcall(s390_ipl_init); -static void __init strncpy_skip_quote(char *dst, char *src, int n) +static void __init strscpy_skip_quote(char *dst, char *src, int n) { int sx, dx; - dx = 0; - for (sx = 0; src[sx] != 0; sx++) { + if (!n) + return; + for (sx = 0, dx = 0; src[sx]; sx++) { if (src[sx] == '"') continue; - dst[dx++] = src[sx]; - if (dx >= n) + dst[dx] = src[sx]; + if (dx + 1 == n) break; + dx++; } + dst[dx] = '\0'; } static int __init vmcmd_on_reboot_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_reboot, str, 127); - vmcmd_on_reboot[127] = 0; + strscpy_skip_quote(vmcmd_on_reboot, str, sizeof(vmcmd_on_reboot)); on_reboot_trigger.action = &vmcmd_action; return 1; } @@ -1999,10 +2278,9 @@ __setup("vmreboot=", vmcmd_on_reboot_setup); static int __init vmcmd_on_panic_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_panic, str, 127); - vmcmd_on_panic[127] = 0; + strscpy_skip_quote(vmcmd_on_panic, str, sizeof(vmcmd_on_panic)); on_panic_trigger.action = &vmcmd_action; return 1; } @@ -2010,10 +2288,9 @@ __setup("vmpanic=", vmcmd_on_panic_setup); static int __init vmcmd_on_halt_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_halt, str, 127); - vmcmd_on_halt[127] = 0; + strscpy_skip_quote(vmcmd_on_halt, str, sizeof(vmcmd_on_halt)); on_halt_trigger.action = &vmcmd_action; return 1; } @@ -2021,10 +2298,9 @@ __setup("vmhalt=", vmcmd_on_halt_setup); static int __init vmcmd_on_poff_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_poff, str, 127); - vmcmd_on_poff[127] = 0; + strscpy_skip_quote(vmcmd_on_poff, str, sizeof(vmcmd_on_poff)); on_poff_trigger.action = &vmcmd_action; return 1; } @@ -2052,6 +2328,11 @@ void __init setup_ipl(void) ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid; + ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno; + break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: ipl_info.data.fcp.dev_id.ssid = 0; @@ -2078,7 +2359,7 @@ void s390_reset_system(void) set_prefix(0); /* Disable lowcore protection */ - __ctl_clear_bit(0, 28); + local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); diag_amode31_ops.diag308_reset(); } diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 45393919fe61..bdf9c7cb5685 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -9,6 +9,7 @@ */ #include <linux/kernel_stat.h> +#include <linux/cpufeature.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -25,10 +26,13 @@ #include <asm/irq_regs.h> #include <asm/cputime.h> #include <asm/lowcore.h> +#include <asm/machine.h> #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/stacktrace.h> #include <asm/softirq_stack.h> +#include <asm/vtime.h> +#include <asm/asm.h> #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -75,13 +79,13 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"}, {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, + {.irq = IRQEXT_WTI, .name = "WTI", .desc = "[EXT] Warning Track"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, - {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, @@ -99,8 +103,8 @@ static const struct irq_class irqclass_sub_desc[] = { static void do_IRQ(struct pt_regs *regs, int irq) { - if (tod_after_eq(S390_lowcore.int_clock, - S390_lowcore.clock_comparator)) + if (tod_after_eq(get_lowcore()->int_clock, + get_lowcore()->clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); @@ -110,7 +114,7 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; + return ((get_lowcore()->async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) @@ -118,7 +122,7 @@ static void do_irq_async(struct pt_regs *regs, int irq) if (on_async_stack()) { do_IRQ(regs, irq); } else { - call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + call_on_stack(2, get_lowcore()->async_stack, void, do_IRQ, struct pt_regs *, regs, int, irq); } } @@ -127,36 +131,41 @@ static int irq_pending(struct pt_regs *regs) { int cc; - asm volatile("tpi 0\n" - "ipm %0" : "=d" (cc) : : "cc"); - return cc >> 28; + asm volatile( + " tpi 0\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc); } void noinstr do_io_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); struct pt_regs *old_regs = set_irq_regs(regs); - int from_idle; + bool from_idle; irq_enter_rcu(); if (user_mode(regs)) { update_timer_sys(); - if (static_branch_likely(&cpu_has_bear)) + if (cpu_has_bear()) current->thread.last_break = regs->last_break; } - from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); + set_cpu_flag(CIF_NOHZ_DELAY); do { - regs->tpi_info = S390_lowcore.tpi_info; - if (S390_lowcore.tpi_info.adapter_IO) + regs->tpi_info = get_lowcore()->tpi_info; + if (get_lowcore()->tpi_info.adapter_IO) do_irq_async(regs, THIN_INTERRUPT); else do_irq_async(regs, IO_INTERRUPT); - } while (MACHINE_IS_LPAR && irq_pending(regs)); + } while (machine_is_lpar() && irq_pending(regs)); irq_exit_rcu(); @@ -171,21 +180,21 @@ void noinstr do_ext_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); struct pt_regs *old_regs = set_irq_regs(regs); - int from_idle; + bool from_idle; irq_enter_rcu(); if (user_mode(regs)) { update_timer_sys(); - if (static_branch_likely(&cpu_has_bear)) + if (cpu_has_bear()) current->thread.last_break = regs->last_break; } - regs->int_code = S390_lowcore.ext_int_code_addr; - regs->int_parm = S390_lowcore.ext_params; - regs->int_parm_long = S390_lowcore.ext_params2; + regs->int_code = get_lowcore()->ext_int_code_addr; + regs->int_parm = get_lowcore()->ext_params; + regs->int_parm_long = get_lowcore()->ext_params2; - from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -250,7 +259,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } - if (index < nr_irqs) { + if (index < irq_get_nr_irqs()) { show_msi_interrupt(p, index); goto out; } @@ -385,7 +394,7 @@ void irq_subclass_register(enum irq_subclass subclass) { spin_lock(&irq_subclass_lock); if (!irq_subclass_refcount[subclass]) - ctl_set_bit(0, subclass); + system_ctl_set_bit(0, subclass); irq_subclass_refcount[subclass]++; spin_unlock(&irq_subclass_lock); } @@ -396,7 +405,7 @@ void irq_subclass_unregister(enum irq_subclass subclass) spin_lock(&irq_subclass_lock); irq_subclass_refcount[subclass]--; if (!irq_subclass_refcount[subclass]) - ctl_clear_bit(0, subclass); + system_ctl_clear_bit(0, subclass); spin_unlock(&irq_subclass_lock); } EXPORT_SYMBOL(irq_subclass_unregister); diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 9da6fa30c447..4d364de43799 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = phdr->p_memsz; data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index af23eff5774d..a32ce8bea745 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image, buf.bufsz = image->kernel_buf_len; buf.mem = 0; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->kernel_buf = image->kernel_buf; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 0032bdbe8e3f..c450120b4474 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -9,11 +9,11 @@ #define pr_fmt(fmt) "kprobes: " fmt -#include <linux/moduleloader.h> #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/preempt.h> #include <linux/stop_machine.h> +#include <linux/cpufeature.h> #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/extable.h> @@ -21,6 +21,8 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/ftrace.h> +#include <linux/execmem.h> +#include <asm/text-patching.h> #include <asm/set_memory.h> #include <asm/sections.h> #include <asm/dis.h> @@ -31,41 +33,17 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -DEFINE_INSN_CACHE_OPS(s390_insn); - -static int insn_page_in_use; - void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; - __set_memory((unsigned long) page, 1, SET_MEMORY_RO | SET_MEMORY_X); + set_memory_rox((unsigned long)page, 1); return page; } -static void *alloc_s390_insn_page(void) -{ - if (xchg(&insn_page_in_use, 1) == 1) - return NULL; - return &kprobes_insn_page; -} - -static void free_s390_insn_page(void *page) -{ - xchg(&insn_page_in_use, 0); -} - -struct kprobe_insn_cache kprobe_s390_insn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), - .alloc = alloc_s390_insn_page, - .free = free_s390_insn_page, - .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), - .insn_size = MAX_INSN_SIZE, -}; - static void copy_instruction(struct kprobe *p) { kprobe_opcode_t insn[MAX_INSN_SIZE]; @@ -79,10 +57,10 @@ static void copy_instruction(struct kprobe *p) if (probe_is_insn_relative_long(&insn[0])) { /* * For pc-relative instructions in RIL-b or RIL-c format patch - * the RI2 displacement field. We have already made sure that - * the insn slot for the patched instruction is within the same - * 2GB area as the original instruction (either kernel image or - * module area). Therefore the new displacement will always fit. + * the RI2 displacement field. The insn slot for the to be + * patched instruction is within the same 4GB area like the + * original instruction. Therefore the new displacement will + * always fit. */ disp = *(s32 *)&insn[1]; addr = (u64)(unsigned long)p->addr; @@ -94,34 +72,6 @@ static void copy_instruction(struct kprobe *p) } NOKPROBE_SYMBOL(copy_instruction); -static int s390_get_insn_slot(struct kprobe *p) -{ - /* - * Get an insn slot that is within the same 2GB area like the original - * instruction. That way instructions with a 32bit signed displacement - * field can be patched and executed within the insn slot. - */ - p->ainsn.insn = NULL; - if (is_kernel((unsigned long)p->addr)) - p->ainsn.insn = get_s390_insn_slot(); - else if (is_module_addr(p->addr)) - p->ainsn.insn = get_insn_slot(); - return p->ainsn.insn ? 0 : -ENOMEM; -} -NOKPROBE_SYMBOL(s390_get_insn_slot); - -static void s390_free_insn_slot(struct kprobe *p) -{ - if (!p->ainsn.insn) - return; - if (is_kernel((unsigned long)p->addr)) - free_s390_insn_slot(p->ainsn.insn, 0); - else - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; -} -NOKPROBE_SYMBOL(s390_free_insn_slot); - /* Check if paddr is at an instruction boundary */ static bool can_probe(unsigned long paddr) { @@ -175,7 +125,8 @@ int arch_prepare_kprobe(struct kprobe *p) /* Make sure the probe isn't going on a difficult instruction */ if (probe_is_prohibited_opcode(p->addr)) return -EINVAL; - if (s390_get_insn_slot(p)) + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) return -ENOMEM; copy_instruction(p); return 0; @@ -203,7 +154,12 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (cpu_has_seq_insn()) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -211,13 +167,21 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (cpu_has_seq_insn()) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_disarm_kprobe); void arch_remove_kprobe(struct kprobe *p) { - s390_free_insn_slot(p); + if (!p->ainsn.insn) + return; + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; } NOKPROBE_SYMBOL(arch_remove_kprobe); @@ -225,20 +189,27 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb, struct pt_regs *regs, unsigned long ip) { - struct per_regs per_kprobe; + union { + struct ctlreg regs[3]; + struct { + struct ctlreg control; + struct ctlreg start; + struct ctlreg end; + }; + } per_kprobe; /* Set up the PER control registers %cr9-%cr11 */ - per_kprobe.control = PER_EVENT_IFETCH; - per_kprobe.start = ip; - per_kprobe.end = ip; + per_kprobe.control.val = PER_EVENT_IFETCH; + per_kprobe.start.val = ip; + per_kprobe.end.val = ip; /* Save control regs and psw mask */ - __ctl_store(kcb->kprobe_saved_ctl, 9, 11); + __local_ctl_store(9, 11, kcb->kprobe_saved_ctl); kcb->kprobe_saved_imask = regs->psw.mask & (PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT); /* Set PER control regs, turns on single step for the given address */ - __ctl_load(per_kprobe, 9, 11); + __local_ctl_load(9, 11, per_kprobe.regs); regs->psw.mask |= PSW_MASK_PER; regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT); regs->psw.addr = ip; @@ -250,7 +221,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb, unsigned long ip) { /* Restore control regs and psw mask, set new psw address */ - __ctl_load(kcb->kprobe_saved_ctl, 9, 11); + __local_ctl_load(9, 11, kcb->kprobe_saved_ctl); regs->psw.mask &= ~PSW_MASK_PER; regs->psw.mask |= kcb->kprobe_saved_imask; regs->psw.addr = ip; @@ -279,19 +250,10 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->prev_kprobe.kp = NULL; } NOKPROBE_SYMBOL(pop_kprobe); -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->gprs[14]; - ri->fp = (void *)regs->gprs[15]; - - /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long)&__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) { switch (kcb->kprobe_status) { @@ -372,26 +334,6 @@ static int kprobe_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_handler); -void arch_kretprobe_fixup_return(struct pt_regs *regs, - kprobe_opcode_t *correct_ret_addr) -{ - /* Replace fake return address with real one. */ - regs->gprs[14] = (unsigned long)correct_ret_addr; -} -NOKPROBE_SYMBOL(arch_kretprobe_fixup_return); - -/* - * Called from __kretprobe_trampoline - */ -void trampoline_probe_handler(struct pt_regs *regs) -{ - kretprobe_trampoline_handler(regs, (void *)regs->gprs[15]); -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - -/* assembler function that handles the kretprobes must not be probed itself */ -NOKPROBE_SYMBOL(__kretprobe_trampoline); - /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" @@ -433,12 +375,11 @@ static int post_kprobe_handler(struct pt_regs *regs) if (!p) return 0; + resume_execution(p, regs); if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - - resume_execution(p, regs); pop_kprobe(kcb); preempt_enable_no_resched(); @@ -549,6 +490,12 @@ int __init arch_init_kprobes(void) return 0; } +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); +} + int arch_trampoline_kprobe(struct kprobe *p) { return 0; diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S deleted file mode 100644 index f6cb022ef8c8..000000000000 --- a/arch/s390/kernel/kprobes_insn_page.S +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include <linux/linkage.h> - -/* - * insn_page is a special 4k aligned dummy function for kprobes. - * It will contain all kprobed instructions that are out-of-line executed. - * The page must be within the kernel image to guarantee that the - * out-of-line instructions are within 2GB distance of their original - * location. Using a dummy function ensures that the insn_page is within - * the text section of the kernel and mapped read-only/executable from - * the beginning on, thus avoiding to split large mappings if the page - * would be in the data section instead. - */ - .section .kprobes.text, "ax" - .align 4096 -ENTRY(kprobes_insn_page) - .rept 2048 - .word 0x07fe - .endr -ENDPROC(kprobes_insn_page) - .previous diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 6652e54cf3db..6d1ffca5f798 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -166,7 +166,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); + mod_timer(&lgr_timer, jiffies + secs_to_jiffies(LGR_TIMER_INTERVAL_SECS)); } /* diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index ab761c008f98..baeb3dcfc1c8 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -13,7 +13,12 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/debug_locks.h> +#include <linux/cpufeature.h> +#include <asm/guarded_storage.h> +#include <asm/machine.h> +#include <asm/pfault.h> #include <asm/cio.h> +#include <asm/fpu.h> #include <asm/setup.h> #include <asm/smp.h> #include <asm/ipl.h> @@ -21,15 +26,15 @@ #include <asm/elf.h> #include <asm/asm-offsets.h> #include <asm/cacheflush.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/set_memory.h> #include <asm/stacktrace.h> -#include <asm/switch_to.h> #include <asm/nmi.h> #include <asm/sclp.h> -typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long, - unsigned long); +typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long); +typedef int (*purgatory_t)(int); extern const unsigned char relocate_kernel[]; extern const unsigned long long relocate_kernel_len; @@ -40,13 +45,16 @@ extern const unsigned long long relocate_kernel_len; * Reset the system, copy boot CPU registers to absolute zero, * and jump to the kdump image */ -static void __do_machine_kdump(void *image) +static void __do_machine_kdump(void *data) { - int (*start_kdump)(int); + struct kimage *image = data; + purgatory_t purgatory; unsigned long prefix; + purgatory = (purgatory_t)image->start; + /* store_status() saved the prefix register to lowcore */ - prefix = (unsigned long) S390_lowcore.prefixreg_save_area; + prefix = (unsigned long)get_lowcore()->prefixreg_save_area; /* Now do the reset */ s390_reset_system(); @@ -56,14 +64,12 @@ static void __do_machine_kdump(void *image) * This need to be done *after* s390_reset_system set the * prefix register of this CPU to zero */ - memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA), - (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512); + memcpy(absolute_pointer(get_lowcore()->floating_pt_save_area), + phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512); - __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA); - start_kdump = (void *)((struct kimage *) image)->start; - start_kdump(1); + call_nodat(1, int, purgatory, int, 1); - /* Die if start_kdump returns */ + /* Die if kdump returns */ disabled_wait(); } @@ -87,16 +93,16 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ - mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); - if (MACHINE_HAS_VX) + mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK); + if (cpu_has_vx()) save_vx_regs((__vector128 *) mcesa->vector_save_area); - if (MACHINE_HAS_GS) { - __ctl_store(cr2_old.val, 2, 2); + if (cpu_has_gs()) { + local_ctl_store(2, &cr2_old.reg); cr2_new = cr2_old; cr2_new.gse = 1; - __ctl_load(cr2_new.val, 2, 2); + local_ctl_load(2, &cr2_new.reg); save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); - __ctl_load(cr2_old.val, 2, 2); + local_ctl_load(2, &cr2_old.reg); } /* * To create a good backchain for this CPU in the dump store_status @@ -110,18 +116,6 @@ static noinline void __machine_kdump(void *image) store_status(__do_machine_kdump, image); } -static unsigned long do_start_kdump(unsigned long addr) -{ - struct kimage *image = (struct kimage *) addr; - int (*start_kdump)(int) = (void *)image->start; - int rc; - - __arch_local_irq_stnsm(0xfb); /* disable DAT */ - rc = start_kdump(0); - __arch_local_irq_stosm(0x04); /* enable DAT */ - return rc; -} - #endif /* CONFIG_CRASH_DUMP */ /* @@ -130,12 +124,10 @@ static unsigned long do_start_kdump(unsigned long addr) static bool kdump_csum_valid(struct kimage *image) { #ifdef CONFIG_CRASH_DUMP + purgatory_t purgatory = (purgatory_t)image->start; int rc; - preempt_disable(); - rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump, - unsigned long, (unsigned long)image); - preempt_enable(); + rc = call_nodat(1, int, purgatory, int, 0); return rc == 0; #else return false; @@ -188,7 +180,7 @@ void arch_kexec_unprotect_crashkres(void) static int machine_kexec_prepare_kdump(void) { #ifdef CONFIG_CRASH_DUMP - if (MACHINE_IS_VM) + if (machine_is_vm()) diag10_range(PFN_DOWN(crashk_res.start), PFN_DOWN(crashk_res.end - crashk_res.start + 1)); return 0; @@ -209,7 +201,7 @@ int machine_kexec_prepare(struct kimage *image) return -EINVAL; /* Get the destination where the assembler code should be copied to.*/ - reboot_code_buffer = (void *) page_to_phys(image->control_code_page); + reboot_code_buffer = page_to_virt(image->control_code_page); /* Then copy it */ memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len); @@ -220,17 +212,6 @@ void machine_kexec_cleanup(struct kimage *image) { } -void arch_crash_save_vmcoreinfo(void) -{ - VMCOREINFO_SYMBOL(lowcore_ptr); - VMCOREINFO_SYMBOL(high_memory); - VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); - vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31); - vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31); - vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - put_abs_lowcore(vmcore_info, paddr_vmcoreinfo_note()); -} - void machine_shutdown(void) { } @@ -245,19 +226,20 @@ void machine_crash_shutdown(struct pt_regs *regs) */ static void __do_machine_kexec(void *data) { - unsigned long diag308_subcode; - relocate_kernel_t data_mover; + unsigned long data_mover, entry, diag308_subcode; struct kimage *image = data; - s390_reset_system(); - data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page); - - __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ - /* Call the moving routine */ + data_mover = page_to_phys(image->control_code_page); + entry = virt_to_phys(&image->head); diag308_subcode = DIAG308_CLEAR_RESET; if (sclp.has_iplcc) diag308_subcode |= DIAG308_FLAG_EI; - (*data_mover)(&image->head, image->start, diag308_subcode); + s390_reset_system(); + + call_nodat(3, void, (relocate_kernel_t)data_mover, + unsigned long, entry, + unsigned long, image->start, + unsigned long, diag308_subcode); /* Die if kexec returns */ disabled_wait(); diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index fc6d5f58debe..c2bac14dd668 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -105,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image, if (ret) return ret; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { u64 crash_size; @@ -121,6 +122,7 @@ static int kexec_file_update_purgatory(struct kimage *image, sizeof(crash_size), false); } +#endif return ret; } @@ -134,8 +136,10 @@ static int kexec_file_add_purgatory(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif ret = kexec_load_purgatory(image, &buf); if (ret) @@ -158,8 +162,10 @@ static int kexec_file_add_initrd(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->parm->initrd_start = data->memsz; @@ -187,10 +193,8 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; - if (image->type == KEXEC_TYPE_CRASH) - buf.mem += crashk_res.start; - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); end = ptr + ipl_cert_list_size; ncerts = 0; while (ptr < end) { @@ -202,7 +206,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, addr = data->memsz + data->report->size; addr += ncerts * sizeof(struct ipl_rb_certificate_entry); - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); while (ptr < end) { len = *(unsigned int *)ptr; ptr += sizeof(len); @@ -225,6 +229,11 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; +#ifdef CONFIG_CRASH_DUMP + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; +#endif + ret = kexec_add_buffer(&buf); out: return ret; @@ -267,10 +276,12 @@ void *kexec_file_add_components(struct kimage *image, memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { data.parm->oldmem_base = crashk_res.start; data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; } +#endif if (image->initrd_buf) { ret = kexec_file_add_initrd(image, &data); diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 4786bfe02144..1fec370fecf4 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -9,15 +9,21 @@ #include <asm/ftrace.h> #include <asm/nospec-insn.h> #include <asm/ptrace.h> -#include <asm/export.h> +#include <asm/march.h> +#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) + +#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE) +#define STACK_FREGS (STACK_FRAME_OVERHEAD) +#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS) +#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS) +#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW) +#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2) +#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS) -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) -#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) -#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS) /* packed stack: allocate just enough for r14, r15 and backchain */ #define TRACED_FUNC_FRAME_SIZE 24 @@ -28,9 +34,14 @@ .section .kprobes.text, "ax" -ENTRY(ftrace_stub) +SYM_FUNC_START(ftrace_stub) BR_EX %r14 -ENDPROC(ftrace_stub) +SYM_FUNC_END(ftrace_stub) + +SYM_CODE_START(ftrace_stub_direct_tramp) + lgr %r1, %r0 + BR_EX %r1 +SYM_CODE_END(ftrace_stub_direct_tramp) .macro ftrace_regs_entry, allregs=0 stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller @@ -48,23 +59,23 @@ ENDPROC(ftrace_stub) stg %r1,__SF_BACKCHAIN(%r15) stg %r0,(__SF_GPRS+8*8)(%r15) stg %r15,(__SF_GPRS+9*8)(%r15) - # allocate pt_regs and stack frame for ftrace_trace_function - aghi %r15,-STACK_FRAME_SIZE - stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) - xc STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15) + # allocate ftrace_regs and stack frame for ftrace_trace_function + aghi %r15,-STACK_FRAME_SIZE_FREGS + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15) .if \allregs == 1 - stg %r14,(STACK_PTREGS_PSW)(%r15) - mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS + stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15) + mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS .else - xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15) + xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15) .endif lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address aghi %r1,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) - stg %r0,(STACK_PTREGS_PSW+8)(%r15) - stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) + stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) + stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) .endm SYM_CODE_START(ftrace_regs_caller) @@ -78,7 +89,7 @@ SYM_CODE_START(ftrace_caller) SYM_CODE_END(ftrace_caller) SYM_CODE_START(ftrace_common) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op lgrl %r1,ftrace_func @@ -91,30 +102,19 @@ SYM_CODE_START(ftrace_common) lg %r1,0(%r1) #endif lgr %r3,%r14 - la %r5,STACK_PTREGS(%r15) + la %r5,STACK_FREGS(%r15) BASR_EX %r14,%r1 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -# The j instruction gets runtime patched to a nop instruction. -# See ftrace_enable_ftrace_graph_caller. -SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) - j .Lftrace_graph_caller_end - lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r4,(STACK_PTREGS_PSW+8)(%r15) - brasl %r14,prepare_ftrace_return - stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) -.Lftrace_graph_caller_end: -#endif - lg %r0,(STACK_PTREGS_PSW+8)(%r15) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - ltg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) +#ifdef MARCH_HAS_Z196_FEATURES + ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) locgrz %r1,%r0 #else - lg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) ltgr %r1,%r1 jnz 0f lgr %r1,%r0 #endif -0: lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) +0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 SYM_CODE_END(ftrace_common) @@ -123,10 +123,14 @@ SYM_CODE_END(ftrace_common) SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD + # allocate ftrace_regs and stack frame for ftrace_return_to_handler + aghi %r15,-STACK_FRAME_SIZE_FREGS stg %r1,__SF_BACKCHAIN(%r15) + stg %r2,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + la %r2,STACK_FRAME_OVERHEAD(%r15) brasl %r14,ftrace_return_to_handler - aghi %r15,STACK_FRAME_OVERHEAD + aghi %r15,STACK_FRAME_SIZE_FREGS lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 @@ -135,16 +139,31 @@ SYM_FUNC_END(return_to_handler) #endif #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_KPROBES - -SYM_FUNC_START(__kretprobe_trampoline) - +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br) + lmg %r0,%r1,2(%r1) + br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br) + +#ifdef CONFIG_EXPOLINE +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl) + lmg %r0,%r1,2(%r1) + exrl %r0,0f + j . +0: br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl) +#endif /* CONFIG_EXPOLINE */ + +#ifdef CONFIG_RETHOOK + +SYM_CODE_START(arch_rethook_trampoline) stg %r14,(__SF_GPRS+8*8)(%r15) - lay %r15,-STACK_FRAME_SIZE(%r15) + lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15) stmg %r0,%r14,STACK_PTREGS_GPRS(%r15) # store original stack pointer in backchain and pt_regs - lay %r7,STACK_FRAME_SIZE(%r15) + lay %r7,STACK_FRAME_SIZE_PTREGS(%r15) stg %r7,__SF_BACKCHAIN(%r15) stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15) @@ -152,16 +171,15 @@ SYM_FUNC_START(__kretprobe_trampoline) epsw %r2,%r3 risbg %r3,%r2,0,31,32 stg %r3,STACK_PTREGS_PSW(%r15) - larl %r1,__kretprobe_trampoline + larl %r1,arch_rethook_trampoline stg %r1,STACK_PTREGS_PSW+8(%r15) lay %r2,STACK_PTREGS(%r15) - brasl %r14,trampoline_probe_handler + brasl %r14,arch_rethook_trampoline_callback mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15) lmg %r0,%r15,STACK_PTREGS_GPRS(%r15) lpswe __SF_EMPTY(%r15) +SYM_CODE_END(arch_rethook_trampoline) -SYM_FUNC_END(__kretprobe_trampoline) - -#endif /* CONFIG_KPROBES */ +#endif /* CONFIG_RETHOOK */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 2d159b32885b..91e207b50394 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -21,11 +21,13 @@ #include <linux/moduleloader.h> #include <linux/bug.h> #include <linux/memory.h> +#include <linux/execmem.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/facility.h> #include <asm/ftrace.lds.h> #include <asm/set_memory.h> +#include <asm/setup.h> #if 0 #define DEBUGP printk @@ -35,27 +37,10 @@ #define PLT_ENTRY_SIZE 22 -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - return p; -} - #ifdef CONFIG_FUNCTION_TRACER void module_arch_cleanup(struct module *mod) { - module_memfree(mod->arch.trampolines_start); + execmem_free(mod->arch.trampolines_start); } #endif @@ -126,6 +111,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, Elf_Rela *rela; char *strings; int nrela, i, j; + struct module_memory *mod_mem; /* Find symbol table and string table. */ symtab = NULL; @@ -173,14 +159,15 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, /* Increase core size by size of got & plt and set start offsets for got and plt. */ - me->core_layout.size = ALIGN(me->core_layout.size, 4); - me->arch.got_offset = me->core_layout.size; - me->core_layout.size += me->arch.got_size; - me->arch.plt_offset = me->core_layout.size; + mod_mem = &me->mem[MOD_TEXT]; + mod_mem->size = ALIGN(mod_mem->size, 4); + me->arch.got_offset = mod_mem->size; + mod_mem->size += me->arch.got_size; + me->arch.plt_offset = mod_mem->size; if (me->arch.plt_size) { if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) me->arch.plt_size += PLT_ENTRY_SIZE; - me->core_layout.size += me->arch.plt_size; + mod_mem->size += me->arch.plt_size; } return 0; } @@ -304,7 +291,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ if (info->got_initialized == 0) { - Elf_Addr *gotent = me->core_layout.base + + Elf_Addr *gotent = me->mem[MOD_TEXT].base + me->arch.got_offset + info->got_offset; @@ -329,7 +316,8 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, rc = apply_rela_bits(loc, val, 0, 64, 0, write); else if (r_type == R_390_GOTENT || r_type == R_390_GOTPLTENT) { - val += (Elf_Addr) me->core_layout.base - loc; + val += (Elf_Addr)me->mem[MOD_TEXT].base + + me->arch.got_offset - loc; rc = apply_rela_bits(loc, val, 1, 32, 1, write); } break; @@ -345,7 +333,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, char *plt_base; char *ip; - plt_base = me->core_layout.base + me->arch.plt_offset; + plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset; ip = plt_base + info->plt_offset; *(int *)insn = 0x0d10e310; /* basr 1,0 */ *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */ @@ -375,7 +363,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, val - loc + 0xffffUL < 0x1ffffeUL) || (r_type == R_390_PLT32DBL && val - loc + 0xffffffffULL < 0x1fffffffeULL))) - val = (Elf_Addr) me->core_layout.base + + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.plt_offset + info->plt_offset; val += rela->r_addend - loc; @@ -397,7 +385,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_GOTOFF32: /* 32 bit offset to GOT. */ case R_390_GOTOFF64: /* 64 bit offset to GOT. */ val = val + rela->r_addend - - ((Elf_Addr) me->core_layout.base + me->arch.got_offset); + ((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset); if (r_type == R_390_GOTOFF16) rc = apply_rela_bits(loc, val, 0, 16, 0, write); else if (r_type == R_390_GOTOFF32) @@ -407,7 +395,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, break; case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ - val = (Elf_Addr) me->core_layout.base + me->arch.got_offset + + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset + rela->r_addend - loc; if (r_type == R_390_GOTPC) rc = apply_rela_bits(loc, val, 1, 32, 0, write); @@ -486,10 +474,10 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); numpages = DIV_ROUND_UP(size, PAGE_SIZE); - start = module_alloc(numpages * PAGE_SIZE); + start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE); if (!start) return -ENOMEM; - set_memory_ro((unsigned long)start, numpages); + set_memory_rox((unsigned long)start, numpages); end = start + size; me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start; @@ -515,7 +503,7 @@ int module_finalize(const Elf_Ehdr *hdr, !nospec_disable && me->arch.plt_size) { unsigned int *ij; - ij = me->core_layout.base + me->arch.plt_offset + + ij = me->mem[MOD_TEXT].base + me->arch.plt_offset + me->arch.plt_size - PLT_ENTRY_SIZE; ij[0] = 0xc6000000; /* exrl %r0,.+10 */ ij[1] = 0x0005a7f4; /* j . */ diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 53ed3884fe64..3da371c144eb 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -9,8 +9,10 @@ */ #include <linux/kernel_stat.h> +#include <linux/cpufeature.h> #include <linux/init.h> #include <linux/errno.h> +#include <linux/entry-common.h> #include <linux/hardirq.h> #include <linux/log2.h> #include <linux/kprobes.h> @@ -18,20 +20,19 @@ #include <linux/time.h> #include <linux/module.h> #include <linux/sched/signal.h> - +#include <linux/kvm_host.h> #include <linux/export.h> #include <asm/lowcore.h> +#include <asm/ctlreg.h> +#include <asm/fpu.h> #include <asm/smp.h> #include <asm/stp.h> #include <asm/cputime.h> #include <asm/nmi.h> #include <asm/crw.h> -#include <asm/switch_to.h> -#include <asm/ctl_reg.h> #include <asm/asm-offsets.h> #include <asm/pai.h> - -#include <linux/kvm_host.h> +#include <asm/vtime.h> struct mcck_struct { unsigned int kill_task : 1; @@ -42,19 +43,10 @@ struct mcck_struct { }; static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static struct kmem_cache *mcesa_cache; -static unsigned long mcesa_origin_lc; static inline int nmi_needs_mcesa(void) { - return MACHINE_HAS_VX || MACHINE_HAS_GS; -} - -static inline unsigned long nmi_get_mcesa_size(void) -{ - if (MACHINE_HAS_GS) - return MCESA_MAX_SIZE; - return MCESA_MIN_SIZE; + return cpu_has_vx() || cpu_has_gs(); } /* @@ -63,47 +55,34 @@ static inline unsigned long nmi_get_mcesa_size(void) * structure. The structure is required for machine check happening * early in the boot process. */ -static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); +static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE); void __init nmi_alloc_mcesa_early(u64 *mcesad) { if (!nmi_needs_mcesa()) return; *mcesad = __pa(&boot_mcesa); - if (MACHINE_HAS_GS) + if (cpu_has_gs()) *mcesad |= ilog2(MCESA_MAX_SIZE); } -static void __init nmi_alloc_cache(void) +int nmi_alloc_mcesa(u64 *mcesad) { unsigned long size; - - if (!nmi_needs_mcesa()) - return; - size = nmi_get_mcesa_size(); - if (size > MCESA_MIN_SIZE) - mcesa_origin_lc = ilog2(size); - /* create slab cache for the machine-check-extended-save-areas */ - mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL); - if (!mcesa_cache) - panic("Couldn't create nmi save area cache"); -} - -int __ref nmi_alloc_mcesa(u64 *mcesad) -{ - unsigned long origin; + void *origin; *mcesad = 0; if (!nmi_needs_mcesa()) return 0; - if (!mcesa_cache) - nmi_alloc_cache(); - origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + size = cpu_has_gs() ? MCESA_MAX_SIZE : MCESA_MIN_SIZE; + origin = kmalloc(size, GFP_KERNEL); if (!origin) return -ENOMEM; /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) origin); - *mcesad = __pa(origin) | mcesa_origin_lc; + kmemleak_not_leak(origin); + *mcesad = __pa(origin); + if (cpu_has_gs()) + *mcesad |= ilog2(MCESA_MAX_SIZE); return 0; } @@ -111,12 +90,65 @@ void nmi_free_mcesa(u64 *mcesad) { if (!nmi_needs_mcesa()) return; - kmem_cache_free(mcesa_cache, __va(*mcesad & MCESA_ORIGIN_MASK)); + kfree(__va(*mcesad & MCESA_ORIGIN_MASK)); +} + +static __always_inline char *nmi_puts(char *dest, const char *src) +{ + while (*src) + *dest++ = *src++; + *dest = 0; + return dest; +} + +static __always_inline char *u64_to_hex(char *dest, u64 val) +{ + int i, num; + + for (i = 1; i <= 16; i++) { + num = (val >> (64 - 4 * i)) & 0xf; + if (num >= 10) + *dest++ = 'A' + num - 10; + else + *dest++ = '0' + num; + } + *dest = 0; + return dest; } static notrace void s390_handle_damage(void) { + struct lowcore *lc = get_lowcore(); + union ctlreg0 cr0, cr0_new; + char message[100]; + psw_t psw_save; + char *ptr; + smp_emergency_stop(); + diag_amode31_ops.diag308_reset(); + ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x"); + u64_to_hex(ptr, lc->mcck_interruption_code); + + /* + * Disable low address protection and make machine check new PSW a + * disabled wait PSW. Any additional machine check cannot be handled. + */ + local_ctl_store(0, &cr0.reg); + cr0_new = cr0; + cr0_new.lap = 0; + local_ctl_load(0, &cr0_new.reg); + psw_save = lc->mcck_new_psw; + psw_bits(lc->mcck_new_psw).io = 0; + psw_bits(lc->mcck_new_psw).ext = 0; + psw_bits(lc->mcck_new_psw).wait = 1; + sclp_emergency_printk(message); + + /* + * Restore machine check new PSW and control register 0 to original + * values. This makes possible system dump analysis easier. + */ + lc->mcck_new_psw = psw_save; + local_ctl_load(0, &cr0.reg); disabled_wait(); while (1); } @@ -126,19 +158,20 @@ NOKPROBE_SYMBOL(s390_handle_damage); * Main machine check handler function. Will be called with interrupts disabled * and machine checks enabled. */ -void __s390_handle_mcck(void) +void s390_handle_mcck(void) { struct mcck_struct mcck; + unsigned long mflags; /* * Disable machine checks and get the current state of accumulated * machine checks. Afterwards delete the old state and enable machine * checks again. */ - local_mcck_disable(); + local_mcck_save(mflags); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); - local_mcck_enable(); + local_mcck_restore(mflags); if (mcck.channel_report) crw_handle_channel_report(); @@ -155,203 +188,80 @@ void __s390_handle_mcck(void) static int mchchk_wng_posted = 0; /* Use single cpu clear, as we cannot handle smp here. */ - __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ + local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT); if (xchg(&mchchk_wng_posted, 1) == 0) kill_cad_pid(SIGPWR, 1); } if (mcck.stp_queue) stp_queue_work(); if (mcck.kill_task) { - local_irq_enable(); printk(KERN_EMERG "mcck: Terminating task because of machine " "malfunction (code 0x%016lx).\n", mcck.mcck_code); printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", current->comm, current->pid); - make_task_dead(SIGSEGV); + if (is_global_init(current)) + panic("mcck: Attempting to kill init!\n"); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID); } } -void noinstr s390_handle_mcck(struct pt_regs *regs) -{ - trace_hardirqs_off(); - pai_kernel_enter(regs); - __s390_handle_mcck(); - pai_kernel_exit(regs); - trace_hardirqs_on(); -} -/* - * returns 0 if all required registers are available - * returns 1 otherwise +/** + * nmi_registers_valid - verify if registers are valid + * @mci: machine check interruption code + * + * Inspect a machine check interruption code and verify if all required + * registers are valid. For some registers the corresponding validity bit is + * ignored and the registers are set to the expected value. + * Returns true if all registers are valid, otherwise false. */ -static int notrace s390_validate_registers(union mci mci, int umode) +static bool notrace nmi_registers_valid(union mci mci) { - struct mcesa *mcesa; - void *fpt_save_area; union ctlreg2 cr2; - int kill_task; - u64 zero; - - kill_task = 0; - zero = 0; - if (!mci.gr) { - /* - * General purpose registers couldn't be restored and have - * unknown contents. Stop system or terminate process. - */ - if (!umode) - s390_handle_damage(); - kill_task = 1; - } - if (!mci.fp) { - /* - * Floating point registers can't be restored. If the - * kernel currently uses floating point registers the - * system is stopped. If the process has its floating - * pointer registers loaded it is terminated. - */ - if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } - fpt_save_area = &S390_lowcore.floating_pt_save_area; - if (!mci.fc) { - /* - * Floating point control register can't be restored. - * If the kernel currently uses the floating pointer - * registers and needs the FPC register the system is - * stopped. If the process has its floating pointer - * registers loaded it is terminated. Otherwise the - * FPC is just validated. - */ - if (S390_lowcore.fpu_flags & KERNEL_FPC) - s390_handle_damage(); - asm volatile( - " lfpc %0\n" - : - : "Q" (zero)); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } else { - asm volatile( - " lfpc %0\n" - : - : "Q" (S390_lowcore.fpt_creg_save_area)); - } - - mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); - if (!MACHINE_HAS_VX) { - /* Validate floating point registers */ - asm volatile( - " ld 0,0(%0)\n" - " ld 1,8(%0)\n" - " ld 2,16(%0)\n" - " ld 3,24(%0)\n" - " ld 4,32(%0)\n" - " ld 5,40(%0)\n" - " ld 6,48(%0)\n" - " ld 7,56(%0)\n" - " ld 8,64(%0)\n" - " ld 9,72(%0)\n" - " ld 10,80(%0)\n" - " ld 11,88(%0)\n" - " ld 12,96(%0)\n" - " ld 13,104(%0)\n" - " ld 14,112(%0)\n" - " ld 15,120(%0)\n" - : - : "a" (fpt_save_area) - : "memory"); - } else { - /* Validate vector registers */ - union ctlreg0 cr0; - - /* - * The vector validity must only be checked if not running a - * KVM guest. For KVM guests the machine check is forwarded by - * KVM and it is the responsibility of the guest to take - * appropriate actions. The host vector or FPU values have been - * saved by KVM and will be restored by KVM. - */ - if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) { - /* - * Vector registers can't be restored. If the kernel - * currently uses vector registers the system is - * stopped. If the process has its vector registers - * loaded it is terminated. Otherwise just validate - * the registers. - */ - if (S390_lowcore.fpu_flags & KERNEL_VXR) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } - cr0.val = S390_lowcore.cregs_save_area[0]; - cr0.afp = cr0.vx = 1; - __ctl_load(cr0.val, 0, 0); - asm volatile( - " la 1,%0\n" - " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ - " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ - : - : "Q" (*(struct vx_array *)mcesa->vector_save_area) - : "1"); - __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); - } - /* Validate access registers */ - asm volatile( - " lam 0,15,0(%0)\n" - : - : "a" (&S390_lowcore.access_regs_save_area) - : "memory"); - if (!mci.ar) { - /* - * Access registers have unknown contents. - * Terminating task. - */ - kill_task = 1; - } - /* Validate guarded storage registers */ - cr2.val = S390_lowcore.cregs_save_area[2]; - if (cr2.gse) { - if (!mci.gs) { - /* - * 2 cases: - * - machine check in kernel or userspace - * - machine check while running SIE (KVM guest) - * For kernel or userspace the userspace values of - * guarded storage control can not be recreated, the - * process must be terminated. - * For SIE the guest values of guarded storage can not - * be recreated. This is either due to a bug or due to - * GS being disabled in the guest. The guest will be - * notified by KVM code and the guests machine check - * handling must take care of this. The host values - * are saved by KVM and are not affected. - */ - if (!test_cpu_flag(CIF_MCCK_GUEST)) - kill_task = 1; - } else { - load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); - } - } /* - * The getcpu vdso syscall reads CPU number from the programmable + * The getcpu vdso syscall reads the CPU number from the programmable * field of the TOD clock. Disregard the TOD programmable register - * validity bit and load the CPU number into the TOD programmable - * field unconditionally. + * validity bit and load the CPU number into the TOD programmable field + * unconditionally. */ set_tod_programmable_field(raw_smp_processor_id()); - /* Validate clock comparator register */ - set_clock_comparator(S390_lowcore.clock_comparator); - + /* + * Set the clock comparator register to the next expected value. + */ + set_clock_comparator(get_lowcore()->clock_comparator); + if (!mci.gr || !mci.fp || !mci.fc) + return false; + /* + * The vector validity must only be checked if not running a + * KVM guest. For KVM guests the machine check is forwarded by + * KVM and it is the responsibility of the guest to take + * appropriate actions. The host vector or FPU values have been + * saved by KVM and will be restored by KVM. + */ + if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) + return false; + if (!mci.ar) + return false; + /* + * Two cases for guarded storage registers: + * - machine check in kernel or userspace + * - machine check while running SIE (KVM guest) + * For kernel or userspace the userspace values of guarded storage + * control can not be recreated, the process must be terminated. + * For SIE the guest values of guarded storage can not be recreated. + * This is either due to a bug or due to GS being disabled in the + * guest. The guest will be notified by KVM code and the guests machine + * check handling must take care of this. The host values are saved by + * KVM and are not affected. + */ + cr2.reg = get_lowcore()->cregs_save_area[2]; + if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST)) + return false; if (!mci.ms || !mci.pm || !mci.ia) - kill_task = 1; - - return kill_task; + return false; + return true; } -NOKPROBE_SYMBOL(s390_validate_registers); +NOKPROBE_SYMBOL(nmi_registers_valid); /* * Backup the guest's machine check info to its description block @@ -362,8 +272,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) struct sie_page *sie_page; /* r14 contains the sie block, which was set in sie64a */ - struct kvm_s390_sie_block *sie_block = - (struct kvm_s390_sie_block *) regs->gprs[14]; + struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]); if (sie_block == NULL) /* Something's seriously wrong, stop system. */ @@ -371,11 +280,10 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) sie_page = container_of(sie_block, struct sie_page, sie_block); mcck_backup = &sie_page->mcck_info; - mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + mcck_backup->mcic = get_lowcore()->mcck_interruption_code & ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); - mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; - mcck_backup->failing_storage_address - = S390_lowcore.failing_storage_address; + mcck_backup->ext_damage_code = get_lowcore()->external_damage_code; + mcck_backup->failing_storage_address = get_lowcore()->failing_storage_address; } NOKPROBE_SYMBOL(s390_backup_mcck_info); @@ -390,23 +298,25 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info); /* * machine check handler. */ -int notrace s390_do_machine_check(struct pt_regs *regs) +void notrace s390_do_machine_check(struct pt_regs *regs) { static int ipd_count; static DEFINE_SPINLOCK(ipd_lock); static unsigned long long last_ipd; + struct lowcore *lc = get_lowcore(); struct mcck_struct *mcck; unsigned long long tmp; + irqentry_state_t irq_state; union mci mci; unsigned long mcck_dam_code; int mcck_pending = 0; - nmi_enter(); + irq_state = irqentry_nmi_enter(regs); if (user_mode(regs)) update_timer_mcck(); inc_irq_stat(NMI_NMI); - mci.val = S390_lowcore.mcck_interruption_code; + mci.val = lc->mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); /* @@ -449,7 +359,9 @@ int notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_validate_registers(mci, user_mode(regs))) { + if (!nmi_registers_valid(mci)) { + if (!user_mode(regs)) + s390_handle_damage(); /* * Couldn't restore all register contents for the * user space process -> mark task for termination. @@ -472,13 +384,27 @@ int notrace s390_do_machine_check(struct pt_regs *regs) } if (mci.ed && mci.ec) { /* External damage */ - if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) + if (lc->external_damage_code & (1U << ED_STP_SYNC)) mcck->stp_queue |= stp_sync_check(); - if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) + if (lc->external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); mcck_pending = 1; } - + /* + * Reinject storage related machine checks into the guest if they + * happen when the guest is running. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) { + /* Storage error uncorrected */ + if (mci.se) + s390_handle_damage(); + /* Storage key-error uncorrected */ + if (mci.ke) + s390_handle_damage(); + /* Storage degradation */ + if (mci.ds && mci.fa) + s390_handle_damage(); + } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; @@ -503,24 +429,18 @@ int notrace s390_do_machine_check(struct pt_regs *regs) } clear_cpu_flag(CIF_MCCK_GUEST); - if (user_mode(regs) && mcck_pending) { - nmi_exit(); - return 1; - } - if (mcck_pending) schedule_mcck_handler(); - nmi_exit(); - return 0; + irqentry_nmi_exit(regs, irq_state); } NOKPROBE_SYMBOL(s390_do_machine_check); static int __init machine_check_init(void) { - ctl_set_bit(14, 25); /* enable external damage MCH */ - ctl_set_bit(14, 27); /* enable system recovery MCH */ - ctl_set_bit(14, 24); /* enable warning MCH */ + system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT); + system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT); + system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT); return 0; } early_initcall(machine_check_init); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 717bbcc056e5..e11ec15960a1 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -4,6 +4,8 @@ #include <linux/cpu.h> #include <asm/nospec-branch.h> +int nobp = IS_ENABLED(CONFIG_KERNEL_NOBP); + static int __init nobp_setup_early(char *str) { bool enabled; @@ -14,14 +16,14 @@ static int __init nobp_setup_early(char *str) return rc; if (enabled && test_facility(82)) { /* - * The user explicitely requested nobp=1, enable it and + * The user explicitly requested nobp=1, enable it and * disable the expoline support. */ - __set_facility(82, alt_stfle_fac_list); + nobp = 1; if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_disable = 1; } else { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } return 0; } @@ -29,7 +31,7 @@ early_param("nobp", nobp_setup_early); static int __init nospec_setup_early(char *str) { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; return 0; } early_param("nospec", nospec_setup_early); @@ -40,7 +42,7 @@ static int __init nospec_report(void) pr_info("Spectre V2 mitigation: etokens\n"); if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) + if (nobp_enabled()) pr_info("Spectre V2 mitigation: limited branch prediction\n"); return 0; } @@ -66,14 +68,14 @@ void __init nospec_auto_detect(void) */ if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. */ nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } /* * If the kernel has not been compiled with expolines the @@ -86,7 +88,7 @@ static int __init spectre_v2_setup_early(char *str) { if (str && !strncmp(str, "on", 2)) { nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } if (str && !strncmp(str, "off", 3)) nospec_disable = 1; @@ -114,10 +116,10 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) type = BRASL_EXPOLINE; /* brasl instruction */ else continue; - thunk = instr + (*(int *)(instr + 2)) * 2; + thunk = instr + (long)(*(int *)(instr + 2)) * 2; if (thunk[0] == 0xc6 && thunk[1] == 0x00) /* exrl %r0,<target-br> */ - br = thunk + (*(int *)(thunk + 2)) * 2; + br = thunk + (long)(*(int *)(thunk + 2)) * 2; else continue; if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index 52d4353188ad..5970dd3ee7c5 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -7,17 +7,17 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return sysfs_emit(buf, "Mitigation: __user pointer sanitization\n"); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { if (test_facility(156)) - return sprintf(buf, "Mitigation: etokens\n"); + return sysfs_emit(buf, "Mitigation: etokens\n"); if (nospec_uses_trampoline()) - return sprintf(buf, "Mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) - return sprintf(buf, "Mitigation: limited branch prediction\n"); - return sprintf(buf, "Vulnerable\n"); + return sysfs_emit(buf, "Mitigation: execute trampolines\n"); + if (nobp_enabled()) + return sysfs_emit(buf, "Mitigation: limited branch prediction\n"); + return sysfs_emit(buf, "Vulnerable\n"); } diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 23ab9f02f278..2fc40f97c0ad 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -14,9 +14,6 @@ #include <linux/node.h> #include <asm/numa.h> -struct pglist_data *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - void __init numa_setup(void) { int nid; @@ -24,12 +21,8 @@ void __init numa_setup(void) nodes_clear(node_possible_map); node_set(0, node_possible_map); node_set_online(0); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8); NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 1acc2e05d70f..c2a468986212 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -13,9 +13,13 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <asm/checksum.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> #include <asm/asm-offsets.h> +#include <asm/sections.h> +#include <asm/ipl.h> /* * OS info structure has to be page aligned @@ -28,7 +32,7 @@ static struct os_info os_info __page_aligned_data; u32 os_info_csum(struct os_info *os_info) { int size = sizeof(*os_info) - offsetof(struct os_info, version_major); - return (__force u32)csum_partial(&os_info->version_major, size, 0); + return (__force u32)cksm(&os_info->version_major, size, 0); } /* @@ -42,13 +46,24 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size) } /* - * Add OS info entry and update checksum + * Add OS info data entry and update checksum */ -void os_info_entry_add(int nr, void *ptr, u64 size) +void os_info_entry_add_data(int nr, void *ptr, u64 size) { os_info.entry[nr].addr = __pa(ptr); os_info.entry[nr].size = size; - os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0); + os_info.entry[nr].csum = (__force u32)cksm(ptr, size, 0); + os_info.csum = os_info_csum(&os_info); +} + +/* + * Add OS info value entry and update checksum + */ +void os_info_entry_add_val(int nr, u64 value) +{ + os_info.entry[nr].val = value; + os_info.entry[nr].size = 0; + os_info.entry[nr].csum = 0; os_info.csum = os_info_csum(&os_info); } @@ -57,13 +72,25 @@ void os_info_entry_add(int nr, void *ptr, u64 size) */ void __init os_info_init(void) { - void *ptr = &os_info; + struct lowcore *abs_lc; + BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE); os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; + os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base); + os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset()); + os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys); + os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap); + os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START); + os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END); + os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext); + os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end); + os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext)); os_info.csum = os_info_csum(&os_info); - put_abs_lowcore(os_info, __pa(ptr)); + abs_lc = get_abs_lowcore(); + abs_lc->os_info = __pa(&os_info); + put_abs_lowcore(abs_lc); } #ifdef CONFIG_CRASH_DUMP @@ -95,7 +122,7 @@ static void os_info_old_alloc(int nr, int align) msg = "copy failed"; goto fail_free; } - csum = (__force u32)csum_partial(buf_align, size, 0); + csum = (__force u32)cksm(buf_align, size, 0); if (csum != os_info_old->entry[nr].csum) { msg = "checksum failed"; goto fail_free; @@ -122,7 +149,7 @@ static void os_info_old_init(void) if (os_info_init) return; - if (!oldmem_data.start) + if (!oldmem_data.start && !is_ipl_type_dump()) goto fail; if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) goto fail; @@ -154,7 +181,7 @@ fail: } /* - * Return pointer to os infor entry and its size + * Return pointer to os info entry and its size */ void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..6a262e198e35 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,7 +2,7 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2021 + * Copyright IBM Corp. 2012, 2023 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> * Thomas Richter <tmricht@linux.ibm.com> */ @@ -16,14 +16,310 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/miscdevice.h> +#include <linux/perf_event.h> -#include <asm/cpu_mcf.h> +#include <asm/cpu_mf.h> #include <asm/hwctrset.h> #include <asm/debug.h> +/* Perf PMU definitions for the counter facility */ +#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ +#define PERF_EVENT_CPUM_CF_DIAG 0xBC000UL /* Event: Counter sets */ + +enum cpumf_ctr_set { + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 + +static inline void ctr_set_enable(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; +} + +static inline void ctr_set_disable(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); +} + +static inline void ctr_set_start(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; +} + +static inline void ctr_set_stop(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) +{ + switch (set) { + case CPUMF_CTR_SET_BASIC: + return stcctm(BASIC, range, dest); + case CPUMF_CTR_SET_USER: + return stcctm(PROBLEM_STATE, range, dest); + case CPUMF_CTR_SET_CRYPTO: + return stcctm(CRYPTO_ACTIVITY, range, dest); + case CPUMF_CTR_SET_EXT: + return stcctm(EXTENDED, range, dest); + case CPUMF_CTR_SET_MT_DIAG: + return stcctm(MT_DIAG_CLEARING, range, dest); + case CPUMF_CTR_SET_MAX: + return 3; + } + return 3; +} + +struct cpu_cf_events { + refcount_t refcnt; /* Reference count */ + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state; /* For perf_event_open SVC */ + u64 dev_state; /* For /dev/hwctr */ + unsigned int flags; + size_t used; /* Bytes used in data */ + size_t usedss; /* Bytes used in start/stop */ + unsigned char start[PAGE_SIZE]; /* Counter set at event add */ + unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ + unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ + unsigned int sets; /* # Counter set saved in memory */ +}; + static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ static debug_info_t *cf_dbg; +/* + * The CPU Measurement query counter information instruction contains + * information which varies per machine generation, but is constant and + * does not change when running on a particular machine, such as counter + * first and second version number. This is needed to determine the size + * of counter sets. Extract this information at device driver initialization. + */ +static struct cpumf_ctr_info cpumf_ctr_info; + +struct cpu_cf_ptr { + struct cpu_cf_events *cpucf; +}; + +static struct cpu_cf_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct cpu_cf_ptr __percpu *cfptr; +} cpu_cf_root; + +/* + * Serialize event initialization and event removal. Both are called from + * user space in task context with perf_event_open() and close() + * system calls. + * + * This mutex serializes functions cpum_cf_alloc_cpu() called at event + * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu() + * called at event removal via call back function hw_perf_event_destroy() + * when the event is deleted. They are serialized to enforce correct + * bookkeeping of pointer and reference counts anchored by + * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the + * per CPU pointers stored in cpu_cf_root::cfptr. + */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Get pointer to per-cpu structure. + * + * Function get_cpu_cfhw() is called from + * - cfset_copy_all(): This function is protected by cpus_read_lock(), so + * CPU hot plug remove can not happen. Event removal requires a close() + * first. + * + * Function this_cpu_cfhw() is called from perf common code functions: + * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}(): + * All functions execute with interrupts disabled on that particular CPU. + * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all(). + * + * Therefore it is safe to access the CPU specific pointer to the event. + */ +static struct cpu_cf_events *get_cpu_cfhw(int cpu) +{ + struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr; + + if (p) { + struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu); + + return q->cpucf; + } + return NULL; +} + +static struct cpu_cf_events *this_cpu_cfhw(void) +{ + return get_cpu_cfhw(smp_processor_id()); +} + +/* Disable counter sets on dedicated CPU */ +static void cpum_cf_reset_cpu(void *flags) +{ + lcctl(0); +} + +/* Free per CPU data when the last event is removed. */ +static void cpum_cf_free_root(void) +{ + if (!refcount_dec_and_test(&cpu_cf_root.refcnt)) + return; + free_percpu(cpu_cf_root.cfptr); + cpu_cf_root.cfptr = NULL; + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n", + __func__, refcount_read(&cpu_cf_root.refcnt), + !cpu_cf_root.cfptr); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int cpum_cf_alloc_root(void) +{ + int rc = 0; + + if (refcount_inc_not_zero(&cpu_cf_root.refcnt)) + return rc; + + /* The memory is already zeroed. */ + cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr); + if (cpu_cf_root.cfptr) { + refcount_set(&cpu_cf_root.refcnt, 1); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + } else { + rc = -ENOMEM; + } + + return rc; +} + +/* Free CPU counter data structure for a PMU */ +static void cpum_cf_free_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + + mutex_lock(&pmc_reserve_mutex); + /* + * When invoked via CPU hotplug handler, there might be no events + * installed or that particular CPU might not have an + * event installed. This anchor pointer can be NULL! + */ + if (!cpu_cf_root.cfptr) + goto out; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + /* + * Might be zero when called from CPU hotplug handler and no event + * installed on that CPU, but on different CPUs. + */ + if (!cpuhw) + goto out; + + if (refcount_dec_and_test(&cpuhw->refcnt)) { + kfree(cpuhw); + p->cpucf = NULL; + } + cpum_cf_free_root(); +out: + mutex_unlock(&pmc_reserve_mutex); +} + +/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */ +static int cpum_cf_alloc_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + int rc; + + mutex_lock(&pmc_reserve_mutex); + rc = cpum_cf_alloc_root(); + if (rc) + goto unlock; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + + if (!cpuhw) { + cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL); + if (cpuhw) { + p->cpucf = cpuhw; + refcount_set(&cpuhw->refcnt, 1); + } else { + rc = -ENOMEM; + } + } else { + refcount_inc(&cpuhw->refcnt); + } + if (rc) { + /* + * Error in allocation of event, decrement anchor. Since + * cpu_cf_event in not created, its destroy() function is not + * invoked. Adjust the reference counter for the anchor. + */ + cpum_cf_free_root(); + } +unlock: + mutex_unlock(&pmc_reserve_mutex); + return rc; +} + +/* + * Create/delete per CPU data structures for /dev/hwctr interface and events + * created by perf_event_open(). + * If cpu is -1, track task on all available CPUs. This requires + * allocation of hardware data structures for all CPUs. This setup handles + * perf_event_open() with task context and /dev/hwctr interface. + * If cpu is non-zero install event on this CPU only. This setup handles + * perf_event_open() with CPU context. + */ +static int cpum_cf_alloc(int cpu) +{ + cpumask_var_t mask; + int rc; + + if (cpu == -1) { + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + for_each_online_cpu(cpu) { + rc = cpum_cf_alloc_cpu(cpu); + if (rc) { + for_each_cpu(cpu, mask) + cpum_cf_free_cpu(cpu); + break; + } + cpumask_set_cpu(cpu, mask); + } + free_cpumask_var(mask); + } else { + rc = cpum_cf_alloc_cpu(cpu); + } + return rc; +} + +static void cpum_cf_free(int cpu) +{ + if (cpu == -1) { + for_each_online_cpu(cpu) + cpum_cf_free_cpu(cpu); + } else { + cpum_cf_free_cpu(cpu); + } +} + #define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ /* interval in seconds */ @@ -96,11 +392,10 @@ struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ /* Create the trailer data at the end of a page. */ static void cfdiag_trailer(struct cf_trailer_entry *te) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct cpuid cpuid; - te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ - te->csvn = cpuhw->info.csvn; + te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */ + te->csvn = cpumf_ctr_info.csvn; get_cpu_id(&cpuid); /* Machine type */ te->mach_type = cpuid.machine; @@ -112,6 +407,63 @@ static void cfdiag_trailer(struct cf_trailer_entry *te) te->timestamp = get_tod_clock_fast(); } +/* + * The number of counters per counter set varies between machine generations, + * but is constant when running on a particular machine generation. + * Determine each counter set size at device driver initialization and + * retrieve it later. + */ +static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX]; +static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (cpumf_ctr_info.cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (cpumf_ctr_info.cfvn == 1) + ctrset_size = 6; + else if (cpumf_ctr_info.cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 16; + else if (cpumf_ctr_info.csvn >= 6) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (cpumf_ctr_info.csvn == 1) + ctrset_size = 32; + else if (cpumf_ctr_info.csvn == 2) + ctrset_size = 48; + else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 128; + else if (cpumf_ctr_info.csvn >= 6 && cpumf_ctr_info.csvn <= 8) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpumf_ctr_info.csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + cpumf_ctr_setsizes[ctrset] = ctrset_size; +} + +/* + * Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset) +{ + return cpumf_ctr_setsizes[ctrset]; +} + /* Read a counter set. The counter set number determines the counter set and * the CPUM-CF first and second version number determine the number of * available counters in each counter set. @@ -130,14 +482,13 @@ static void cfdiag_trailer(struct cf_trailer_entry *te) static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, size_t room, bool error_ok) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); size_t ctrset_size, need = 0; int rc = 3; /* Assume write failure */ ctrdata->def = CF_DIAG_CTRSET_DEF; ctrdata->set = ctrset; ctrdata->res1 = 0; - ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); + ctrset_size = cpum_cf_read_setsize(ctrset); if (ctrset_size) { /* Save data */ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); @@ -151,10 +502,6 @@ static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, need = 0; } - debug_sprintf_event(cf_dbg, 3, - "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc %d\n", __func__, ctrset, ctrset_size, - cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); return need; } @@ -213,25 +560,31 @@ static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) struct cf_trailer_entry *trailer_start, *trailer_stop; struct cf_ctrset_entry *ctrstart, *ctrstop; size_t offset = 0; + int i; - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + /* Counter set not authorized */ + if (!(auth & cpumf_ctr_ctl[i])) + continue; + /* Counter set size zero was not saved */ + if (!cpum_cf_read_setsize(i)) + continue; + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { pr_err_once("cpum_cf_diag counter set compare error " "in set %i\n", ctrstart->set); return 0; } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; if (ctrstart->def == CF_DIAG_CTRSET_DEF) { cfdiag_diffctrset((u64 *)(ctrstart + 1), (u64 *)(ctrstop + 1), ctrstart->ctr); offset += ctrstart->ctr * sizeof(u64) + sizeof(*ctrstart); } - } while (ctrstart->def && auth); + } /* Save time_stamp from start of event in stop's trailer */ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); @@ -259,40 +612,35 @@ static enum cpumf_ctr_set get_counter_set(u64 event) return set; } -static int validate_ctr_version(const struct hw_perf_event *hwc, - enum cpumf_ctr_set set) +static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set) { - struct cpu_cf_events *cpuhw; - int err = 0; u16 mtdiag_ctl; - - cpuhw = &get_cpu_var(cpu_cf_events); + int err = 0; /* check required version for counter sets */ switch (set) { case CPUMF_CTR_SET_BASIC: case CPUMF_CTR_SET_USER: - if (cpuhw->info.cfvn < 1) + if (cpumf_ctr_info.cfvn < 1) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_CRYPTO: - if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && - hwc->config > 79) || - (cpuhw->info.csvn >= 6 && hwc->config > 83)) + if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 && + config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_EXT: - if (cpuhw->info.csvn < 1) + if (cpumf_ctr_info.csvn < 1) err = -EOPNOTSUPP; - if ((cpuhw->info.csvn == 1 && hwc->config > 159) || - (cpuhw->info.csvn == 2 && hwc->config > 175) || - (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 - && hwc->config > 255) || - (cpuhw->info.csvn >= 6 && hwc->config > 287)) + if ((cpumf_ctr_info.csvn == 1 && config > 159) || + (cpumf_ctr_info.csvn == 2 && config > 175) || + (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 && + config > 255) || + (cpumf_ctr_info.csvn >= 6 && config > 287)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MT_DIAG: - if (cpuhw->info.csvn <= 3) + if (cpumf_ctr_info.csvn <= 3) err = -EOPNOTSUPP; /* * MT-diagnostic counters are read-only. The counter set @@ -307,35 +655,15 @@ static int validate_ctr_version(const struct hw_perf_event *hwc, * counter set is enabled and active. */ mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; - if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && - (cpuhw->info.enable_ctl & mtdiag_ctl) && - (cpuhw->info.act_ctl & mtdiag_ctl))) + if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) && + (cpumf_ctr_info.enable_ctl & mtdiag_ctl) && + (cpumf_ctr_info.act_ctl & mtdiag_ctl))) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MAX: err = -EOPNOTSUPP; } - put_cpu_var(cpu_cf_events); - return err; -} - -static int validate_ctr_auth(const struct hw_perf_event *hwc) -{ - struct cpu_cf_events *cpuhw; - int err = 0; - - cpuhw = &get_cpu_var(cpu_cf_events); - - /* Check authorization for cpu counter sets. - * If the particular CPU counter set is not authorized, - * return with -ENOENT in order to fall back to other - * PMUs that might suffice the event request. - */ - if (!(hwc->config_base & cpuhw->info.auth_ctl)) - err = -ENOENT; - - put_cpu_var(cpu_cf_events); return err; } @@ -346,20 +674,17 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc) */ static void cpumf_pmu_enable(struct pmu *pmu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int err; - if (cpuhw->flags & PMU_F_ENABLED) + if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED)) return; err = lcctl(cpuhw->state | cpuhw->dev_state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - - cpuhw->flags |= PMU_F_ENABLED; + if (err) + pr_err("Enabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags |= PMU_F_ENABLED; } /* @@ -369,40 +694,26 @@ static void cpumf_pmu_enable(struct pmu *pmu) */ static void cpumf_pmu_disable(struct pmu *pmu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); u64 inactive; + int err; - if (!(cpuhw->flags & PMU_F_ENABLED)) + if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED)) return; inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); inactive |= cpuhw->dev_state; err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - - cpuhw->flags &= ~PMU_F_ENABLED; + if (err) + pr_err("Disabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags &= ~PMU_F_ENABLED; } - -/* Number of perf events counting hardware events */ -static atomic_t num_events = ATOMIC_INIT(0); -/* Used to avoid races in calling reserve/release_cpumf_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - /* Release the PMU if event is the last perf event */ static void hw_perf_event_destroy(struct perf_event *event) { - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - __kernel_cpumcf_end(); - mutex_unlock(&pmc_reserve_mutex); - } + cpum_cf_free(event->cpu); } /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ @@ -426,12 +737,10 @@ static const int cpumf_generic_events_user[] = { [PERF_COUNT_HW_BUS_CYCLES] = -1, }; -static void cpumf_hw_inuse(void) +static int is_userspace_event(u64 ev) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_inc_return(&num_events) == 1) - __kernel_cpumcf_begin(); - mutex_unlock(&pmc_reserve_mutex); + return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; } static int __hw_perf_event_init(struct perf_event *event, unsigned int type) @@ -439,7 +748,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; enum cpumf_ctr_set set; - int err = 0; u64 ev; switch (type) { @@ -456,19 +764,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) if (is_sampling_event(event)) /* No sampling support */ return -ENOENT; ev = attr->config; - /* Count user space (problem-state) only */ if (!attr->exclude_user && attr->exclude_kernel) { - if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_user[ev]; - - /* No support for kernel space counters only */ + /* + * Count user space (problem-state) only + * Handle events 32 and 33 as 0:u and 1:u + */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + } } else if (!attr->exclude_kernel && attr->exclude_user) { + /* No support for kernel space counters only */ return -EOPNOTSUPP; - } else { /* Count user and kernel space */ - if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_basic[ev]; + } else { + /* Count user and kernel space, incl. events 32 + 33 */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } } break; @@ -505,18 +820,22 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) } /* Initialize for using the CPU-measurement counter facility */ - cpumf_hw_inuse(); + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; event->destroy = hw_perf_event_destroy; - /* Finally, validate version and authorization of the counter set */ - err = validate_ctr_auth(hwc); - if (!err) - err = validate_ctr_version(hwc, set); - - return err; + /* + * Finally, validate version and authorization of the counter set. + * If the particular CPU counter set is not authorized, + * return with -ENOENT in order to fall back to other + * PMUs that might suffice the event request. + */ + if (!(hwc->config_base & cpumf_ctr_info.auth_ctl)) + return -ENOENT; + return validate_ctr_version(hwc->config, set); } -/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different +/* Events CPU_CYCLES and INSTRUCTIONS can be submitted with two different * attribute::type values: * - PERF_TYPE_HARDWARE: * - pmu->type: @@ -539,18 +858,13 @@ static int cpumf_pmu_event_type(struct perf_event *event) static int cpumf_pmu_event_init(struct perf_event *event) { unsigned int type = event->attr.type; - int err; + int err = -ENOENT; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) /* Registered as unknown PMU */ err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); - else - return -ENOENT; - - if (unlikely(err) && event->destroy) - event->destroy(event); return err; } @@ -560,8 +874,8 @@ static int hw_perf_event_reset(struct perf_event *event) u64 prev, new; int err; + prev = local64_read(&event->hw.prev_count); do { - prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) { if (err != 3) @@ -573,7 +887,7 @@ static int hw_perf_event_reset(struct perf_event *event) */ new = 0; } - } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new)); return err; } @@ -583,12 +897,12 @@ static void hw_perf_event_update(struct perf_event *event) u64 prev, new, delta; int err; + prev = local64_read(&event->hw.prev_count); do { - prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) return; - } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new)); delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1; /* overflow */ @@ -605,7 +919,7 @@ static void cpumf_pmu_read(struct perf_event *event) static void cpumf_pmu_start(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct hw_perf_event *hwc = &event->hw; int i; @@ -662,17 +976,10 @@ static int cfdiag_push_sample(struct perf_event *event, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = cpuhw->usedss; raw.frag.data = cpuhw->stop; - raw.size = raw.frag.size; - data.raw = &raw; + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); - debug_sprintf_event(cf_dbg, 3, - "%s event %#llx sample_type %#llx raw %d ov %d\n", - __func__, event->hw.config, - event->attr.sample_type, raw.size, overflow); - if (overflow) - event->pmu->stop(event, 0); perf_event_update_userpage(event); return overflow; @@ -680,7 +987,7 @@ static int cfdiag_push_sample(struct perf_event *event, static void cpumf_pmu_stop(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct hw_perf_event *hwc = &event->hw; int i; @@ -707,8 +1014,7 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags) false); if (cfdiag_diffctr(cpuhw, event->hw.config_base)) cfdiag_push_sample(event, cpuhw); - } else if (cpuhw->flags & PMU_F_RESERVED) { - /* Only update when PMU not hotplugged off */ + } else { hw_perf_event_update(event); } hwc->state |= PERF_HES_UPTODATE; @@ -717,7 +1023,7 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags) static int cpumf_pmu_add(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); ctr_set_enable(&cpuhw->state, event->hw.config_base); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -730,7 +1036,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) static void cpumf_pmu_del(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int i; cpumf_pmu_stop(event, PERF_EF_UPDATE); @@ -741,7 +1047,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * * When a new perf event has been added but not yet started, this can * clear enable control and resets all counters in a set. Therefore, - * cpumf_pmu_start() always has to reenable a counter set. + * cpumf_pmu_start() always has to re-enable a counter set. */ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) if (!atomic_read(&cpuhw->ctr_set[i])) @@ -762,31 +1068,172 @@ static struct pmu cpumf_pmu = { .read = cpumf_pmu_read, }; +static struct cfset_session { /* CPUs and counter set bit mask */ + struct list_head head; /* Head of list of active processes */ +} cfset_session = { + .head = LIST_HEAD_INIT(cfset_session.head) +}; + +static refcount_t cfset_opencnt = REFCOUNT_INIT(0); /* Access count */ +/* + * Synchronize access to device /dev/hwc. This mutex protects against + * concurrent access to functions cfset_open() and cfset_release(). + * Same for CPU hotplug add and remove events triggering + * cpum_cf_online_cpu() and cpum_cf_offline_cpu(). + * It also serializes concurrent device ioctl access from multiple + * processes accessing /dev/hwc. + * + * The mutex protects concurrent access to the /dev/hwctr session management + * struct cfset_session and reference counting variable cfset_opencnt. + */ +static DEFINE_MUTEX(cfset_ctrset_mutex); + +/* + * CPU hotplug handles only /dev/hwctr device. + * For perf_event_open() the CPU hotplug handling is done on kernel common + * code: + * - CPU add: Nothing is done since a file descriptor can not be created + * and returned to the user. + * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and + * pmu_delete(). The event itself is removed when the file descriptor is + * closed. + */ +static int cfset_online_cpu(unsigned int cpu); + +static int cpum_cf_online_cpu(unsigned int cpu) +{ + int rc = 0; + + /* + * Ignore notification for perf_event_open(). + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + rc = cpum_cf_alloc_cpu(cpu); + if (!rc) + cfset_online_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return rc; +} + +static int cfset_offline_cpu(unsigned int cpu); + +static int cpum_cf_offline_cpu(unsigned int cpu) +{ + /* + * During task exit processing of grouped perf events triggered by CPU + * hotplug processing, pmu_disable() is called as part of perf context + * removal process. Therefore do not trigger event removal now for + * perf_event_open() created events. Perf common code triggers event + * destruction when the event file descriptor is closed. + * + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + cfset_offline_cpu(cpu); + cpum_cf_free_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +/* Return true if store counter set multiple instruction is available */ +static inline int stccm_avail(void) +{ + return test_facility(142); +} + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_cf_events *cpuhw; + + if (!(alert & CPU_MF_INT_CF_MASK)) + return; + + inc_irq_stat(IRQEXT_CMC); + + /* + * Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. + */ + cpuhw = this_cpu_cfhw(); + if (!cpuhw) + return; + + /* counter authorization change alert */ + if (alert & CPU_MF_INT_CF_CACA) + qctri(&cpumf_ctr_info); + + /* loss of counter data alert */ + if (alert & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); +} + static int cfset_init(void); static int __init cpumf_pmu_init(void) { int rc; - if (!kernel_cpumcf_avail()) + /* Extract counter measurement facility information */ + if (!cpum_cf_avail() || qctri(&cpumf_ctr_info)) return -ENODEV; + /* Determine and store counter set sizes for later reference */ + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + cpum_cf_make_setsize(rc); + + /* + * Clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters + */ + system_ctl_clear_bit(0, CR0_CPUMF_EXTRACTION_AUTH_BIT); + + /* register handler for measurement-alert interruptions */ + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc); + return rc; + } + /* Setup s390dbf facility */ cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); if (!cf_dbg) { pr_err("Registration of s390dbf(cpum_cf) failed\n"); - return -ENOMEM; + rc = -ENOMEM; + goto out1; } debug_register_view(cf_dbg, &debug_sprintf_view); cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); if (rc) { - debug_unregister_view(cf_dbg, &debug_sprintf_view); - debug_unregister(cf_dbg); pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + goto out2; } else if (stccm_avail()) { /* Setup counter set device */ cfset_init(); } + + rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "perf/s390/cf:online", + cpum_cf_online_cpu, cpum_cf_offline_cpu); + return rc; + +out2: + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); +out1: + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); return rc; } @@ -795,19 +1242,11 @@ static int __init cpumf_pmu_init(void) * counter set via normal file operations. */ -static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */ -static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ unsigned int sets; /* Counter set bit mask */ atomic_t cpus_ack; /* # CPUs successfully executed func */ }; -static struct cfset_session { /* CPUs and counter set bit mask */ - struct list_head head; /* Head of list of active processes */ -} cfset_session = { - .head = LIST_HEAD_INIT(cfset_session.head) -}; - struct cfset_request { /* CPUs and counter set bit mask */ unsigned long ctrset; /* Bit mask of counter set to read */ cpumask_t mask; /* CPU mask to read from */ @@ -869,11 +1308,11 @@ static void cfset_session_add(struct cfset_request *p) /* Stop all counter sets via ioctl interface */ static void cfset_ioctl_off(void *parm) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct cfset_call_on_cpu_parm *p = parm; int rc; - /* Check if any counter set used by /dev/hwc */ + /* Check if any counter set used by /dev/hwctr */ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) if ((p->sets & cpumf_ctr_ctl[rc])) { if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { @@ -890,14 +1329,12 @@ static void cfset_ioctl_off(void *parm) cpuhw->state, S390_HWCTR_DEVICE, rc); if (!cpuhw->dev_state) cpuhw->flags &= ~PMU_F_IN_USE; - debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", - __func__, rc, cpuhw->state, cpuhw->dev_state); } /* Start counter sets on particular CPU */ static void cfset_ioctl_on(void *parm) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct cfset_call_on_cpu_parm *p = parm; int rc; @@ -913,17 +1350,13 @@ static void cfset_ioctl_on(void *parm) else pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); - debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", - __func__, rc, cpuhw->state, cpuhw->dev_state); } static void cfset_release_cpu(void *p) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int rc; - debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", - __func__, cpuhw->state, cpuhw->dev_state); cpuhw->dev_state = 0; rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ if (rc) @@ -959,27 +1392,41 @@ static int cfset_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; } - if (!atomic_dec_return(&cfset_opencnt)) + if (refcount_dec_and_test(&cfset_opencnt)) { /* Last close */ on_each_cpu(cfset_release_cpu, NULL, 1); + cpum_cf_free(-1); + } mutex_unlock(&cfset_ctrset_mutex); - - hw_perf_event_destroy(NULL); return 0; } +/* + * Open via /dev/hwctr device. Allocate all per CPU resources on the first + * open of the device. The last close releases all per CPU resources. + * Parallel perf_event_open system calls also use per CPU resources. + * These invocations are handled via reference counting on the per CPU data + * structures. + */ static int cfset_open(struct inode *inode, struct file *file) { - if (!capable(CAP_SYS_ADMIN)) + int rc = 0; + + if (!perfmon_capable()) return -EPERM; + file->private_data = NULL; + mutex_lock(&cfset_ctrset_mutex); - if (atomic_inc_return(&cfset_opencnt) == 1) - cfset_session_init(); + if (!refcount_inc_not_zero(&cfset_opencnt)) { /* First open */ + rc = cpum_cf_alloc(-1); + if (!rc) { + cfset_session_init(); + refcount_set(&cfset_opencnt, 1); + } + } mutex_unlock(&cfset_ctrset_mutex); - cpumf_hw_inuse(); - file->private_data = NULL; /* nonseekable_open() never fails */ - return nonseekable_open(inode, file); + return rc ?: nonseekable_open(inode, file); } static int cfset_all_start(struct cfset_request *req) @@ -998,13 +1445,11 @@ static int cfset_all_start(struct cfset_request *req) if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); rc = -EIO; - debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); } free_cpumask_var(mask); return rc; } - /* Return the maximum required space for all possible CPUs in case one * CPU will be onlined during the START, READ, STOP cycles. * To find out the size of the counter sets, any one CPU will do. They @@ -1012,34 +1457,32 @@ static int cfset_all_start(struct cfset_request *req) */ static size_t cfset_needspace(unsigned int sets) { - struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); size_t bytes = 0; int i; for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { if (!(sets & cpumf_ctr_ctl[i])) continue; - bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + bytes += cpum_cf_read_setsize(i) * sizeof(u64) + sizeof(((struct s390_ctrset_setdata *)0)->set) + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); } bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); - put_cpu_ptr(&cpu_cf_events); return bytes; } static int cfset_all_copy(unsigned long arg, cpumask_t *mask) { struct s390_ctrset_read __user *ctrset_read; - unsigned int cpu, cpus, rc; + unsigned int cpu, cpus, rc = 0; void __user *uptr; ctrset_read = (struct s390_ctrset_read __user *)arg; uptr = ctrset_read->data; for_each_cpu(cpu, mask) { - struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); + struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu); struct s390_ctrset_cpudata __user *ctrset_cpudata; ctrset_cpudata = uptr; @@ -1047,17 +1490,18 @@ static int cfset_all_copy(unsigned long arg, cpumask_t *mask) rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, cpuhw->used); - if (rc) - return -EFAULT; + if (rc) { + rc = -EFAULT; + goto out; + } uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; cond_resched(); } cpus = cpumask_weight(mask); if (put_user(cpus, &ctrset_read->no_cpus)) - return -EFAULT; - debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, - uptr - (void __user *)ctrset_read->data); - return 0; + rc = -EFAULT; +out: + return rc; } static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, @@ -1080,7 +1524,7 @@ static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, /* Read all counter sets. */ static void cfset_cpu_read(void *parm) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct cfset_call_on_cpu_parm *p = parm; int set, set_size; size_t space; @@ -1097,7 +1541,7 @@ static void cfset_cpu_read(void *parm) if (!(p->sets & cpumf_ctr_ctl[set])) continue; /* Counter set not in list */ - set_size = cpum_cf_ctrset_size(set, &cpuhw->info); + set_size = cpum_cf_read_setsize(set); space = sizeof(cpuhw->data) - cpuhw->used; space = cfset_cpuset_read(sp, set, set_size, space); if (space) { @@ -1105,8 +1549,6 @@ static void cfset_cpu_read(void *parm) cpuhw->sets += 1; } } - debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, - cpuhw->sets, cpuhw->used); } static int cfset_all_read(unsigned long arg, struct cfset_request *req) @@ -1128,14 +1570,10 @@ static int cfset_all_read(unsigned long arg, struct cfset_request *req) static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) { - struct s390_ctrset_read read; int ret = -ENODATA; - if (req && req->ctrset) { - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; + if (req && req->ctrset) ret = cfset_all_read(arg, req); - } return ret; } @@ -1204,8 +1642,6 @@ static long cfset_ioctl_start(unsigned long arg, struct file *file) if (!ret) { cfset_session_add(preq); file->private_data = preq; - debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n", - __func__, preq->ctrset, need, ret); } else { kfree(preq); } @@ -1255,24 +1691,23 @@ static const struct file_operations cfset_fops = { .release = cfset_release, .unlocked_ioctl = cfset_ioctl, .compat_ioctl = cfset_ioctl, - .llseek = no_llseek }; static struct miscdevice cfset_dev = { .name = S390_HWCTR_DEVICE, .minor = MISC_DYNAMIC_MINOR, .fops = &cfset_fops, + .mode = 0666, }; /* Hotplug add of a CPU. Scan through all active processes and add * that CPU to the list of CPUs supplied with ioctl(..., START, ...). */ -int cfset_online_cpu(unsigned int cpu) +static int cfset_online_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; struct cfset_request *rp; - mutex_lock(&cfset_ctrset_mutex); if (!list_empty(&cfset_session.head)) { list_for_each_entry(rp, &cfset_session.head, node) { p.sets = rp->ctrset; @@ -1280,19 +1715,18 @@ int cfset_online_cpu(unsigned int cpu) cpumask_set_cpu(cpu, &rp->mask); } } - mutex_unlock(&cfset_ctrset_mutex); return 0; } /* Hotplug remove of a CPU. Scan through all active processes and clear * that CPU from the list of CPUs supplied with ioctl(..., START, ...). + * Adjust reference counts. */ -int cfset_offline_cpu(unsigned int cpu) +static int cfset_offline_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; struct cfset_request *rp; - mutex_lock(&cfset_ctrset_mutex); if (!list_empty(&cfset_session.head)) { list_for_each_entry(rp, &cfset_session.head, node) { p.sets = rp->ctrset; @@ -1300,28 +1734,22 @@ int cfset_offline_cpu(unsigned int cpu) cpumask_clear_cpu(cpu, &rp->mask); } } - mutex_unlock(&cfset_ctrset_mutex); return 0; } static void cfdiag_read(struct perf_event *event) { - debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, - event->attr.config, local64_read(&event->count)); } static int get_authctrsets(void) { - struct cpu_cf_events *cpuhw; unsigned long auth = 0; enum cpumf_ctr_set i; - cpuhw = &get_cpu_var(cpu_cf_events); for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) + if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i]) auth |= cpumf_ctr_ctl[i]; } - put_cpu_var(cpu_cf_events); return auth; } @@ -1355,8 +1783,6 @@ static int cfdiag_event_init2(struct perf_event *event) if (!event->hw.config_base) err = -EINVAL; - debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", - __func__, err, event->hw.config_base); return err; } @@ -1381,12 +1807,11 @@ static int cfdiag_event_init(struct perf_event *event) } /* Initialize for using the CPU-measurement counter facility */ - cpumf_hw_inuse(); + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; event->destroy = hw_perf_event_destroy; err = cfdiag_event_init2(event); - if (unlikely(err)) - event->destroy(event); out: return err; } @@ -1429,7 +1854,7 @@ static const struct attribute_group *cfdiag_attr_groups[] = { /* Performance monitoring unit for event CF_DIAG. Since this event * is also started and stopped via the perf_event_open() system call, use * the same event enable/disable call back functions. They do not - * have a pointer to the perf_event strcture as first parameter. + * have a pointer to the perf_event structure as first parameter. * * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. * Reuse them and distinguish the event (always first parameter) via @@ -1459,7 +1884,7 @@ static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) enum cpumf_ctr_set i; for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cpum_cf_ctrset_size(i, info); + size_t size = cpum_cf_read_setsize(i); if (size) max_size += size * sizeof(u64) + @@ -1493,16 +1918,12 @@ static void cfdiag_get_cpu_speed(void) static int cfset_init(void) { - struct cpumf_ctr_info info; size_t need; int rc; - if (qctri(&info)) - return -ENODEV; - cfdiag_get_cpu_speed(); /* Make sure the counter set data fits into predefined buffer. */ - need = cfdiag_maxsize(&info); + need = cfdiag_maxsize(&cpumf_ctr_info); if (need > sizeof(((struct cpu_cf_events *)0)->start)) { pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", need); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c deleted file mode 100644 index 8ee48672233f..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * CPU-Measurement Counter Facility Support - Common Layer - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_common" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> - -/* Per-CPU event structure for the counter facility */ -DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { - .ctr_set = { - [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0), - }, - .alert = ATOMIC64_INIT(0), - .state = 0, - .dev_state = 0, - .flags = 0, - .used = 0, - .usedss = 0, - .sets = 0 -}; -/* Indicator whether the CPU-Measurement Counter Facility Support is ready */ -static bool cpum_cf_initalized; - -/* CPU-measurement alerts for the counter facility */ -static void cpumf_measurement_alert(struct ext_code ext_code, - unsigned int alert, unsigned long unused) -{ - struct cpu_cf_events *cpuhw; - - if (!(alert & CPU_MF_INT_CF_MASK)) - return; - - inc_irq_stat(IRQEXT_CMC); - cpuhw = this_cpu_ptr(&cpu_cf_events); - - /* Measurement alerts are shared and might happen when the PMU - * is not reserved. Ignore these alerts in this case. */ - if (!(cpuhw->flags & PMU_F_RESERVED)) - return; - - /* counter authorization change alert */ - if (alert & CPU_MF_INT_CF_CACA) - qctri(&cpuhw->info); - - /* loss of counter data alert */ - if (alert & CPU_MF_INT_CF_LCDA) - pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); - - /* loss of MT counter data alert */ - if (alert & CPU_MF_INT_CF_MTDA) - pr_warn("CPU[%i] MT counter data was lost\n", - smp_processor_id()); - - /* store alert for special handling by in-kernel users */ - atomic64_or(alert, &cpuhw->alert); -} - -#define PMC_INIT 0 -#define PMC_RELEASE 1 -static void cpum_cf_setup_cpu(void *flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - switch (*((int *) flags)) { - case PMC_INIT: - memset(&cpuhw->info, 0, sizeof(cpuhw->info)); - qctri(&cpuhw->info); - cpuhw->flags |= PMU_F_RESERVED; - break; - - case PMC_RELEASE: - cpuhw->flags &= ~PMU_F_RESERVED; - break; - } - - /* Disable CPU counter sets */ - lcctl(0); -} - -bool kernel_cpumcf_avail(void) -{ - return cpum_cf_initalized; -} -EXPORT_SYMBOL(kernel_cpumcf_avail); - -/* Initialize the CPU-measurement counter facility */ -int __kernel_cpumcf_begin(void) -{ - int flags = PMC_INIT; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; -} -EXPORT_SYMBOL(__kernel_cpumcf_begin); - -/* Obtain the CPU-measurement alerts for the counter facility */ -unsigned long kernel_cpumcf_alert(int clear) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - unsigned long alert; - - alert = atomic64_read(&cpuhw->alert); - if (clear) - atomic64_set(&cpuhw->alert, 0); - - return alert; -} -EXPORT_SYMBOL(kernel_cpumcf_alert); - -/* Release the CPU-measurement counter facility */ -void __kernel_cpumcf_end(void) -{ - int flags = PMC_RELEASE; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); -} -EXPORT_SYMBOL(__kernel_cpumcf_end); - -static int cpum_cf_setup(unsigned int cpu, int flags) -{ - local_irq_disable(); - cpum_cf_setup_cpu(&flags); - local_irq_enable(); - return 0; -} - -static int cpum_cf_online_cpu(unsigned int cpu) -{ - cpum_cf_setup(cpu, PMC_INIT); - return cfset_online_cpu(cpu); -} - -static int cpum_cf_offline_cpu(unsigned int cpu) -{ - cfset_offline_cpu(cpu); - return cpum_cf_setup(cpu, PMC_RELEASE); -} - -/* Return the maximum possible counter set size (in number of 8 byte counters) - * depending on type and model number. - */ -size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info) -{ - size_t ctrset_size = 0; - - switch (ctrset) { - case CPUMF_CTR_SET_BASIC: - if (info->cfvn >= 1) - ctrset_size = 6; - break; - case CPUMF_CTR_SET_USER: - if (info->cfvn == 1) - ctrset_size = 6; - else if (info->cfvn >= 3) - ctrset_size = 2; - break; - case CPUMF_CTR_SET_CRYPTO: - if (info->csvn >= 1 && info->csvn <= 5) - ctrset_size = 16; - else if (info->csvn == 6 || info->csvn == 7) - ctrset_size = 20; - break; - case CPUMF_CTR_SET_EXT: - if (info->csvn == 1) - ctrset_size = 32; - else if (info->csvn == 2) - ctrset_size = 48; - else if (info->csvn >= 3 && info->csvn <= 5) - ctrset_size = 128; - else if (info->csvn == 6 || info->csvn == 7) - ctrset_size = 160; - break; - case CPUMF_CTR_SET_MT_DIAG: - if (info->csvn > 3) - ctrset_size = 48; - break; - case CPUMF_CTR_SET_MAX: - break; - } - - return ctrset_size; -} - -static int __init cpum_cf_init(void) -{ - int rc; - - if (!cpum_cf_avail()) - return -ENODEV; - - /* clear bit 15 of cr0 to unauthorize problem-state to - * extract measurement counters */ - ctl_clear_bit(0, 48); - - /* register handler for measurement-alert interruptions */ - rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, - cpumf_measurement_alert); - if (rc) { - pr_err("Registering for CPU-measurement alerts " - "failed with rc=%i\n", rc); - return rc; - } - - rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, - "perf/s390/cf:online", - cpum_cf_online_cpu, cpum_cf_offline_cpu); - if (!rc) - cpum_cf_initalized = true; - - return rc; -} -early_initcall(cpum_cf_init); diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 0d64aafd158f..7ace1f9e4ccf 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -237,7 +237,6 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4); CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); - CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); @@ -291,8 +290,8 @@ CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); -CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); -CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x0108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x0109); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); @@ -365,6 +364,83 @@ CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z17, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z17, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z17, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z17, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z17, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z17, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z17, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z17, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z17, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z17, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z17, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z17, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z17, CYCLES_SAMETHRD, 0x00ca); +CPUMF_EVENT_ATTR(cf_z17, CYCLES_DIFFTHRD, 0x00cb); +CPUMF_EVENT_ATTR(cf_z17, INST_SAMETHRD, 0x00cc); +CPUMF_EVENT_ATTR(cf_z17, INST_DIFFTHRD, 0x00cd); +CPUMF_EVENT_ATTR(cf_z17, WRONG_BRANCH_PREDICTION, 0x00ce); +CPUMF_EVENT_ATTR(cf_z17, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z17, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z17, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z17, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z17, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z17, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z17, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z17, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z17, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_ONCHIP, 0x0110); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_OFFCHIP, 0x0111); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_DIFF, 0x0112); +CPUMF_EVENT_ATTR(cf_z17, NNPA_4K_PREFETCH, 0x0114); +CPUMF_EVENT_ATTR(cf_z17, NNPA_COMPL_LOCK, 0x0115); +CPUMF_EVENT_ATTR(cf_z17, NNPA_RETRY_LOCK, 0x0116); +CPUMF_EVENT_ATTR(cf_z17, NNPA_RETRY_LOCK_WITH_PLO, 0x0117); +CPUMF_EVENT_ATTR(cf_z17, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z17, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), @@ -414,7 +490,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_678_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -779,6 +855,87 @@ static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z17_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z17, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z17, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z17, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z17, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z17, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z17, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z17, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z17, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z17, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z17, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z17, CYCLES_SAMETHRD), + CPUMF_EVENT_PTR(cf_z17, CYCLES_DIFFTHRD), + CPUMF_EVENT_PTR(cf_z17, INST_SAMETHRD), + CPUMF_EVENT_PTR(cf_z17, INST_DIFFTHRD), + CPUMF_EVENT_PTR(cf_z17, WRONG_BRANCH_PREDICTION), + CPUMF_EVENT_PTR(cf_z17, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z17, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z17, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z17, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z17, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z17, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z17, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z17, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z17, SORTL), + CPUMF_EVENT_PTR(cf_z17, DFLT_CC), + CPUMF_EVENT_PTR(cf_z17, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z17, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z17, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z17, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_ONCHIP), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_OFFCHIP), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_DIFF), + CPUMF_EVENT_PTR(cf_z17, NNPA_4K_PREFETCH), + CPUMF_EVENT_PTR(cf_z17, NNPA_COMPL_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_RETRY_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_RETRY_LOCK_WITH_PLO), + CPUMF_EVENT_PTR(cf_z17, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z17, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -855,16 +1012,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void) } /* Determine version specific crypto set */ - switch (ci.csvn) { - case 1 ... 5: + csvn = none; + if (ci.csvn >= 1 && ci.csvn <= 5) csvn = cpumcf_svn_12345_pmu_event_attr; - break; - case 6 ... 7: - csvn = cpumcf_svn_67_pmu_event_attr; - break; - default: - csvn = none; - } + else if (ci.csvn >= 6) + csvn = cpumcf_svn_678_pmu_event_attr; /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); @@ -897,6 +1049,10 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x3932: model = cpumcf_z16_pmu_event_attr; break; + case 0x9175: + case 0x9176: + model = cpumcf_z17_pmu_event_attr; + break; default: model = none; break; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 332a49965130..91469401f2c9 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -22,6 +22,23 @@ #include <asm/irq.h> #include <asm/debug.h> #include <asm/timex.h> +#include <linux/io.h> + +/* Perf PMU definitions for the sampling facility */ +#define PERF_CPUM_SF_MAX_CTR 2 +#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ +#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ +#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ + +#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) +#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) +#define TEAR_REG(hwc) ((hwc)->last_tag) +#define SAMPL_RATE(hwc) ((hwc)->event_base) +#define SAMPL_FLAGS(hwc) ((hwc)->config_base) +#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) +#define SAMPL_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) /* Minimum number of sample-data-block-tables: * At least one table is required for the sampling buffer structure. @@ -42,7 +59,7 @@ #define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8) static inline int require_table_link(const void *sdbt) { - return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; + return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; } /* Minimum and maximum sampling buffer sizes: @@ -99,15 +116,55 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); /* Debug feature */ static debug_info_t *sfdbg; +/* Sampling control helper functions */ +static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, + unsigned long freq) +{ + return (USEC_PER_SEC / freq) * qsi->cpu_speed; +} + +static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, + unsigned long rate) +{ + return USEC_PER_SEC * qsi->cpu_speed / rate; +} + +/* Return pointer to trailer entry of an sample data block */ +static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) +{ + void *ret; + + ret = (void *)v; + ret += PAGE_SIZE; + ret -= sizeof(struct hws_trailer_entry); + + return ret; +} + +/* + * Return true if the entry in the sample data block table (sdbt) + * is a link to the next sdbt + */ +static inline int is_link_entry(unsigned long *s) +{ + return *s & 0x1UL ? 1 : 0; +} + +/* Return pointer to the linked sdbt */ +static inline unsigned long *get_next_sdbt(unsigned long *s) +{ + return phys_to_virt(*s & ~0x1UL); +} + /* * sf_disable() - Switch off sampling facility */ -static int sf_disable(void) +static void sf_disable(void) { struct hws_lsctl_request_block sreq; memset(&sreq, 0, sizeof(sreq)); - return lsctl(&sreq); + lsctl(&sreq); } /* @@ -123,57 +180,44 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw) */ static void free_sampling_buffer(struct sf_buffer *sfb) { - unsigned long *sdbt, *curr; - - if (!sfb->sdbt) - return; + unsigned long *sdbt, *curr, *head; sdbt = sfb->sdbt; - curr = sdbt; - + if (!sdbt) + return; + sfb->sdbt = NULL; /* Free the SDBT after all SDBs are processed... */ - while (1) { - if (!*curr || !sdbt) - break; - - /* Process table-link entries */ + head = sdbt; + curr = sdbt; + do { if (is_link_entry(curr)) { + /* Process table-link entries */ curr = get_next_sdbt(curr); - if (sdbt) - free_page((unsigned long) sdbt); - - /* If the origin is reached, sampling buffer is freed */ - if (curr == sfb->sdbt) - break; - else - sdbt = curr; + free_page((unsigned long)sdbt); + sdbt = curr; } else { /* Process SDB pointer */ - if (*curr) { - free_page(*curr); - curr++; - } + free_page((unsigned long)phys_to_virt(*curr)); + curr++; } - } - - debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__, - (unsigned long)sfb->sdbt); + } while (curr != head); memset(sfb, 0, sizeof(*sfb)); } static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) { - unsigned long sdb, *trailer; + struct hws_trailer_entry *te; + unsigned long sdb; /* Allocate and initialize sample-data-block */ sdb = get_zeroed_page(gfp_flags); if (!sdb) return -ENOMEM; - trailer = trailer_entry_ptr(sdb); - *trailer = SDB_TE_ALERT_REQ_MASK; + te = trailer_entry_ptr(sdb); + te->header.a = 1; /* Link SDB into the sample-data-block-table */ - *sdbt = sdb; + *sdbt = virt_to_phys((void *)sdb); return 0; } @@ -212,10 +256,8 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, * the sampling buffer origin. */ if (sfb->sdbt != get_next_sdbt(tail)) { - debug_sprintf_event(sfdbg, 3, "%s: " - "sampling buffer is not linked: origin %#lx" - " tail %#lx\n", __func__, - (unsigned long)sfb->sdbt, + debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n", + __func__, (unsigned long)sfb->sdbt, (unsigned long)tail); return -EINVAL; } @@ -225,14 +267,14 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, for (i = 0; i < num_sdb; i++) { /* Allocate a new SDB-table if it is full. */ if (require_table_link(tail)) { - new = (unsigned long *) get_zeroed_page(gfp_flags); + new = (unsigned long *)get_zeroed_page(gfp_flags); if (!new) { rc = -ENOMEM; break; } sfb->num_sdbt++; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys((void *)new) + 1; tail_prev = tail; tail = new; } @@ -251,7 +293,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, */ if (tail_prev) { sfb->num_sdbt--; - free_page((unsigned long) new); + free_page((unsigned long)new); tail = tail_prev; } break; @@ -262,12 +304,9 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, } /* Link sampling buffer to its origin */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; - debug_sprintf_event(sfdbg, 4, "%s: new buffer" - " settings: sdbt %lu sdb %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); return rc; } @@ -290,7 +329,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) return -EINVAL; /* Allocate the sample-data-block-table origin */ - sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!sfb->sdbt) return -ENOMEM; sfb->num_sdb = 0; @@ -300,19 +339,12 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) * realloc_sampling_buffer() invocation. */ sfb->tail = sfb->sdbt; - *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1; + *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1; /* Allocate requested number of sample-data-blocks */ rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); - if (rc) { + if (rc) free_sampling_buffer(sfb); - debug_sprintf_event(sfdbg, 4, "%s: " - "realloc_sampling_buffer failed with rc %i\n", - __func__, rc); - } else - debug_sprintf_event(sfdbg, 4, - "%s: tear %#lx dear %#lx\n", __func__, - (unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt); return rc; } @@ -324,8 +356,8 @@ static void sfb_set_limits(unsigned long min, unsigned long max) CPUM_SF_MAX_SDB = max; memset(&si, 0, sizeof(si)); - if (!qsi(&si)) - CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); + qsi(&si); + CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); } static unsigned long sfb_max_limit(struct hw_perf_event *hwc) @@ -344,12 +376,6 @@ static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, return 0; } -static int sfb_has_pending_allocs(struct sf_buffer *sfb, - struct hw_perf_event *hwc) -{ - return sfb_pending_allocs(sfb, hwc) > 0; -} - static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) { /* Limit the number of SDBs to not exceed the maximum */ @@ -366,14 +392,13 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) static void deallocate_buffers(struct cpu_hw_sf *cpuhw) { - if (cpuhw->sfb.sdbt) + if (sf_buffer_available(cpuhw)) free_sampling_buffer(&cpuhw->sfb); } static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { unsigned long n_sdb, freq; - size_t sample_size; /* Calculate sampling buffers using 4K pages * @@ -404,7 +429,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up * to 511 SDBs). */ - sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); @@ -420,12 +444,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) if (sf_buffer_available(cpuhw)) return 0; - debug_sprintf_event(sfdbg, 3, - "%s: rate %lu f %lu sdb %lu/%lu" - " sample_size %lu cpuhw %p\n", __func__, - SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc), - sample_size, cpuhw); - return alloc_sampling_buffer(&cpuhw->sfb, sfb_pending_allocs(&cpuhw->sfb, hwc)); } @@ -482,8 +500,6 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, if (num) sfb_account_allocs(num, hwc); - debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n", - __func__, OVERFLOW_REG(hwc), ratio, num); OVERFLOW_REG(hwc) = 0; } @@ -501,13 +517,11 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, static void extend_sampling_buffer(struct sf_buffer *sfb, struct hw_perf_event *hwc) { - unsigned long num, num_old; - int rc; + unsigned long num; num = sfb_pending_allocs(sfb, hwc); if (!num) return; - num_old = sfb->num_sdb; /* Disable the sampling facility to reset any states and also * clear pending measurement alerts. @@ -519,62 +533,32 @@ static void extend_sampling_buffer(struct sf_buffer *sfb, * called by perf. Because this is a reallocation, it is fine if the * new SDB-request cannot be satisfied immediately. */ - rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC); - if (rc) - debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n", - __func__, rc); - - if (sfb_has_pending_allocs(sfb, hwc)) - debug_sprintf_event(sfdbg, 5, "%s: " - "req %lu alloc %lu remaining %lu\n", - __func__, num, sfb->num_sdb - num_old, - sfb_pending_allocs(sfb, hwc)); + realloc_sampling_buffer(sfb, num, GFP_ATOMIC); } /* Number of perf events counting hardware events */ -static atomic_t num_events; +static refcount_t num_events; /* Used to avoid races in calling reserve/release_cpumf_hardware */ static DEFINE_MUTEX(pmc_reserve_mutex); #define PMC_INIT 0 #define PMC_RELEASE 1 -#define PMC_FAILURE 2 static void setup_pmc_cpu(void *flags) { - int err; - struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); - err = 0; - switch (*((int *) flags)) { + sf_disable(); + switch (*((int *)flags)) { case PMC_INIT: - memset(cpusf, 0, sizeof(*cpusf)); - err = qsi(&cpusf->qsi); - if (err) - break; - cpusf->flags |= PMU_F_RESERVED; - err = sf_disable(); - if (err) - pr_err("Switching off the sampling facility failed " - "with rc %i\n", err); - debug_sprintf_event(sfdbg, 5, - "%s: initialized: cpuhw %p\n", __func__, - cpusf); + memset(cpuhw, 0, sizeof(*cpuhw)); + qsi(&cpuhw->qsi); + cpuhw->flags |= PMU_F_RESERVED; break; case PMC_RELEASE: - cpusf->flags &= ~PMU_F_RESERVED; - err = sf_disable(); - if (err) { - pr_err("Switching off the sampling facility failed " - "with rc %i\n", err); - } else - deallocate_buffers(cpusf); - debug_sprintf_event(sfdbg, 5, - "%s: released: cpuhw %p\n", __func__, - cpusf); + cpuhw->flags &= ~PMU_F_RESERVED; + deallocate_buffers(cpuhw); break; } - if (err) - *((int *) flags) |= PMC_FAILURE; } static void release_pmc_hardware(void) @@ -585,27 +569,19 @@ static void release_pmc_hardware(void) on_each_cpu(setup_pmc_cpu, &flags, 1); } -static int reserve_pmc_hardware(void) +static void reserve_pmc_hardware(void) { int flags = PMC_INIT; on_each_cpu(setup_pmc_cpu, &flags, 1); - if (flags & PMC_FAILURE) { - release_pmc_hardware(); - return -ENODEV; - } irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; } static void hw_perf_event_destroy(struct perf_event *event) { /* Release PMC if this is the last perf event */ - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - release_pmc_hardware(); + if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -671,7 +647,8 @@ static void cpumsf_output_event_pid(struct perf_event *event, /* Protect callchain buffers, tasks */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); if (perf_output_begin(&handle, data, event, header.size)) goto out; @@ -708,9 +685,6 @@ static unsigned long getrate(bool freq, unsigned long sample, */ if (sample_rate_to_freq(si, rate) > sysctl_perf_event_sample_rate) { - debug_sprintf_event(sfdbg, 1, "%s: " - "Sampling rate exceeds maximum " - "perf sample rate\n", __func__); rate = 0; } } @@ -755,9 +729,6 @@ static int __hw_perf_event_init_rate(struct perf_event *event, attr->sample_period = rate; SAMPL_RATE(hwc) = rate; hw_init_period(hwc, SAMPL_RATE(hwc)); - debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n", - __func__, event->cpu, event->attr.sample_period, - event->attr.freq, SAMPLE_FREQ_MODE(hwc)); return 0; } @@ -767,23 +738,16 @@ static int __hw_perf_event_init(struct perf_event *event) struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - int cpu, err; + int cpu, err = 0; /* Reserve CPU-measurement sampling facility */ - err = 0; - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); + mutex_lock(&pmc_reserve_mutex); + if (!refcount_inc_not_zero(&num_events)) { + reserve_pmc_hardware(); + refcount_set(&num_events, 1); } event->destroy = hw_perf_event_destroy; - if (err) - goto out; - /* Access per-CPU sampling information (query sampling info) */ /* * The event->cpu value can be -1 to count on every CPU, for example, @@ -795,9 +759,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ memset(&si, 0, sizeof(si)); cpuhw = NULL; - if (event->cpu == -1) + if (event->cpu == -1) { qsi(&si); - else { + } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. */ @@ -834,21 +798,13 @@ static int __hw_perf_event_init(struct perf_event *event) SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE; } - /* Check and set other sampling flags */ - if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) - SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; - err = __hw_perf_event_init_rate(event, &si); if (err) goto out; - /* Initialize sample data overflow accounting */ - hwc->extra_reg.reg = REG_OVERFLOW; - OVERFLOW_REG(hwc) = 0; - /* Use AUX buffer. No need to allocate it by ourself */ if (attr->config == PERF_EVENT_CPUM_SF_DIAG) - return 0; + goto out; /* Allocate the per-CPU sampling buffer using the CPU information * from the event. If the event is not pinned to a particular @@ -878,6 +834,7 @@ static int __hw_perf_event_init(struct perf_event *event) if (is_default_overflow_handler(event)) event->overflow_handler = cpumsf_output_event_pid; out: + mutex_unlock(&pmc_reserve_mutex); return err; } @@ -919,10 +876,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event) return -ENOENT; } - /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= 0 && !cpu_online(event->cpu)) - return -ENODEV; - /* Force reset of idle/hv excludes regardless of what the * user requested. */ @@ -932,9 +885,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event) event->attr.exclude_idle = 0; err = __hw_perf_event_init(event); - if (unlikely(err)) - if (event->destroy) - event->destroy(event); return err; } @@ -944,10 +894,14 @@ static void cpumsf_pmu_enable(struct pmu *pmu) struct hw_perf_event *hwc; int err; - if (cpuhw->flags & PMU_F_ENABLED) - return; - - if (cpuhw->flags & PMU_F_ERR_MASK) + /* + * Event must be + * - added/started on this CPU (PMU_F_IN_USE set) + * - and CPU must be available (PMU_F_RESERVED set) + * - and not already enabled (PMU_F_ENABLED not set) + * - and not in error condition (PMU_F_ERR_MASK not set) + */ + if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED)) return; /* Check whether to extent the sampling buffer. @@ -961,40 +915,27 @@ static void cpumsf_pmu_enable(struct pmu *pmu) * facility, but it can be fully re-enabled using sampling controls that * have been saved in cpumsf_pmu_disable(). */ - if (cpuhw->event) { - hwc = &cpuhw->event->hw; - if (!(SAMPL_DIAG_MODE(hwc))) { - /* - * Account number of overflow-designated - * buffer extents - */ - sfb_account_overflows(cpuhw, hwc); - extend_sampling_buffer(&cpuhw->sfb, hwc); - } - /* Rate may be adjusted with ioctl() */ - cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); + hwc = &cpuhw->event->hw; + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + extend_sampling_buffer(&cpuhw->sfb, hwc); } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(hwc); /* (Re)enable the PMU and sampling facility */ - cpuhw->flags |= PMU_F_ENABLED; - barrier(); - err = lsctl(&cpuhw->lsctl); if (err) { - cpuhw->flags &= ~PMU_F_ENABLED; - pr_err("Loading sampling controls failed: op %i err %i\n", - 1, err); + pr_err("Loading sampling controls failed: op 1 err %i\n", err); return; } /* Load current program parameter */ - lpp(&S390_lowcore.lpp); - - debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i " - "interval %#lx tear %#lx dear %#lx\n", __func__, - cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, - cpuhw->lsctl.cd, cpuhw->lsctl.interval, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); + lpp(&get_lowcore()->lpp); + cpuhw->flags |= PMU_F_ENABLED; } static void cpumsf_pmu_disable(struct pmu *pmu) @@ -1017,31 +958,27 @@ static void cpumsf_pmu_disable(struct pmu *pmu) err = lsctl(&inactive); if (err) { - pr_err("Loading sampling controls failed: op %i err %i\n", - 2, err); + pr_err("Loading sampling controls failed: op 2 err %i\n", err); return; } - /* Save state of TEAR and DEAR register contents */ - err = qsi(&si); - if (!err) { - /* TEAR/DEAR values are valid only if the sampling facility is - * enabled. Note that cpumsf_pmu_disable() might be called even - * for a disabled sampling facility because cpumsf_pmu_enable() - * controls the enable/disable state. - */ - if (si.es) { - cpuhw->lsctl.tear = si.tear; - cpuhw->lsctl.dear = si.dear; - } - } else - debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n", - __func__, err); + /* + * Save state of TEAR and DEAR register contents. + * TEAR/DEAR values are valid only if the sampling facility is + * enabled. Note that cpumsf_pmu_disable() might be called even + * for a disabled sampling facility because cpumsf_pmu_enable() + * controls the enable/disable state. + */ + qsi(&si); + if (si.es) { + cpuhw->lsctl.tear = si.tear; + cpuhw->lsctl.dear = si.dear; + } cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -1050,7 +987,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1133,12 +1070,9 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; - if (perf_event_overflow(event, &data, ®s)) { - overflow = 1; - event->pmu->stop(event, 0); - } + overflow = perf_event_overflow(event, &data, ®s); perf_event_update_userpage(event); out: return overflow; @@ -1175,9 +1109,9 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, struct hws_trailer_entry *te; struct hws_basic_entry *sample; - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); - sample = (struct hws_basic_entry *) *sdbt; - while ((unsigned long *) sample < (unsigned long *) te) { + te = trailer_entry_ptr((unsigned long)sdbt); + sample = (struct hws_basic_entry *)sdbt; + while ((unsigned long *)sample < (unsigned long *)te) { /* Check for an empty sample */ if (!sample->def || sample->LS) break; @@ -1202,11 +1136,6 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, /* Count discarded samples */ *overflow += 1; } else { - debug_sprintf_event(sfdbg, 4, - "%s: Found unknown" - " sampling data entry: te->f %i" - " basic.def %#4x (%p)\n", __func__, - te->f, sample->def, sample); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1217,7 +1146,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, * that are not full. Stop processing if the first * invalid format was detected. */ - if (!te->f) + if (!te->header.f) break; } @@ -1235,71 +1164,62 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, * The sampling buffer position are retrieved and saved in the TEAR_REG * register of the specified perf event. * - * Only full sample-data-blocks are processed. Specify the flash_all flag - * to also walk through partially filled sample-data-blocks. It is ignored - * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag - * enforces the processing of full sample-data-blocks only (trailer entries - * with the block-full-indicator bit set). + * Only full sample-data-blocks are processed. Specify the flush_all flag + * to also walk through partially filled sample-data-blocks. */ static void hw_perf_event_update(struct perf_event *event, int flush_all) { + unsigned long long event_overflow, sampl_overflow, num_sdb; struct hw_perf_event *hwc = &event->hw; + union hws_trailer_header prev, new; struct hws_trailer_entry *te; - unsigned long *sdbt; - unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags; + unsigned long *sdbt, sdb; int done; /* * AUX buffer is used when in diagnostic sampling mode. * No perf events/samples are created. */ - if (SAMPL_DIAG_MODE(&event->hw)) + if (SAMPL_DIAG_MODE(hwc)) return; - if (flush_all && SDB_FULL_BLOCKS(hwc)) - flush_all = 0; - - sdbt = (unsigned long *) TEAR_REG(hwc); + sdbt = (unsigned long *)TEAR_REG(hwc); done = event_overflow = sampl_overflow = num_sdb = 0; while (!done) { /* Get the trailer entry of the sample-data-block */ - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); + sdb = (unsigned long)phys_to_virt(*sdbt); + te = trailer_entry_ptr(sdb); /* Leave loop if no more work to do (block full indicator) */ - if (!te->f) { + if (!te->header.f) { done = 1; if (!flush_all) break; } /* Check the sample overflow count */ - if (te->overflow) + if (te->header.overflow) /* Account sample overflows and, if a particular limit * is reached, extend the sampling buffer. * For details, see sfb_account_overflows(). */ - sampl_overflow += te->overflow; - - /* Timestamps are valid for full sample-data-blocks only */ - debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx " - "overflow %llu timestamp %#llx\n", - __func__, (unsigned long)sdbt, te->overflow, - (te->f) ? trailer_timestamp(te) : 0ULL); + sampl_overflow += te->header.overflow; /* Collect all samples from a single sample-data-block and * flag if an (perf) event overflow happened. If so, the PMU * is stopped and remaining samples will be discarded. */ - hw_collect_samples(event, sdbt, &event_overflow); + hw_collect_samples(event, (unsigned long *)sdb, &event_overflow); num_sdb++; /* Reset trailer (using compare-double-and-swap) */ + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; - te_flags |= SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - te->flags, te->overflow, - te_flags, 0ULL)); + new.val = prev.val; + new.f = 0; + new.a = 1; + new.overflow = 0; + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); /* Advance to next sample-data-block */ sdbt++; @@ -1307,7 +1227,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sdbt = get_next_sdbt(sdbt); /* Update event hardware registers */ - TEAR_REG(hwc) = (unsigned long) sdbt; + TEAR_REG(hwc) = (unsigned long)sdbt; /* Stop processing sample-data if all samples of the current * sample-data-block were flushed even if it was not full. @@ -1329,25 +1249,30 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * are dropped. * Slightly increase the interval to avoid hitting this limit. */ - if (event_overflow) { + if (event_overflow) SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); - debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", - __func__, - DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); - } +} + +static inline unsigned long aux_sdb_index(struct aux_buffer *aux, + unsigned long i) +{ + return i % aux->sfb.num_sdb; +} + +static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end) +{ + return end >= start ? end - start + 1 : 0; +} - if (sampl_overflow || event_overflow) - debug_sprintf_event(sfdbg, 4, "%s: " - "overflows: sample %llu event %llu" - " total %llu num_sdb %llu\n", - __func__, sampl_overflow, event_overflow, - OVERFLOW_REG(hwc), num_sdb); +static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->alert_mark); } -#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) -#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) -#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) -#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) +static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->empty_mark); +} /* * Get trailer entry by index of SDB. @@ -1357,9 +1282,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, { unsigned long sdb; - index = AUX_SDB_INDEX(aux, index); + index = aux_sdb_index(aux, index); sdb = aux->sdb_index[index]; - return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + return trailer_entry_ptr(sdb); } /* @@ -1381,10 +1306,10 @@ static void aux_output_end(struct perf_output_handle *handle) if (!aux) return; - range_scan = AUX_SDB_NUM_ALERT(aux); + range_scan = aux_sdb_num_alert(aux); for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); - if (!(te->flags & SDB_TE_BUFFER_FULL_MASK)) + if (!te->header.f) break; } /* i is num of SDBs which are full */ @@ -1392,10 +1317,7 @@ static void aux_output_end(struct perf_output_handle *handle) /* Remove alert indicators in the buffer */ te = aux_sdb_trailer(aux, aux->alert_mark); - te->flags &= ~SDB_TE_ALERT_REQ_MASK; - - debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n", - __func__, i, range_scan, aux->head); + te->header.a = 0; } /* @@ -1411,12 +1333,10 @@ static int aux_output_begin(struct perf_output_handle *handle, struct aux_buffer *aux, struct cpu_hw_sf *cpuhw) { - unsigned long range; - unsigned long i, range_scan, idx; - unsigned long head, base, offset; + unsigned long range, i, range_scan, idx, head, base, offset; struct hws_trailer_entry *te; - if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + if (handle->head & ~PAGE_MASK) return -EINVAL; aux->head = handle->head >> PAGE_SHIFT; @@ -1428,18 +1348,14 @@ static int aux_output_begin(struct perf_output_handle *handle, * SDBs between aux->head and aux->empty_mark are already ready * for new data. range_scan is num of SDBs not within them. */ - debug_sprintf_event(sfdbg, 6, - "%s: range %ld head %ld alert %ld empty %ld\n", - __func__, range, aux->head, aux->alert_mark, - aux->empty_mark); - if (range > AUX_SDB_NUM_EMPTY(aux)) { - range_scan = range - AUX_SDB_NUM_EMPTY(aux); + if (range > aux_sdb_num_empty(aux)) { + range_scan = range - aux_sdb_num_empty(aux); idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); - te->flags &= ~(SDB_TE_BUFFER_FULL_MASK | - SDB_TE_ALERT_REQ_MASK); - te->overflow = 0; + te->header.f = 0; + te->header.a = 0; + te->header.overflow = 0; } /* Save the position of empty SDBs */ aux->empty_mark = aux->head + range - 1; @@ -1448,20 +1364,14 @@ static int aux_output_begin(struct perf_output_handle *handle, /* Set alert indicator */ aux->alert_mark = aux->head + range/2 - 1; te = aux_sdb_trailer(aux, aux->alert_mark); - te->flags = te->flags | SDB_TE_ALERT_REQ_MASK; + te->header.a = 1; /* Reset hardware buffer head */ - head = AUX_SDB_INDEX(aux, aux->head); + head = aux_sdb_index(aux, aux->head); base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; offset = head % CPUM_SF_SDB_PER_TABLE; - cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); - cpuhw->lsctl.dear = aux->sdb_index[head]; - - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " - "index %ld tear %#lx dear %#lx\n", __func__, - aux->head, aux->alert_mark, aux->empty_mark, - head / CPUM_SF_SDB_PER_TABLE, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); + cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); return 0; } @@ -1475,14 +1385,15 @@ static int aux_output_begin(struct perf_output_handle *handle, static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, unsigned long long *overflow) { - unsigned long long orig_overflow, orig_flags, new_flags; + union hws_trailer_header prev, new; struct hws_trailer_entry *te; te = aux_sdb_trailer(aux, alert_index); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - orig_flags = te->flags; - *overflow = orig_overflow = te->overflow; - if (orig_flags & SDB_TE_BUFFER_FULL_MASK) { + new.val = prev.val; + *overflow = prev.overflow; + if (prev.f) { /* * SDB is already set by hardware. * Abort and try to set somewhere @@ -1490,10 +1401,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, */ return false; } - new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - orig_flags, orig_overflow, - new_flags, 0ULL)); + new.a = 1; + new.overflow = 0; + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); return true; } @@ -1522,14 +1432,12 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, unsigned long long *overflow) { - unsigned long long orig_overflow, orig_flags, new_flags; - unsigned long i, range_scan, idx, idx_old; + union hws_trailer_header prev, new; + unsigned long i, range_scan, idx; + unsigned long long orig_overflow; struct hws_trailer_entry *te; - debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " - "empty %ld\n", __func__, range, aux->head, - aux->alert_mark, aux->empty_mark); - if (range <= AUX_SDB_NUM_EMPTY(aux)) + if (range <= aux_sdb_num_empty(aux)) /* * No need to scan. All SDBs in range are marked as empty. * Just set alert indicator. Should check race with hardware @@ -1550,30 +1458,27 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, * Start scanning from one SDB behind empty_mark. If the new alert * indicator fall into this range, set it. */ - range_scan = range - AUX_SDB_NUM_EMPTY(aux); - idx_old = idx = aux->empty_mark + 1; + range_scan = range - aux_sdb_num_empty(aux); + idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - orig_flags = te->flags; - orig_overflow = te->overflow; - new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK; + new.val = prev.val; + orig_overflow = prev.overflow; + new.f = 0; + new.overflow = 0; if (idx == aux->alert_mark) - new_flags |= SDB_TE_ALERT_REQ_MASK; + new.a = 1; else - new_flags &= ~SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - orig_flags, orig_overflow, - new_flags, 0ULL)); + new.a = 0; + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); *overflow += orig_overflow; } /* Update empty_mark to new position */ aux->empty_mark = aux->head + range - 1; - debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld " - "empty %ld\n", __func__, range_scan, idx_old, - idx - 1, aux->empty_mark); return true; } @@ -1590,12 +1495,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) unsigned long num_sdb; aux = perf_get_aux(handle); - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Inform user space new data arrived */ - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; - debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; + debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__, size >> PAGE_SHIFT); perf_aux_output_end(handle, size); @@ -1607,12 +1512,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) pr_err("The AUX buffer with %lu pages for the " "diagnostic-sampling mode is full\n", num_sdb); - debug_sprintf_event(sfdbg, 1, - "%s: AUX buffer used up\n", - __func__); break; } - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Update head and alert_mark to new position */ @@ -1632,23 +1534,11 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) perf_aux_output_end(&cpuhw->handle, size); pr_err("Sample data caused the AUX buffer with %lu " "pages to overflow\n", aux->sfb.num_sdb); - debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld " - "overflow %lld\n", __func__, - aux->head, range, overflow); } else { - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "already full, try another\n", - __func__, - aux->head, aux->alert_mark); } } - - if (done) - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "empty %ld\n", __func__, aux->head, - aux->alert_mark, aux->empty_mark); } /* @@ -1670,15 +1560,13 @@ static void aux_buffer_free(void *data) kfree(aux->sdbt_index); kfree(aux->sdb_index); kfree(aux); - - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt); } static void aux_sdb_init(unsigned long sdb) { struct hws_trailer_entry *te; - te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + te = trailer_entry_ptr(sdb); /* Save clock base */ te->clock_base = 1; @@ -1741,7 +1629,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, /* Allocate the first SDBT */ sfb->num_sdbt = 0; - sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!sfb->sdbt) goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; @@ -1753,23 +1641,23 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ for (i = 0; i < nr_pages; i++, tail++) { if (require_table_link(tail)) { - new = (unsigned long *) get_zeroed_page(GFP_KERNEL); + new = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!new) goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys(new) + 1; tail = new; } /* Tail is the entry in a SDBT */ - *tail = (unsigned long)pages[i]; + *tail = virt_to_phys(pages[i]); aux->sdb_index[i] = (unsigned long)pages[i]; aux_sdb_init((unsigned long)pages[i]); } sfb->num_sdb = nr_pages; /* Link the last entry in the SDBT to the first SDBT */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; /* @@ -1779,9 +1667,6 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ aux->empty_mark = sfb->num_sdb - 1; - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); - return aux; no_sdbt: @@ -1802,7 +1687,7 @@ static void cpumsf_pmu_read(struct perf_event *event) /* Nothing to do ... updates are interrupt-driven */ } -/* Check if the new sampling period/freqeuncy is appropriate. +/* Check if the new sampling period/frequency is appropriate. * * Return non-zero on error and zero on passed checks. */ @@ -1814,8 +1699,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) memset(&si, 0, sizeof(si)); if (event->cpu == -1) { - if (qsi(&si)) - return -ENODEV; + qsi(&si); } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. @@ -1825,7 +1709,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) si = cpuhw->qsi; } - do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + do_freq = !!SAMPL_FREQ_MODE(&event->hw); rate = getrate(do_freq, value, &si); if (!rate) return -EINVAL; @@ -1833,10 +1717,6 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) event->attr.sample_period = rate; SAMPL_RATE(&event->hw) = rate; hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); - debug_sprintf_event(sfdbg, 4, "%s:" - " cpu %d value %#llx period %#llx freq %d\n", - __func__, event->cpu, value, - event->attr.sample_period, do_freq); return 0; } @@ -1847,12 +1727,8 @@ static void cpumsf_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + if (!(event->hw.state & PERF_HES_STOPPED)) return; - - if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - perf_pmu_disable(event->pmu); event->hw.state = 0; cpuhw->lsctl.cs = 1; @@ -1877,7 +1753,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags) event->hw.state |= PERF_HES_STOPPED; if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event, 1); + /* CPU hotplug off removes SDBs. No samples to extract. */ + if (cpuhw->flags & PMU_F_RESERVED) + hw_perf_event_update(event, 1); event->hw.state |= PERF_HES_UPTODATE; } perf_pmu_enable(event->pmu); @@ -1887,15 +1765,14 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); struct aux_buffer *aux; - int err; + int err = 0; if (cpuhw->flags & PMU_F_IN_USE) return -EAGAIN; - if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) + if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw)) return -EINVAL; - err = 0; perf_pmu_disable(event->pmu); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -1909,9 +1786,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) cpuhw->lsctl.h = 1; cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); if (!SAMPL_DIAG_MODE(&event->hw)) { - cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; - cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; - TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt); + cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt; + TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt; } /* Ensure sampling functions are in the disabled state. If disabled, @@ -2055,18 +1932,17 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Program alert request */ if (alert & CPU_MF_INT_SF_PRA) { - if (cpuhw->flags & PMU_F_IN_USE) + if (cpuhw->flags & PMU_F_IN_USE) { if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) hw_collect_aux(cpuhw); else hw_perf_event_update(cpuhw->event, 0); - else - WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); + } } /* Report measurement alerts only for non-PRA codes */ if (alert != CPU_MF_INT_SF_PRA) - debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__, alert); /* Sampling authorization change request */ @@ -2082,7 +1958,7 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Invalid sampling buffer entry */ if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) { - pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n", + pr_err("A sampling buffer entry is incorrect (alert=%#x)\n", alert); cpuhw->flags |= PMU_F_ERR_IBE; sf_disable(); @@ -2094,7 +1970,7 @@ static int cpusf_pmu_setup(unsigned int cpu, int flags) /* Ignore the notification if no events are scheduled on the PMU. * This might be racy... */ - if (!atomic_read(&num_events)) + if (!refcount_read(&num_events)) return 0; local_irq_disable(); @@ -2156,10 +2032,12 @@ static const struct kernel_param_ops param_ops_sfb_size = { .get = param_get_sfb_size, }; -#define RS_INIT_FAILURE_QSI 0x0001 -#define RS_INIT_FAILURE_BSDES 0x0002 -#define RS_INIT_FAILURE_ALRT 0x0003 -#define RS_INIT_FAILURE_PERF 0x0004 +enum { + RS_INIT_FAILURE_BSDES = 2, /* Bad basic sampling size */ + RS_INIT_FAILURE_ALRT = 3, /* IRQ registration failure */ + RS_INIT_FAILURE_PERF = 4 /* PMU registration failure */ +}; + static void __init pr_cpumsf_err(unsigned int reason) { pr_err("Sampling facility support for perf is not available: " @@ -2175,11 +2053,7 @@ static int __init init_cpum_sampling_pmu(void) return -ENODEV; memset(&si, 0, sizeof(si)); - if (qsi(&si)) { - pr_cpumsf_err(RS_INIT_FAILURE_QSI); - return -ENODEV; - } - + qsi(&si); if (!si.as && !si.ad) return -ENODEV; diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index c27321cb0969..2b9611c4718e 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -15,7 +15,10 @@ #include <linux/export.h> #include <linux/seq_file.h> #include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/compat.h> #include <linux/sysfs.h> +#include <asm/stacktrace.h> #include <asm/irq.h> #include <asm/cpu_mf.h> #include <asm/lowcore.h> @@ -54,7 +57,7 @@ static unsigned long instruction_pointer_guest(struct pt_regs *regs) return sie_block(regs)->gpsw.addr; } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return is_in_guest(regs) ? instruction_pointer_guest(regs) : instruction_pointer(regs); @@ -81,7 +84,7 @@ static unsigned long perf_misc_flags_sf(struct pt_regs *regs) return flags; } -unsigned long perf_misc_flags(struct pt_regs *regs) +unsigned long perf_arch_misc_flags(struct pt_regs *regs) { /* Check if the cpum_sf PMU has created the pt_regs structure. * In this case, perf misc flags can be easily extracted. Otherwise, @@ -212,6 +215,12 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } } +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + arch_stack_walk_user_common(NULL, NULL, entry, regs, true); +} + /* Perf definitions for PMU event attributes in sysfs */ ssize_t cpumf_events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) @@ -219,5 +228,5 @@ ssize_t cpumf_events_sysfs_show(struct device *dev, struct perf_pmu_events_attr *pmu_attr; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%04llx\n", pmu_attr->id); + return sysfs_emit(page, "event=0x%04llx\n", pmu_attr->id); } diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..63875270941b 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -16,8 +16,7 @@ #include <linux/export.h> #include <linux/io.h> #include <linux/perf_event.h> - -#include <asm/ctl_reg.h> +#include <asm/ctlreg.h> #include <asm/pai.h> #include <asm/debug.h> @@ -35,13 +34,49 @@ struct pai_userdata { struct paicrypt_map { unsigned long *page; /* Page for CPU to store counters */ struct pai_userdata *save; /* Page to store no-zero counters */ - unsigned int users; /* # of PAI crypto users */ - unsigned int sampler; /* # of PAI crypto samplers */ - unsigned int counter; /* # of PAI crypto counters */ + unsigned int active_events; /* # of PAI crypto users */ + refcount_t refcnt; /* Reference count mapped buffers */ struct perf_event *event; /* Perf event for sampling */ + struct list_head syswide_list; /* List system-wide sampling events */ +}; + +struct paicrypt_mapptr { + struct paicrypt_map *mapptr; }; -static DEFINE_PER_CPU(struct paicrypt_map, paicrypt_map); +static struct paicrypt_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct paicrypt_mapptr __percpu *mapptr; +} paicrypt_root; + +/* Free per CPU data when the last event is removed. */ +static void paicrypt_root_free(void) +{ + if (refcount_dec_and_test(&paicrypt_root.refcnt)) { + free_percpu(paicrypt_root.mapptr); + paicrypt_root.mapptr = NULL; + } + debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__, + refcount_read(&paicrypt_root.refcnt)); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paicrypt_root_alloc(void) +{ + if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) { + /* The memory is already zeroed. */ + paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr); + if (!paicrypt_root.mapptr) + return -ENOMEM; + refcount_set(&paicrypt_root.refcnt, 1); + } + return 0; +} /* Release the PMU if event is the last perf event */ static DEFINE_MUTEX(pai_reserve_mutex); @@ -49,38 +84,51 @@ static DEFINE_MUTEX(pai_reserve_mutex); /* Adjust usage counters and remove allocated memory when all users are * gone. */ -static void paicrypt_event_destroy(struct perf_event *event) +static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu) { - struct paicrypt_map *cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); + struct paicrypt_map *cpump = mp->mapptr; - cpump->event = NULL; - static_branch_dec(&pai_key); mutex_lock(&pai_reserve_mutex); - if (event->attr.sample_period) - cpump->sampler -= 1; - else - cpump->counter -= 1; - debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d" - " sampler %d counter %d\n", __func__, - event->attr.config, event->cpu, cpump->sampler, - cpump->counter); - if (!cpump->counter && !cpump->sampler) { + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d " + "refcnt %u\n", __func__, event->attr.config, + event->cpu, cpump->active_events, + refcount_read(&cpump->refcnt)); + if (refcount_dec_and_test(&cpump->refcnt)) { debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", __func__, (unsigned long)cpump->page, cpump->save); free_page((unsigned long)cpump->page); - cpump->page = NULL; kvfree(cpump->save); - cpump->save = NULL; + kfree(cpump); + mp->mapptr = NULL; } + paicrypt_root_free(); mutex_unlock(&pai_reserve_mutex); } -static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel) +static void paicrypt_event_destroy(struct perf_event *event) +{ + int cpu; + + static_branch_dec(&pai_key); + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + + for_each_cpu(cpu, mask) + paicrypt_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + paicrypt_event_destroy_cpu(event, event->cpu); + } +} + +static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel) { if (kernel) nr += PAI_CRYPTO_MAXCTR; - return cpump->page[nr]; + return page[nr]; } /* Read the counter values. Return value from location in CMP. For event @@ -88,18 +136,19 @@ static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel) */ static u64 paicrypt_getdata(struct perf_event *event, bool kernel) { - struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; u64 sum = 0; int i; if (event->attr.config != PAI_CRYPTO_BASE) { - return paicrypt_getctr(cpump, + return paicrypt_getctr(cpump->page, event->attr.config - PAI_CRYPTO_BASE, kernel); } for (i = 1; i <= paicrypt_cnt; i++) { - u64 val = paicrypt_getctr(cpump, i, kernel); + u64 val = paicrypt_getctr(cpump->page, i, kernel); if (!val) continue; @@ -120,66 +169,110 @@ static u64 paicrypt_getall(struct perf_event *event) return sum; } -/* Used to avoid races in checking concurrent access of counting and - * sampling for crypto events - * - * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is - * allowed and when this event is running, no counting event is allowed. - * Several counting events are allowed in parallel, but no sampling event - * is allowed while one (or more) counting events are running. - * +/* Check concurrent access of counting and sampling for crypto events. * This function is called in process context and it is save to block. * When the event initialization functions fails, no other call back will * be invoked. * * Allocate the memory for the event. */ -static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump) +static struct paicrypt_map *paicrypt_busy(struct perf_event *event, int cpu) { - unsigned int *use_ptr; - int rc = 0; + struct paicrypt_map *cpump = NULL; + struct paicrypt_mapptr *mp; + int rc; mutex_lock(&pai_reserve_mutex); - if (a->sample_period) { /* Sampling requested */ - use_ptr = &cpump->sampler; - if (cpump->counter || cpump->sampler) - rc = -EBUSY; /* ... sampling/counting active */ - } else { /* Counting requested */ - use_ptr = &cpump->counter; - if (cpump->sampler) - rc = -EBUSY; /* ... and sampling active */ - } + + /* Allocate root node */ + rc = paicrypt_root_alloc(); if (rc) goto unlock; + /* Allocate node for this event */ + mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paicrypt_map allocated? */ + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) { + rc = -ENOMEM; + goto free_root; + } + INIT_LIST_HEAD(&cpump->syswide_list); + } + /* Allocate memory for counter page and counter extraction. * Only the first counting event has to allocate a page. */ - if (cpump->page) + if (cpump->page) { + refcount_inc(&cpump->refcnt); goto unlock; + } rc = -ENOMEM; cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!cpump->page) - goto unlock; + goto free_paicrypt_map; cpump->save = kvmalloc_array(paicrypt_cnt + 1, sizeof(struct pai_userdata), GFP_KERNEL); if (!cpump->save) { free_page((unsigned long)cpump->page); cpump->page = NULL; - goto unlock; + goto free_paicrypt_map; } - rc = 0; -unlock: - /* If rc is non-zero, do not increment counter/sampler. */ - if (!rc) - *use_ptr += 1; - debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx sampler %d" - " counter %d page %#lx save %p rc %d\n", __func__, - a->sample_period, cpump->sampler, cpump->counter, + /* Set mode and reference count */ + rc = 0; + refcount_set(&cpump->refcnt, 1); + mp->mapptr = cpump; + debug_sprintf_event(cfm_dbg, 5, "%s users %d refcnt %u page %#lx " + "save %p rc %d\n", __func__, cpump->active_events, + refcount_read(&cpump->refcnt), (unsigned long)cpump->page, cpump->save, rc); + goto unlock; + +free_paicrypt_map: + /* Undo memory allocation */ + kfree(cpump); + mp->mapptr = NULL; +free_root: + paicrypt_root_free(); +unlock: mutex_unlock(&pai_reserve_mutex); + return rc ? ERR_PTR(rc) : cpump; +} + +static int paicrypt_event_init_all(struct perf_event *event) +{ + struct paicrypt_map *cpump; + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + cpump = paicrypt_busy(event, cpu); + if (IS_ERR(cpump)) { + for_each_cpu(cpu, maskptr) + paicrypt_event_destroy_cpu(event, cpu); + kfree(maskptr); + rc = PTR_ERR(cpump); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in paicrypt_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: return rc; } @@ -188,7 +281,7 @@ static int paicrypt_event_init(struct perf_event *event) { struct perf_event_attr *a = &event->attr; struct paicrypt_map *cpump; - int rc; + int rc = 0; /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) @@ -197,25 +290,29 @@ static int paicrypt_event_init(struct perf_event *event) if (a->config < PAI_CRYPTO_BASE || a->config > PAI_CRYPTO_BASE + paicrypt_cnt) return -EINVAL; - /* Allow only CPU wide operation, no process context for now. */ - if (event->hw.target || event->cpu == -1) - return -ENOENT; - /* Allow only CRYPTO_ALL for sampling. */ + /* Allow only CRYPTO_ALL for sampling */ if (a->sample_period && a->config != PAI_CRYPTO_BASE) return -EINVAL; + /* Get a page to store last counter values for sampling */ + if (a->sample_period) { + PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL); + if (!PAI_SAVE_AREA(event)) { + rc = -ENOMEM; + goto out; + } + } - cpump = per_cpu_ptr(&paicrypt_map, event->cpu); - rc = paicrypt_busy(a, cpump); - if (rc) - return rc; - - /* Event initialization sets last_tag to 0. When later on the events - * are deleted and re-added, do not reset the event count value to zero. - * Events are added, deleted and re-added when 2 or more events - * are active at the same time. - */ - event->hw.last_tag = 0; - cpump->event = event; + if (event->cpu >= 0) { + cpump = paicrypt_busy(event, event->cpu); + if (IS_ERR(cpump)) + rc = PTR_ERR(cpump); + } else { + rc = paicrypt_event_init_all(event); + } + if (rc) { + free_page(PAI_SAVE_AREA(event)); + goto out; + } event->destroy = paicrypt_event_destroy; if (a->sample_period) { @@ -230,7 +327,8 @@ static int paicrypt_event_init(struct perf_event *event) } static_branch_inc(&pai_key); - return 0; +out: + return rc; } static void paicrypt_read(struct perf_event *event) @@ -247,76 +345,102 @@ static void paicrypt_read(struct perf_event *event) static void paicrypt_start(struct perf_event *event, int flags) { + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; u64 sum; - if (!event->hw.last_tag) { - event->hw.last_tag = 1; - sum = paicrypt_getall(event); /* Get current value */ - local64_set(&event->count, 0); + if (!event->attr.sample_period) { /* Counting */ + sum = paicrypt_getall(event); /* Get current value */ local64_set(&event->hw.prev_count, sum); + } else { /* Sampling */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } } } static int paicrypt_add(struct perf_event *event, int flags) { - struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; unsigned long ccd; - if (cpump->users++ == 0) { + if (++cpump->active_events == 1) { ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; - WRITE_ONCE(S390_lowcore.ccd, ccd); - __ctl_set_bit(0, 50); + WRITE_ONCE(get_lowcore()->ccd, ccd); + local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); } - cpump->event = event; - if (flags & PERF_EF_START && !event->attr.sample_period) { - /* Only counting needs initial counter value */ + if (flags & PERF_EF_START) paicrypt_start(event, PERF_EF_RELOAD); - } event->hw.state = 0; - if (event->attr.sample_period) - perf_sched_cb_inc(event->pmu); return 0; } +static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *); static void paicrypt_stop(struct perf_event *event, int flags) { - paicrypt_read(event); + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ + paicrypt_read(event); + } else { /* Sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + perf_sched_cb_dec(event->pmu); + list_del(PAI_SWLIST(event)); + } else { + paicrypt_have_sample(event, cpump); + cpump->event = NULL; + } + } event->hw.state = PERF_HES_STOPPED; } static void paicrypt_del(struct perf_event *event, int flags) { - struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); - - if (event->attr.sample_period) - perf_sched_cb_dec(event->pmu); - if (!event->attr.sample_period) - /* Only counting needs to read counter */ - paicrypt_stop(event, PERF_EF_UPDATE); - if (cpump->users-- == 1) { - __ctl_clear_bit(0, 50); - WRITE_ONCE(S390_lowcore.ccd, 0); + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + + paicrypt_stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + WRITE_ONCE(get_lowcore()->ccd, 0); } } -/* Create raw data and save it in buffer. Returns number of bytes copied. - * Saves only positive counter entries of the form +/* Create raw data and save it in buffer. Calculate the delta for each + * counter between this invocation and the last invocation. + * Returns number of bytes copied. + * Saves only entries with positive counter difference of the form * 2 bytes: Number of counter * 8 bytes: Value of counter */ -static size_t paicrypt_copy(struct pai_userdata *userdata, - struct paicrypt_map *cpump, - bool exclude_user, bool exclude_kernel) +static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page, + unsigned long *page_old, bool exclude_user, + bool exclude_kernel) { int i, outidx = 0; for (i = 1; i <= paicrypt_cnt; i++) { - u64 val = 0; + u64 val = 0, val_old = 0; - if (!exclude_kernel) - val += paicrypt_getctr(cpump, i, true); - if (!exclude_user) - val += paicrypt_getctr(cpump, i, false); + if (!exclude_kernel) { + val += paicrypt_getctr(page, i, true); + val_old += paicrypt_getctr(page_old, i, true); + } + if (!exclude_user) { + val += paicrypt_getctr(page, i, false); + val_old += paicrypt_getctr(page_old, i, false); + } + if (val >= val_old) + val -= val_old; + else + val = (~0ULL - val_old) + val + 1; if (val) { userdata[outidx].num = i; userdata[outidx].value = val; @@ -326,24 +450,14 @@ static size_t paicrypt_copy(struct pai_userdata *userdata, return outidx * sizeof(struct pai_userdata); } -static int paicrypt_push_sample(void) +static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, + struct perf_event *event) { - struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); - struct perf_event *event = cpump->event; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; - size_t rawsize; int overflow; - if (!cpump->event) /* No event active */ - return 0; - rawsize = paicrypt_copy(cpump->save, cpump, - cpump->event->attr.exclude_user, - cpump->event->attr.exclude_kernel); - if (!rawsize) /* No incremented counters */ - return 0; - /* Setup perf sample */ memset(®s, 0, sizeof(regs)); memset(&raw, 0, sizeof(raw)); @@ -364,27 +478,54 @@ static int paicrypt_push_sample(void) if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - raw.size = raw.frag.size; - data.raw = &raw; + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); perf_event_update_userpage(event); - /* Clear lowcore page after read */ - memset(cpump->page, 0, PAGE_SIZE); + /* Save crypto counter lowcore page after reading event data. */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE); return overflow; } +/* Check if there is data to be saved on schedule out of a task. */ +static void paicrypt_have_sample(struct perf_event *event, + struct paicrypt_map *cpump) +{ + size_t rawsize; + + if (!event) /* No event active */ + return; + rawsize = paicrypt_copy(cpump->save, cpump->page, + (unsigned long *)PAI_SAVE_AREA(event), + event->attr.exclude_user, + event->attr.exclude_kernel); + if (rawsize) /* No incremented counters */ + paicrypt_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paicrypt_have_samples(void) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + paicrypt_have_sample(event, cpump); +} + /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event CRYPTO_ALL is allowed. */ -static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in) +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, clear values. + * results on schedule_out and if page was dirty, save old values. */ if (!sched_in) - paicrypt_push_sample(); + paicrypt_have_samples(); } /* Attribute definitions for paicrypt interface. As with other CPU @@ -428,7 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = { /* Performance monitoring unit for mapped counters */ static struct pmu paicrypt = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = paicrypt_event_init, .add = paicrypt_add, .del = paicrypt_del, @@ -598,6 +739,22 @@ static const char * const paicrypt_ctrnames[] = { [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", [155] = "IBM_RESERVED_155", [156] = "IBM_RESERVED_156", + [157] = "KM_FULL_XTS_AES_128", + [158] = "KM_FULL_XTS_AES_256", + [159] = "KM_FULL_XTS_ENCRYPTED_AES_128", + [160] = "KM_FULL_XTS_ENCRYPTED_AES_256", + [161] = "KMAC_HMAC_SHA_224", + [162] = "KMAC_HMAC_SHA_256", + [163] = "KMAC_HMAC_SHA_384", + [164] = "KMAC_HMAC_SHA_512", + [165] = "KMAC_HMAC_ENCRYPTED_SHA_224", + [166] = "KMAC_HMAC_ENCRYPTED_SHA_256", + [167] = "KMAC_HMAC_ENCRYPTED_SHA_384", + [168] = "KMAC_HMAC_ENCRYPTED_SHA_512", + [169] = "PCKMO_ENCRYPT_HMAC_512_KEY", + [170] = "PCKMO_ENCRYPT_HMAC_1024_KEY", + [171] = "PCKMO_ENCRYPT_AES_XTS_128", + [172] = "PCKMO_ENCRYPT_AES_XTS_256", }; static void __init attr_event_free(struct attribute **attrs, int num) @@ -619,6 +776,12 @@ static int __init attr_event_init_one(struct attribute **attrs, int num) { struct perf_pmu_events_attr *pa; + /* Index larger than array_size, no counter name available */ + if (num >= ARRAY_SIZE(paicrypt_ctrnames)) { + attrs[num] = NULL; + return 0; + } + pa = kzalloc(sizeof(*pa), GFP_KERNEL); if (!pa) return -ENOMEM; @@ -639,14 +802,13 @@ static int __init attr_event_init(void) struct attribute **attrs; int ret, i; - attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs), - GFP_KERNEL); + attrs = kmalloc_array(paicrypt_cnt + 2, sizeof(*attrs), GFP_KERNEL); if (!attrs) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) { + for (i = 0; i <= paicrypt_cnt; i++) { ret = attr_event_init_one(attrs, i); if (ret) { - attr_event_free(attrs, i - 1); + attr_event_free(attrs, i); return ret; } } @@ -667,8 +829,10 @@ static int __init paicrypt_init(void) paicrypt_cnt = ib.num_cc; if (paicrypt_cnt == 0) return 0; - if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) - paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1; + if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) { + pr_err("Too many PMU pai_crypto counters %d\n", paicrypt_cnt); + return -E2BIG; + } rc = attr_event_init(); /* Export known PAI crypto events */ if (rc) { diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c new file mode 100644 index 000000000000..fd14d5ebccbc --- /dev/null +++ b/arch/s390/kernel/perf_pai_ext.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Extension + * Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define KMSG_COMPONENT "pai_ext" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/io.h> +#include <linux/perf_event.h> +#include <asm/ctlreg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */ +#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */ + +static debug_info_t *paiext_dbg; +static unsigned int paiext_cnt; /* Extracted with QPACI instruction */ + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[488]; +} __packed; + +struct paiext_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Area to store non-zero counters */ + unsigned int active_events; /* # of PAI Extension users */ + refcount_t refcnt; + struct perf_event *event; /* Perf event for sampling */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ + struct list_head syswide_list; /* List system-wide sampling events */ +}; + +struct paiext_mapptr { + struct paiext_map *mapptr; +}; + +static struct paiext_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct paiext_mapptr __percpu *mapptr; +} paiext_root; + +/* Free per CPU data when the last event is removed. */ +static void paiext_root_free(void) +{ + if (refcount_dec_and_test(&paiext_root.refcnt)) { + free_percpu(paiext_root.mapptr); + paiext_root.mapptr = NULL; + } + debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__, + refcount_read(&paiext_root.refcnt)); +} + +/* On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paiext_root_alloc(void) +{ + if (!refcount_inc_not_zero(&paiext_root.refcnt)) { + /* The memory is already zeroed. */ + paiext_root.mapptr = alloc_percpu(struct paiext_mapptr); + if (!paiext_root.mapptr) { + /* Returning without refcnt adjustment is ok. The + * error code is handled by paiext_alloc() which + * decrements refcnt when an event can not be + * created. + */ + return -ENOMEM; + } + refcount_set(&paiext_root.refcnt, 1); + } + return 0; +} + +/* Protects against concurrent increment of sampler and counter member + * increments at the same time and prohibits concurrent execution of + * counting and sampling events. + * Ensures that analytics counter block is deallocated only when the + * sampling and counting on that cpu is zero. + * For details see paiext_alloc(). + */ +static DEFINE_MUTEX(paiext_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void paiext_free(struct paiext_mapptr *mp) +{ + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Release the PMU if event is the last perf event */ +static void paiext_event_destroy_cpu(struct perf_event *event, int cpu) +{ + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu); + struct paiext_map *cpump = mp->mapptr; + + mutex_lock(&paiext_reserve_mutex); + if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ + paiext_free(mp); + paiext_root_free(); + mutex_unlock(&paiext_reserve_mutex); +} + +static void paiext_event_destroy(struct perf_event *event) +{ + int cpu; + + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + + for_each_cpu(cpu, mask) + paiext_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + paiext_event_destroy_cpu(event, event->cpu); + } + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__, + event->cpu); +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for pai_extension events. + * + * Only one instance of event pai_ext/NNPA_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is safe to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paiext_alloc_cpu(struct perf_event *event, int cpu) +{ + struct paiext_mapptr *mp; + struct paiext_map *cpump; + int rc; + + mutex_lock(&paiext_reserve_mutex); + rc = paiext_root_alloc(); + if (rc) + goto unlock; + + mp = per_cpu_ptr(paiext_root.mapptr, cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paiext_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) + goto undo; + + /* Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary alignment. + * - a 1KB byte block and requires 1KB boundary alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * need adjustment. + */ + mp->mapptr = cpump; + cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + cpump->save = kvmalloc_array(paiext_cnt + 1, + sizeof(struct pai_userdata), + GFP_KERNEL); + if (!cpump->save || !cpump->area || !cpump->paiext_cb) { + paiext_free(mp); + goto undo; + } + INIT_LIST_HEAD(&cpump->syswide_list); + refcount_set(&cpump->refcnt, 1); + rc = 0; + } else { + refcount_inc(&cpump->refcnt); + } + +undo: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + paiext_root_free(); + } +unlock: + mutex_unlock(&paiext_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +static int paiext_alloc(struct perf_event *event) +{ + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + rc = paiext_alloc_cpu(event, cpu); + if (rc) { + for_each_cpu(cpu, maskptr) + paiext_event_destroy_cpu(event, cpu); + kfree(maskptr); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in paicrypt_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + +/* The PAI extension 1 control block supports up to 128 entries. Return + * the index within PAIE1_CB given the event number. Also validate event + * number. + */ +static int paiext_event_valid(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) { + /* Offset NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + return 0; + } + return -EINVAL; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI extension event must be valid and in supported range */ + rc = paiext_event_valid(event); + if (rc) + return rc; + /* Allow only event NNPA_ALL for sampling. */ + if (a->sample_period && a->config != PAI_NNPA_BASE) + return -EINVAL; + /* Prohibit exclude_user event selection */ + if (a->exclude_user) + return -EINVAL; + /* Get a page to store last counter values for sampling */ + if (a->sample_period) { + PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL); + if (!PAI_SAVE_AREA(event)) + return -ENOMEM; + } + + if (event->cpu >= 0) + rc = paiext_alloc_cpu(event, event->cpu); + else + rc = paiext_alloc(event); + if (rc) { + free_page(PAI_SAVE_AREA(event)); + return rc; + } + event->destroy = paiext_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which are the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + return 0; +} + +static u64 paiext_getctr(unsigned long *area, int nr) +{ + return area[nr]; +} + +/* Read the counter values. Return value from location in buffer. For event + * NNPA_ALL sum up all events. + */ +static u64 paiext_getdata(struct perf_event *event) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + u64 sum = 0; + int i; + + if (event->attr.config != PAI_NNPA_BASE) + return paiext_getctr(cpump->area, + event->attr.config - PAI_NNPA_BASE); + + for (i = 1; i <= paiext_cnt; i++) + sum += paiext_getctr(cpump->area, i); + + return sum; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return paiext_getdata(event); +} + +static void paiext_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paiext_getall(event); + local64_set(&event->hw.prev_count, new); + delta = new - prev; + local64_add(delta, &event->count); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + u64 sum; + + if (!event->attr.sample_period) { /* Counting */ + sum = paiext_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } else { /* Sampling */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, + PAIE1_CTRBLOCK_SZ); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } + } +} + +static int paiext_add(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (++cpump->active_events == 1) { + get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); + } + if (flags & PERF_EF_START) + paiext_start(event, PERF_EF_RELOAD); + event->hw.state = 0; + return 0; +} + +static void paiext_have_sample(struct perf_event *, struct paiext_map *); +static void paiext_stop(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ + paiext_read(event); + } else { /* Sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_del(PAI_SWLIST(event)); + perf_sched_cb_dec(event->pmu); + } else { + paiext_have_sample(event, cpump); + cpump->event = NULL; + } + } + event->hw.state = PERF_HES_STOPPED; +} + +static void paiext_del(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + paiext_stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + /* Disable CPU instruction lookup for PAIE1 control block */ + local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); + pcb->acc = 0; + get_lowcore()->aicd = 0; + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area, + unsigned long *area_old) +{ + int i, outidx = 0; + + for (i = 1; i <= paiext_cnt; i++) { + u64 val = paiext_getctr(area, i); + u64 val_old = paiext_getctr(area_old, i); + + if (val >= val_old) + val -= val_old; + else + val = (~0ULL - val_old) + val + 1; + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paiext_sched_task() and paiext_push_sample() are not + * invoked after function paiext_del() has been called because of function + * perf_sched_cb_dec(). + * The function paiext_sched_task() and paiext_push_sample() are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paiext_sched_task() as call back + * to run at context switch time (see paiext_add()). + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paiext_del() + * returns and has deleted the event on that CPU. + */ +static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, + struct perf_event *event) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = smp_processor_id(); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, event, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Save NNPA lowcore area after read in event */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, + PAIE1_CTRBLOCK_SZ); + return overflow; +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paiext_have_sample(struct perf_event *event, + struct paiext_map *cpump) +{ + size_t rawsize; + + if (!event) + return; + rawsize = paiext_copy(cpump->save, cpump->area, + (unsigned long *)PAI_SAVE_AREA(event)); + if (rawsize) /* Incremented counters */ + paiext_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paiext_have_samples(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + paiext_have_sample(event, cpump); +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, save old values. + */ + if (!sched_in) + paiext_have_samples(); +} + +/* Attribute definitions for pai extension1 interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1800 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography + * counters. + * Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_hw_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", + [28] = "NNPA_TRANSFORM", + [29] = "NNPA_GELU", + [30] = "NNPA_MOMENTS", + [31] = "NNPA_LAYERNORM", + [32] = "NNPA_MATMUL_OP_BCAST1", + [33] = "NNPA_SQRT", + [34] = "NNPA_INVSQRT", + [35] = "NNPA_NORM", + [36] = "NNPA_REDUCE", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + struct device_attribute *dap; + int i; + + for (i = 0; i < num; i++) { + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + /* Index larger than array_size, no counter name available */ + if (num >= ARRAY_SIZE(paiext_ctrnames)) { + attrs[num] = NULL; + return 0; + } + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_NNPA_BASE + num; + pa->attr.attr.name = paiext_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(paiext_cnt + 2, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i <= paiext_cnt; i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i); + return ret; + } + } + attrs[i] = NULL; + paiext_events_group.attrs = attrs; + return 0; +} + +static int __init paiext_init(void) +{ + struct qpaci_info_block ib; + int rc = -ENOMEM; + + if (!test_facility(197)) + return 0; + + qpaci(&ib); + paiext_cnt = ib.num_nnpa; + if (paiext_cnt >= PAI_NNPA_MAXCTR) + paiext_cnt = PAI_NNPA_MAXCTR; + if (!paiext_cnt) + return 0; + + rc = attr_event_init(); + if (rc) { + pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!paiext_dbg) { + pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n"); + rc = -ENOMEM; + goto out_init; + } + debug_register_view(paiext_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1); + if (rc) { + pr_err("Registration of " KMSG_COMPONENT " PMU failed with " + "rc=%i\n", rc); + goto out_pmu; + } + + return 0; + +out_pmu: + debug_unregister_view(paiext_dbg, &debug_sprintf_view); + debug_unregister(paiext_dbg); +out_init: + attr_event_free(paiext_events_group.attrs, + ARRAY_SIZE(paiext_ctrnames) + 1); + return rc; +} + +device_initcall(paiext_init); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index 6e9e5d5e927e..a6b058ee4a36 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -5,8 +5,7 @@ #include <linux/errno.h> #include <linux/bug.h> #include <asm/ptrace.h> -#include <asm/fpu/api.h> -#include <asm/fpu/types.h> +#include <asm/fpu.h> u64 perf_reg_value(struct pt_regs *regs, int idx) { @@ -20,8 +19,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) return 0; idx -= PERF_REG_S390_FP0; - fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) - : current->thread.fpu.fprs[idx]; + fp = *(freg_t *)(current->thread.ufpu.vxrs + idx); return fp.ui; } @@ -63,6 +61,6 @@ void perf_get_regs_user(struct perf_regs *regs_user, */ regs_user->regs = task_pt_regs(current); if (user_mode(regs_user->regs)) - save_fpu_regs(); + save_user_fpu_regs(); regs_user->abi = perf_reg_abi(current); } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 89949b9f3cf8..9637aee43c40 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -30,16 +30,20 @@ #include <linux/export.h> #include <linux/init_task.h> #include <linux/entry-common.h> +#include <linux/io.h> +#include <asm/guarded_storage.h> +#include <asm/access-regs.h> +#include <asm/switch_to.h> #include <asm/cpu_mf.h> -#include <asm/io.h> #include <asm/processor.h> +#include <asm/ptrace.h> #include <asm/vtimer.h> #include <asm/exec.h> +#include <asm/fpu.h> #include <asm/irq.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/stacktrace.h> -#include <asm/switch_to.h> #include <asm/runtime_instr.h> #include <asm/unwind.h> #include "entry.h" @@ -67,10 +71,10 @@ void flush_thread(void) void arch_setup_new_exec(void) { - if (S390_lowcore.current_pid != current->pid) { - S390_lowcore.current_pid = current->pid; + if (get_lowcore()->current_pid != current->pid) { + get_lowcore()->current_pid = current->pid; if (test_facility(40)) - lpp(&S390_lowcore.lpp); + lpp(&get_lowcore()->lpp); } } @@ -82,15 +86,22 @@ void arch_release_task_struct(struct task_struct *tsk) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { + save_user_fpu_regs(); + + *dst = *src; + dst->thread.kfpu_flags = 0; + /* - * Save the floating-point or vector register state of the current - * task and set the CIF_FPU flag to lazy restore the FPU register - * state when returning to user space. + * Don't transfer over the runtime instrumentation or the guarded + * storage control block pointers. These fields are cleared here instead + * of in copy_thread() to avoid premature freeing of associated memory + * on fork() failure. Wait to clear the RI flag because ->stack still + * refers to the source thread. */ - save_fpu_regs(); + dst->thread.ri_cb = NULL; + dst->thread.gs_cb = NULL; + dst->thread.gs_bc_cb = NULL; - memcpy(dst, src, arch_task_struct_size); - dst->thread.fpu.regs = dst->thread.fpu.fprs; return 0; } @@ -124,21 +135,19 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.last_break = 1; frame->sf.back_chain = 0; - frame->sf.gprs[5] = (unsigned long)frame + sizeof(struct stack_frame); - frame->sf.gprs[6] = (unsigned long)p; + frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs; + frame->sf.gprs[12 - 6] = (unsigned long)p; /* new return point is ret_from_fork */ - frame->sf.gprs[8] = (unsigned long)ret_from_fork; + frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork; /* fake return stack for resume(), don't go back to schedule */ - frame->sf.gprs[9] = (unsigned long)frame; + frame->sf.gprs[15 - 6] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); - frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; - frame->childregs.psw.addr = - (unsigned long)__ret_from_fork; + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.gprs[9] = (unsigned long)args->fn; frame->childregs.gprs[10] = (unsigned long)args->fn_arg; frame->childregs.orig_gpr2 = -1; @@ -150,13 +159,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->childregs.flags = 0; if (new_stackp) frame->childregs.gprs[15] = new_stackp; - - /* Don't copy runtime instrumentation info */ - p->thread.ri_cb = NULL; + /* + * Clear the runtime instrumentation flag after the above childregs + * copy. The CB pointer was already cleared in arch_dup_task_struct(). + */ frame->childregs.psw.mask &= ~PSW_MASK_RI; - /* Don't copy guarded storage control block */ - p->thread.gs_cb = NULL; - p->thread.gs_bc_cb = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { @@ -178,8 +185,23 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) void execve_tail(void) { - current->thread.fpu.fpc = 0; - asm volatile("sfpc %0" : : "d" (0)); + current->thread.ufpu.fpc = 0; + fpu_sfpc(0); +} + +struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) +{ + save_user_fpu_regs(); + save_kernel_fpu_regs(&prev->thread); + save_access_regs(&prev->thread.acrs[0]); + save_ri_cb(prev->thread.ri_cb); + save_gs_cb(prev->thread.gs_cb); + update_cr_regs(next); + restore_kernel_fpu_regs(&next->thread); + restore_access_regs(&next->thread.acrs[0]); + restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); + restore_gs_cb(next->thread.gs_cb); + return __switch_to_asm(prev, next); } unsigned long __get_wchan(struct task_struct *p) @@ -214,13 +236,13 @@ unsigned long __get_wchan(struct task_struct *p) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } static inline unsigned long brk_rnd(void) { - return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; + return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index aa0e0e7fc773..11f70c1e2797 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -18,7 +18,9 @@ #include <linux/mm_types.h> #include <linux/delay.h> #include <linux/cpu.h> - +#include <linux/smp.h> +#include <asm/text-patching.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/facility.h> #include <asm/elf.h> @@ -72,7 +74,7 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) this_cpu = smp_processor_id(); if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { __this_cpu_write(cpu_relax_retry, 0); - cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + cpu = cpumask_next_wrap(this_cpu, cpumask); if (cpu >= nr_cpu_ids) return; if (arch_vcpu_is_preempted(cpu)) @@ -80,6 +82,23 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) } } +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} + /* * cpu_init - initializes state that is per-CPU. */ @@ -96,15 +115,6 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, current); } -/* - * cpu_have_feature - Test CPU features on module initialization - */ -int cpu_have_feature(unsigned int num) -{ - return elf_hwcap & (1UL << num); -} -EXPORT_SYMBOL(cpu_have_feature); - static void show_facilities(struct seq_file *m) { unsigned int bit; @@ -201,21 +211,18 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_DFP; /* huge page support */ - if (MACHINE_HAS_EDAT1) + if (cpu_has_edat1()) elf_hwcap |= HWCAP_HPAGE; /* 64-bit register support for 31-bit processes */ elf_hwcap |= HWCAP_HIGH_GPRS; /* transactional execution */ - if (MACHINE_HAS_TE) + if (machine_has_tx()) elf_hwcap |= HWCAP_TE; - /* - * Vector extension can be disabled with the "novx" parameter. - * Use MACHINE_HAS_VX instead of facility bit 129. - */ - if (MACHINE_HAS_VX) { + /* vector */ + if (test_facility(129)) { elf_hwcap |= HWCAP_VXRS; if (test_facility(134)) elf_hwcap |= HWCAP_VXRS_BCD; @@ -239,10 +246,10 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_NNPA; /* guarded storage */ - if (MACHINE_HAS_GS) + if (cpu_has_gs()) elf_hwcap |= HWCAP_GS; - if (MACHINE_HAS_PCI_MIO) + if (test_machine_feature(MFEATURE_PCI_MIO)) elf_hwcap |= HWCAP_PCI_MIO; /* virtualization support */ @@ -261,31 +268,35 @@ static int __init setup_elf_platform(void) add_device_randomness(&cpu_id, sizeof(cpu_id)); switch (cpu_id.machine) { default: /* Use "z10" as default. */ - strcpy(elf_platform, "z10"); + strscpy(elf_platform, "z10"); break; case 0x2817: case 0x2818: - strcpy(elf_platform, "z196"); + strscpy(elf_platform, "z196"); break; case 0x2827: case 0x2828: - strcpy(elf_platform, "zEC12"); + strscpy(elf_platform, "zEC12"); break; case 0x2964: case 0x2965: - strcpy(elf_platform, "z13"); + strscpy(elf_platform, "z13"); break; case 0x3906: case 0x3907: - strcpy(elf_platform, "z14"); + strscpy(elf_platform, "z14"); break; case 0x8561: case 0x8562: - strcpy(elf_platform, "z15"); + strscpy(elf_platform, "z15"); break; case 0x3931: case 0x3932: - strcpy(elf_platform, "z16"); + strscpy(elf_platform, "z16"); + break; + case 0x9175: + case 0x9176: + strscpy(elf_platform, "z17"); break; } return 0; @@ -374,21 +385,3 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -int s390_isolate_bp(void) -{ - if (!test_facility(82)) - return -EOPNOTSUPP; - set_thread_flag(TIF_ISOLATE_BP); - return 0; -} -EXPORT_SYMBOL(s390_isolate_bp); - -int s390_isolate_bp_guest(void) -{ - if (!test_facility(82)) - return -EOPNOTSUPP; - set_thread_flag(TIF_ISOLATE_BP_GUEST); - return 0; -} -EXPORT_SYMBOL(s390_isolate_bp_guest); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 53e0209229f8..e1240f6b29fa 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -7,10 +7,10 @@ * Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#include "asm/ptrace.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> @@ -24,12 +24,17 @@ #include <linux/seccomp.h> #include <linux/compat.h> #include <trace/syscall.h> +#include <asm/guarded_storage.h> +#include <asm/access-regs.h> #include <asm/page.h> #include <linux/uaccess.h> #include <asm/unistd.h> -#include <asm/switch_to.h> #include <asm/runtime_instr.h> #include <asm/facility.h> +#include <asm/machine.h> +#include <asm/ptrace.h> +#include <asm/rwonce.h> +#include <asm/fpu.h> #include "entry.h" @@ -41,17 +46,24 @@ void update_cr_regs(struct task_struct *task) { struct pt_regs *regs = task_pt_regs(task); struct thread_struct *thread = &task->thread; - struct per_regs old, new; union ctlreg0 cr0_old, cr0_new; union ctlreg2 cr2_old, cr2_new; int cr0_changed, cr2_changed; - - __ctl_store(cr0_old.val, 0, 0); - __ctl_store(cr2_old.val, 2, 2); + union { + struct ctlreg regs[3]; + struct { + struct ctlreg control; + struct ctlreg start; + struct ctlreg end; + }; + } old, new; + + local_ctl_store(0, &cr0_old.reg); + local_ctl_store(2, &cr2_old.reg); cr0_new = cr0_old; cr2_new = cr2_old; /* Take care of the enable/disable of transactional execution. */ - if (MACHINE_HAS_TE) { + if (machine_has_tx()) { /* Set or clear transaction execution TXC bit 8. */ cr0_new.tcx = 1; if (task->thread.per_flags & PER_FLAG_NO_TE) @@ -66,7 +78,7 @@ void update_cr_regs(struct task_struct *task) } } /* Take care of enable/disable of guarded storage. */ - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { cr2_new.gse = 0; if (task->thread.gs_cb) cr2_new.gse = 1; @@ -75,38 +87,38 @@ void update_cr_regs(struct task_struct *task) cr0_changed = cr0_new.val != cr0_old.val; cr2_changed = cr2_new.val != cr2_old.val; if (cr0_changed) - __ctl_load(cr0_new.val, 0, 0); + local_ctl_load(0, &cr0_new.reg); if (cr2_changed) - __ctl_load(cr2_new.val, 2, 2); + local_ctl_load(2, &cr2_new.reg); /* Copy user specified PER registers */ - new.control = thread->per_user.control; - new.start = thread->per_user.start; - new.end = thread->per_user.end; + new.control.val = thread->per_user.control; + new.start.val = thread->per_user.start; + new.end.val = thread->per_user.end; /* merge TIF_SINGLE_STEP into user specified PER registers. */ if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) || test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) { if (test_tsk_thread_flag(task, TIF_BLOCK_STEP)) - new.control |= PER_EVENT_BRANCH; + new.control.val |= PER_EVENT_BRANCH; else - new.control |= PER_EVENT_IFETCH; - new.control |= PER_CONTROL_SUSPENSION; - new.control |= PER_EVENT_TRANSACTION_END; + new.control.val |= PER_EVENT_IFETCH; + new.control.val |= PER_CONTROL_SUSPENSION; + new.control.val |= PER_EVENT_TRANSACTION_END; if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) - new.control |= PER_EVENT_IFETCH; - new.start = 0; - new.end = -1UL; + new.control.val |= PER_EVENT_IFETCH; + new.start.val = 0; + new.end.val = -1UL; } /* Take care of the PER enablement bit in the PSW. */ - if (!(new.control & PER_EVENT_MASK)) { + if (!(new.control.val & PER_EVENT_MASK)) { regs->psw.mask &= ~PSW_MASK_PER; return; } regs->psw.mask |= PSW_MASK_PER; - __ctl_store(old, 9, 11); + __local_ctl_store(9, 11, old.regs); if (memcmp(&new, &old, sizeof(struct per_regs)) != 0) - __ctl_load(new, 9, 11); + __local_ctl_load(9, 11, new.regs); } void user_enable_single_step(struct task_struct *task) @@ -238,22 +250,15 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr) /* * floating point control reg. is in the thread structure */ - tmp = child->thread.fpu.fpc; + tmp = child->thread.ufpu.fpc; tmp <<= BITS_PER_LONG - 32; } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array + * floating point regs. are in the child->thread.ufpu.vxrs array */ offset = addr - offsetof(struct user, regs.fp_regs.fprs); - if (MACHINE_HAS_VX) - tmp = *(addr_t *) - ((addr_t) child->thread.fpu.vxrs + 2*offset); - else - tmp = *(addr_t *) - ((addr_t) child->thread.fpu.fprs + offset); - + tmp = *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset); } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. @@ -385,24 +390,16 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) /* * floating point control reg. is in the thread structure */ - if ((unsigned int) data != 0 || - test_fp_ctl(data >> (BITS_PER_LONG - 32))) + if ((unsigned int)data != 0) return -EINVAL; - child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); + child->thread.ufpu.fpc = data >> (BITS_PER_LONG - 32); } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array + * floating point regs. are in the child->thread.ufpu.vxrs array */ offset = addr - offsetof(struct user, regs.fp_regs.fprs); - if (MACHINE_HAS_VX) - *(addr_t *)((addr_t) - child->thread.fpu.vxrs + 2*offset) = data; - else - *(addr_t *)((addr_t) - child->thread.fpu.fprs + offset) = data; - + *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = data; } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. @@ -474,22 +471,20 @@ long arch_ptrace(struct task_struct *child, long request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned long __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned long __user *)data); case PTRACE_ENABLE_TE: - if (!MACHINE_HAS_TE) + if (!machine_has_tx()) return -EIO; child->thread.per_flags &= ~PER_FLAG_NO_TE; return 0; case PTRACE_DISABLE_TE: - if (!MACHINE_HAS_TE) + if (!machine_has_tx()) return -EIO; child->thread.per_flags |= PER_FLAG_NO_TE; child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; return 0; case PTRACE_TE_ABORT_RAND: - if (!MACHINE_HAS_TE || (child->thread.per_flags & PER_FLAG_NO_TE)) + if (!machine_has_tx() || (child->thread.per_flags & PER_FLAG_NO_TE)) return -EIO; switch (data) { case 0UL: @@ -617,21 +612,14 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) /* * floating point control reg. is in the thread structure */ - tmp = child->thread.fpu.fpc; + tmp = child->thread.ufpu.fpc; } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array + * floating point regs. are in the child->thread.ufpu.vxrs array */ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); - if (MACHINE_HAS_VX) - tmp = *(__u32 *) - ((addr_t) child->thread.fpu.vxrs + 2*offset); - else - tmp = *(__u32 *) - ((addr_t) child->thread.fpu.fprs + offset); - + tmp = *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset); } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { /* * Handle access to the per_info structure. @@ -743,23 +731,14 @@ static int __poke_user_compat(struct task_struct *child, /* * floating point control reg. is in the thread structure */ - if (test_fp_ctl(tmp)) - return -EINVAL; - child->thread.fpu.fpc = data; + child->thread.ufpu.fpc = data; } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array + * floating point regs. are in the child->thread.ufpu.vxrs array */ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); - if (MACHINE_HAS_VX) - *(__u32 *)((addr_t) - child->thread.fpu.vxrs + 2*offset) = tmp; - else - *(__u32 *)((addr_t) - child->thread.fpu.fprs + offset) = tmp; - + *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = tmp; } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { /* * Handle access to the per_info structure. @@ -824,9 +803,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned int __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned int __user *)data); } return compat_ptrace_request(child, request, addr, data); } @@ -892,10 +869,10 @@ static int s390_fpregs_get(struct task_struct *target, _s390_fp_regs fp_regs; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); - fp_regs.fpc = target->thread.fpu.fpc; - fpregs_store(&fp_regs, &target->thread.fpu); + fp_regs.fpc = target->thread.ufpu.fpc; + fpregs_store(&fp_regs, &target->thread.ufpu); return membuf_write(&to, &fp_regs, sizeof(fp_regs)); } @@ -909,23 +886,17 @@ static int s390_fpregs_set(struct task_struct *target, freg_t fprs[__NUM_FPRS]; if (target == current) - save_fpu_regs(); - - if (MACHINE_HAS_VX) - convert_vx_to_fp(fprs, target->thread.fpu.vxrs); - else - memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs)); - - /* If setting FPC, must validate it first. */ + save_user_fpu_regs(); + convert_vx_to_fp(fprs, target->thread.ufpu.vxrs); if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { - u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; + u32 ufpc[2] = { target->thread.ufpu.fpc, 0 }; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, 0, offsetof(s390_fp_regs, fprs)); if (rc) return rc; - if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) + if (ufpc[1] != 0) return -EINVAL; - target->thread.fpu.fpc = ufpc[0]; + target->thread.ufpu.fpc = ufpc[0]; } if (rc == 0 && count > 0) @@ -933,12 +904,7 @@ static int s390_fpregs_set(struct task_struct *target, fprs, offsetof(s390_fp_regs, fprs), -1); if (rc) return rc; - - if (MACHINE_HAS_VX) - convert_fp_to_vx(target->thread.fpu.vxrs, fprs); - else - memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs)); - + convert_fp_to_vx(target->thread.ufpu.vxrs, fprs); return rc; } @@ -985,12 +951,12 @@ static int s390_vxrs_low_get(struct task_struct *target, __u64 vxrs[__NUM_VXRS_LOW]; int i; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.ufpu.vxrs[i].low; return membuf_write(&to, vxrs, sizeof(vxrs)); } @@ -1002,18 +968,18 @@ static int s390_vxrs_low_set(struct task_struct *target, __u64 vxrs[__NUM_VXRS_LOW]; int i, rc; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.ufpu.vxrs[i].low; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); if (rc == 0) for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i]; + target->thread.ufpu.vxrs[i].low = vxrs[i]; return rc; } @@ -1022,11 +988,11 @@ static int s390_vxrs_high_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); - return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW, + save_user_fpu_regs(); + return membuf_write(&to, target->thread.ufpu.vxrs + __NUM_VXRS_LOW, __NUM_VXRS_HIGH * sizeof(__vector128)); } @@ -1037,13 +1003,13 @@ static int s390_vxrs_high_set(struct task_struct *target, { int rc; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1); + target->thread.ufpu.vxrs + __NUM_VXRS_LOW, 0, -1); return rc; } @@ -1070,7 +1036,7 @@ static int s390_gs_cb_get(struct task_struct *target, { struct gs_cb *data = target->thread.gs_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) return -ENODATA; @@ -1087,7 +1053,7 @@ static int s390_gs_cb_set(struct task_struct *target, struct gs_cb gs_cb = { }, *data = NULL; int rc; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!target->thread.gs_cb) { data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -1111,7 +1077,7 @@ static int s390_gs_cb_set(struct task_struct *target, target->thread.gs_cb = data; *target->thread.gs_cb = gs_cb; if (target == current) { - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); restore_gs_cb(target->thread.gs_cb); } preempt_enable(); @@ -1124,7 +1090,7 @@ static int s390_gs_bc_get(struct task_struct *target, { struct gs_cb *data = target->thread.gs_bc_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) return -ENODATA; @@ -1138,7 +1104,7 @@ static int s390_gs_bc_set(struct task_struct *target, { struct gs_cb *data = target->thread.gs_bc_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) { data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -1558,13 +1524,6 @@ static const char *gpr_names[NUM_GPRS] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", }; -unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) -{ - if (offset >= NUM_GPRS) - return 0; - return regs->gprs[offset]; -} - int regs_query_register_offset(const char *name) { unsigned long offset; @@ -1584,29 +1543,3 @@ const char *regs_query_register_name(unsigned int offset) return NULL; return gpr_names[offset]; } - -static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) -{ - unsigned long ksp = kernel_stack_pointer(regs); - - return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); -} - -/** - * regs_get_kernel_stack_nth() - get Nth entry of the stack - * @regs:pt_regs which contains kernel stack pointer. - * @n:stack entry number. - * - * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specifined by @regs. If the @n th entry is NOT in the kernel stack, - * this returns 0. - */ -unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) -{ - unsigned long addr; - - addr = kernel_stack_pointer(regs) + n * sizeof(long); - if (!regs_within_kernel_stack(regs, addr)) - return 0; - return *(unsigned long *)addr; -} diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 4a22163962eb..69fcaf54d5ca 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -9,6 +9,7 @@ #include <asm/asm-offsets.h> #include <asm/nospec-insn.h> #include <asm/sigp.h> +#include <asm/lowcore.h> GEN_BR_THUNK %r9 @@ -19,21 +20,16 @@ # r2 = Function to be called after store status # r3 = Parameter for function # -ENTRY(store_status) - /* Save register one and load save area base */ - stg %r1,__LC_SAVE_AREA_RESTART +SYM_CODE_START(store_status) + STMG_LC %r0,%r15,__LC_GPREGS_SAVE_AREA /* General purpose registers */ - lghi %r1,__LC_GPREGS_SAVE_AREA - stmg %r0,%r15,0(%r1) - mvc 8(8,%r1),__LC_SAVE_AREA_RESTART + GET_LC %r13 /* Control registers */ - lghi %r1,__LC_CREGS_SAVE_AREA - stctg %c0,%c15,0(%r1) + stctg %c0,%c15,__LC_CREGS_SAVE_AREA(%r13) /* Access registers */ - lghi %r1,__LC_AREGS_SAVE_AREA - stam %a0,%a15,0(%r1) + stamy %a0,%a15,__LC_AREGS_SAVE_AREA(%r13) /* Floating point registers */ - lghi %r1,__LC_FPREGS_SAVE_AREA + lay %r1,__LC_FPREGS_SAVE_AREA(%r13) std %f0, 0x00(%r1) std %f1, 0x08(%r1) std %f2, 0x10(%r1) @@ -51,21 +47,21 @@ ENTRY(store_status) std %f14,0x70(%r1) std %f15,0x78(%r1) /* Floating point control register */ - lghi %r1,__LC_FP_CREG_SAVE_AREA + lay %r1,__LC_FP_CREG_SAVE_AREA(%r13) stfpc 0(%r1) /* CPU timer */ - lghi %r1,__LC_CPU_TIMER_SAVE_AREA + lay %r1,__LC_CPU_TIMER_SAVE_AREA(%r13) stpt 0(%r1) /* Store prefix register */ - lghi %r1,__LC_PREFIX_SAVE_AREA + lay %r1,__LC_PREFIX_SAVE_AREA(%r13) stpx 0(%r1) /* Clock comparator - seven bytes */ - lghi %r1,__LC_CLOCK_COMP_SAVE_AREA - larl %r4,.Lclkcmp + larl %r4,clkcmp stckc 0(%r4) + lay %r1,__LC_CLOCK_COMP_SAVE_AREA(%r13) mvc 1(7,%r1),1(%r4) /* Program status word */ - lghi %r1,__LC_PSW_SAVE_AREA + lay %r1,__LC_PSW_SAVE_AREA(%r13) epsw %r4,%r5 st %r4,0(%r1) st %r5,4(%r1) @@ -73,9 +69,9 @@ ENTRY(store_status) lgr %r9,%r2 lgr %r2,%r3 BR_EX %r9 -ENDPROC(store_status) +SYM_CODE_END(store_status) .section .bss - .align 8 -.Lclkcmp: .quad 0x0000000000000000 + .balign 8 +SYM_DATA_LOCAL(clkcmp, .quad 0x0000000000000000) .previous diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index a9a1a6f45375..0ae297c82afd 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -26,53 +26,51 @@ */ .text -ENTRY(relocate_kernel) - basr %r13,0 # base address - .base: - lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 - lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 - lg %r5,0(%r2) # read another word for indirection page - aghi %r2,8 # increment pointer - tml %r5,0x1 # is it a destination page? - je .indir_check # NO, goto "indir_check" - lgr %r6,%r5 # r6 = r5 - nill %r6,0xf000 # mask it out and... - j .base # ...next iteration - .indir_check: - tml %r5,0x2 # is it a indirection page? - je .done_test # NO, goto "done_test" - nill %r5,0xf000 # YES, mask out, - lgr %r2,%r5 # move it into the right register, - j .base # and read next... - .done_test: - tml %r5,0x4 # is it the done indicator? - je .source_test # NO! Well, then it should be the source indicator... - j .done # ok, lets finish it here... - .source_test: - tml %r5,0x8 # it should be a source indicator... - je .base # NO, ignore it... - lgr %r8,%r5 # r8 = r5 - nill %r8,0xf000 # masking - 0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 - jo 0b - j .base - .done: - lgr %r0,%r4 # subcode - cghi %r3,0 - je .diag - la %r4,load_psw-.base(%r13) # load psw-address into the register - o %r3,4(%r4) # or load address into psw - st %r3,4(%r4) - mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 - .diag: - diag %r0,%r0,0x308 -ENDPROC(relocate_kernel) +SYM_CODE_START(relocate_kernel) + basr %r13,0 # base address +.base: + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 + lg %r5,0(%r2) # read another word for indirection page + aghi %r2,8 # increment pointer + tml %r5,0x1 # is it a destination page? + je .indir_check # NO, goto "indir_check" + lgr %r6,%r5 # r6 = r5 + nill %r6,0xf000 # mask it out and... + j .base # ...next iteration +.indir_check: + tml %r5,0x2 # is it a indirection page? + je .done_test # NO, goto "done_test" + nill %r5,0xf000 # YES, mask out, + lgr %r2,%r5 # move it into the right register, + j .base # and read next... +.done_test: + tml %r5,0x4 # is it the done indicator? + je .source_test # NO! Well, then it should be the source indicator... + j .done # ok, lets finish it here... +.source_test: + tml %r5,0x8 # it should be a source indicator... + je .base # NO, ignore it... + lgr %r8,%r5 # r8 = r5 + nill %r8,0xf000 # masking +0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 + jo 0b + j .base +.done: + lgr %r0,%r4 # subcode + cghi %r3,0 + je .diag + la %r4,load_psw-.base(%r13) # load psw-address into the register + o %r3,4(%r4) # or load address into psw + st %r3,4(%r4) + mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 +.diag: + diag %r0,%r0,0x308 +SYM_CODE_END(relocate_kernel) - .align 8 - load_psw: - .long 0x00080000,0x80000000 - relocate_kernel_end: - .align 8 - .globl relocate_kernel_len - relocate_kernel_len: - .quad relocate_kernel_end - relocate_kernel + .balign 8 +SYM_DATA_START_LOCAL(load_psw) + .long 0x00080000,0x80000000 +SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end) + .balign 8 +SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel) diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c new file mode 100644 index 000000000000..af10e6bdd34e --- /dev/null +++ b/arch/s390/kernel/rethook.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/rethook.h> +#include <linux/kprobes.h> +#include "rethook.h" + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + rh->ret_addr = regs->gprs[14]; + rh->frame = regs->gprs[15]; + + /* Replace the return addr with trampoline addr */ + regs->gprs[14] = (unsigned long)&arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); + +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* Replace fake return address with real one. */ + regs->gprs[14] = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +/* + * Called from arch_rethook_trampoline + */ +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->gprs[15]); +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* assembler function that handles the rethook must not be probed itself */ +NOKPROBE_SYMBOL(arch_rethook_trampoline); diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h new file mode 100644 index 000000000000..32f069eed3f3 --- /dev/null +++ b/arch/s390/kernel/rethook.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __S390_RETHOOK_H +#define __S390_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); + +#endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ebad41afe355..f244c5560e7f 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -52,13 +52,15 @@ #include <linux/hugetlb.h> #include <linux/kmemleak.h> +#include <asm/archrandom.h> #include <asm/boot_data.h> +#include <asm/machine.h> #include <asm/ipl.h> #include <asm/facility.h> #include <asm/smp.h> #include <asm/mmu_context.h> #include <asm/cpcmd.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/nmi.h> #include <asm/irq.h> #include <asm/page.h> @@ -73,7 +75,8 @@ #include <asm/numa.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> -#include <asm/mem_detect.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> #include <asm/uv.h> #include <asm/asm-offsets.h> #include "entry.h" @@ -95,10 +98,10 @@ EXPORT_SYMBOL(console_irq); * relocated above 2 GB, because it has to use 31 bit addresses. * Such code and data is part of the .amode31 section. */ -unsigned long __amode31_ref __samode31 = (unsigned long)&_samode31; -unsigned long __amode31_ref __eamode31 = (unsigned long)&_eamode31; -unsigned long __amode31_ref __stext_amode31 = (unsigned long)&_stext_amode31; -unsigned long __amode31_ref __etext_amode31 = (unsigned long)&_etext_amode31; +char __amode31_ref *__samode31 = _samode31; +char __amode31_ref *__eamode31 = _eamode31; +char __amode31_ref *__stext_amode31 = _stext_amode31; +char __amode31_ref *__etext_amode31 = _etext_amode31; struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table; struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table; @@ -143,39 +146,41 @@ static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31; static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; -int __bootdata(noexec_disabled); -unsigned long __bootdata(ident_map_size); -struct mem_detect_info __bootdata(mem_detect); -struct initrd_data __bootdata(initrd_data); +unsigned long __bootdata_preserved(max_mappable); +struct physmem_info __bootdata(physmem_info); -unsigned long __bootdata_preserved(__kaslr_offset); -unsigned long __bootdata(__amode31_base); +struct vm_layout __bootdata_preserved(vm_layout); +EXPORT_SYMBOL(vm_layout); +int __bootdata_preserved(__kaslr_enabled); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); u64 __bootdata_preserved(stfle_fac_list[16]); EXPORT_SYMBOL(stfle_fac_list); -u64 __bootdata_preserved(alt_stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); -unsigned long VMALLOC_START; +char __bootdata(boot_rb)[PAGE_SIZE * 2]; +bool __bootdata(boot_earlyprintk); +size_t __bootdata(boot_rb_off); +char __bootdata(bootdebug_filter)[128]; +bool __bootdata(bootdebug); + +unsigned long __bootdata_preserved(VMALLOC_START); EXPORT_SYMBOL(VMALLOC_START); -unsigned long VMALLOC_END; +unsigned long __bootdata_preserved(VMALLOC_END); EXPORT_SYMBOL(VMALLOC_END); -struct page *vmemmap; +struct page *__bootdata_preserved(vmemmap); EXPORT_SYMBOL(vmemmap); -unsigned long vmemmap_size; +unsigned long __bootdata_preserved(vmemmap_size); -unsigned long MODULES_VADDR; -unsigned long MODULES_END; +unsigned long __bootdata_preserved(MODULES_VADDR); +unsigned long __bootdata_preserved(MODULES_END); /* An array with a pointer to the lowcore of every CPU. */ struct lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); -DEFINE_STATIC_KEY_FALSE(cpu_has_bear); - /* * The Write Back bit position in the physaddr is given by the SLPC PCI. * Leaving the mask zero always uses write through which is safe @@ -245,7 +250,7 @@ static void __init conmode_default(void) char query_buffer[1024]; char *ptr; - if (MACHINE_IS_VM) { + if (machine_is_vm()) { cpcmd("QUERY CONSOLE", query_buffer, 1024, NULL); console_devno = simple_strtoul(query_buffer + 5, NULL, 16); ptr = strstr(query_buffer, "SUBCHANNEL ="); @@ -283,7 +288,7 @@ static void __init conmode_default(void) SET_CONSOLE_SCLP; #endif } - } else if (MACHINE_IS_KVM) { + } else if (machine_is_kvm()) { if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) SET_CONSOLE_VT220; else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) @@ -304,7 +309,7 @@ static void __init setup_zfcpdump(void) return; if (oldmem_data.start) return; - strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev"); + strlcat(boot_command_line, " cio_ignore=all,!ipldev,!condev", COMMAND_LINE_SIZE); console_loglevel = 2; } #else @@ -359,63 +364,30 @@ void *restart_stack; unsigned long stack_alloc(void) { -#ifdef CONFIG_VMAP_STACK - void *ret; + void *stack; - ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, - NUMA_NO_NODE, __builtin_return_address(0)); - kmemleak_not_leak(ret); - return (unsigned long)ret; -#else - return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); -#endif + stack = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, + NUMA_NO_NODE, __builtin_return_address(0)); + kmemleak_not_leak(stack); + return (unsigned long)stack; } void stack_free(unsigned long stack) { -#ifdef CONFIG_VMAP_STACK - vfree((void *) stack); -#else - free_pages(stack, THREAD_SIZE_ORDER); -#endif + vfree((void *)stack); } -int __init arch_early_irq_init(void) +static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); - if (!stack) - panic("Couldn't allocate async stack"); - S390_lowcore.async_stack = stack + STACK_INIT_OFFSET; - return 0; + stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE); + return stack; } -void __init arch_call_rest_init(void) +static void __init setup_lowcore(void) { - unsigned long stack; - - stack = stack_alloc(); - if (!stack) - panic("Couldn't allocate kernel stack"); - current->stack = (void *) stack; -#ifdef CONFIG_VMAP_STACK - current->stack_vm_area = (void *) stack; -#endif - set_task_stack_end_magic(current); - stack += STACK_INIT_OFFSET; - S390_lowcore.kernel_stack = stack; - call_on_stack_noreturn(rest_init, stack); -} - -static void __init setup_lowcore_dat_off(void) -{ - unsigned long int_psw_mask = PSW_KERNEL_BITS; - unsigned long mcck_stack; - struct lowcore *lc; - - if (IS_ENABLED(CONFIG_KASAN)) - int_psw_mask |= PSW_MASK_DAT; + struct lowcore *lc, *abs_lc; /* * Setup lowcore for boot cpu @@ -426,44 +398,40 @@ static void __init setup_lowcore_dat_off(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); - lc->restart_psw.mask = PSW_KERNEL_BITS; - lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->pcpu = (unsigned long)per_cpu_ptr(&pcpu_devices, 0); + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->program_new_psw.mask = PSW_KERNEL_BITS; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->io_new_psw.mask = PSW_KERNEL_BITS; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; - lc->nodat_stack = ((unsigned long) &init_thread_union) - + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; - lc->machine_flags = S390_lowcore.machine_flags; - lc->preempt_count = S390_lowcore.preempt_count; + lc->preempt_count = get_lowcore()->preempt_count; nmi_alloc_mcesa_early(&lc->mcesad); - lc->sys_enter_timer = S390_lowcore.sys_enter_timer; - lc->exit_timer = S390_lowcore.exit_timer; - lc->user_timer = S390_lowcore.user_timer; - lc->system_timer = S390_lowcore.system_timer; - lc->steal_timer = S390_lowcore.steal_timer; - lc->last_update_timer = S390_lowcore.last_update_timer; - lc->last_update_clock = S390_lowcore.last_update_clock; - + lc->sys_enter_timer = get_lowcore()->sys_enter_timer; + lc->exit_timer = get_lowcore()->exit_timer; + lc->user_timer = get_lowcore()->user_timer; + lc->system_timer = get_lowcore()->system_timer; + lc->steal_timer = get_lowcore()->steal_timer; + lc->last_update_timer = get_lowcore()->last_update_timer; + lc->last_update_clock = get_lowcore()->last_update_clock; /* * Allocate the global restart stack which is the same for - * all CPUs in cast *one* of them does a PSW restart. + * all CPUs in case *one* of them does a PSW restart. */ - restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!restart_stack) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - restart_stack += STACK_INIT_OFFSET; - + restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET); + lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->kernel_stack = get_lowcore()->kernel_stack; /* * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant * restart data to the absolute zero lowcore. This is necessary if @@ -473,47 +441,31 @@ static void __init setup_lowcore_dat_off(void) lc->restart_fn = (unsigned long) do_restart; lc->restart_data = 0; lc->restart_source = -1U; - - mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!mcck_stack) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; - - /* Setup absolute zero lowcore */ - put_abs_lowcore(restart_stack, lc->restart_stack); - put_abs_lowcore(restart_fn, lc->restart_fn); - put_abs_lowcore(restart_data, lc->restart_data); - put_abs_lowcore(restart_source, lc->restart_source); - put_abs_lowcore(restart_psw, lc->restart_psw); - lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; arch_spin_lock_setup(0); lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; + lc->kernel_asce = get_lowcore()->kernel_asce; + lc->user_asce = get_lowcore()->user_asce; + + system_ctlreg_init_save_area(lc); + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc); set_prefix(__pa(lc)); lowcore_ptr[0] = lc; -} - -static void __init setup_lowcore_dat_on(void) -{ - struct lowcore *lc = lowcore_ptr[0]; - int cr; - - __ctl_clear_bit(0, 28); - S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_store(S390_lowcore.cregs_save_area, 0, 15); - __ctl_set_bit(0, 28); - put_abs_lowcore(restart_flags, RESTART_FLAG_CTLREGS); - put_abs_lowcore(program_new_psw, lc->program_new_psw); - for (cr = 0; cr < ARRAY_SIZE(lc->cregs_save_area); cr++) - put_abs_lowcore(cregs_save_area[cr], lc->cregs_save_area[cr]); + if (abs_lowcore_map(0, lowcore_ptr[0], false)) + panic("Couldn't setup absolute lowcore"); } static struct resource code_resource = { @@ -544,25 +496,22 @@ static void __init setup_resources(void) int j; u64 i; - code_resource.start = (unsigned long) _text; - code_resource.end = (unsigned long) _etext - 1; - data_resource.start = (unsigned long) _etext; - data_resource.end = (unsigned long) _edata - 1; - bss_resource.start = (unsigned long) __bss_start; - bss_resource.end = (unsigned long) __bss_stop - 1; + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext) - 1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata) - 1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop) - 1; for_each_mem_range(i, &start, &end) { - res = memblock_alloc(sizeof(*res), 8); - if (!res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*res), 8); + res = memblock_alloc_or_panic(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; res->start = start; /* * In memblock, end points to the first byte after the - * range while in resourses, end points to the last byte in + * range while in resources, end points to the last byte in * the range. */ res->end = end - 1; @@ -574,10 +523,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_alloc(sizeof(*sub_res), 8); - if (!sub_res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*sub_res), 8); + sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -606,7 +552,6 @@ static void __init setup_resources(void) static void __init setup_memory_end(void) { - memblock_remove(ident_map_size, PHYS_ADDR_MAX - ident_map_size); max_pfn = max_low_pfn = PFN_DOWN(ident_map_size); pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20); } @@ -638,6 +583,18 @@ static struct notifier_block kdump_mem_nb = { #endif /* + * Reserve page tables created by decompressor + */ +static void __init reserve_pgtables(void) +{ + unsigned long start, end; + struct reserved_range *range; + + for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end) + memblock_reserve(start, end - start); +} + +/* * Reserve memory for kdump kernel to be loaded with kexec */ static void __init reserve_crashkernel(void) @@ -647,8 +604,8 @@ static void __init reserve_crashkernel(void) phys_addr_t low, high; int rc; - rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size, - &crash_base); + rc = parse_crashkernel(boot_command_line, ident_map_size, + &crash_size, &crash_base, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); @@ -693,7 +650,7 @@ static void __init reserve_crashkernel(void) return; } - if (!oldmem_data.start && MACHINE_IS_VM) + if (!oldmem_data.start && machine_is_vm()) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; @@ -711,13 +668,13 @@ static void __init reserve_crashkernel(void) */ static void __init reserve_initrd(void) { -#ifdef CONFIG_BLK_DEV_INITRD - if (!initrd_data.start || !initrd_data.size) + unsigned long addr, size; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size)) return; - initrd_start = (unsigned long)__va(initrd_data.start); - initrd_end = initrd_start + initrd_data.size; - memblock_reserve(initrd_data.start, initrd_data.size); -#endif + initrd_start = (unsigned long)__va(addr); + initrd_end = initrd_start + size; + memblock_reserve(addr, size); } /* @@ -729,80 +686,64 @@ static void __init reserve_certificate_list(void) memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); } -static void __init reserve_mem_detect_info(void) +static void __init reserve_physmem_info(void) { - unsigned long start, size; + unsigned long addr, size; - get_mem_detect_reserved(&start, &size); - if (size) - memblock_reserve(start, size); + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) + memblock_reserve(addr, size); } -static void __init free_mem_detect_info(void) +static void __init free_physmem_info(void) { - unsigned long start, size; + unsigned long addr, size; - get_mem_detect_reserved(&start, &size); - if (size) - memblock_phys_free(start, size); + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) + memblock_phys_free(addr, size); } -static const char * __init get_mem_info_source(void) -{ - switch (mem_detect.info_source) { - case MEM_DETECT_SCLP_STOR_INFO: - return "sclp storage info"; - case MEM_DETECT_DIAG260: - return "diag260"; - case MEM_DETECT_SCLP_READ_INFO: - return "sclp read info"; - case MEM_DETECT_BIN_SEARCH: - return "binary search"; - } - return "none"; -} - -static void __init memblock_add_mem_detect_info(void) +static void __init memblock_add_physmem_info(void) { unsigned long start, end; int i; pr_debug("physmem info source: %s (%hhd)\n", - get_mem_info_source(), mem_detect.info_source); + get_physmem_info_source(), physmem_info.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); - for_each_mem_detect_block(i, &start, &end) { + for_each_physmem_usable_range(i, &start, &end) memblock_add(start, end - start); + for_each_physmem_online_range(i, &start, &end) memblock_physmem_add(start, end - start); - } memblock_set_bottom_up(false); memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); } /* - * Check for initrd being in usable memory + * Reserve memory used for lowcore. */ -static void __init check_initrd(void) +static void __init reserve_lowcore(void) { -#ifdef CONFIG_BLK_DEV_INITRD - if (initrd_data.start && initrd_data.size && - !memblock_is_region_memory(initrd_data.start, initrd_data.size)) { - pr_err("The initial RAM disk does not fit into the memory\n"); - memblock_phys_free(initrd_data.start, initrd_data.size); - initrd_start = initrd_end = 0; + void *lowcore_start = get_lowcore(); + void *lowcore_end = lowcore_start + sizeof(struct lowcore); + void *start, *end; + + if (absolute_pointer(__identity_base) < lowcore_end) { + start = max(lowcore_start, (void *)__identity_base); + end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); + memblock_reserve(__pa(start), __pa(end)); } -#endif } /* - * Reserve memory used for lowcore/command line/kernel image. + * Reserve memory used for absolute lowcore/command line/kernel image. */ static void __init reserve_kernel(void) { memblock_reserve(0, STARTUP_NORMAL_OFFSET); memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); - memblock_reserve(__amode31_base, __eamode31 - __samode31); + memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31); memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); memblock_reserve(__pa(_stext), _end - _stext); } @@ -824,15 +765,15 @@ static void __init setup_memory(void) static void __init relocate_amode31_section(void) { unsigned long amode31_size = __eamode31 - __samode31; - long amode31_offset = __amode31_base - __samode31; - long *ptr; + long amode31_offset, *ptr; + amode31_offset = AMODE31_START - (unsigned long)__samode31; pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); /* Move original AMODE31 section to the new one */ - memmove((void *)__amode31_base, (void *)__samode31, amode31_size); + memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size); /* Zero out the old AMODE31 section to catch invalid accesses within it */ - memset((void *)__samode31, 0, amode31_size); + memset(__samode31, 0, amode31_size); /* Update all AMODE31 region references */ for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++) @@ -851,15 +792,15 @@ static void __init setup_cr(void) __ctl_duct[4] = (unsigned long)__ctl_duald; /* Update control registers CR2, CR5 and CR15 */ - __ctl_store(cr2.val, 2, 2); - __ctl_store(cr5.val, 5, 5); - __ctl_store(cr15.val, 15, 15); + local_ctl_store(2, &cr2.reg); + local_ctl_store(5, &cr5.reg); + local_ctl_store(15, &cr15.reg); cr2.ducto = (unsigned long)__ctl_duct >> 6; cr5.pasteo = (unsigned long)__ctl_duct >> 6; cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3; - __ctl_load(cr2.val, 2, 2); - __ctl_load(cr5.val, 5, 5); - __ctl_load(cr15.val, 15, 15); + system_ctl_load(2, &cr2.reg); + system_ctl_load(5, &cr5.reg); + system_ctl_load(15, &cr15.reg); } /* @@ -869,9 +810,7 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vmms) - panic("Failed to allocate memory for sysinfo structure\n"); + vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); @@ -881,22 +820,6 @@ static void __init setup_randomness(void) } /* - * Find the correct size for the task_struct. This depends on - * the size of the struct fpu at the end of the thread_struct - * which is embedded in the task_struct. - */ -static void __init setup_task_size(void) -{ - int task_size = sizeof(struct task_struct); - - if (!MACHINE_HAS_VX) { - task_size -= sizeof(__vector128) * __NUM_VXRS; - task_size += sizeof(freg_t) * __NUM_FPRS; - } - arch_task_struct_size = task_size; -} - -/* * Issue diagnose 318 to set the control program name and * version codes. */ @@ -928,7 +851,7 @@ static void __init log_component_list(void) pr_info("Linux is running with Secure-IPL enabled\n"); else pr_info("Linux is running with Secure-IPL disabled\n"); - ptr = (void *) early_ipl_comp_list_addr; + ptr = __va(early_ipl_comp_list_addr); end = (void *) ptr + early_ipl_comp_list_size; pr_info("The IPL report contains the following components:\n"); while (ptr < end) { @@ -947,6 +870,23 @@ static void __init log_component_list(void) } /* + * Print avoiding interpretation of % in buf and taking bootdebug option + * into consideration. + */ +static void __init print_rb_entry(const char *buf) +{ + char fmt[] = KERN_SOH "0boot: %s"; + int level = printk_get_level(buf); + + buf = skip_timestamp(printk_skip_level(buf)); + if (level == KERN_DEBUG[1] && (!bootdebug || !bootdebug_filter_match(buf))) + return; + + fmt[1] = level; + printk(fmt, buf); +} + +/* * Setup function called from init/main.c just after the banner * was printed. */ @@ -956,15 +896,21 @@ void __init setup_arch(char **cmdline_p) /* * print what head.S has found out about the machine */ - if (MACHINE_IS_VM) + if (machine_is_vm()) pr_info("Linux is running as a z/VM " "guest operating system in 64-bit mode\n"); - else if (MACHINE_IS_KVM) + else if (machine_is_kvm()) pr_info("Linux is running under KVM in 64-bit mode\n"); - else if (MACHINE_IS_LPAR) + else if (machine_is_lpar()) pr_info("Linux is running natively in 64-bit mode\n"); else pr_info("Linux is running as a guest in 64-bit mode\n"); + /* Print decompressor messages if not already printed */ + if (!boot_earlyprintk) + boot_rb_foreach(print_rb_entry); + + if (machine_has_relocated_lowcore()) + pr_info("Lowcore relocated to 0x%px\n", get_lowcore()); log_component_list(); @@ -988,21 +934,22 @@ void __init setup_arch(char **cmdline_p) os_info_init(); setup_ipl(); - setup_task_size(); setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ + reserve_pgtables(); + reserve_lowcore(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); - reserve_mem_detect_info(); + reserve_physmem_info(); memblock_set_current_limit(ident_map_size); memblock_allow_resize(); /* Get information about *all* installed memory */ - memblock_add_mem_detect_info(); + memblock_add_physmem_info(); - free_mem_detect_info(); + free_physmem_info(); setup_memory_end(); memblock_dump_all(); setup_memory(); @@ -1012,33 +959,29 @@ void __init setup_arch(char **cmdline_p) setup_uv(); dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); - if (MACHINE_HAS_EDAT2) + if (cpu_has_edat2()) hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - check_initrd(); reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP /* - * Be aware that smp_save_dump_cpus() triggers a system reset. + * Be aware that smp_save_dump_secondary_cpus() triggers a system reset. * Therefore CPU and device initialization should be done afterwards. */ - smp_save_dump_cpus(); + smp_save_dump_secondary_cpus(); #endif setup_resources(); - setup_lowcore_dat_off(); + setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); cpu_init(); numa_setup(); smp_detect_cpus(); topology_init_early(); - - if (test_facility(193)) - static_branch_enable(&cpu_has_bear); - + setup_protection_map(); /* - * Create kernel page tables and switch to virtual addressing. + * Create kernel page tables. */ paging_init(); @@ -1046,7 +989,9 @@ void __init setup_arch(char **cmdline_p) * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ - setup_lowcore_dat_on(); +#ifdef CONFIG_CRASH_DUMP + smp_save_dump_ipl_cpu(); +#endif /* Setup default console */ conmode_default(); @@ -1062,3 +1007,8 @@ void __init setup_arch(char **cmdline_p) /* Add system specific data to the random pool */ setup_randomness(); } + +void __init arch_cpu_finalize_init(void) +{ + sclp_init(); +} diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 38258f817048..e48013cd832c 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/rseq.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> @@ -29,9 +30,9 @@ #include <linux/compat.h> #include <asm/ucontext.h> #include <linux/uaccess.h> +#include <asm/vdso-symbols.h> +#include <asm/access-regs.h> #include <asm/lowcore.h> -#include <asm/switch_to.h> -#include <asm/vdso.h> #include "entry.h" /* @@ -108,7 +109,7 @@ struct rt_sigframe static void store_sigregs(void) { save_access_regs(current->thread.acrs); - save_fpu_regs(); + save_user_fpu_regs(); } /* Load registers after signal return */ @@ -130,7 +131,7 @@ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - fpregs_store(&user_sregs.fpregs, ¤t->thread.fpu); + fpregs_store(&user_sregs.fpregs, ¤t->thread.ufpu); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) return -EFAULT; return 0; @@ -149,10 +150,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI)) return -EINVAL; - /* Test the floating-point-control word. */ - if (test_fp_ctl(user_sregs.fpregs.fpc)) - return -EINVAL; - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | (user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); @@ -168,7 +165,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); + fpregs_load(&user_sregs.fpregs, ¤t->thread.ufpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; @@ -182,13 +179,13 @@ static int save_sigregs_ext(struct pt_regs *regs, int i; /* Save vector registers to signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.ufpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, - current->thread.fpu.vxrs + __NUM_VXRS_LOW, + current->thread.ufpu.vxrs + __NUM_VXRS_LOW, sizeof(sregs_ext->vxrs_high))) return -EFAULT; } @@ -202,15 +199,15 @@ static int restore_sigregs_ext(struct pt_regs *regs, int i; /* Restore vector registers from signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW, &sregs_ext->vxrs_high, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.ufpu.vxrs[i].low = vxrs[i]; } return 0; } @@ -225,7 +222,7 @@ SYSCALL_DEFINE0(sigreturn) if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) goto badframe; set_current_blocked(&set); - save_fpu_regs(); + save_user_fpu_regs(); if (restore_sigregs(regs, &frame->sregs)) goto badframe; if (restore_sigregs_ext(regs, &frame->sregs_ext)) @@ -249,7 +246,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - save_fpu_regs(); + save_user_fpu_regs(); if (restore_sigregs(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) @@ -300,7 +297,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, * included in the signal frame on a 31-bit system. */ frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) frame_size += sizeof(frame->sregs_ext); frame = get_sigframe(ka, regs, frame_size); if (frame == (void __user *) -1UL) @@ -377,7 +374,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, * included in the signal frame on a 31-bit system. */ uc_flags = 0; - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { frame_size += sizeof(_sigregs_ext); uc_flags |= UC_VXRS; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 30c91d565933..81f12bb77f62 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -18,6 +18,7 @@ #define KMSG_COMPONENT "cpu" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/cpufeature.h> #include <linux/workqueue.h> #include <linux/memblock.h> #include <linux/export.h> @@ -36,16 +37,20 @@ #include <linux/sched/task_stack.h> #include <linux/crash_dump.h> #include <linux/kprobes.h> +#include <asm/access-regs.h> #include <asm/asm-offsets.h> +#include <asm/machine.h> +#include <asm/ctlreg.h> +#include <asm/pfault.h> #include <asm/diag.h> -#include <asm/switch_to.h> #include <asm/facility.h> +#include <asm/fpu.h> #include <asm/ipl.h> #include <asm/setup.h> #include <asm/irq.h> #include <asm/tlbflush.h> #include <asm/vtimer.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/sclp.h> #include <asm/debug.h> #include <asm/os_info.h> @@ -55,6 +60,7 @@ #include <asm/stacktrace.h> #include <asm/topology.h> #include <asm/vdso.h> +#include <asm/maccess.h> #include "entry.h" enum { @@ -70,18 +76,15 @@ enum { CPU_STATE_CONFIGURED, }; -static DEFINE_PER_CPU(struct cpu *, cpu_device); - -struct pcpu { - unsigned long ec_mask; /* bit mask for ec_xxx functions */ - unsigned long ec_clk; /* sigp timestamp for ec_xxx */ - signed char state; /* physical cpu state */ - signed char polarization; /* physical polarization */ - u16 address; /* physical cpu address */ -}; - static u8 boot_core_type; -static struct pcpu pcpu_devices[NR_CPUS]; +DEFINE_PER_CPU(struct pcpu, pcpu_devices); +/* + * Pointer to the pcpu area of the boot CPU. This is required when a restart + * interrupt is triggered on an offline CPU. For that case accessing percpu + * data with the common primitives does not work, since the percpu offset is + * stored in a non existent lowcore. + */ +static struct pcpu *ipl_pcpu; unsigned int smp_cpu_mt_shift; EXPORT_SYMBOL(smp_cpu_mt_shift); @@ -96,13 +99,6 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; static unsigned int smp_max_threads __initdata = -1U; cpumask_t cpu_setup_mask; -static int __init early_nosmt(char *s) -{ - smp_max_threads = 1; - return 0; -} -early_param("nosmt", early_nosmt); - static int __init early_smt(char *s) { get_option(&s, &smp_max_threads); @@ -112,7 +108,7 @@ early_param("smt", early_smt); /* * The smp_cpu_state_mutex must be held when changing the state or polarization - * member of a pcpu data structure within the pcpu_devices arreay. + * member of a pcpu data structure within the pcpu_devices array. */ DEFINE_MUTEX(smp_cpu_state_mutex); @@ -172,8 +168,8 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) int cpu; for_each_cpu(cpu, mask) - if (pcpu_devices[cpu].address == address) - return pcpu_devices + cpu; + if (per_cpu(pcpu_devices, cpu).address == address) + return &per_cpu(pcpu_devices, cpu); return NULL; } @@ -199,7 +195,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) mcck_stack = stack_alloc(); if (!lc || !nodat_stack || !async_stack || !mcck_stack) goto out; - memcpy(lc, &S390_lowcore, 512); + memcpy(lc, get_lowcore(), 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + STACK_INIT_OFFSET; lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; @@ -212,10 +208,14 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->preempt_count = PREEMPT_DISABLED; if (nmi_alloc_mcesa(&lc->mcesad)) goto out; + if (abs_lowcore_map(cpu, lc, true)) + goto out_mcesa; lowcore_ptr[cpu] = lc; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc)); return 0; +out_mcesa: + nmi_free_mcesa(&lc->mcesad); out: stack_free(mcck_stack); stack_free(async_stack); @@ -224,19 +224,18 @@ out: return -ENOMEM; } -static void pcpu_free_lowcore(struct pcpu *pcpu) +static void pcpu_free_lowcore(struct pcpu *pcpu, int cpu) { unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET; async_stack = lc->async_stack - STACK_INIT_OFFSET; mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[cpu] = NULL; + abs_lowcore_unmap(cpu); nmi_free_mcesa(&lc->mcesad); stack_free(async_stack); stack_free(mcck_stack); @@ -246,37 +245,37 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { - struct lowcore *lc = lowcore_ptr[cpu]; + struct lowcore *lc, *abs_lc; + lc = lowcore_ptr[cpu]; cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; + lc->pcpu = (unsigned long)pcpu; lc->restart_flags = RESTART_FLAG_CTLREGS; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; - lc->kernel_asce = S390_lowcore.kernel_asce; + lc->kernel_asce = get_lowcore()->kernel_asce; lc->user_asce = s390_invalid_asce; - lc->machine_flags = S390_lowcore.machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; - __ctl_store(lc->cregs_save_area, 0, 15); - lc->cregs_save_area[1] = lc->kernel_asce; + abs_lc = get_abs_lowcore(); + memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); + put_abs_lowcore(abs_lc); + lc->cregs_save_area[1] = lc->user_asce; lc->cregs_save_area[7] = lc->user_asce; save_access_regs((unsigned int *) lc->access_regs_save_area); arch_spin_lock_setup(cpu); } -static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) +static void pcpu_attach_task(int cpu, struct task_struct *tsk) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; - lc->kernel_stack = (unsigned long) task_stack_page(tsk) - + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->current_task = (unsigned long) tsk; + lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET; + lc->current_task = (unsigned long)tsk; lc->lpp = LPP_MAGIC; lc->current_pid = tsk->pid; lc->user_timer = tsk->thread.user_timer; @@ -287,18 +286,16 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) lc->steal_timer = 0; } -static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) +static void pcpu_start_fn(int cpu, void (*func)(void *), void *data) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; lc->restart_stack = lc->kernel_stack; lc->restart_fn = (unsigned long) func; lc->restart_data = (unsigned long) data; lc->restart_source = -1U; - pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); + pcpu_sigp_retry(per_cpu_ptr(&pcpu_devices, cpu), SIGP_RESTART, 0); } typedef void (pcpu_delegate_fn)(void *); @@ -311,20 +308,23 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) func(data); /* should not return */ } -static void pcpu_delegate(struct pcpu *pcpu, +static void pcpu_delegate(struct pcpu *pcpu, int cpu, pcpu_delegate_fn *func, void *data, unsigned long stack) { - struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; - unsigned int source_cpu = stap(); + struct lowcore *lc, *abs_lc; + unsigned int source_cpu; + + lc = lowcore_ptr[cpu]; + source_cpu = stap(); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, pcpu_delegate_fn *, func, void *, data); } /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0); /* Restart func on the target cpu and stop the current cpu. */ if (lc) { lc->restart_stack = stack; @@ -332,12 +332,13 @@ static void pcpu_delegate(struct pcpu *pcpu, lc->restart_data = (unsigned long)data; lc->restart_source = source_cpu; } else { - put_abs_lowcore(restart_stack, stack); - put_abs_lowcore(restart_fn, (unsigned long)func); - put_abs_lowcore(restart_data, (unsigned long)data); - put_abs_lowcore(restart_source, source_cpu); + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = stack; + abs_lc->restart_fn = (unsigned long)func; + abs_lc->restart_data = (unsigned long)data; + abs_lc->restart_source = source_cpu; + put_abs_lowcore(abs_lc); } - __bpon(); asm volatile( "0: sigp 0,%0,%2 # sigp restart to target cpu\n" " brc 2,0b # busy, try again\n" @@ -364,38 +365,22 @@ static int pcpu_set_smt(unsigned int mtid) smp_cpu_mt_shift = 0; while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift)) smp_cpu_mt_shift++; - pcpu_devices[0].address = stap(); + per_cpu(pcpu_devices, 0).address = stap(); } return cc; } /* - * Call function on an online CPU. - */ -void smp_call_online_cpu(void (*func)(void *), void *data) -{ - struct pcpu *pcpu; - - /* Use the current cpu if it is online. */ - pcpu = pcpu_find_address(cpu_online_mask, stap()); - if (!pcpu) - /* Use the first online cpu. */ - pcpu = pcpu_devices + cpumask_first(cpu_online_mask); - pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack); -} - -/* * Call function on the ipl CPU. */ void smp_call_ipl_cpu(void (*func)(void *), void *data) { struct lowcore *lc = lowcore_ptr[0]; - if (pcpu_devices[0].address == stap()) - lc = &S390_lowcore; + if (ipl_pcpu->address == stap()) + lc = get_lowcore(); - pcpu_delegate(&pcpu_devices[0], func, data, - lc->nodat_stack); + pcpu_delegate(ipl_pcpu, 0, func, data, lc->nodat_stack); } int smp_find_processor_id(u16 address) @@ -403,21 +388,21 @@ int smp_find_processor_id(u16 address) int cpu; for_each_present_cpu(cpu) - if (pcpu_devices[cpu].address == address) + if (per_cpu(pcpu_devices, cpu).address == address) return cpu; return -1; } void schedule_mcck_handler(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_mcck_pending); } bool notrace arch_vcpu_is_preempted(int cpu) { if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) return false; - if (pcpu_running(pcpu_devices + cpu)) + if (pcpu_running(per_cpu_ptr(&pcpu_devices, cpu))) return false; return true; } @@ -425,11 +410,11 @@ EXPORT_SYMBOL(arch_vcpu_is_preempted); void notrace smp_yield_cpu(int cpu) { - if (!MACHINE_HAS_DIAG9C) + if (!machine_has_diag9c()) return; diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" - : : "d" (pcpu_devices[cpu].address)); + : : "d" (per_cpu(pcpu_devices, cpu).address)); } EXPORT_SYMBOL_GPL(smp_yield_cpu); @@ -450,7 +435,7 @@ void notrace smp_emergency_stop(void) end = get_tod_clock() + (1000000UL << 12); for_each_cpu(cpu, &cpumask) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); set_bit(ec_stop_cpu, &pcpu->ec_mask); while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, 0, NULL) == SIGP_CC_BUSY && @@ -459,7 +444,7 @@ void notrace smp_emergency_stop(void) } while (get_tod_clock() < end) { for_each_cpu(cpu, &cpumask) - if (pcpu_stopped(pcpu_devices + cpu)) + if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu))) cpumask_clear_cpu(cpu, &cpumask); if (cpumask_empty(&cpumask)) break; @@ -474,10 +459,11 @@ NOKPROBE_SYMBOL(smp_emergency_stop); */ void smp_send_stop(void) { + struct pcpu *pcpu; int cpu; /* Disable all interrupts/machine checks */ - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + __load_psw_mask(PSW_KERNEL_BITS); trace_hardirqs_off(); debug_set_critical(); @@ -489,8 +475,9 @@ void smp_send_stop(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); - while (!pcpu_stopped(pcpu_devices + cpu)) + pcpu = per_cpu_ptr(&pcpu_devices, cpu); + pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu)) cpu_relax(); } } @@ -504,7 +491,7 @@ static void smp_handle_ext_call(void) unsigned long bits; /* handle bit signal external calls */ - bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0); + bits = this_cpu_xchg(pcpu_devices.ec_mask, 0); if (test_bit(ec_stop_cpu, &bits)) smp_stop_cpu(); if (test_bit(ec_schedule, &bits)) @@ -512,7 +499,7 @@ static void smp_handle_ext_call(void) if (test_bit(ec_call_function_single, &bits)) generic_smp_call_function_single_interrupt(); if (test_bit(ec_mcck_pending, &bits)) - __s390_handle_mcck(); + s390_handle_mcck(); if (test_bit(ec_irq_work, &bits)) irq_work_run(); } @@ -529,12 +516,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) int cpu; for_each_cpu(cpu, mask) - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } void arch_send_call_function_single_ipi(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } /* @@ -542,63 +529,18 @@ void arch_send_call_function_single_ipi(int cpu) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void arch_smp_send_reschedule(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_schedule); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_schedule); } #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_irq_work); } #endif -/* - * parameter area for the set/clear control bit callbacks - */ -struct ec_creg_mask_parms { - unsigned long orval; - unsigned long andval; - int cr; -}; - -/* - * callback for setting/clearing control bits - */ -static void smp_ctl_bit_callback(void *info) -{ - struct ec_creg_mask_parms *pp = info; - unsigned long cregs[16]; - - __ctl_store(cregs, 0, 15); - cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval; - __ctl_load(cregs, 0, 15); -} - -static DEFINE_SPINLOCK(ctl_lock); - -void smp_ctl_set_clear_bit(int cr, int bit, bool set) -{ - struct ec_creg_mask_parms parms = { .cr = cr, }; - u64 ctlreg; - - if (set) { - parms.orval = 1UL << bit; - parms.andval = -1UL; - } else { - parms.orval = 0; - parms.andval = ~(1UL << bit); - } - spin_lock(&ctl_lock); - get_abs_lowcore(ctlreg, cregs_save_area[cr]); - ctlreg = (ctlreg & parms.andval) | parms.orval; - put_abs_lowcore(cregs_save_area[cr], ctlreg); - spin_unlock(&ctl_lock); - on_each_cpu(smp_ctl_bit_callback, &parms, 1); -} -EXPORT_SYMBOL(smp_ctl_set_clear_bit); - #ifdef CONFIG_CRASH_DUMP int smp_store_status(int cpu) @@ -607,16 +549,16 @@ int smp_store_status(int cpu) struct pcpu *pcpu; unsigned long pa; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); lc = lowcore_ptr[cpu]; pa = __pa(&lc->floating_pt_save_area); if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; - if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) + if (!cpu_has_vx() && !cpu_has_gs()) return 0; pa = lc->mcesad & MCESA_ORIGIN_MASK; - if (MACHINE_HAS_GS) + if (cpu_has_gs()) pa |= lc->mcesad & MCESA_LC_MASK; if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) @@ -626,7 +568,7 @@ int smp_store_status(int cpu) /* * Collect CPU state of the previous, crashed system. - * There are four cases: + * There are three cases: * 1) standard zfcp/nvme dump * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected @@ -639,46 +581,45 @@ int smp_store_status(int cpu) * with sigp stop-and-store-status. The firmware or the boot-loader * stored the registers of the boot CPU in the absolute lowcore in the * memory of the old system. - * 3) kdump and the old kernel did not store the CPU state, - * or stand-alone kdump for DASD - * condition: OLDMEM_BASE != NULL && !is_kdump_kernel() + * 3) kdump or stand-alone kdump for DASD + * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == false * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The kexec code or the boot-loader * stored the registers of the boot CPU in the memory of the old system. - * 4) kdump and the old kernel stored the CPU state - * condition: OLDMEM_BASE != NULL && is_kdump_kernel() - * This case does not exist for s390 anymore, setup_arch explicitly - * deactivates the elfcorehdr= kernel parameter + * + * Note that the legacy kdump mode where the old kernel stored the CPU states + * does no longer exist: setup_arch() explicitly deactivates the elfcorehdr= + * kernel parameter. The is_kdump_kernel() implementation on s390 is independent + * of the elfcorehdr= parameter. */ -static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr, - bool is_boot_cpu, __vector128 *vxrs) +static bool dump_available(void) { - if (is_boot_cpu) - vxrs = boot_cpu_vector_save_area; - else - __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(vxrs)); - save_area_add_vxrs(sa, vxrs); + return oldmem_data.start || is_ipl_type_dump(); } -static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr, - bool is_boot_cpu, void *regs) +void __init smp_save_dump_ipl_cpu(void) { - if (is_boot_cpu) - copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); - else - __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(regs)); + struct save_area *sa; + void *regs; + + if (!dump_available()) + return; + sa = save_area_alloc(true); + regs = memblock_alloc_or_panic(512, 8); + copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); + memblock_free(regs, 512); + if (cpu_has_vx()) + save_area_add_vxrs(sa, boot_cpu_vector_save_area); } -void __init smp_save_dump_cpus(void) +void __init smp_save_dump_secondary_cpus(void) { int addr, boot_cpu_addr, max_cpu_addr; struct save_area *sa; - bool is_boot_cpu; void *page; - if (!(oldmem_data.start || is_ipl_type_dump())) - /* No previous system present, normal boot. */ + if (!dump_available()) return; /* Allocate a page as dumping area for the store status sigps */ page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -691,26 +632,18 @@ void __init smp_save_dump_cpus(void) boot_cpu_addr = stap(); max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; for (addr = 0; addr <= max_cpu_addr; addr++) { + if (addr == boot_cpu_addr) + continue; if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == SIGP_CC_NOT_OPERATIONAL) continue; - is_boot_cpu = (addr == boot_cpu_addr); - /* Allocate save area */ - sa = save_area_alloc(is_boot_cpu); - if (!sa) - panic("could not allocate memory for save area\n"); - if (MACHINE_HAS_VX) - /* Get the vector registers */ - smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); - /* - * For a zfcp/nvme dump OLDMEM_BASE == NULL and the registers - * of the boot CPU are stored in the HSA. To retrieve - * these registers an SCLP request is required which is - * done by drivers/s390/char/zcore.c:init_cpu_info() - */ - if (!is_boot_cpu || oldmem_data.start) - /* Get the CPU registers */ - smp_save_cpu_regs(sa, addr, is_boot_cpu, page); + sa = save_area_alloc(false); + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); + save_area_add_regs(sa, page); + if (cpu_has_vx()) { + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page)); + save_area_add_vxrs(sa, page); + } } memblock_free(page, PAGE_SIZE); diag_amode31_ops.diag308_reset(); @@ -720,17 +653,36 @@ void __init smp_save_dump_cpus(void) void smp_cpu_set_polarization(int cpu, int val) { - pcpu_devices[cpu].polarization = val; + per_cpu(pcpu_devices, cpu).polarization = val; } int smp_cpu_get_polarization(int cpu) { - return pcpu_devices[cpu].polarization; + return per_cpu(pcpu_devices, cpu).polarization; +} + +void smp_cpu_set_capacity(int cpu, unsigned long val) +{ + per_cpu(pcpu_devices, cpu).capacity = val; +} + +unsigned long smp_cpu_get_capacity(int cpu) +{ + return per_cpu(pcpu_devices, cpu).capacity; +} + +void smp_set_core_capacity(int cpu, unsigned long val) +{ + int i; + + cpu = smp_get_base_cpu(cpu); + for (i = cpu; (i <= cpu + smp_cpu_mtid) && (i < nr_cpu_ids); i++) + smp_cpu_set_capacity(i, val); } int smp_cpu_get_cpu_address(int cpu) { - return pcpu_devices[cpu].address; + return per_cpu(pcpu_devices, cpu).address; } static void __ref smp_get_core_info(struct sclp_core_info *info, int early) @@ -754,8 +706,6 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early) } } -static int smp_add_present_cpu(int cpu); - static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, bool configured, bool early) { @@ -771,15 +721,16 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) { if (pcpu_find_address(cpu_present_mask, address + i)) continue; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); pcpu->address = address + i; if (configured) pcpu->state = CPU_STATE_CONFIGURED; else pcpu->state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); set_cpu_present(cpu, true); - if (!early && smp_add_present_cpu(cpu) != 0) + if (!early && arch_register_cpu(cpu)) set_cpu_present(cpu, false); else nr++; @@ -806,7 +757,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) * that all SMT threads get subsequent logical CPU numbers. */ if (early) { - core_id = pcpu_devices[0].address >> smp_cpu_mt_shift; + core_id = per_cpu(pcpu_devices, 0).address >> smp_cpu_mt_shift; for (i = 0; i < info->configured; i++) { core = &info->core[i]; if (core->core_id == core_id) { @@ -831,10 +782,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_alloc(sizeof(*info), 8); - if (!info) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*info), 8); + info = memblock_alloc_or_panic(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { @@ -853,6 +801,7 @@ void __init smp_detect_cpus(void) mtid = boot_core_type ? sclp.mtid : sclp.mtid_cp; mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1; pcpu_set_smt(mtid); + cpu_smt_set_num_threads(smp_cpu_mtid + 1, smp_cpu_mtid + 1); /* Print number of CPUs */ c_cpus = s_cpus = 0; @@ -866,9 +815,6 @@ void __init smp_detect_cpus(void) s_cpus += smp_cpu_mtid + 1; } pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); - - /* Add CPUs present at boot */ - __smp_rescan_cpus(info, true); memblock_free(info, sizeof(*info)); } @@ -877,17 +823,18 @@ void __init smp_detect_cpus(void) */ static void smp_start_secondary(void *cpuvoid) { + struct lowcore *lc = get_lowcore(); int cpu = raw_smp_processor_id(); - S390_lowcore.last_update_clock = get_tod_clock(); - S390_lowcore.restart_stack = (unsigned long)restart_stack; - S390_lowcore.restart_fn = (unsigned long)do_restart; - S390_lowcore.restart_data = 0; - S390_lowcore.restart_source = -1U; - S390_lowcore.restart_flags = 0; - restore_access_regs(S390_lowcore.access_regs_save_area); + lc->last_update_clock = get_tod_clock(); + lc->restart_stack = (unsigned long)restart_stack; + lc->restart_fn = (unsigned long)do_restart; + lc->restart_data = 0; + lc->restart_source = -1U; + lc->restart_flags = 0; + restore_access_regs(lc->access_regs_save_area); cpu_init(); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); init_cpu_timer(); vtime_init(); vdso_getcpu_init(); @@ -908,7 +855,7 @@ static void smp_start_secondary(void *cpuvoid) /* Upping and downing of CPUs */ int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); int rc; if (pcpu->state != CPU_STATE_CONFIGURED) @@ -920,12 +867,18 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) rc = pcpu_alloc_lowcore(pcpu, cpu); if (rc) return rc; + /* + * Make sure global control register contents do not change + * until new CPU has initialized control registers. + */ + system_ctlreg_lock(); pcpu_prepare_secondary(pcpu, cpu); - pcpu_attach_task(pcpu, tidle); - pcpu_start_fn(pcpu, smp_start_secondary, NULL); + pcpu_attach_task(cpu, tidle); + pcpu_start_fn(cpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ while (!cpu_online(cpu)) cpu_relax(); + system_ctlreg_unlock(); return 0; } @@ -940,7 +893,7 @@ early_param("possible_cpus", _setup_possible_cpus); int __cpu_disable(void) { - unsigned long cregs[16]; + struct ctlreg cregs[16]; int cpu; /* Handle possible pending IPIs */ @@ -952,11 +905,11 @@ int __cpu_disable(void) /* Disable pseudo page faults on this cpu. */ pfault_fini(); /* Disable interrupt sources via control register. */ - __ctl_store(cregs, 0, 15); - cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */ - cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */ - cregs[14] &= ~0x1f000000UL; /* disable most machine checks */ - __ctl_load(cregs, 0, 15); + __local_ctl_store(0, 15, cregs); + cregs[0].val &= ~0x0000ee70UL; /* disable all external interrupts */ + cregs[6].val &= ~0xff000000UL; /* disable all I/O interrupts */ + cregs[14].val &= ~0x1f000000UL; /* disable most machine checks */ + __local_ctl_load(0, 15, cregs); clear_cpu_flag(CIF_NOHZ_DELAY); return 0; } @@ -966,19 +919,19 @@ void __cpu_die(unsigned int cpu) struct pcpu *pcpu; /* Wait until target cpu is down */ - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); while (!pcpu_stopped(pcpu)) cpu_relax(); - pcpu_free_lowcore(pcpu); + pcpu_free_lowcore(pcpu, cpu); cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); + pcpu->flags = 0; } void __noreturn cpu_die(void) { idle_task_exit(); - __bpon(); - pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); + pcpu_sigp_retry(this_cpu_ptr(&pcpu_devices), SIGP_STOP, 0); for (;;) ; } @@ -997,30 +950,36 @@ void __init smp_fill_possible_mask(void) void __init smp_prepare_cpus(unsigned int max_cpus) { - /* request the 0x1201 emergency signal external interrupt */ if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1201"); - /* request the 0x1202 external call external interrupt */ + system_ctl_set_bit(0, 14); if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); + system_ctl_set_bit(0, 13); + smp_rescan_cpus(true); } void __init smp_prepare_boot_cpu(void) { - struct pcpu *pcpu = pcpu_devices; + struct lowcore *lc = get_lowcore(); WARN_ON(!cpu_present(0) || !cpu_online(0)); - pcpu->state = CPU_STATE_CONFIGURED; - S390_lowcore.percpu_offset = __per_cpu_offset[0]; + lc->percpu_offset = __per_cpu_offset[0]; + ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0); + ipl_pcpu->state = CPU_STATE_CONFIGURED; + lc->pcpu = (unsigned long)ipl_pcpu; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(0, CPU_CAPACITY_HIGH); } void __init smp_setup_processor_id(void) { - pcpu_devices[0].address = stap(); - S390_lowcore.cpu_nr = 0; - S390_lowcore.spinlock_lockval = arch_spin_lockval(0); - S390_lowcore.spinlock_index = 0; + struct lowcore *lc = get_lowcore(); + + lc->cpu_nr = 0; + per_cpu(pcpu_devices, 0).address = stap(); + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; } /* @@ -1040,7 +999,7 @@ static ssize_t cpu_configure_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state); + count = sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -1060,15 +1019,13 @@ static ssize_t cpu_configure_store(struct device *dev, cpus_read_lock(); mutex_lock(&smp_cpu_state_mutex); rc = -EBUSY; - /* disallow configuration changes of online cpus and cpu 0 */ + /* disallow configuration changes of online cpus */ cpu = dev->id; cpu = smp_get_base_cpu(cpu); - if (cpu == 0) - goto out; for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_online(cpu + i)) goto out; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); rc = 0; switch (val) { case 0: @@ -1080,7 +1037,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_STANDBY; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1095,7 +1052,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_CONFIGURED; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_CONFIGURED; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1114,7 +1071,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); static ssize_t show_cpu_address(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pcpu_devices[dev->id].address); + return sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address); } static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); @@ -1140,35 +1097,34 @@ static struct attribute_group cpu_online_attr_group = { static int smp_cpu_online(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); - return sysfs_create_group(&s->kobj, &cpu_online_attr_group); + return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group); } static int smp_cpu_pre_down(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); - sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group); return 0; } -static int smp_add_present_cpu(int cpu) +bool arch_cpu_is_hotpluggable(int cpu) +{ + return !!cpu; +} + +int arch_register_cpu(int cpu) { - struct device *s; - struct cpu *c; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); int rc; - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - per_cpu(cpu_device, cpu) = c; - s = &c->dev; - c->hotpluggable = 1; + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); rc = register_cpu(c, cpu); if (rc) goto out; - rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); + rc = sysfs_create_group(&c->dev.kobj, &cpu_common_attr_group); if (rc) goto out_cpu; rc = topology_cpu_init(c); @@ -1177,14 +1133,14 @@ static int smp_add_present_cpu(int cpu) return 0; out_topology: - sysfs_remove_group(&s->kobj, &cpu_common_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_common_attr_group); out_cpu: unregister_cpu(c); out: return rc; } -int __ref smp_rescan_cpus(void) +int __ref smp_rescan_cpus(bool early) { struct sclp_core_info *info; int nr; @@ -1193,7 +1149,7 @@ int __ref smp_rescan_cpus(void) if (!info) return -ENOMEM; smp_get_core_info(info, 0); - nr = __smp_rescan_cpus(info, false); + nr = __smp_rescan_cpus(info, early); kfree(info); if (nr) topology_schedule_update(); @@ -1210,7 +1166,7 @@ static ssize_t __ref rescan_store(struct device *dev, rc = lock_device_hotplug_sysfs(); if (rc) return rc; - rc = smp_rescan_cpus(); + rc = smp_rescan_cpus(false); unlock_device_hotplug(); return rc ? rc : count; } @@ -1218,77 +1174,19 @@ static DEVICE_ATTR_WO(rescan); static int __init s390_smp_init(void) { - int cpu, rc = 0; + struct device *dev_root; + int rc; - rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan); - if (rc) - return rc; - for_each_present_cpu(cpu) { - rc = smp_add_present_cpu(cpu); + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_rescan); + put_device(dev_root); if (rc) - goto out; + return rc; } - rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); rc = rc <= 0 ? rc : 0; -out: return rc; } subsys_initcall(s390_smp_init); - -static __always_inline void set_new_lowcore(struct lowcore *lc) -{ - union register_pair dst, src; - u32 pfx; - - src.even = (unsigned long) &S390_lowcore; - src.odd = sizeof(S390_lowcore); - dst.even = (unsigned long) lc; - dst.odd = sizeof(*lc); - pfx = __pa(lc); - - asm volatile( - " mvcl %[dst],%[src]\n" - " spx %[pfx]\n" - : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair) - : [pfx] "Q" (pfx) - : "memory", "cc"); -} - -static int __init smp_reinit_ipl_cpu(void) -{ - unsigned long async_stack, nodat_stack, mcck_stack; - struct lowcore *lc, *lc_ipl; - unsigned long flags, cr0; - u64 mcesad; - - lc_ipl = lowcore_ptr[0]; - lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); - nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); - async_stack = stack_alloc(); - mcck_stack = stack_alloc(); - if (!lc || !nodat_stack || !async_stack || !mcck_stack || nmi_alloc_mcesa(&mcesad)) - panic("Couldn't allocate memory"); - - local_irq_save(flags); - local_mcck_disable(); - set_new_lowcore(lc); - S390_lowcore.nodat_stack = nodat_stack + STACK_INIT_OFFSET; - S390_lowcore.async_stack = async_stack + STACK_INIT_OFFSET; - S390_lowcore.mcck_stack = mcck_stack + STACK_INIT_OFFSET; - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - S390_lowcore.mcesad = mcesad; - __ctl_load(cr0, 0, 0); - lowcore_ptr[0] = lc; - local_mcck_enable(); - local_irq_restore(flags); - - free_pages(lc_ipl->async_stack - STACK_INIT_OFFSET, THREAD_SIZE_ORDER); - memblock_free_late(__pa(lc_ipl->mcck_stack - STACK_INIT_OFFSET), THREAD_SIZE); - memblock_free_late(__pa(lc_ipl), sizeof(*lc_ipl)); - - return 0; -} -early_initcall(smp_reinit_ipl_cpu); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 7ee455e8e3d5..b153a395f46d 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -5,10 +5,15 @@ * Copyright IBM Corp. 2006 */ +#include <linux/perf_event.h> #include <linux/stacktrace.h> +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <asm/asm-offsets.h> #include <asm/stacktrace.h> #include <asm/unwind.h> #include <asm/kprobes.h> +#include <asm/ptrace.h> void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) @@ -40,12 +45,12 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; -#ifdef CONFIG_KPROBES +#ifdef CONFIG_RETHOOK /* - * Mark stacktraces with kretprobed functions on them + * Mark stacktraces with krethook functions on them * as unreliable. */ - if (state.ip == (unsigned long)__kretprobe_trampoline) + if (state.ip == (unsigned long)arch_rethook_trampoline) return -EINVAL; #endif @@ -58,3 +63,103 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, return -EINVAL; return 0; } + +static inline bool store_ip(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, bool perf, + unsigned long ip) +{ +#ifdef CONFIG_PERF_EVENTS + if (perf) { + if (perf_callchain_store(entry, ip)) + return false; + return true; + } +#endif + return consume_entry(cookie, ip); +} + +static inline bool ip_invalid(unsigned long ip) +{ + /* + * Perform some basic checks if an instruction address taken + * from unreliable source is invalid. + */ + if (ip & 1) + return true; + if (ip < mmap_min_addr) + return true; + if (ip >= current->mm->context.asce_limit) + return true; + return false; +} + +static inline bool ip_within_vdso(unsigned long ip) +{ + return in_range(ip, current->mm->context.vdso_base, vdso_text_size()); +} + +void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, + const struct pt_regs *regs, bool perf) +{ + struct stack_frame_vdso_wrapper __user *sf_vdso; + struct stack_frame_user __user *sf; + unsigned long ip, sp; + bool first = true; + + if (is_compat_task()) + return; + if (!current->mm) + return; + ip = instruction_pointer(regs); + if (!store_ip(consume_entry, cookie, entry, perf, ip)) + return; + sf = (void __user *)user_stack_pointer(regs); + pagefault_disable(); + while (1) { + if (__get_user(sp, &sf->back_chain)) + break; + /* + * VDSO entry code has a non-standard stack frame layout. + * See VDSO user wrapper code for details. + */ + if (!sp && ip_within_vdso(ip)) { + sf_vdso = (void __user *)sf; + if (__get_user(ip, &sf_vdso->return_address)) + break; + sp = (unsigned long)sf + STACK_FRAME_VDSO_OVERHEAD; + sf = (void __user *)sp; + if (__get_user(sp, &sf->back_chain)) + break; + } else { + sf = (void __user *)sp; + if (__get_user(ip, &sf->gprs[8])) + break; + } + /* Sanity check: ABI requires SP to be 8 byte aligned. */ + if (sp & 0x7) + break; + if (ip_invalid(ip)) { + /* + * If the instruction address is invalid, and this + * is the first stack frame, assume r14 has not + * been written to the stack yet. Otherwise exit. + */ + if (!first) + break; + ip = regs->gprs[14]; + if (ip_invalid(ip)) + break; + } + if (!store_ip(consume_entry, cookie, entry, perf, ip)) + break; + first = false; + } + pagefault_enable(); +} + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + arch_stack_walk_user_common(consume_entry, cookie, NULL, regs, false); +} diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 4d141e2c132e..d40f0b983e74 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -17,6 +17,7 @@ #include <asm/ebcdic.h> #include <asm/facility.h> #include <asm/sthyi.h> +#include <asm/asm.h> #include "entry.h" #define DED_WEIGHT 0xffff @@ -300,32 +301,57 @@ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, return (struct diag204_x_part_block *)&block->cpus[i]; } -static void fill_diag(struct sthyi_sctns *sctns) +static void *diag204_get_data(bool diag204_allow_busy) { - int i, r, pages; - bool this_lpar; + unsigned long subcode; void *diag204_buf; + int pages, rc; + + subcode = DIAG204_SUBC_RSI; + subcode |= DIAG204_INFO_EXT; + pages = diag204(subcode, 0, NULL); + if (pages < 0) + return ERR_PTR(pages); + if (pages == 0) + return ERR_PTR(-ENODATA); + diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return ERR_PTR(-ENOMEM); + subcode = DIAG204_SUBC_STIB7; + subcode |= DIAG204_INFO_EXT; + if (diag204_has_bif() && diag204_allow_busy) + subcode |= DIAG204_BIF_BIT; + rc = diag204(subcode, pages, diag204_buf); + if (rc < 0) { + vfree(diag204_buf); + return ERR_PTR(rc); + } + return diag204_buf; +} + +static bool is_diag204_cached(struct sthyi_sctns *sctns) +{ + /* + * Check if validity bits are set when diag204 data + * is gathered. + */ + if (sctns->par.infpval1) + return true; + return false; +} + +static void fill_diag(struct sthyi_sctns *sctns, void *diag204_buf) +{ + int i; + bool this_lpar; void *diag224_buf = NULL; struct diag204_x_info_blk_hdr *ti_hdr; struct diag204_x_part_block *part_block; struct diag204_x_phys_block *phys_block; struct lpar_cpu_inf lpar_inf = {}; - /* Errors are handled through the validity bits in the response. */ - pages = diag204((unsigned long)DIAG204_SUBC_RSI | - (unsigned long)DIAG204_INFO_EXT, 0, NULL); - if (pages <= 0) - return; - - diag204_buf = vmalloc(array_size(pages, PAGE_SIZE)); - if (!diag204_buf) - return; - - r = diag204((unsigned long)DIAG204_SUBC_STIB7 | - (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); - if (r < 0) - goto out; - diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!diag224_buf || diag224(diag224_buf)) goto out; @@ -390,7 +416,6 @@ static void fill_diag(struct sthyi_sctns *sctns) out: free_page((unsigned long)diag224_buf); - vfree(diag204_buf); } static int sthyi(u64 vaddr, u64 *rc) @@ -401,30 +426,41 @@ static int sthyi(u64 vaddr, u64 *rc) asm volatile( ".insn rre,0xB2560000,%[r1],%[r2]\n" - "ipm %[cc]\n" - "srl %[cc],28\n" - : [cc] "=&d" (cc), [r2] "+&d" (r2.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [r2] "+&d" (r2.pair) : [r1] "d" (r1.pair) - : "memory", "cc"); + : CC_CLOBBER_LIST("memory")); *rc = r2.odd; - return cc; + return CC_TRANSFORM(cc); } static int fill_dst(void *dst, u64 *rc) { + void *diag204_buf; + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; /* * If the facility is on, we don't want to emulate the instruction. * We ask the hypervisor to provide the data. */ - if (test_facility(74)) + if (test_facility(74)) { + memset(dst, 0, PAGE_SIZE); return sthyi((u64)dst, rc); - + } + /* + * When emulating, if diag204 returns BUSY don't reset dst buffer + * and use cached data. + */ + *rc = 0; + diag204_buf = diag204_get_data(is_diag204_cached(sctns)); + if (IS_ERR(diag204_buf)) + return PTR_ERR(diag204_buf); + memset(dst, 0, PAGE_SIZE); fill_hdr(sctns); fill_stsi(sctns); - fill_diag(sctns); - *rc = 0; + fill_diag(sctns, diag204_buf); + vfree(diag204_buf); return 0; } @@ -443,11 +479,14 @@ static int sthyi_update_cache(u64 *rc) { int r; - memset(sthyi_cache.info, 0, PAGE_SIZE); r = fill_dst(sthyi_cache.info, rc); - if (r) - return r; - sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + if (r == 0) { + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + } else if (r == -EBUSY) { + /* mark as expired and return 0 to keep using cached data */ + sthyi_cache.end = jiffies - 1; + r = 0; + } return r; } @@ -459,9 +498,9 @@ static int sthyi_update_cache(u64 *rc) * * Fills the destination with system information returned by the STHYI * instruction. The data is generated by emulation or execution of STHYI, - * if available. The return value is the condition code that would be - * returned, the rc parameter is the return code which is passed in - * register R2 + 1. + * if available. The return value is either a negative error value or + * the condition code that would be returned, the rc parameter is the + * return code which is passed in register R2 + 1. */ int sthyi_fill(void *dst, u64 *rc) { diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index dc2355c623d6..4fee74553ca2 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -12,6 +12,7 @@ * platform. */ +#include <linux/cpufeature.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/mm.h> @@ -38,33 +39,6 @@ #include "entry.h" -/* - * Perform the mmap() system call. Linux for S/390 isn't able to handle more - * than 5 system call parameters, so this system call uses a memory block - * for parameter passing. - */ - -struct s390_mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) -{ - struct s390_mmap_arg_struct a; - int error = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -out: - return error; -} - #ifdef CONFIG_SYSVIPC /* * sys_ipc() is the de-multiplexer for the SysV IPC calls. @@ -108,25 +82,35 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -static void do_syscall(struct pt_regs *regs) +void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { unsigned long nr; + add_random_kstack_offset(); + enter_from_user_mode(regs); + regs->psw = get_lowcore()->svc_old_psw; + regs->int_code = get_lowcore()->svc_int_code; + update_timer_sys(); + if (cpu_has_bear()) + current->thread.last_break = regs->last_break; + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + if (unlikely(per_trap)) + set_thread_flag(TIF_PER_TRAP); + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); nr = regs->int_code & 0xffff; - if (!nr) { + if (likely(!nr)) { nr = regs->gprs[1] & 0xffff; regs->int_code &= ~0xffffUL; regs->int_code |= nr; } - regs->gprs[2] = nr; - if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { regs->psw.addr = current->restart_block.arch_data; current->restart_block.arch_data = 1; } nr = syscall_enter_from_user_mode_work(regs, nr); - /* * In the s390 ptrace ABI, both the syscall number and the return value * use gpr2. However, userspace puts the syscall number either in the @@ -134,37 +118,11 @@ static void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ - if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) goto out; regs->gprs[2] = -ENOSYS; - if (likely(nr >= NR_syscalls)) - goto out; - do { + if (likely(nr < NR_syscalls)) regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); out: - syscall_exit_to_user_mode_work(regs); -} - -void noinstr __do_syscall(struct pt_regs *regs, int per_trap) -{ - add_random_kstack_offset(); - enter_from_user_mode(regs); - regs->psw = S390_lowcore.svc_old_psw; - regs->int_code = S390_lowcore.svc_int_code; - update_timer_sys(); - if (static_branch_likely(&cpu_has_bear)) - current->thread.last_break = regs->last_break; - - local_irq_enable(); - regs->orig_gpr2 = regs->gprs[2]; - - if (per_trap) - set_thread_flag(TIF_PER_TRAP); - - regs->flags = 0; - set_pt_regs_flag(regs, PIF_SYSCALL); - do_syscall(regs); - exit_to_user_mode(); + syscall_exit_to_user_mode(regs); } diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile index fb85e797946d..c5d958a09ff4 100644 --- a/arch/s390/kernel/syscalls/Makefile +++ b/arch/s390/kernel/syscalls/Makefile @@ -4,15 +4,15 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm uapi := $(gen)/uapi/asm -syscall := $(srctree)/$(src)/syscall.tbl -systbl := $(srctree)/$(src)/syscalltbl +syscall := $(src)/syscall.tbl +systbl := $(src)/syscalltbl gen-y := $(kapi)/syscall_table.h kapi-hdrs-y := $(kapi)/unistd_nr.h uapi-hdrs-y := $(uapi)/unistd_32.h uapi-hdrs-y += $(uapi)/unistd_64.h -targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) +targets += $(addprefix ../../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) PHONY += kapi uapi @@ -23,23 +23,26 @@ uapi: $(uapi-hdrs-y) # Create output directory if not already present $(shell mkdir -p $(uapi) $(kapi)) -filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$@" < $< > $@ -filechk_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< +quiet_cmd_sysnr = SYSNR $@ + cmd_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< > $@ -filechk_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< +quiet_cmd_syscalls = SYSTBL $@ + cmd_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< > $@ syshdr_abi_unistd_32 := common,32 -$(uapi)/unistd_32.h: $(syscall) FORCE - $(call filechk,syshdr,$@) +$(uapi)/unistd_32.h: $(syscall) $(systbl) FORCE + $(call if_changed,syshdr) syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall) FORCE - $(call filechk,syshdr,$@) +$(uapi)/unistd_64.h: $(syscall) $(systbl) FORCE + $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) FORCE - $(call filechk,syscalls) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE + $(call if_changed,syscalls) sysnr_abi_unistd_nr := common,32,64 -$(kapi)/unistd_nr.h: $(syscall) FORCE - $(call filechk,sysnr) +$(kapi)/unistd_nr.h: $(syscall) $(systbl) FORCE + $(call if_changed,sysnr) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 799147658dee..a4569b96ef06 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -100,7 +100,7 @@ 106 common stat sys_newstat compat_sys_newstat 107 common lstat sys_newlstat compat_sys_newlstat 108 common fstat sys_newfstat compat_sys_newfstat -110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +110 common lookup_dcookie - - 111 common vhangup sys_vhangup sys_vhangup 112 common idle - - 114 common wait4 sys_wait4 compat_sys_wait4 @@ -418,7 +418,7 @@ 412 32 utimensat_time64 - sys_utimensat 413 32 pselect6_time64 - compat_sys_pselect6_time64 414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents +416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 - sys_mq_timedsend 419 32 mq_timedreceive_time64 - sys_mq_timedreceive @@ -449,7 +449,24 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self -# 447 reserved for memfd_secret +447 common memfd_secret sys_memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 +453 common map_shadow_stack sys_map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue sys_futex_requeue +457 common statmount sys_statmount sys_statmount +458 common listmount sys_listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal sys_mseal +463 common setxattrat sys_setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index b5e364358ce4..1ea84e942bd4 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -5,6 +5,7 @@ * Martin Schwidefsky <schwidefsky@de.ibm.com>, */ +#include <linux/cpufeature.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -15,54 +16,17 @@ #include <linux/export.h> #include <linux/slab.h> #include <asm/asm-extable.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/debug.h> #include <asm/sysinfo.h> #include <asm/cpcmd.h> #include <asm/topology.h> -#include <asm/fpu/api.h> +#include <asm/fpu.h> +#include <asm/asm.h> int topology_max_mnest; -static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) -{ - int r0 = (fc << 28) | sel1; - int rc = 0; - - asm volatile( - " lr 0,%[r0]\n" - " lr 1,%[r1]\n" - " stsi 0(%[sysinfo])\n" - "0: jz 2f\n" - "1: lhi %[rc],%[retval]\n" - "2: lr %[r0],0\n" - EX_TABLE(0b, 1b) - : [r0] "+d" (r0), [rc] "+d" (rc) - : [r1] "d" (sel2), - [sysinfo] "a" (sysinfo), - [retval] "K" (-EOPNOTSUPP) - : "cc", "0", "1", "memory"); - *lvl = ((unsigned int) r0) >> 28; - return rc; -} - -/* - * stsi - store system information - * - * Returns the current configuration level if function code 0 was specified. - * Otherwise returns 0 on success or a negative value on error. - */ -int stsi(void *sysinfo, int fc, int sel1, int sel2) -{ - int lvl, rc; - - rc = __stsi(sysinfo, fc, sel1, sel2, &lvl); - if (rc) - return rc; - return fc ? 0 : lvl; -} -EXPORT_SYMBOL(stsi); - #ifdef CONFIG_PROC_FS static bool convert_ext_name(unsigned char encoding, char *name, size_t len) @@ -81,10 +45,12 @@ static bool convert_ext_name(unsigned char encoding, char *name, size_t len) static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) { + bool has_var_cap; int i; if (stsi(info, 1, 1, 1)) return; + has_var_cap = !!info->model_var_cap[0]; EBCASC(info->manufacturer, sizeof(info->manufacturer)); EBCASC(info->type, sizeof(info->type)); EBCASC(info->model, sizeof(info->model)); @@ -93,6 +59,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) EBCASC(info->model_capacity, sizeof(info->model_capacity)); EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap)); EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap)); + if (has_var_cap) + EBCASC(info->model_var_cap, sizeof(info->model_var_cap)); seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer); seq_printf(m, "Type: %-4.4s\n", info->type); if (info->lic) @@ -120,12 +88,18 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n", info->model_temp_cap, info->model_temp_cap_rating); + if (has_var_cap && info->model_var_cap_rating) + seq_printf(m, "Model Var. Capacity: %-16.16s %08u\n", + info->model_var_cap, + info->model_var_cap_rating); if (info->ncr) seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr); if (info->npr) seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr); if (info->ntr) seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr); + if (has_var_cap && info->nvr) + seq_printf(m, "Nominal Var. Rating: %08u\n", info->nvr); if (info->cai) { seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai); seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr); @@ -144,7 +118,7 @@ static void stsi_15_1_x(struct seq_file *m, struct sysinfo_15_1_x *info) int i; seq_putc(m, '\n'); - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return; if (stsi(info, 15, 1, topology_max_mnest)) return; @@ -387,7 +361,7 @@ static void service_level_vm_print(struct seq_file *m, { char *query_buffer, *str; - query_buffer = kmalloc(1024, GFP_KERNEL | GFP_DMA); + query_buffer = kmalloc(1024, GFP_KERNEL); if (!query_buffer) return; cpcmd("QUERY CPLEVEL", query_buffer, 1024, NULL); @@ -405,7 +379,7 @@ static struct service_level service_level_vm = { static __init int create_proc_service_level(void) { proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops); - if (MACHINE_IS_VM) + if (machine_is_vm()) register_service_level(&service_level_vm); return 0; } @@ -416,9 +390,9 @@ subsys_initcall(create_proc_service_level); */ void s390_adjust_jiffies(void) { + DECLARE_KERNEL_FPU_ONSTACK16(fpu); struct sysinfo_1_2_2 *info; unsigned long capability; - struct kernel_fpu fpu; info = (void *) get_zeroed_page(GFP_KERNEL); if (!info) @@ -437,21 +411,14 @@ void s390_adjust_jiffies(void) * point division .. */ kernel_fpu_begin(&fpu, KERNEL_FPR); - asm volatile( - " sfpc %3\n" - " l %0,%1\n" - " tmlh %0,0xff80\n" - " jnz 0f\n" - " cefbr %%f2,%0\n" - " j 1f\n" - "0: le %%f2,%1\n" - "1: cefbr %%f0,%2\n" - " debr %%f0,%%f2\n" - " cgebr %0,5,%%f0\n" - : "=&d" (capability) - : "Q" (info->capability), "d" (10000000), "d" (0) - : "cc" - ); + fpu_sfpc(0); + if (info->capability & 0xff800000) + fpu_ldgr(2, info->capability); + else + fpu_cefbr(2, info->capability); + fpu_cefbr(0, 10000000); + fpu_debr(0, 2); + capability = fpu_cgebr(0, 5); kernel_fpu_end(&fpu, KERNEL_FPR); } else /* @@ -495,7 +462,6 @@ static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ .open = stsi_open_##fc##_##s1##_##s2, \ .release = stsi_release, \ .read = stsi_read, \ - .llseek = no_llseek, \ }; static int stsi_release(struct inode *inode, struct file *file) @@ -557,7 +523,7 @@ static __init int stsi_init_debugfs(void) sf = &stsi_file[i]; debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops); } - if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) { + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && cpu_has_topology()) { char link_to[10]; sprintf(link_to, "15_1_%d", topology_mnest_limit()); diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S index 2c8b14cc5556..26f2981aa09e 100644 --- a/arch/s390/kernel/text_amode31.S +++ b/arch/s390/kernel/text_amode31.S @@ -18,8 +18,7 @@ * affects a few functions that are not performance-relevant. */ .macro BR_EX_AMODE31_r14 - larl %r1,0f - ex 0,0(%r1) + exrl 0,0f j . 0: br %r14 .endm @@ -27,7 +26,7 @@ /* * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode) */ -ENTRY(_diag14_amode31) +SYM_FUNC_START(_diag14_amode31) lgr %r1,%r2 lgr %r2,%r3 lgr %r3,%r4 @@ -42,12 +41,12 @@ ENTRY(_diag14_amode31) lgfr %r2,%r5 BR_EX_AMODE31_r14 EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault) -ENDPROC(_diag14_amode31) +SYM_FUNC_END(_diag14_amode31) /* * int _diag210_amode31(struct diag210 *addr) */ -ENTRY(_diag210_amode31) +SYM_FUNC_START(_diag210_amode31) lgr %r1,%r2 lhi %r2,-1 sam31 @@ -60,12 +59,25 @@ ENTRY(_diag210_amode31) lgfr %r2,%r2 BR_EX_AMODE31_r14 EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault) -ENDPROC(_diag210_amode31) +SYM_FUNC_END(_diag210_amode31) /* + * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len) +*/ +SYM_FUNC_START(_diag8c_amode31) + llgf %r3,0(%r3) + sam31 + diag %r2,%r4,0x8c +.Ldiag8c_ex: + sam64 + lgfr %r2,%r3 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex) +SYM_FUNC_END(_diag8c_amode31) +/* * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode) */ -ENTRY(_diag26c_amode31) +SYM_FUNC_START(_diag26c_amode31) lghi %r5,-EOPNOTSUPP sam31 diag %r2,%r4,0x26c @@ -74,42 +86,42 @@ ENTRY(_diag26c_amode31) lgfr %r2,%r5 BR_EX_AMODE31_r14 EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex) -ENDPROC(_diag26c_amode31) +SYM_FUNC_END(_diag26c_amode31) /* - * void _diag0c_amode31(struct hypfs_diag0c_entry *entry) + * void _diag0c_amode31(unsigned long rx) */ -ENTRY(_diag0c_amode31) +SYM_FUNC_START(_diag0c_amode31) sam31 diag %r2,%r2,0x0c sam64 BR_EX_AMODE31_r14 -ENDPROC(_diag0c_amode31) +SYM_FUNC_END(_diag0c_amode31) /* * void _diag308_reset_amode31(void) * * Calls diag 308 subcode 1 and continues execution */ -ENTRY(_diag308_reset_amode31) - larl %r4,.Lctlregs # Save control registers +SYM_FUNC_START(_diag308_reset_amode31) + larl %r4,ctlregs # Save control registers stctg %c0,%c15,0(%r4) lg %r2,0(%r4) # Disable lowcore protection nilh %r2,0xefff - larl %r4,.Lctlreg0 + larl %r4,ctlreg0 stg %r2,0(%r4) lctlg %c0,%c0,0(%r4) - larl %r4,.Lfpctl # Floating point control register + larl %r4,fpctl # Floating point control register stfpc 0(%r4) - larl %r4,.Lprefix # Save prefix register + larl %r4,prefix # Save prefix register stpx 0(%r4) - larl %r4,.Lprefix_zero # Set prefix register to 0 + larl %r4,prefix_zero # Set prefix register to 0 spx 0(%r4) - larl %r4,.Lcontinue_psw # Save PSW flags + larl %r4,continue_psw # Save PSW flags epsw %r2,%r3 stm %r2,%r3,0(%r4) larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0 - larl %r3,.Lrestart_diag308_psw + larl %r3,restart_diag308_psw og %r4,0(%r3) # Save PSW lghi %r3,0 sturg %r4,%r3 # Use sturg, because of large pages @@ -121,39 +133,26 @@ ENTRY(_diag308_reset_amode31) lhi %r1,2 # Use mode 2 = ESAME (dump) sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode sam64 # Switch to 64 bit addressing mode - larl %r4,.Lctlregs # Restore control registers + larl %r4,ctlregs # Restore control registers lctlg %c0,%c15,0(%r4) - larl %r4,.Lfpctl # Restore floating point ctl register + larl %r4,fpctl # Restore floating point ctl register lfpc 0(%r4) - larl %r4,.Lprefix # Restore prefix register + larl %r4,prefix # Restore prefix register spx 0(%r4) - larl %r4,.Lcontinue_psw # Restore PSW flags + larl %r4,continue_psw # Restore PSW flags larl %r2,.Lcontinue stg %r2,8(%r4) lpswe 0(%r4) .Lcontinue: BR_EX_AMODE31_r14 -ENDPROC(_diag308_reset_amode31) +SYM_FUNC_END(_diag308_reset_amode31) .section .amode31.data,"aw",@progbits -.align 8 -.Lrestart_diag308_psw: - .long 0x00080000,0x80000000 - -.align 8 -.Lcontinue_psw: - .quad 0,0 - -.align 8 -.Lctlreg0: - .quad 0 -.Lctlregs: - .rept 16 - .quad 0 - .endr -.Lfpctl: - .long 0 -.Lprefix: - .long 0 -.Lprefix_zero: - .long 0 + .balign 8 +SYM_DATA_LOCAL(restart_diag308_psw, .long 0x00080000,0x80000000) +SYM_DATA_LOCAL(continue_psw, .quad 0,0) +SYM_DATA_LOCAL(ctlreg0, .quad 0) +SYM_DATA_LOCAL(ctlregs, .fill 16,8,0) +SYM_DATA_LOCAL(fpctl, .long 0) +SYM_DATA_LOCAL(prefix, .long 0) +SYM_DATA_LOCAL(prefix_zero, .long 0) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 6b7b6d5e3632..fed17d407a44 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -36,7 +36,6 @@ #include <linux/profile.h> #include <linux/timex.h> #include <linux/notifier.h> -#include <linux/timekeeper_internal.h> #include <linux/clockchips.h> #include <linux/gfp.h> #include <linux/kprobes.h> @@ -55,10 +54,10 @@ #include <asm/cio.h> #include "entry.h" -union tod_clock tod_clock_base __section(".data"); +union tod_clock __bootdata_preserved(tod_clock_base); EXPORT_SYMBOL_GPL(tod_clock_base); -u64 clock_comparator_max = -1ULL; +u64 __bootdata_preserved(clock_comparator_max); EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -80,12 +79,10 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; - int cs; /* Initialize TOD steering parameters */ tod_steering_end = tod_clock_base.tod; - for (cs = 0; cs < CS_BASES; cs++) - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -102,6 +99,11 @@ void __init time_early_init(void) ((long) qui.old_leap * 4096000000L); } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return tod_to_ns(__get_tod_clock_monotonic()); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -126,7 +128,7 @@ void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = clock_comparator_max; + get_lowcore()->clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -134,8 +136,8 @@ void clock_comparator_work(void) static int s390_next_event(unsigned long delta, struct clock_event_device *evt) { - S390_lowcore.clock_comparator = get_tod_clock() + delta; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = get_tod_clock() + delta; + set_clock_comparator(get_lowcore()->clock_comparator); return 0; } @@ -148,8 +150,8 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = clock_comparator_max; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = clock_comparator_max; + set_clock_comparator(get_lowcore()->clock_comparator); cpu = smp_processor_id(); cd = &per_cpu(comparators, cpu); @@ -168,10 +170,10 @@ void init_cpu_timer(void) clockevents_register_device(cd); /* Enable clock comparator timer interrupt. */ - __ctl_set_bit(0,11); + local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SUBMASK_BIT); /* Always allow the timing alert external interrupt. */ - __ctl_set_bit(0, 4); + local_ctl_set_bit(0, CR0_ETR_SUBMASK_BIT); } static void clock_comparator_interrupt(struct ext_code ext_code, @@ -179,8 +181,8 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == clock_comparator_max) - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator == clock_comparator_max) + set_clock_comparator(get_lowcore()->clock_comparator); } static void stp_timing_alert(struct stp_irq_parm *); @@ -246,10 +248,11 @@ static struct clocksource clocksource_tod = { .rating = 400, .read = read_tod_clock, .mask = CLOCKSOURCE_MASK(64), - .mult = 1000, - .shift = 12, + .mult = 4096000, + .shift = 24, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .vdso_clock_mode = VDSO_CLOCKMODE_TOD, + .id = CSID_S390_TOD, }; struct clocksource * __init clocksource_default_clock(void) @@ -368,7 +371,6 @@ static void clock_sync_global(long delta) { unsigned long now, adj; struct ptff_qto qto; - int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; @@ -384,10 +386,8 @@ static void clock_sync_global(long delta) panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - for (cs = 0; cs < CS_BASES; cs++) { - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; - vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; - } + vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_steering_delta = tod_steering_delta; /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) @@ -403,12 +403,12 @@ static void clock_sync_global(long delta) static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != clock_comparator_max) { - S390_lowcore.clock_comparator += delta; - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator != clock_comparator_max) { + get_lowcore()->clock_comparator += delta; + set_clock_comparator(get_lowcore()->clock_comparator); } /* Adjust the last_update_clock time-stamp. */ - S390_lowcore.last_update_clock += delta; + get_lowcore()->last_update_clock += delta; } /* Single threaded workqueue used for stp sync events */ @@ -463,6 +463,12 @@ static void __init stp_reset(void) } } +bool stp_enabled(void) +{ + return test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online; +} +EXPORT_SYMBOL(stp_enabled); + static void stp_timeout(struct timer_list *unused) { queue_work(time_sync_wq, &stp_work); @@ -651,12 +657,12 @@ static void stp_check_leap(void) if (ret < 0) pr_err("failed to set leap second flags\n"); /* arm Timer to clear leap second flags */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); + mod_timer(&stp_timer, jiffies + secs_to_jiffies(14400)); } else { /* The day the leap second is scheduled for hasn't been reached. Retry * in one hour. */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); + mod_timer(&stp_timer, jiffies + secs_to_jiffies(3600)); } } @@ -674,7 +680,7 @@ static void stp_work_fn(struct work_struct *work) if (!stp_online) { chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); - del_timer_sync(&stp_timer); + timer_delete_sync(&stp_timer); goto out_unlock; } @@ -697,7 +703,7 @@ static void stp_work_fn(struct work_struct *work) if (!check_sync_clock()) /* - * There is a usable clock but the synchonization failed. + * There is a usable clock but the synchronization failed. * Retry after a second. */ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); @@ -711,7 +717,7 @@ out_unlock: /* * STP subsys sysfs interface functions */ -static struct bus_type stp_subsys = { +static const struct bus_type stp_subsys = { .name = "stp", .dev_name = "stp", }; @@ -724,8 +730,8 @@ static ssize_t ctn_id_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%016lx\n", - *(unsigned long *) stp_info.ctnid); + ret = sysfs_emit(buf, "%016lx\n", + *(unsigned long *)stp_info.ctnid); mutex_unlock(&stp_mutex); return ret; } @@ -740,7 +746,7 @@ static ssize_t ctn_type_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.ctn); + ret = sysfs_emit(buf, "%i\n", stp_info.ctn); mutex_unlock(&stp_mutex); return ret; } @@ -755,7 +761,7 @@ static ssize_t dst_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x2000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.dsto); mutex_unlock(&stp_mutex); return ret; } @@ -770,7 +776,7 @@ static ssize_t leap_seconds_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x8000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.leaps); mutex_unlock(&stp_mutex); return ret; } @@ -796,11 +802,11 @@ static ssize_t leap_seconds_scheduled_show(struct device *dev, return ret; if (!stzi.lsoib.p) - return sprintf(buf, "0,0\n"); + return sysfs_emit(buf, "0,0\n"); - return sprintf(buf, "%lu,%d\n", - tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, - stzi.lsoib.nlso - stzi.lsoib.also); + return sysfs_emit(buf, "%lu,%d\n", + tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, + stzi.lsoib.nlso - stzi.lsoib.also); } static DEVICE_ATTR_RO(leap_seconds_scheduled); @@ -813,7 +819,7 @@ static ssize_t stratum_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.stratum); mutex_unlock(&stp_mutex); return ret; } @@ -828,7 +834,7 @@ static ssize_t time_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x0800)) - ret = sprintf(buf, "%i\n", (int) stp_info.tto); + ret = sysfs_emit(buf, "%i\n", (int)stp_info.tto); mutex_unlock(&stp_mutex); return ret; } @@ -843,7 +849,7 @@ static ssize_t time_zone_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x4000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.tzo); mutex_unlock(&stp_mutex); return ret; } @@ -858,7 +864,7 @@ static ssize_t timing_mode_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.tmd); + ret = sysfs_emit(buf, "%i\n", stp_info.tmd); mutex_unlock(&stp_mutex); return ret; } @@ -873,7 +879,7 @@ static ssize_t timing_state_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.tst); + ret = sysfs_emit(buf, "%i\n", stp_info.tst); mutex_unlock(&stp_mutex); return ret; } @@ -884,7 +890,7 @@ static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%i\n", stp_online); + return sysfs_emit(buf, "%i\n", stp_online); } static ssize_t online_store(struct device *dev, diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index c6eecd4a5302..3df048e190b1 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -6,6 +6,7 @@ #define KMSG_COMPONENT "cpu" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/cpufeature.h> #include <linux/workqueue.h> #include <linux/memblock.h> #include <linux/uaccess.h> @@ -24,7 +25,9 @@ #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/node.h> +#include <asm/hiperdispatch.h> #include <asm/sysinfo.h> +#include <asm/asm.h> #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -47,6 +50,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; +static int cpu_management; static DECLARE_WORK(topology_work, topology_work_fn); @@ -95,7 +99,7 @@ out: static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) { static cpumask_t mask; - int i; + unsigned int max_cpu; cpumask_clear(&mask); if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) @@ -104,9 +108,10 @@ static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) if (topology_mode != TOPOLOGY_MODE_HW) goto out; cpu -= cpu % (smp_cpu_mtid + 1); - for (i = 0; i <= smp_cpu_mtid; i++) { - if (cpumask_test_cpu(cpu + i, &cpu_setup_mask)) - cpumask_set_cpu(cpu + i, &mask); + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + if (cpumask_test_cpu(cpu, &cpu_setup_mask)) + cpumask_set_cpu(cpu, &mask); } out: cpumask_copy(dst, &mask); @@ -123,25 +128,27 @@ static void add_cpus_to_mask(struct topology_core *tl_core, unsigned int core; for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) { - unsigned int rcore; - int lcpu, i; + unsigned int max_cpu, rcore; + int cpu; rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin; - lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); - if (lcpu < 0) + cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); + if (cpu < 0) continue; - for (i = 0; i <= smp_cpu_mtid; i++) { - topo = &cpu_topology[lcpu + i]; + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + topo = &cpu_topology[cpu]; topo->drawer_id = drawer->id; topo->book_id = book->id; topo->socket_id = socket->id; topo->core_id = rcore; - topo->thread_id = lcpu + i; + topo->thread_id = cpu; topo->dedicated = tl_core->d; - cpumask_set_cpu(lcpu + i, &drawer->mask); - cpumask_set_cpu(lcpu + i, &book->mask); - cpumask_set_cpu(lcpu + i, &socket->mask); - smp_cpu_set_polarization(lcpu + i, tl_core->pp); + cpumask_set_cpu(cpu, &drawer->mask); + cpumask_set_cpu(cpu, &book->mask); + cpumask_set_cpu(cpu, &socket->mask); + smp_cpu_set_polarization(cpu, tl_core->pp); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); } } } @@ -219,22 +226,22 @@ static void topology_update_polarization_simple(void) static int ptf(unsigned long fc) { - int rc; + int cc; asm volatile( - " .insn rre,0xb9a20000,%1,%1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc) - : "d" (fc) : "cc"); - return rc; + " .insn rre,0xb9a20000,%[fc],%[fc]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [fc] "d" (fc) + : CC_CLOBBER); + return CC_TRANSFORM(cc); } int topology_set_cpu_management(int fc) { int cpu, rc; - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return -EOPNOTSUPP; if (fc) rc = ptf(PTF_VERTICAL); @@ -268,6 +275,7 @@ void update_cpu_masks(void) topo->drawer_id = id; } } + hd_reset_state(); for_each_online_cpu(cpu) { topo = &cpu_topology[cpu]; pkg_first = cpumask_first(&topo->core_mask); @@ -276,8 +284,10 @@ void update_cpu_masks(void) for_each_cpu(sibling, &topo->core_mask) { topo_sibling = &cpu_topology[sibling]; smt_first = cpumask_first(&topo_sibling->thread_mask); - if (sibling == smt_first) + if (sibling == smt_first) { topo_package->booted_cores++; + hd_add_core(sibling); + } } } else { topo->booted_cores = topo_package->booted_cores; @@ -301,33 +311,33 @@ static void __arch_update_dedicated_flag(void *arg) static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - int rc = 0; + int rc, hd_status; + hd_status = 0; + rc = 0; mutex_lock(&smp_cpu_state_mutex); - if (MACHINE_HAS_TOPOLOGY) { + if (cpu_has_topology()) { rc = 1; store_topology(info); tl_to_masks(info); } update_cpu_masks(); - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) topology_update_polarization_simple(); + if (cpu_management == 1) + hd_status = hd_enable_hiperdispatch(); mutex_unlock(&smp_cpu_state_mutex); + if (hd_status == 0) + hd_disable_hiperdispatch(); return rc; } int arch_update_cpu_topology(void) { - struct device *dev; - int cpu, rc; + int rc; rc = __arch_update_cpu_topology(); on_each_cpu(__arch_update_dedicated_flag, NULL, 0); - for_each_online_cpu(cpu) { - dev = get_cpu_device(cpu); - if (dev) - kobject_uevent(&dev->kobj, KOBJ_CHANGE); - } return rc; } @@ -362,12 +372,12 @@ static void set_topology_timer(void) if (atomic_add_unless(&topology_poll, -1, 0)) mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); + mod_timer(&topology_timer, jiffies + secs_to_jiffies(60)); } void topology_expect_change(void) { - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return; /* This is racy, but it doesn't matter since it is just a heuristic. * Worst case is that we poll in a higher frequency for a bit longer. @@ -378,7 +388,24 @@ void topology_expect_change(void) set_topology_timer(); } -static int cpu_management; +static int set_polarization(int polarization) +{ + int rc = 0; + + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + if (cpu_management == polarization) + goto out; + rc = topology_set_cpu_management(polarization); + if (rc) + goto out; + cpu_management = polarization; + topology_expect_change(); +out: + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return rc; +} static ssize_t dispatching_show(struct device *dev, struct device_attribute *attr, @@ -387,7 +414,7 @@ static ssize_t dispatching_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", cpu_management); + count = sysfs_emit(buf, "%d\n", cpu_management); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -404,19 +431,7 @@ static ssize_t dispatching_store(struct device *dev, return -EINVAL; if (val != 0 && val != 1) return -EINVAL; - rc = 0; - cpus_read_lock(); - mutex_lock(&smp_cpu_state_mutex); - if (cpu_management == val) - goto out; - rc = topology_set_cpu_management(val); - if (rc) - goto out; - cpu_management = val; - topology_expect_change(); -out: - mutex_unlock(&smp_cpu_state_mutex); - cpus_read_unlock(); + rc = set_polarization(val); return rc ? rc : count; } static DEVICE_ATTR_RW(dispatching); @@ -430,19 +445,19 @@ static ssize_t cpu_polarization_show(struct device *dev, mutex_lock(&smp_cpu_state_mutex); switch (smp_cpu_get_polarization(cpu)) { case POLARIZATION_HRZ: - count = sprintf(buf, "horizontal\n"); + count = sysfs_emit(buf, "horizontal\n"); break; case POLARIZATION_VL: - count = sprintf(buf, "vertical:low\n"); + count = sysfs_emit(buf, "vertical:low\n"); break; case POLARIZATION_VM: - count = sprintf(buf, "vertical:medium\n"); + count = sysfs_emit(buf, "vertical:medium\n"); break; case POLARIZATION_VH: - count = sprintf(buf, "vertical:high\n"); + count = sysfs_emit(buf, "vertical:high\n"); break; default: - count = sprintf(buf, "unknown\n"); + count = sysfs_emit(buf, "unknown\n"); break; } mutex_unlock(&smp_cpu_state_mutex); @@ -466,7 +481,7 @@ static ssize_t cpu_dedicated_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu)); + count = sysfs_emit(buf, "%d\n", topology_cpu_dedicated(cpu)); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -486,7 +501,7 @@ int topology_cpu_init(struct cpu *cpu) int rc; rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); - if (rc || !MACHINE_HAS_TOPOLOGY) + if (rc || !cpu_has_topology()) return rc; rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group); if (rc) @@ -520,7 +535,7 @@ static struct sched_domain_topology_level s390_topology[] = { { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, { cpu_book_mask, SD_INIT_NAME(BOOK) }, { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -534,33 +549,38 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_alloc(sizeof(*mask->next), 8); - if (!mask->next) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*mask->next), 8); + mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8); mask = mask->next; } } +static int __init detect_polarization(union topology_entry *tle) +{ + struct topology_core *tl_core; + + while (tle->nl) + tle = next_tle(tle); + tl_core = (struct topology_core *)tle; + return tl_core->pp != POLARIZATION_HRZ; +} + void __init topology_init_early(void) { struct sysinfo_15_1_x *info; set_sched_topology(s390_topology); if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { - if (MACHINE_HAS_TOPOLOGY) + if (cpu_has_topology()) topology_mode = TOPOLOGY_MODE_HW; else topology_mode = TOPOLOGY_MODE_SINGLE; } - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) goto out; - tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!tl_info) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); + cpu_management = detect_polarization(info->tle); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", info->mag[0], info->mag[1], info->mag[2], info->mag[3], info->mag[4], info->mag[5], info->mnest); @@ -577,7 +597,7 @@ static inline int topology_get_mode(int enabled) { if (!enabled) return TOPOLOGY_MODE_SINGLE; - return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; + return cpu_has_topology() ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; } static inline int topology_is_enabled(void) @@ -598,7 +618,7 @@ static int __init topology_setup(char *str) } early_param("topology", topology_setup); -static int topology_ctl_handler(struct ctl_table *ctl, int write, +static int topology_ctl_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); @@ -628,33 +648,58 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write, return rc; } -static struct ctl_table topology_ctl_table[] = { +static int polarization_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int polarization; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &polarization, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + polarization = cpu_management; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + return set_polarization(polarization); +} + +static const struct ctl_table topology_ctl_table[] = { { .procname = "topology", .mode = 0644, .proc_handler = topology_ctl_handler, }, - { }, -}; - -static struct ctl_table topology_dir_table[] = { { - .procname = "s390", - .maxlen = 0, - .mode = 0555, - .child = topology_ctl_table, + .procname = "polarization", + .mode = 0644, + .proc_handler = polarization_ctl_handler, }, - { }, }; static int __init topology_init(void) { + struct device *dev_root; + int rc = 0; + timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); - if (MACHINE_HAS_TOPOLOGY) + if (cpu_has_topology()) set_topology_timer(); else topology_update_polarization_simple(); - register_sysctl_table(topology_dir_table); - return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY_VERTICAL)) + set_polarization(1); + register_sysctl("s390", topology_ctl_table); + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_dispatching); + put_device(dev_root); + } + return rc; } device_initcall(topology_init); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 1d2aa448d103..19687dab32f7 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -3,18 +3,13 @@ * S390 version * Copyright IBM Corp. 1999, 2000 * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), * * Derived from "arch/i386/kernel/traps.c" * Copyright (C) 1991, 1992 Linus Torvalds */ -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. - */ -#include "asm/irqflags.h" -#include "asm/ptrace.h" +#include <linux/cpufeature.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/randomize_kstack.h> @@ -27,9 +22,13 @@ #include <linux/uaccess.h> #include <linux/cpu.h> #include <linux/entry-common.h> +#include <linux/kmsan.h> #include <asm/asm-extable.h> -#include <asm/fpu/api.h> +#include <asm/irqflags.h> +#include <asm/ptrace.h> #include <asm/vtime.h> +#include <asm/fpu.h> +#include <asm/fault.h> #include "entry.h" static inline void __user *get_trap_ip(struct pt_regs *regs) @@ -40,29 +39,30 @@ static inline void __user *get_trap_ip(struct pt_regs *regs) address = current->thread.trap_tdb.data[3]; else address = regs->psw.addr; - return (void __user *) (address - (regs->int_code >> 16)); + return (void __user *)(address - (regs->int_code >> 16)); } +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return 1; } +#endif void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { if (user_mode(regs)) { force_sig_fault(si_signo, si_code, get_trap_ip(regs)); report_user_fault(regs, si_signo, 0); - } else { + } else { if (!fixup_exception(regs)) die(regs, str); - } + } } static void do_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { - if (notify_die(DIE_TRAP, str, regs, 0, - regs->int_code, si_signo) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, str, regs, 0, regs->int_code, si_signo) == NOTIFY_STOP) return; do_report_trap(regs, si_signo, si_code, str); } @@ -74,8 +74,7 @@ void do_per_trap(struct pt_regs *regs) return; if (!current->ptrace) return; - force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __force __user *) current->thread.per_event.address); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __force __user *)current->thread.per_event.address); } NOKPROBE_SYMBOL(do_per_trap); @@ -94,36 +93,25 @@ static void name(struct pt_regs *regs) \ do_trap(regs, signr, sicode, str); \ } -DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, - "addressing exception") -DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, - "execute exception") -DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, - "fixpoint divide exception") -DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, - "fixpoint overflow exception") -DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, - "HFP overflow exception") -DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, - "HFP underflow exception") -DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, - "HFP significance exception") -DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, - "HFP divide exception") -DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, - "HFP square root exception") -DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, - "operand exception") -DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, - "privileged operation") -DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, - "special operation exception") -DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, - "transaction constraint exception") +DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, "addressing exception") +DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, "fixpoint divide exception") +DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, "execute exception") +DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, "HFP divide exception") +DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, "HFP overflow exception") +DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, "HFP significance exception") +DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, "HFP square root exception") +DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, "HFP underflow exception") +DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, "operand exception") +DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, "fixpoint overflow exception") +DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, "privileged operation") +DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, "special operation exception") +DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, "specification exception"); +DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, "transaction constraint exception") static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) { int si_code = 0; + /* FPC[2] is Data Exception Code */ if ((fpc & 0x00000300) == 0) { /* bits 6 and 7 of DXC are 0 iff IEEE exception */ @@ -149,36 +137,35 @@ static void translation_specification_exception(struct pt_regs *regs) static void illegal_op(struct pt_regs *regs) { - __u8 opcode[6]; - __u16 __user *location; int is_uprobe_insn = 0; + u16 __user *location; int signal = 0; + u16 opcode; location = get_trap_ip(regs); - if (user_mode(regs)) { - if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) + if (get_user(opcode, location)) return; - if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { + if (opcode == S390_BREAKPOINT_U16) { if (current->ptrace) force_sig_fault(SIGTRAP, TRAP_BRKPT, location); else signal = SIGILL; #ifdef CONFIG_UPROBES - } else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) { + } else if (opcode == UPROBE_SWBP_INSN) { is_uprobe_insn = 1; #endif - } else + } else { signal = SIGILL; + } } /* - * We got either an illegal op in kernel mode, or user space trapped + * This is either an illegal op in kernel mode, or user space trapped * on a uprobes illegal instruction. See if kprobes or uprobes picks * it up. If not, SIGILL. */ if (is_uprobe_insn || !user_mode(regs)) { - if (notify_die(DIE_BPT, "bpt", regs, 0, - 3, SIGTRAP) != NOTIFY_STOP) + if (notify_die(DIE_BPT, "bpt", regs, 0, 3, SIGTRAP) != NOTIFY_STOP) signal = SIGILL; } if (signal) @@ -186,21 +173,13 @@ static void illegal_op(struct pt_regs *regs) } NOKPROBE_SYMBOL(illegal_op); -DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, - "specification exception"); - static void vector_exception(struct pt_regs *regs) { int si_code, vic; - if (!MACHINE_HAS_VX) { - do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); - return; - } - /* get vector interrupt code from fpc */ - save_fpu_regs(); - vic = (current->thread.fpu.fpc & 0xf00) >> 8; + save_user_fpu_regs(); + vic = (current->thread.ufpu.fpc & 0xf00) >> 8; switch (vic) { case 1: /* invalid vector operation */ si_code = FPE_FLTINV; @@ -225,9 +204,9 @@ static void vector_exception(struct pt_regs *regs) static void data_exception(struct pt_regs *regs) { - save_fpu_regs(); - if (current->thread.fpu.fpc & FPC_DXC_MASK) - do_fp_trap(regs, current->thread.fpu.fpc); + save_user_fpu_regs(); + if (current->thread.ufpu.fpc & FPC_DXC_MASK) + do_fp_trap(regs, current->thread.ufpu.fpc); else do_trap(regs, SIGILL, ILL_ILLOPN, "data exception"); } @@ -245,7 +224,6 @@ static void monitor_event_exception(struct pt_regs *regs) { if (user_mode(regs)) return; - switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) { case BUG_TRAP_TYPE_NONE: fixup_exception(regs); @@ -258,15 +236,20 @@ static void monitor_event_exception(struct pt_regs *regs) } } -void kernel_stack_overflow(struct pt_regs *regs) +void kernel_stack_invalid(struct pt_regs *regs) { + /* + * Normally regs are unpoisoned by the generic entry code, but + * kernel_stack_overflow() is a rare case that is called bypassing it. + */ + kmsan_unpoison_entry_regs(regs); bust_spinlocks(1); - printk("Kernel stack overflow.\n"); + pr_emerg("Kernel stack pointer invalid\n"); show_regs(regs); bust_spinlocks(0); - panic("Corrupt kernel stack, can't continue."); + panic("Invalid kernel stack pointer, cannot continue"); } -NOKPROBE_SYMBOL(kernel_stack_overflow); +NOKPROBE_SYMBOL(kernel_stack_invalid); static void __init test_monitor_call(void) { @@ -274,18 +257,30 @@ static void __init test_monitor_call(void) if (!IS_ENABLED(CONFIG_BUG)) return; - asm volatile( + asm_inline volatile( " mc 0,0\n" - "0: xgr %0,%0\n" + "0: lhi %[val],0\n" "1:\n" - EX_TABLE(0b,1b) - : "+d" (val)); + EX_TABLE(0b, 1b) + : [val] "+d" (val)); if (!val) panic("Monitor call doesn't work!\n"); } void __init trap_init(void) { + struct lowcore *lc = get_lowcore(); + unsigned long flags; + struct ctlreg cr0; + + local_irq_save(flags); + cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); + psw_bits(lc->external_new_psw).mcheck = 1; + psw_bits(lc->program_new_psw).mcheck = 1; + psw_bits(lc->svc_new_psw).mcheck = 1; + psw_bits(lc->io_new_psw).mcheck = 1; + local_ctl_load(0, &cr0); + local_irq_restore(flags); local_mcck_enable(); test_monitor_call(); } @@ -294,36 +289,47 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned int trapnr; + struct lowcore *lc = get_lowcore(); irqentry_state_t state; + unsigned int trapnr; + union teid teid; - regs->int_code = S390_lowcore.pgm_int_code; - regs->int_parm_long = S390_lowcore.trans_exc_code; - + teid.val = lc->trans_exc_code; + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = teid.val; + /* + * In case of a guest fault, short-circuit the fault handler and return. + * This way the sie64a() function will return 0; fault address and + * other relevant bits are saved in current->thread.gmap_teid, and + * the fault number in current->thread.gmap_int_code. KVM will be + * able to use this information to handle the fault. + */ + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) { + current->thread.gmap_teid.val = regs->int_parm_long; + current->thread.gmap_int_code = regs->int_code & 0xffff; + return; + } state = irqentry_enter(regs); - if (user_mode(regs)) { update_timer_sys(); - if (!static_branch_likely(&cpu_has_bear)) { + if (!cpu_has_bear()) { if (regs->last_break < 4096) regs->last_break = 1; } current->thread.last_break = regs->last_break; } - - if (S390_lowcore.pgm_code & 0x0200) { + if (lc->pgm_code & 0x0200) { /* transaction abort */ - current->thread.trap_tdb = S390_lowcore.pgm_tdb; + current->thread.trap_tdb = lc->pgm_tdb; } - - if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (lc->pgm_code & PGM_INT_CODE_PER) { if (user_mode(regs)) { struct per_event *ev = ¤t->thread.per_event; set_thread_flag(TIF_PER_TRAP); - ev->address = S390_lowcore.per_address; - ev->cause = S390_lowcore.per_code_combined; - ev->paid = S390_lowcore.per_access_id; + ev->address = lc->per_address; + ev->cause = lc->per_code_combined; + ev->paid = lc->per_access_id; } else { /* PER event in kernel is kprobes */ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); @@ -331,11 +337,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) goto out; } } - if (!irqs_disabled_flags(regs->psw.mask)) trace_hardirqs_on(); __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); - trapnr = regs->int_code & PGM_INT_CODE_MASK; if (trapnr) pgm_check_table[trapnr](regs); @@ -387,8 +391,8 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { [0x3b] = do_dat_exception, [0x3c] = default_trap_handler, [0x3d] = do_secure_storage_access, - [0x3e] = do_non_secure_storage_access, - [0x3f] = do_secure_storage_violation, + [0x3e] = default_trap_handler, + [0x3f] = default_trap_handler, [0x40] = monitor_event_exception, [0x41 ... 0x7f] = default_trap_handler, }; @@ -399,5 +403,3 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { __stringify(default_trap_handler)) COND_TRAP(do_secure_storage_access); -COND_TRAP(do_non_secure_storage_access); -COND_TRAP(do_secure_storage_violation); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 0ece156fdd7c..cd44be2b6ce8 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -49,6 +49,8 @@ static inline bool is_final_pt_regs(struct unwind_state *state, READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE; } +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; @@ -118,6 +120,8 @@ out_stop: } EXPORT_SYMBOL_GPL(unwind_next_frame); +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long first_frame) { diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index b88345ef8bd9..5b0633ea8d93 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -12,7 +12,6 @@ #include <linux/kdebug.h> #include <linux/sched/task_stack.h> -#include <asm/switch_to.h> #include <asm/facility.h> #include <asm/kprobes.h> #include <asm/dis.h> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index a5425075dd25..b99478e84da4 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -2,7 +2,7 @@ /* * Common Ultravisor functions and initialization * - * Copyright IBM Corp. 2019, 2020 + * Copyright IBM Corp. 2019, 2024 */ #define KMSG_COMPONENT "prot_virt" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -14,21 +14,29 @@ #include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/pagewalk.h> +#include <linux/backing-dev.h> #include <asm/facility.h> #include <asm/sections.h> #include <asm/uv.h> /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); -#endif +EXPORT_SYMBOL(prot_virt_guest); +/* + * uv_info contains both host and guest information but it's currently only + * expected to be used within modules if it's the KVM module or for + * any PV guest module. + * + * The kernel itself will write these values once in uv_query_info() + * and then make some of them readable via a sysfs interface. + */ struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); -#if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); -EXPORT_SYMBOL(uv_info); static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len) { @@ -80,7 +88,7 @@ fail: * Requests the Ultravisor to pin the page in the shared state. This will * cause an intercept when the guest attempts to unshare the pinned page. */ -static int uv_pin_shared(unsigned long paddr) +int uv_pin_shared(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_PIN_PAGE_SHARED, @@ -92,6 +100,7 @@ static int uv_pin_shared(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_pin_shared); /* * Requests the Ultravisor to destroy a guest page and make it @@ -100,7 +109,7 @@ static int uv_pin_shared(unsigned long paddr) * * @paddr: Absolute host address of page to be destroyed */ -static int uv_destroy_page(unsigned long paddr) +static int uv_destroy(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_DESTR_SEC_STOR, @@ -121,20 +130,33 @@ static int uv_destroy_page(unsigned long paddr) } /* - * The caller must already hold a reference to the page + * The caller must already hold a reference to the folio */ -int uv_destroy_owned_page(unsigned long paddr) +int uv_destroy_folio(struct folio *folio) { - struct page *page = phys_to_page(paddr); int rc; - get_page(page); - rc = uv_destroy_page(paddr); + /* Large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_destroy(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &page->flags); - put_page(page); + clear_bit(PG_arch_1, &folio->flags); + folio_put(folio); return rc; } +EXPORT_SYMBOL(uv_destroy_folio); + +/* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_destroy_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_destroy_folio(pfn_folio(pte_pfn(pte))); +} /* * Requests the Ultravisor to encrypt a guest page and make it @@ -154,65 +176,119 @@ int uv_convert_from_secure(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure); /* - * The caller must already hold a reference to the page + * The caller must already hold a reference to the folio. */ -int uv_convert_owned_from_secure(unsigned long paddr) +int uv_convert_from_secure_folio(struct folio *folio) { - struct page *page = phys_to_page(paddr); int rc; - get_page(page); - rc = uv_convert_from_secure(paddr); + /* Large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &page->flags); - put_page(page); + clear_bit(PG_arch_1, &folio->flags); + folio_put(folio); return rc; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio); + +/* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_convert_from_secure_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); +} + +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} /* - * Calculate the expected ref_count for a page that would otherwise have no + * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in * the kernel, but with some slight modifications. We know that a secure - * page can not be a huge page for example. + * folio can not be a large folio, for example. */ -static int expected_page_refs(struct page *page) +static int expected_folio_refs(struct folio *folio) { int res; - res = page_mapcount(page); - if (PageSwapCache(page)) { + res = folio_mapcount(folio); + if (folio_test_swapcache(folio)) { res++; - } else if (page_mapping(page)) { + } else if (folio_mapping(folio)) { res++; - if (page_has_private(page)) + if (folio->private) res++; } return res; } -static int make_secure_pte(pte_t *ptep, unsigned long addr, - struct page *exp_page, struct uv_cb_header *uvcb) +/** + * __make_folio_secure() - make a folio secure + * @folio: the folio to make secure + * @uvcb: the uvcb that describes the UVC to be used + * + * The folio @folio will be made secure if possible, @uvcb will be passed + * as-is to the UVC. + * + * Return: 0 on success; + * -EBUSY if the folio is in writeback or has too many references; + * -EAGAIN if the UVC needs to be attempted again; + * -ENXIO if the address is not mapped; + * -EINVAL if the UVC failed for other reasons. + * + * Context: The caller must hold exactly one extra reference on the folio + * (it's the same logic as split_folio()), and the folio must be + * locked. + */ +static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { - pte_t entry = READ_ONCE(*ptep); - struct page *page; int expected, cc = 0; - if (!pte_present(entry)) - return -ENXIO; - if (pte_val(entry) & _PAGE_INVALID) - return -ENXIO; - - page = pte_page(entry); - if (page != exp_page) - return -ENXIO; - if (PageWriteback(page)) - return -EAGAIN; - expected = expected_page_refs(page); - if (!page_ref_freeze(page, expected)) + if (folio_test_writeback(folio)) return -EBUSY; - set_bit(PG_arch_1, &page->flags); + expected = expected_folio_refs(folio) + 1; + if (!folio_ref_freeze(folio, expected)) + return -EBUSY; + set_bit(PG_arch_1, &folio->flags); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -222,9 +298,9 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, * -EAGAIN and we let the callers deal with it. */ cc = __uv_call(0, (u64)uvcb); - page_ref_unfreeze(page, expected); + folio_ref_unfreeze(folio, expected); /* - * Return -ENXIO if the page was not mapped, -EINVAL for other errors. + * Return -ENXIO if the folio was not mapped, -EINVAL for other errors. * If busy or partially completed, return -EAGAIN. */ if (cc == UVC_CC_OK) @@ -234,164 +310,255 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } -/* - * Requests the Ultravisor to make a page accessible to a guest. - * If it's brought in the first time, it will be cleared. If - * it has been exported before, it will be decrypted and integrity - * checked. - */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb) { - struct vm_area_struct *vma; - bool local_drain = false; - spinlock_t *ptelock; - unsigned long uaddr; - struct page *page; - pte_t *ptep; int rc; -again: - rc = -EFAULT; - mmap_read_lock(gmap->mm); + if (!folio_trylock(folio)) + return -EAGAIN; + if (should_export_before_import(uvcb, mm)) + uv_convert_from_secure(folio_to_phys(folio)); + rc = __make_folio_secure(folio, uvcb); + folio_unlock(folio); + + return rc; +} + +/** + * s390_wiggle_split_folio() - try to drain extra references to a folio and + * split the folio if it is large. + * @mm: the mm containing the folio to work on + * @folio: the folio + * + * Context: Must be called while holding an extra reference to the folio; + * the mm lock should not be held. + * Return: 0 if the operation was successful; + * -EAGAIN if splitting the large folio was not successful, + * but another attempt can be made; + * -EINVAL in case of other folio splitting errors. See split_folio(). + */ +static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) +{ + int rc, tried_splits; + + lockdep_assert_not_held(&mm->mmap_lock); + folio_wait_writeback(folio); + lru_add_drain_all(); + + if (!folio_test_large(folio)) + return 0; + + for (tried_splits = 0; tried_splits < 2; tried_splits++) { + struct address_space *mapping; + loff_t lstart, lend; + struct inode *inode; + + folio_lock(folio); + rc = split_folio(folio); + if (rc != -EBUSY) { + folio_unlock(folio); + return rc; + } - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. If - * userspace is playing dirty tricky with mapping huge pages later - * on this will result in a segmentation fault. - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = -ENXIO; - page = follow_page(vma, uaddr, FOLL_WRITE); - if (IS_ERR_OR_NULL(page)) - goto out; - - lock_page(page); - ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); - rc = make_secure_pte(ptep, uaddr, page, uvcb); - pte_unmap_unlock(ptep, ptelock); - unlock_page(page); -out: - mmap_read_unlock(gmap->mm); - - if (rc == -EAGAIN) { - /* - * If we are here because the UVC returned busy or partial - * completion, this is just a useless check, but it is safe. - */ - wait_on_page_writeback(page); - } else if (rc == -EBUSY) { /* - * If we have tried a local drain and the page refcount - * still does not match our expected safe value, try with a - * system wide drain. This is needed if the pagevecs holding - * the page are on a different CPU. + * Splitting with -EBUSY can fail for various reasons, but we + * have to handle one case explicitly for now: some mappings + * don't allow for splitting dirty folios; writeback will + * mark them clean again, including marking all page table + * entries mapping the folio read-only, to catch future write + * attempts. + * + * While the system should be writing back dirty folios in the + * background, we obtained this folio by looking up a writable + * page table entry. On these problematic mappings, writable + * page table entries imply dirty folios, preventing the + * split in the first place. + * + * To prevent a livelock when trigger writeback manually and + * letting the caller look up the folio again in the page + * table (turning it dirty), immediately try to split again. + * + * This is only a problem for some mappings (e.g., XFS); + * mappings that do not support writeback (e.g., shmem) do not + * apply. */ - if (local_drain) { - lru_add_drain_all(); - /* We give up here, and let the caller try again */ - return -EAGAIN; + if (!folio_test_dirty(folio) || folio_test_anon(folio) || + !folio->mapping || !mapping_can_writeback(folio->mapping)) { + folio_unlock(folio); + break; } + /* - * We are here if the page refcount does not match the - * expected safe value. The main culprits are usually - * pagevecs. With lru_add_drain() we drain the pagevecs - * on the local CPU so that hopefully the refcount will - * reach the expected safe value. + * Ideally, we'd only trigger writeback on this exact folio. But + * there is no easy way to do that, so we'll stabilize the + * mapping while we still hold the folio lock, so we can drop + * the folio lock to trigger writeback on the range currently + * covered by the folio instead. */ - lru_add_drain(); - local_drain = true; - /* And now we try again immediately after draining */ - goto again; - } else if (rc == -ENXIO) { - if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) - return -EFAULT; - return -EAGAIN; + mapping = folio->mapping; + lstart = folio_pos(folio); + lend = lstart + folio_size(folio) - 1; + inode = igrab(mapping->host); + folio_unlock(folio); + + if (unlikely(!inode)) + break; + + filemap_write_and_wait_range(mapping, lstart, lend); + iput(mapping->host); } - return rc; + return -EAGAIN; } -EXPORT_SYMBOL_GPL(gmap_make_secure); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) { - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, - }; + struct vm_area_struct *vma; + struct folio_walk fw; + struct folio *folio; + int rc; + + mmap_read_lock(mm); + vma = vma_lookup(mm, hva); + if (!vma) { + mmap_read_unlock(mm); + return -EFAULT; + } + folio = folio_walk_start(&fw, vma, hva, 0); + if (!folio) { + mmap_read_unlock(mm); + return -ENXIO; + } + + folio_get(folio); + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. + * If userspace plays dirty tricks and decides to map huge pages at a + * later point in time, it will receive a segmentation fault or + * KVM_RUN will return -EFAULT. + */ + if (folio_test_hugetlb(folio)) + rc = -EFAULT; + else if (folio_test_large(folio)) + rc = -E2BIG; + else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID)) + rc = -ENXIO; + else + rc = make_folio_secure(mm, folio, uvcb); + folio_walk_end(&fw, vma); + mmap_read_unlock(mm); + + if (rc == -E2BIG || rc == -EBUSY) { + rc = s390_wiggle_split_folio(mm, folio); + if (!rc) + rc = -EAGAIN; + } + folio_put(folio); - return gmap_make_secure(gmap, gaddr, &uvcb); + return rc; } -EXPORT_SYMBOL_GPL(gmap_convert_to_secure); +EXPORT_SYMBOL_GPL(make_hva_secure); /* - * To be called with the page locked or with an extra reference! This will - * prevent gmap_make_secure from touching the page concurrently. Having 2 - * parallel make_page_accessible is fine, as the UV calls will become a - * no-op if the page is already exported. + * To be called with the folio locked or with an extra reference! This will + * prevent kvm_s390_pv_make_secure() from touching the folio concurrently. + * Having 2 parallel arch_make_folio_accessible is fine, as the UV calls will + * become a no-op if the folio is already exported. */ -int arch_make_page_accessible(struct page *page) +int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* Hugepage cannot be protected, so nothing to do */ - if (PageHuge(page)) + /* Large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) return 0; /* - * PG_arch_1 is used in 3 places: - * 1. for kernel page tables during early boot - * 2. for storage keys of huge pages and KVM - * 3. As an indication that this page might be secure. This can + * PG_arch_1 is used in 2 places: + * 1. for storage keys of hugetlb folios and KVM + * 2. As an indication that this small folio might be secure. This can * overindicate, e.g. we set the bit before calling * convert_to_secure. - * As secure pages are never huge, all 3 variants can co-exists. + * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &page->flags)) + if (!test_bit(PG_arch_1, &folio->flags)) return 0; - rc = uv_pin_shared(page_to_phys(page)); + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); return 0; } - rc = uv_convert_from_secure(page_to_phys(page)); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); return 0; } return rc; } -EXPORT_SYMBOL_GPL(arch_make_page_accessible); - -#endif +EXPORT_SYMBOL_GPL(arch_make_folio_accessible); -#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) static ssize_t uv_query_facilities(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n", - uv_info.inst_calls_list[0], - uv_info.inst_calls_list[1], - uv_info.inst_calls_list[2], - uv_info.inst_calls_list[3]); + return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); } static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_se_hdr_ver_attr = + __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL); + +static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf); +} + +static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = + __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); + +static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len); +} + +static struct kobj_attribute uv_query_dump_cpu_len_attr = + __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); + +static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len); +} + +static struct kobj_attribute uv_query_dump_storage_state_len_attr = + __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); + +static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len); +} + +static struct kobj_attribute uv_query_dump_finalize_len_attr = + __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL); + static ssize_t uv_query_feature_indications(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -402,69 +569,208 @@ static struct kobj_attribute uv_query_feature_indications_attr = __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%d\n", - uv_info.max_guest_cpu_id + 1); + return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1); } static struct kobj_attribute uv_query_max_guest_cpus_attr = __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); static ssize_t uv_query_max_guest_vms(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%d\n", - uv_info.max_num_sec_conf); + return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf); } static struct kobj_attribute uv_query_max_guest_vms_attr = __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); static ssize_t uv_query_max_guest_addr(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", - uv_info.max_sec_stor_addr); + return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr); } static struct kobj_attribute uv_query_max_guest_addr_attr = __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + +static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver); +} + +static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr = + __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL); + +static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf); +} + +static struct kobj_attribute uv_query_supp_add_secret_pcf_attr = + __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL); + +static ssize_t uv_query_supp_secret_types(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types); +} + +static struct kobj_attribute uv_query_supp_secret_types_attr = + __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL); + +static ssize_t uv_query_max_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", + uv_info.max_assoc_secrets + uv_info.max_retr_secrets); +} + +static struct kobj_attribute uv_query_max_secrets_attr = + __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL); + +static ssize_t uv_query_max_retr_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_retr_secrets); +} + +static struct kobj_attribute uv_query_max_retr_secrets_attr = + __ATTR(max_retr_secrets, 0444, uv_query_max_retr_secrets, NULL); + +static ssize_t uv_query_max_assoc_secrets(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_assoc_secrets); +} + +static struct kobj_attribute uv_query_max_assoc_secrets_attr = + __ATTR(max_assoc_secrets, 0444, uv_query_max_assoc_secrets, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, + &uv_query_supp_se_hdr_ver_attr.attr, + &uv_query_supp_se_hdr_pcf_attr.attr, + &uv_query_dump_storage_state_len_attr.attr, + &uv_query_dump_finalize_len_attr.attr, + &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, + &uv_query_supp_add_secret_req_ver_attr.attr, + &uv_query_supp_add_secret_pcf_attr.attr, + &uv_query_supp_secret_types_attr.attr, + &uv_query_max_secrets_attr.attr, + &uv_query_max_assoc_secrets_attr.attr, + &uv_query_max_retr_secrets_attr.attr, NULL, }; +static inline struct uv_cb_query_keys uv_query_keys(void) +{ + struct uv_cb_query_keys uvcb = { + .header.cmd = UVC_CMD_QUERY_KEYS, + .header.len = sizeof(uvcb) + }; + + uv_call(0, (uint64_t)&uvcb); + return uvcb; +} + +static inline ssize_t emit_hash(struct uv_key_hash *hash, char *buf, int at) +{ + return sysfs_emit_at(buf, at, "%016llx%016llx%016llx%016llx\n", + hash->dword[0], hash->dword[1], hash->dword[2], hash->dword[3]); +} + +static ssize_t uv_keys_host_key(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + + return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_HK], buf, 0); +} + +static struct kobj_attribute uv_keys_host_key_attr = + __ATTR(host_key, 0444, uv_keys_host_key, NULL); + +static ssize_t uv_keys_backup_host_key(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + + return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_BACK_HK], buf, 0); +} + +static struct kobj_attribute uv_keys_backup_host_key_attr = + __ATTR(backup_host_key, 0444, uv_keys_backup_host_key, NULL); + +static ssize_t uv_keys_all(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + ssize_t len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(uvcb.key_hashes); i++) + len += emit_hash(uvcb.key_hashes + i, buf, len); + + return len; +} + +static struct kobj_attribute uv_keys_all_attr = + __ATTR(all, 0444, uv_keys_all, NULL); + static struct attribute_group uv_query_attr_group = { .attrs = uv_query_attrs, }; +static struct attribute *uv_keys_attrs[] = { + &uv_keys_host_key_attr.attr, + &uv_keys_backup_host_key_attr.attr, + &uv_keys_all_attr.attr, + NULL, +}; + +static struct attribute_group uv_keys_attr_group = { + .attrs = uv_keys_attrs, +}; + static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - int val = 0; - -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST - val = prot_virt_guest; -#endif - return scnprintf(page, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_guest); } static ssize_t uv_is_prot_virt_host(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - int val = 0; - -#if IS_ENABLED(CONFIG_KVM) - val = prot_virt_host; -#endif - - return scnprintf(page, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_host); } static struct kobj_attribute uv_prot_virt_guest = @@ -480,9 +786,27 @@ static const struct attribute *uv_prot_virt_attrs[] = { }; static struct kset *uv_query_kset; +static struct kset *uv_keys_kset; static struct kobject *uv_kobj; -static int __init uv_info_init(void) +static int __init uv_sysfs_dir_init(const struct attribute_group *grp, + struct kset **uv_dir_kset, const char *name) +{ + struct kset *kset; + int rc; + + kset = kset_create_and_add(name, NULL, uv_kobj); + if (!kset) + return -ENOMEM; + *uv_dir_kset = kset; + + rc = sysfs_create_group(&kset->kobj, grp); + if (rc) + kset_unregister(kset); + return rc; +} + +static int __init uv_sysfs_init(void) { int rc = -ENOMEM; @@ -497,17 +821,16 @@ static int __init uv_info_init(void) if (rc) goto out_kobj; - uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); - if (!uv_query_kset) { - rc = -ENOMEM; + rc = uv_sysfs_dir_init(&uv_query_attr_group, &uv_query_kset, "query"); + if (rc) goto out_ind_files; - } - rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); - if (!rc) - return 0; + /* Get installed key hashes if available, ignore any errors */ + if (test_bit_inv(BIT_UVC_CMD_QUERY_KEYS, uv_info.inst_calls_list)) + uv_sysfs_dir_init(&uv_keys_attr_group, &uv_keys_kset, "keys"); + + return 0; - kset_unregister(uv_query_kset); out_ind_files: sysfs_remove_files(uv_kobj, uv_prot_virt_attrs); out_kobj: @@ -515,5 +838,110 @@ out_kobj: kobject_put(uv_kobj); return rc; } -device_initcall(uv_info_init); -#endif +device_initcall(uv_sysfs_init); + +/* + * Locate a secret in the list by its id. + * @secret_id: search pattern. + * @list: ephemeral buffer space + * @secret: output data, containing the secret's metadata. + * + * Search for a secret with the given secret_id in the Ultravisor secret store. + * + * Context: might sleep. + */ +static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN], + const struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) +{ + u16 i; + + for (i = 0; i < list->total_num_secrets; i++) { + if (memcmp(secret_id, list->secrets[i].id, UV_SECRET_ID_LEN) == 0) { + *secret = list->secrets[i].hdr; + return 0; + } + } + return -ENOENT; +} + +/* + * Do the actual search for `uv_get_secret_metadata`. + * @secret_id: search pattern. + * @list: ephemeral buffer space + * @secret: output data, containing the secret's metadata. + * + * Context: might sleep. + */ +int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) +{ + u16 start_idx = 0; + u16 list_rc; + int ret; + + do { + uv_list_secrets(list, start_idx, &list_rc, NULL); + if (list_rc != UVC_RC_EXECUTED && list_rc != UVC_RC_MORE_DATA) { + if (list_rc == UVC_RC_INV_CMD) + return -ENODEV; + else + return -EIO; + } + ret = find_secret_in_page(secret_id, list, secret); + if (ret == 0) + return ret; + start_idx = list->next_secret_idx; + } while (list_rc == UVC_RC_MORE_DATA && start_idx < list->next_secret_idx); + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(uv_find_secret); + +/** + * uv_retrieve_secret() - get the secret value for the secret index. + * @secret_idx: Secret index for which the secret should be retrieved. + * @buf: Buffer to store retrieved secret. + * @buf_size: Size of the buffer. The correct buffer size is reported as part of + * the result from `uv_get_secret_metadata`. + * + * Calls the Retrieve Secret UVC and translates the UV return code into an errno. + * + * Context: might sleep. + * + * Return: + * * %0 - Entry found; buffer contains a valid secret. + * * %ENOENT: - No entry found or secret at the index is non-retrievable. + * * %ENODEV: - Not supported: UV not available or command not available. + * * %EINVAL: - Buffer too small for content. + * * %EIO: - Other unexpected UV error. + */ +int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size) +{ + struct uv_cb_retr_secr uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_RETR_SECRET, + .secret_idx = secret_idx, + .buf_addr = (u64)buf, + .buf_size = buf_size, + }; + + uv_call_sched(0, (u64)&uvcb); + + switch (uvcb.header.rc) { + case UVC_RC_EXECUTED: + return 0; + case UVC_RC_INV_CMD: + return -ENODEV; + case UVC_RC_RETR_SECR_STORE_EMPTY: + case UVC_RC_RETR_SECR_INV_SECRET: + case UVC_RC_RETR_SECR_INV_IDX: + return -ENOENT; + case UVC_RC_RETR_SECR_BUF_SMALL: + return -EINVAL; + default: + return -EIO; + } +} +EXPORT_SYMBOL_GPL(uv_retrieve_secret); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 5075cde77b29..430feb1a5013 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -12,126 +12,20 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/time_namespace.h> #include <linux/random.h> +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> +#include <asm/vdso/vsyscall.h> +#include <asm/alternative.h> #include <asm/vdso.h> extern char vdso64_start[], vdso64_end[]; extern char vdso32_start[], vdso32_end[]; -static struct vm_special_mapping vvar_mapping; - -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; - -struct vdso_data *vdso_data = vdso_data_store.data; - -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - WARN(1, "vvar_page accessed remotely"); - return NULL; -} - -/* - * The VVAR page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will be re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long size = vma->vm_end - vma->vm_start; - - if (!vma_is_special_mapping(vma, &vvar_mapping)) - continue; - zap_page_range(vma, vma->vm_start, size); - break; - } - mmap_read_unlock(mm); - return 0; -} -#else -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long addr, pfn; - vm_fault_t err; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - if (timens_page) { - /* - * Fault in VVAR page too, since it will be accessed - * to get clock data anyway. - */ - addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - pfn = page_to_pfn(timens_page); - } - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - return vmf_insert_pfn(vma, vmf->address, pfn); -} - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) { @@ -139,11 +33,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping vdso64_mapping = { .name = "[vdso]", .mremap = vdso_mremap, @@ -169,7 +58,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) struct vm_area_struct *vma; int rc; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -184,17 +73,14 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; - vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); + vma = vdso_install_vvar_mapping(mm, vvar_start); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|VM_SEALED_SYSMAP| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_mapping); if (IS_ERR(vma)) { @@ -226,7 +112,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned long len) end -= len; if (end > start) { - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); addr = start + (offset << PAGE_SHIFT); } else { addr = start; @@ -234,17 +120,22 @@ static unsigned long vdso_addr(unsigned long start, unsigned long len) return addr; } -unsigned long vdso_size(void) +unsigned long vdso_text_size(void) { - unsigned long size = VVAR_NR_PAGES * PAGE_SIZE; + unsigned long size; if (is_compat_task()) - size += vdso32_end - vdso32_start; + size = vdso32_end - vdso32_start; else - size += vdso64_end - vdso64_start; + size = vdso64_end - vdso64_start; return PAGE_ALIGN(size); } +unsigned long vdso_size(void) +{ + return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE; +} + int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { unsigned long addr = VDSO_BASE; @@ -269,8 +160,25 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) return pagelist; } +static void vdso_apply_alternatives(void) +{ + const struct elf64_shdr *alt, *shdr; + struct alt_instr *start, *end; + const struct elf64_hdr *hdr; + + hdr = (struct elf64_hdr *)vdso64_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + start = (void *)hdr + alt->sh_offset; + end = (void *)hdr + alt->sh_offset + alt->sh_size; + apply_alternatives(start, end); +} + static int __init vdso_init(void) { + vdso_apply_alternatives(); vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); if (IS_ENABLED(CONFIG_COMPAT)) vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index 245bddfe9bc0..1e4ddd1a683f 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -1,11 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # List of files in the vdso -KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT - -include $(srctree)/lib/vdso/Makefile +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile.include obj-vdso32 = vdso_user_wrapper-32.o note-32.o # Build rules @@ -20,9 +17,12 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) KBUILD_AFLAGS_32 += -m31 -s KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin +KBUILD_CFLAGS_32 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin -fasynchronous-unwind-tables -LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \ +LDFLAGS_vdso32.so.dbg += -shared -soname=linux-vdso32.so.1 \ --hash-style=both --build-id=sha1 -melf_s390 -T $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) @@ -32,17 +32,14 @@ obj-y += vdso32_wrapper.o targets += vdso32.lds CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) -# Disable gcov profiling, ubsan and kasan for VDSO code -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n - # Force dependency (incbin is bad) $(obj)/vdso32_wrapper.o : $(obj)/vdso32.so -$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE - $(call if_changed,ld) +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + +$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,vdso_and_check) # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -58,18 +55,8 @@ quiet_cmd_vdso32as = VDSO32A $@ quiet_cmd_vdso32cc = VDSO32C $@ cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso32.so: $(obj)/vdso32.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso32.so - # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index edf5ff1debe1..9630d58c2080 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -6,18 +6,16 @@ #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") OUTPUT_ARCH(s390:31-bit) -ENTRY(_start) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S index 3f42f27f978c..2e645003fdaf 100644 --- a/arch/s390/kernel/vdso32/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> #include <asm/unistd.h> #include <asm/dwarf.h> .macro vdso_syscall func,syscall .globl __kernel_compat_\func .type __kernel_compat_\func,@function - .align 8 + __ALIGN __kernel_compat_\func: CFI_STARTPROC svc \syscall diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 9e2b95a222a9..d8f0df742809 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -1,17 +1,19 @@ # SPDX-License-Identifier: GPL-2.0 # List of files in the vdso -KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT - -include $(srctree)/lib/vdso/Makefile -obj-vdso64 = vdso_user_wrapper.o note.o -obj-cvdso64 = vdso64_generic.o getcpu.o -VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile.include +obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o +obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE) CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif + # Build rules targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg @@ -22,11 +24,15 @@ KBUILD_AFLAGS += -DBUILD_VDSO KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_64 += -m64 -s +KBUILD_AFLAGS_64 += -m64 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin -ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \ +KBUILD_CFLAGS_64 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables +ldflags-y := -shared -soname=linux-vdso64.so.1 \ --hash-style=both --build-id=sha1 -T $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) @@ -36,18 +42,15 @@ obj-y += vdso64_wrapper.o targets += vdso64.lds CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) -# Disable gcov profiling, ubsan and kasan for VDSO code -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n - # Force dependency (incbin is bad) $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + # link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE - $(call if_changed,ld) +$(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE + $(call if_changed,vdso_and_check) # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -67,18 +70,8 @@ quiet_cmd_vdso64as = VDSO64A $@ quiet_cmd_vdso64cc = VDSO64C $@ cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso64.so: $(obj)/vdso64.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso64.so - # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h index 34c7a2312f9d..9e5397e7b590 100644 --- a/arch/s390/kernel/vdso64/vdso.h +++ b/arch/s390/kernel/vdso64/vdso.h @@ -10,5 +10,6 @@ int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unuse int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); #endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 4461ea151e49..e4f6551ae898 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -4,20 +4,19 @@ * library */ +#include <asm/vdso/vsyscall.h> #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) -ENTRY(_start) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -43,6 +42,10 @@ SECTIONS .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } .rodata1 : { *(.rodata1) } + . = ALIGN(8); + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + .dynamic : { *(.dynamic) } :text :dynamic .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr @@ -141,6 +144,7 @@ VERSION __kernel_restart_syscall; __kernel_rt_sigreturn; __kernel_sigreturn; + __kernel_getrandom; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index 97f0c0a669a5..aa06c85bcbd3 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -1,12 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> #include <asm/vdso.h> #include <asm/unistd.h> #include <asm/asm-offsets.h> #include <asm/dwarf.h> #include <asm/ptrace.h> -#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8) - /* * Older glibc version called vdso without allocating a stackframe. This wrapper * is just used to allocate a stackframe. See @@ -14,23 +13,23 @@ * for details. */ .macro vdso_func func - .globl __kernel_\func - .type __kernel_\func,@function - .align 8 -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC - aghi %r15,-WRAPPER_FRAME_SIZE - CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE) - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - stg %r14,STACK_FRAME_OVERHEAD(%r15) + aghi %r15,-STACK_FRAME_VDSO_OVERHEAD + CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD) + CFI_VAL_OFFSET 15,-STACK_FRAME_USER_OVERHEAD + stg %r14,__SFVDSO_RETURN_ADDRESS(%r15) + CFI_REL_OFFSET 14,__SFVDSO_RETURN_ADDRESS + xc __SFUSER_BACKCHAIN(8,%r15),__SFUSER_BACKCHAIN(%r15) brasl %r14,__s390_vdso_\func - lg %r14,STACK_FRAME_OVERHEAD(%r15) - aghi %r15,WRAPPER_FRAME_SIZE - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + lg %r14,__SFVDSO_RETURN_ADDRESS(%r15) + CFI_RESTORE 14 + aghi %r15,STACK_FRAME_VDSO_OVERHEAD + CFI_DEF_CFA_OFFSET STACK_FRAME_USER_OVERHEAD CFI_RESTORE 15 br %r14 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_func gettimeofday @@ -39,16 +38,13 @@ vdso_func clock_gettime vdso_func getcpu .macro vdso_syscall func,syscall - .globl __kernel_\func - .type __kernel_\func,@function - .align 8 -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC svc \syscall /* Make sure we notice when a syscall returns, which shouldn't happen */ .word 0 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_syscall restart_syscall,__NR_restart_syscall diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S new file mode 100644 index 000000000000..09c034c2f853 --- /dev/null +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/stringify.h> +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/dwarf.h> +#include <asm/fpu-insn.h> + +#define STATE0 %v0 +#define STATE1 %v1 +#define STATE2 %v2 +#define STATE3 %v3 +#define COPY0 %v4 +#define COPY1 %v5 +#define COPY2 %v6 +#define COPY3 %v7 +#define BEPERM %v19 +#define TMP0 %v20 +#define TMP1 %v21 +#define TMP2 %v22 +#define TMP3 %v23 + + .section .rodata + + .balign 32 +SYM_DATA_START_LOCAL(chacha20_constants) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap +SYM_DATA_END(chacha20_constants) + + .text +/* + * s390 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Does not spill to the stack. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + CFI_STARTPROC + larl %r1,chacha20_constants + + /* COPY0 = "expand 32-byte k" */ + VL COPY0,0,,%r1 + + /* BEPERM = byte selectors for VPERM */ + ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148) + + /* COPY1,COPY2 = key */ + VLM COPY1,COPY2,0,%r3 + + /* COPY3 = counter || zero nonce */ + lg %r3,0(%r4) + VZERO COPY3 + VLVGG COPY3,%r3,0 + + lghi %r1,0 +.Lblock: + VLR STATE0,COPY0 + VLR STATE1,COPY1 + VLR STATE2,COPY2 + VLR STATE3,COPY3 + + lghi %r0,10 +.Ldoubleround: + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ + VSLDB STATE1,STATE1,STATE1,4 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ + VSLDB STATE3,STATE3,STATE3,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ + VSLDB STATE1,STATE1,STATE1,12 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ + VSLDB STATE3,STATE3,STATE3,4 + brctg %r0,.Ldoubleround + + /* OUTPUT0 = STATE0 + COPY0 */ + VAF STATE0,STATE0,COPY0 + /* OUTPUT1 = STATE1 + COPY1 */ + VAF STATE1,STATE1,COPY1 + /* OUTPUT2 = STATE2 + COPY2 */ + VAF STATE2,STATE2,COPY2 + /* OUTPUT3 = STATE3 + COPY3 */ + VAF STATE3,STATE3,COPY3 + + ALTERNATIVE \ + __stringify( \ + /* Convert STATE to little endian and store to OUTPUT */\ + VPERM TMP0,STATE0,STATE0,BEPERM; \ + VPERM TMP1,STATE1,STATE1,BEPERM; \ + VPERM TMP2,STATE2,STATE2,BEPERM; \ + VPERM TMP3,STATE3,STATE3,BEPERM; \ + VSTM TMP0,TMP3,0,%r2), \ + __stringify( \ + /* 32 bit wise little endian store to OUTPUT */ \ + VSTBRF STATE0,0,,%r2; \ + VSTBRF STATE1,16,,%r2; \ + VSTBRF STATE2,32,,%r2; \ + VSTBRF STATE3,48,,%r2; \ + brcl 0,0), \ + ALT_FACILITY(148) + + /* ++COPY3.COUNTER */ + /* alsih %r3,1 */ + .insn rilu,0xcc0a00000000,%r3,1 + alcr %r3,%r1 + VLVGG COPY3,%r3,0 + + /* OUTPUT += 64, --NBLOCKS */ + aghi %r2,64 + brctg %r5,.Lblock + + /* COUNTER = COPY3.COUNTER */ + stg %r3,0(%r4) + + /* Zero out potentially sensitive regs */ + VZERO STATE0 + VZERO STATE1 + VZERO STATE2 + VZERO STATE3 + VZERO COPY1 + VZERO COPY2 + + /* Early exit if TMP0-TMP3 have not been used */ + ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148) + + VZERO TMP0 + VZERO TMP1 + VZERO TMP2 + VZERO TMP3 + + br %r14 + CFI_ENDPROC +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso64/vgetrandom.c new file mode 100644 index 000000000000..b5268b507fb5 --- /dev/null +++ b/arch/s390/kernel/vdso64/vgetrandom.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/facility.h> +#include <uapi/asm-generic/errno.h> +#include "vdso.h" + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (test_facility(129)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c new file mode 100644 index 000000000000..cc8933e04ff7 --- /dev/null +++ b/arch/s390/kernel/vmcore_info.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/vmcore_info.h> +#include <linux/mm.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> +#include <asm/setup.h> + +void arch_crash_save_vmcoreinfo(void) +{ + struct lowcore *abs_lc; + + VMCOREINFO_SYMBOL(lowcore_ptr); + VMCOREINFO_SYMBOL(high_memory); + VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); + vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); + vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys); + abs_lc = get_abs_lowcore(); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc); +} diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 2e526f11b91e..ff1ddba96352 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -14,9 +14,13 @@ #define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \ *(.bss..invalid_pg_dir) +#define RO_EXCEPTION_TABLE_ALIGN 16 + /* Handle ro_after_init data on our own. */ #define RO_AFTER_INIT_DATA +#define RUNTIME_DISCARD_EXIT + #define EMITS_PT_NOTE #include <asm-generic/vmlinux.lds.h> @@ -35,14 +39,13 @@ PHDRS { SECTIONS { - . = 0x100000; + . = TEXT_OFFSET; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT @@ -65,11 +68,22 @@ SECTIONS *(.data..ro_after_init) JUMP_TABLE_DATA } :data - EXCEPTION_TABLE(16) . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; + .data.rel.ro : { + *(.data.rel.ro .data.rel.ro.*) + } + .got : { + __got_start = .; + *(.got) + __got_end = .; + } + RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) + .data.rel : { + *(.data.rel*) + } BOOT_DATA_PRESERVED . = ALIGN(8); @@ -79,6 +93,7 @@ SECTIONS _end_amode31_refs = .; } + . = ALIGN(PAGE_SIZE); _edata = .; /* End of data section */ /* will be freed after init */ @@ -131,6 +146,7 @@ SECTIONS /* * Table with the patch locations to undo expolines */ + . = ALIGN(4); .nospec_call_table : { __nospec_call_start = . ; *(.s390_indirect*) @@ -167,31 +183,23 @@ SECTIONS .amode31.data : { *(.amode31.data) } - . = ALIGN(PAGE_SIZE); + . = _samode31 + AMODE31_SIZE; _eamode31 = .; /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) - PERCPU_SECTION(0x100) + RUNTIME_CONST_VARIABLES - .dynsym ALIGN(8) : { - __dynsym_start = .; - *(.dynsym) - __dynsym_end = .; - } - .rela.dyn ALIGN(8) : { - __rela_dyn_start = .; - *(.rela*) - __rela_dyn_end = .; - } + PERCPU_SECTION(0x100) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE) + . = ALIGN(PAGE_SIZE); _end = . ; /* @@ -199,7 +207,6 @@ SECTIONS * it should match struct vmlinux_info */ .vmlinux.info 0 (INFO) : { - QUAD(_stext) /* default_lma */ QUAD(startup_continue) /* entry */ QUAD(__bss_start - _stext) /* image_size */ QUAD(__bss_stop - __bss_start) /* bss_size */ @@ -208,10 +215,21 @@ SECTIONS QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ QUAD(__boot_data_preserved_end - __boot_data_preserved_start) /* bootdata_preserved_size */ - QUAD(__dynsym_start) /* dynsym_start */ - QUAD(__rela_dyn_start) /* rela_dyn_start */ - QUAD(__rela_dyn_end) /* rela_dyn_end */ + QUAD(__got_start) /* got_start */ + QUAD(__got_end) /* got_end */ QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) + QUAD(__alt_instructions) + QUAD(__alt_instructions_end) +#ifdef CONFIG_KASAN + QUAD(kasan_early_shadow_page) + QUAD(kasan_early_shadow_pte) + QUAD(kasan_early_shadow_pmd) + QUAD(kasan_early_shadow_pud) + QUAD(kasan_early_shadow_p4d) +#endif } :NONE /* Debugging sections. */ @@ -219,9 +237,32 @@ SECTIONS DWARF_DEBUG ELF_DETAILS + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the three reserved double words. + */ + .got.plt : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") + /* Sections to be discarded */ DISCARDS /DISCARD/ : { *(.eh_frame) + *(.interp) } } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 9436f3053b88..234a0ba30510 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -7,13 +7,13 @@ */ #include <linux/kernel_stat.h> -#include <linux/sched/cputime.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/types.h> #include <linux/time.h> #include <asm/alternative.h> +#include <asm/cputime.h> #include <asm/vtimer.h> #include <asm/vtime.h> #include <asm/cpu_mf.h> @@ -33,24 +33,17 @@ static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_jiffies); -static inline u64 get_vtimer(void) -{ - u64 timer; - - asm volatile("stpt %0" : "=Q" (timer)); - return timer; -} - static inline void set_vtimer(u64 expires) { + struct lowcore *lc = get_lowcore(); u64 timer; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ : "=Q" (timer) : "Q" (expires)); - S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; - S390_lowcore.last_update_timer = expires; + lc->system_timer += lc->last_update_timer - timer; + lc->last_update_timer = expires; } static inline int virt_timer_forward(u64 elapsed) @@ -125,22 +118,23 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime, static int do_account_vtime(struct task_struct *tsk) { u64 timer, clock, user, guest, system, hardirq, softirq; + struct lowcore *lc = get_lowcore(); - timer = S390_lowcore.last_update_timer; - clock = S390_lowcore.last_update_clock; + timer = lc->last_update_timer; + clock = lc->last_update_clock; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " stckf %1" /* Store current tod clock value */ - : "=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock) + : "=Q" (lc->last_update_timer), + "=Q" (lc->last_update_clock) : : "cc"); - clock = S390_lowcore.last_update_clock - clock; - timer -= S390_lowcore.last_update_timer; + clock = lc->last_update_clock - clock; + timer -= lc->last_update_timer; if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; + lc->hardirq_timer += timer; else - S390_lowcore.system_timer += timer; + lc->system_timer += timer; /* Update MT utilization calculation */ if (smp_cpu_mtid && @@ -149,16 +143,16 @@ static int do_account_vtime(struct task_struct *tsk) /* Calculate cputime delta */ user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(S390_lowcore.user_timer)); + READ_ONCE(lc->user_timer)); guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(S390_lowcore.guest_timer)); + READ_ONCE(lc->guest_timer)); system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(S390_lowcore.system_timer)); + READ_ONCE(lc->system_timer)); hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(S390_lowcore.hardirq_timer)); + READ_ONCE(lc->hardirq_timer)); softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(S390_lowcore.softirq_timer)); - S390_lowcore.steal_timer += + READ_ONCE(lc->softirq_timer)); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ @@ -184,17 +178,19 @@ static int do_account_vtime(struct task_struct *tsk) void vtime_task_switch(struct task_struct *prev) { + struct lowcore *lc = get_lowcore(); + do_account_vtime(prev); - prev->thread.user_timer = S390_lowcore.user_timer; - prev->thread.guest_timer = S390_lowcore.guest_timer; - prev->thread.system_timer = S390_lowcore.system_timer; - prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; - prev->thread.softirq_timer = S390_lowcore.softirq_timer; - S390_lowcore.user_timer = current->thread.user_timer; - S390_lowcore.guest_timer = current->thread.guest_timer; - S390_lowcore.system_timer = current->thread.system_timer; - S390_lowcore.hardirq_timer = current->thread.hardirq_timer; - S390_lowcore.softirq_timer = current->thread.softirq_timer; + prev->thread.user_timer = lc->user_timer; + prev->thread.guest_timer = lc->guest_timer; + prev->thread.system_timer = lc->system_timer; + prev->thread.hardirq_timer = lc->hardirq_timer; + prev->thread.softirq_timer = lc->softirq_timer; + lc->user_timer = current->thread.user_timer; + lc->guest_timer = current->thread.guest_timer; + lc->system_timer = current->thread.system_timer; + lc->hardirq_timer = current->thread.hardirq_timer; + lc->softirq_timer = current->thread.softirq_timer; } /* @@ -204,28 +200,29 @@ void vtime_task_switch(struct task_struct *prev) */ void vtime_flush(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 steal, avg_steal; if (do_account_vtime(tsk)) virt_timer_expire(); - steal = S390_lowcore.steal_timer; - avg_steal = S390_lowcore.avg_steal_timer / 2; + steal = lc->steal_timer; + avg_steal = lc->avg_steal_timer; if ((s64) steal > 0) { - S390_lowcore.steal_timer = 0; + lc->steal_timer = 0; account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } - S390_lowcore.avg_steal_timer = avg_steal; + lc->avg_steal_timer = avg_steal / 2; } static u64 vtime_delta(void) { - u64 timer = S390_lowcore.last_update_timer; - - S390_lowcore.last_update_timer = get_vtimer(); + struct lowcore *lc = get_lowcore(); + u64 timer = lc->last_update_timer; - return timer - S390_lowcore.last_update_timer; + lc->last_update_timer = get_cpu_timer(); + return timer - lc->last_update_timer; } /* @@ -234,12 +231,13 @@ static u64 vtime_delta(void) */ void vtime_account_kernel(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 delta = vtime_delta(); if (tsk->flags & PF_VCPU) - S390_lowcore.guest_timer += delta; + lc->guest_timer += delta; else - S390_lowcore.system_timer += delta; + lc->system_timer += delta; virt_timer_forward(delta); } @@ -249,7 +247,7 @@ void vtime_account_softirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.softirq_timer += delta; + get_lowcore()->softirq_timer += delta; virt_timer_forward(delta); } @@ -258,7 +256,7 @@ void vtime_account_hardirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.hardirq_timer += delta; + get_lowcore()->hardirq_timer += delta; virt_timer_forward(delta); } diff --git a/arch/s390/kernel/wti.c b/arch/s390/kernel/wti.c new file mode 100644 index 000000000000..949fdbf0e8b6 --- /dev/null +++ b/arch/s390/kernel/wti.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for warning track interruption + * + * Copyright IBM Corp. 2023 + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/smpboot.h> +#include <linux/irq.h> +#include <uapi/linux/sched/types.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/sclp.h> + +#define WTI_DBF_LEN 64 + +struct wti_debug { + unsigned long missed; + unsigned long addr; + pid_t pid; +}; + +struct wti_state { + /* debug data for s390dbf */ + struct wti_debug dbg; + /* + * Represents the real-time thread responsible to + * acknowledge the warning-track interrupt and trigger + * preliminary and postliminary precautions. + */ + struct task_struct *thread; + /* + * If pending is true, the real-time thread must be scheduled. + * If not, a wake up of that thread will remain a noop. + */ + bool pending; +}; + +static DEFINE_PER_CPU(struct wti_state, wti_state); + +static debug_info_t *wti_dbg; + +/* + * During a warning-track grace period, interrupts are disabled + * to prevent delays of the warning-track acknowledgment. + * + * Once the CPU is physically dispatched again, interrupts are + * re-enabled. + */ + +static void wti_irq_disable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* disable all I/O interrupts */ + cr6.val &= ~0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void wti_irq_enable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* enable all I/O interrupts */ + cr6.val |= 0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void store_debug_data(struct wti_state *st) +{ + struct pt_regs *regs = get_irq_regs(); + + st->dbg.pid = current->pid; + st->dbg.addr = 0; + if (!user_mode(regs)) + st->dbg.addr = regs->psw.addr; +} + +static void wti_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct wti_state *st = this_cpu_ptr(&wti_state); + + inc_irq_stat(IRQEXT_WTI); + wti_irq_disable(); + store_debug_data(st); + st->pending = true; + wake_up_process(st->thread); +} + +static int wti_pending(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + return st->pending; +} + +static void wti_dbf_grace_period(struct wti_state *st) +{ + struct wti_debug *wdi = &st->dbg; + char buf[WTI_DBF_LEN]; + + if (wdi->addr) + snprintf(buf, sizeof(buf), "%d %pS", wdi->pid, (void *)wdi->addr); + else + snprintf(buf, sizeof(buf), "%d <user>", wdi->pid); + debug_text_event(wti_dbg, 2, buf); + wdi->missed++; +} + +static int wti_show(struct seq_file *seq, void *v) +{ + struct wti_state *st; + int cpu; + + cpus_read_lock(); + seq_puts(seq, " "); + for_each_online_cpu(cpu) + seq_printf(seq, "CPU%-8d", cpu); + seq_putc(seq, '\n'); + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + seq_printf(seq, " %10lu", st->dbg.missed); + } + seq_putc(seq, '\n'); + cpus_read_unlock(); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(wti); + +static void wti_thread_fn(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + st->pending = false; + /* + * Yield CPU voluntarily to the hypervisor. Control + * resumes when hypervisor decides to dispatch CPU + * to this LPAR again. + */ + if (diag49c(DIAG49C_SUBC_ACK)) + wti_dbf_grace_period(st); + wti_irq_enable(); +} + +static struct smp_hotplug_thread wti_threads = { + .store = &wti_state.thread, + .thread_should_run = wti_pending, + .thread_fn = wti_thread_fn, + .thread_comm = "cpuwti/%u", + .selfparking = false, +}; + +static int __init wti_init(void) +{ + struct sched_param wti_sched_param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct dentry *wti_dir; + struct wti_state *st; + int cpu, rc; + + rc = -EOPNOTSUPP; + if (!sclp.has_wti) + goto out; + rc = smpboot_register_percpu_thread(&wti_threads); + if (WARN_ON(rc)) + goto out; + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + sched_setscheduler(st->thread, SCHED_FIFO, &wti_sched_param); + } + rc = register_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); + if (rc) { + pr_warn("Couldn't request external interrupt 0x1007\n"); + goto out_thread; + } + irq_subclass_register(IRQ_SUBCLASS_WARNING_TRACK); + rc = diag49c(DIAG49C_SUBC_REG); + if (rc) { + pr_warn("Failed to register warning track interrupt through DIAG 49C\n"); + rc = -EOPNOTSUPP; + goto out_subclass; + } + wti_dir = debugfs_create_dir("wti", arch_debugfs_dir); + debugfs_create_file("stat", 0400, wti_dir, NULL, &wti_fops); + wti_dbg = debug_register("wti", 1, 1, WTI_DBF_LEN); + if (!wti_dbg) { + rc = -ENOMEM; + goto out_debug_register; + } + rc = debug_register_view(wti_dbg, &debug_hex_ascii_view); + if (rc) + goto out_debug_register; + goto out; +out_debug_register: + debug_unregister(wti_dbg); +out_subclass: + irq_subclass_unregister(IRQ_SUBCLASS_WARNING_TRACK); + unregister_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); +out_thread: + smpboot_unregister_percpu_thread(&wti_threads); +out: + return rc; +} +late_initcall(wti_init); diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 2e84d3922f7c..cae908d64550 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -19,21 +19,17 @@ if VIRTUALIZATION config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM - select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL - select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC + select KVM_COMMON select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL - select SRCU select KVM_VFIO - select INTERVAL_TREE + select MMU_NOTIFIER help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 26f4a74e5ce4..9a723c48b05a 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,6 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o +kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 807fa9da1e72..53233dec8cad 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -11,12 +11,30 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <asm/gmap.h> +#include <asm/gmap_helpers.h> #include <asm/virtio-ccw.h> #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" #include "gaccess.h" +static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslot_iter iter; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned long start, end; + + slots = kvm_vcpu_memslots(vcpu); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + slot = iter.slot; + start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn)); + end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages)); + gmap_helper_discard(vcpu->kvm->mm, start, end); + } +} + static int diag_release_pages(struct kvm_vcpu *vcpu) { unsigned long start, end; @@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); + mmap_read_lock(vcpu->kvm->mm); /* * We checked for start >= end above, so lets check for the * fast path (no prefix swap page involved) */ if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) { - gmap_discard(vcpu->arch.gmap, start, end); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end)); } else { /* * This is slow path. gmap_discard will check for start @@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) * prefix and let gmap_discard make some of these calls * NOPs. */ - gmap_discard(vcpu->arch.gmap, start, prefix); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix)); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + do_discard_gfn_range(vcpu, 0, 1); if (end > prefix + PAGE_SIZE) - gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); - gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); + do_discard_gfn_range(vcpu, 1, 2); + do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end)); } + mmap_read_unlock(vcpu->kvm->mm); return 0; } @@ -77,7 +97,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) vcpu->stat.instruction_diagnose_258++; if (vcpu->run->s.regs.gprs[rx] & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); + rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm)); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) @@ -102,7 +122,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); vcpu->arch.pfault_token = parm.token_addr; @@ -166,6 +186,7 @@ static int diag9c_forwarding_overrun(void) static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tcpu; + int tcpu_cpu; int tid; tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; @@ -181,14 +202,15 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) goto no_yield; /* target guest VCPU already running */ - if (READ_ONCE(tcpu->cpu) >= 0) { + tcpu_cpu = READ_ONCE(tcpu->cpu); + if (tcpu_cpu >= 0) { if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) goto no_yield; /* target host CPU already running */ - if (!vcpu_is_preempted(tcpu->cpu)) + if (!vcpu_is_preempted(tcpu_cpu)) goto no_yield; - smp_yield_cpu(tcpu->cpu); + smp_yield_cpu(tcpu_cpu); VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: yield forwarded", tid); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 227ed0009354..21c2e61fece4 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -11,169 +11,14 @@ #include <linux/err.h> #include <linux/pgtable.h> #include <linux/bitfield.h> - +#include <asm/access-regs.h> +#include <asm/fault.h> #include <asm/gmap.h> +#include <asm/dat-bits.h> #include "kvm-s390.h" #include "gaccess.h" -#include <asm/switch_to.h> - -union asce { - unsigned long val; - struct { - unsigned long origin : 52; /* Region- or Segment-Table Origin */ - unsigned long : 2; - unsigned long g : 1; /* Subspace Group Control */ - unsigned long p : 1; /* Private Space Control */ - unsigned long s : 1; /* Storage-Alteration-Event Control */ - unsigned long x : 1; /* Space-Switch-Event Control */ - unsigned long r : 1; /* Real-Space Control */ - unsigned long : 1; - unsigned long dt : 2; /* Designation-Type Control */ - unsigned long tl : 2; /* Region- or Segment-Table Length */ - }; -}; - -enum { - ASCE_TYPE_SEGMENT = 0, - ASCE_TYPE_REGION3 = 1, - ASCE_TYPE_REGION2 = 2, - ASCE_TYPE_REGION1 = 3 -}; -union region1_table_entry { - unsigned long val; - struct { - unsigned long rto: 52;/* Region-Table Origin */ - unsigned long : 2; - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long : 1; - unsigned long tf : 2; /* Region-Second-Table Offset */ - unsigned long i : 1; /* Region-Invalid Bit */ - unsigned long : 1; - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long tl : 2; /* Region-Second-Table Length */ - }; -}; - -union region2_table_entry { - unsigned long val; - struct { - unsigned long rto: 52;/* Region-Table Origin */ - unsigned long : 2; - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long : 1; - unsigned long tf : 2; /* Region-Third-Table Offset */ - unsigned long i : 1; /* Region-Invalid Bit */ - unsigned long : 1; - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long tl : 2; /* Region-Third-Table Length */ - }; -}; - -struct region3_table_entry_fc0 { - unsigned long sto: 52;/* Segment-Table Origin */ - unsigned long : 1; - unsigned long fc : 1; /* Format-Control */ - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long : 1; - unsigned long tf : 2; /* Segment-Table Offset */ - unsigned long i : 1; /* Region-Invalid Bit */ - unsigned long cr : 1; /* Common-Region Bit */ - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long tl : 2; /* Segment-Table Length */ -}; - -struct region3_table_entry_fc1 { - unsigned long rfaa : 33; /* Region-Frame Absolute Address */ - unsigned long : 14; - unsigned long av : 1; /* ACCF-Validity Control */ - unsigned long acc: 4; /* Access-Control Bits */ - unsigned long f : 1; /* Fetch-Protection Bit */ - unsigned long fc : 1; /* Format-Control */ - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long iep: 1; /* Instruction-Execution-Protection */ - unsigned long : 2; - unsigned long i : 1; /* Region-Invalid Bit */ - unsigned long cr : 1; /* Common-Region Bit */ - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long : 2; -}; - -union region3_table_entry { - unsigned long val; - struct region3_table_entry_fc0 fc0; - struct region3_table_entry_fc1 fc1; - struct { - unsigned long : 53; - unsigned long fc : 1; /* Format-Control */ - unsigned long : 4; - unsigned long i : 1; /* Region-Invalid Bit */ - unsigned long cr : 1; /* Common-Region Bit */ - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long : 2; - }; -}; - -struct segment_entry_fc0 { - unsigned long pto: 53;/* Page-Table Origin */ - unsigned long fc : 1; /* Format-Control */ - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long : 3; - unsigned long i : 1; /* Segment-Invalid Bit */ - unsigned long cs : 1; /* Common-Segment Bit */ - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long : 2; -}; - -struct segment_entry_fc1 { - unsigned long sfaa : 44; /* Segment-Frame Absolute Address */ - unsigned long : 3; - unsigned long av : 1; /* ACCF-Validity Control */ - unsigned long acc: 4; /* Access-Control Bits */ - unsigned long f : 1; /* Fetch-Protection Bit */ - unsigned long fc : 1; /* Format-Control */ - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long iep: 1; /* Instruction-Execution-Protection */ - unsigned long : 2; - unsigned long i : 1; /* Segment-Invalid Bit */ - unsigned long cs : 1; /* Common-Segment Bit */ - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long : 2; -}; - -union segment_table_entry { - unsigned long val; - struct segment_entry_fc0 fc0; - struct segment_entry_fc1 fc1; - struct { - unsigned long : 53; - unsigned long fc : 1; /* Format-Control */ - unsigned long : 4; - unsigned long i : 1; /* Segment-Invalid Bit */ - unsigned long cs : 1; /* Common-Segment Bit */ - unsigned long tt : 2; /* Table-Type Bits */ - unsigned long : 2; - }; -}; - -enum { - TABLE_TYPE_SEGMENT = 0, - TABLE_TYPE_REGION3 = 1, - TABLE_TYPE_REGION2 = 2, - TABLE_TYPE_REGION1 = 3 -}; - -union page_table_entry { - unsigned long val; - struct { - unsigned long pfra : 52; /* Page-Frame Real Address */ - unsigned long z : 1; /* Zero Bit */ - unsigned long i : 1; /* Page-Invalid Bit */ - unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long iep: 1; /* Instruction-Execution-Protection */ - unsigned long : 8; - }; -}; +#define GMAP_SHADOW_FAKE_TABLE 1ULL /* * vaddress union in order to easily decode a virtual address into its @@ -262,119 +107,119 @@ struct aste { /* .. more fields there */ }; -int ipte_lock_held(struct kvm_vcpu *vcpu) +int ipte_lock_held(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) { + if (sclp.has_siif) { int rc; - read_lock(&vcpu->kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0; - read_unlock(&vcpu->kvm->arch.sca_lock); + read_lock(&kvm->arch.sca_lock); + rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + read_unlock(&kvm->arch.sca_lock); return rc; } - return vcpu->kvm->arch.ipte_lock_count != 0; + return kvm->arch.ipte_lock_count != 0; } -static void ipte_lock_simple(struct kvm_vcpu *vcpu) +static void ipte_lock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count++; - if (vcpu->kvm->arch.ipte_lock_count > 1) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count++; + if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); if (old.k) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); + read_unlock(&kvm->arch.sca_lock); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_unlock_simple(struct kvm_vcpu *vcpu) +static void ipte_unlock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count--; - if (vcpu->kvm->arch.ipte_lock_count) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count--; + if (kvm->arch.ipte_lock_count) goto out; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); new = old; new.k = 0; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); - wake_up(&vcpu->kvm->arch.ipte_wq); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); + read_unlock(&kvm->arch.sca_lock); + wake_up(&kvm->arch.ipte_wq); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_lock_siif(struct kvm_vcpu *vcpu) +static void ipte_lock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); if (old.kg) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; new.kh++; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); + read_unlock(&kvm->arch.sca_lock); } -static void ipte_unlock_siif(struct kvm_vcpu *vcpu) +static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); new = old; new.kh--; if (!new.kh) new.k = 0; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); + read_unlock(&kvm->arch.sca_lock); if (!new.kh) - wake_up(&vcpu->kvm->arch.ipte_wq); + wake_up(&kvm->arch.ipte_wq); } -void ipte_lock(struct kvm_vcpu *vcpu) +void ipte_lock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_lock_siif(vcpu); + if (sclp.has_siif) + ipte_lock_siif(kvm); else - ipte_lock_simple(vcpu); + ipte_lock_simple(kvm); } -void ipte_unlock(struct kvm_vcpu *vcpu) +void ipte_unlock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_unlock_siif(vcpu); + if (sclp.has_siif) + ipte_unlock_siif(kvm); else - ipte_unlock_simple(vcpu); + ipte_unlock_simple(kvm); } static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, @@ -391,7 +236,8 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, if (ar >= NUM_ACRS) return -EINVAL; - save_access_regs(vcpu->run->s.regs.acrs); + if (vcpu->arch.acrs_loaded) + save_access_regs(vcpu->run->s.regs.acrs); alet.val = vcpu->run->s.regs.acrs[ar]; if (ar == 0 || alet.val == 0) { @@ -466,64 +312,53 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, return 0; } -struct trans_exc_code_bits { - unsigned long addr : 52; /* Translation-exception Address */ - unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ - unsigned long : 2; - unsigned long b56 : 1; - unsigned long : 3; - unsigned long b60 : 1; - unsigned long b61 : 1; - unsigned long as : 2; /* ASCE Identifier */ -}; - -enum { - FSI_UNKNOWN = 0, /* Unknown wether fetch or store */ - FSI_STORE = 1, /* Exception was due to store operation */ - FSI_FETCH = 2 /* Exception was due to fetch operation */ -}; - enum prot_type { PROT_TYPE_LA = 0, PROT_TYPE_KEYC = 1, PROT_TYPE_ALC = 2, PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, + /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ + PROT_TYPE_DUMMY, }; static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, enum gacc_mode mode, enum prot_type prot, bool terminate) { struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - struct trans_exc_code_bits *tec; + union teid *teid; memset(pgm, 0, sizeof(*pgm)); pgm->code = code; - tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + teid = (union teid *)&pgm->trans_exc_code; switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_TYPE_DUMMY: + /* We should never get here, acts like termination */ + WARN_ON_ONCE(1); + break; case PROT_TYPE_IEP: - tec->b61 = 1; + teid->b61 = 1; fallthrough; case PROT_TYPE_LA: - tec->b56 = 1; + teid->b56 = 1; break; case PROT_TYPE_KEYC: - tec->b60 = 1; + teid->b60 = 1; break; case PROT_TYPE_ALC: - tec->b60 = 1; + teid->b60 = 1; fallthrough; case PROT_TYPE_DAT: - tec->b61 = 1; + teid->b61 = 1; break; } if (terminate) { - tec->b56 = 0; - tec->b60 = 0; - tec->b61 = 0; + teid->b56 = 0; + teid->b60 = 0; + teid->b61 = 0; } fallthrough; case PGM_ASCE_TYPE: @@ -537,9 +372,9 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, * exc_access_id has to be set to 0 for some instructions. Both * cases have to be handled by the caller. */ - tec->addr = gva >> PAGE_SHIFT; - tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + teid->addr = gva >> PAGE_SHIFT; + teid->fsi = mode == GACC_STORE ? TEID_FSI_STORE : TEID_FSI_FETCH; + teid->as = psw_bits(vcpu->arch.sie_block->gpsw).as; fallthrough; case PGM_ALEN_TRANSLATION: case PGM_ALE_SEQUENCE: @@ -619,7 +454,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * Returns: - zero on success; @gpa contains the resulting absolute address * - a negative value if guest access failed due to e.g. broken * guest mapping - * - a positve value if an access exception happened. In this case + * - a positive value if an access exception happened. In this case * the returned value is the program interruption code as defined * by the architecture */ @@ -642,7 +477,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; - ptr = asce.origin * PAGE_SIZE; + ptr = asce.rsto * PAGE_SIZE; switch (asce.dt) { case ASCE_TYPE_REGION1: if (vaddr.rfx01 > asce.tl) @@ -675,7 +510,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_REGION1: { union region1_table_entry rfte; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &rfte.val)) return -EFAULT; @@ -693,7 +528,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_REGION2: { union region2_table_entry rste; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &rste.val)) return -EFAULT; @@ -711,7 +546,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_REGION3: { union region3_table_entry rtte; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &rtte.val)) return -EFAULT; @@ -739,7 +574,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &ste.val)) return -EFAULT; @@ -759,7 +594,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; } } - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &pte.val)) return -EFAULT; @@ -781,7 +616,7 @@ absolute_address: *prot = PROT_TYPE_IEP; return PGM_PROTECTION; } - if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr)) return PGM_ADDRESSING; *gpa = raddr.addr; return 0; @@ -968,8 +803,10 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return rc; } else { gpa = kvm_s390_real_to_abs(vcpu, ga); - if (kvm_is_error_gpa(vcpu->kvm, gpa)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; + prot = PROT_TYPE_DUMMY; + } } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); @@ -993,6 +830,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, const gfn_t gfn = gpa_to_gfn(gpa); int rc; + if (!gfn_to_memslot(kvm, gfn)) + return PGM_ADDRESSING; if (mode == GACC_STORE) rc = kvm_write_guest_page(kvm, gfn, data, offset, len); else @@ -1086,7 +925,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, try_storage_prot_override = storage_prot_override_applicable(vcpu); need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); /* * Since we do the access further down ultimately via a move instruction * that does key checking and returns an error in case of a protection @@ -1112,8 +951,6 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc == PGM_PROTECTION && try_storage_prot_override) rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], data, fragment_len, PAGE_SPO_ACC); - if (rc == PGM_PROTECTION) - prot = PROT_TYPE_KEYC; if (rc) break; len -= fragment_len; @@ -1123,11 +960,15 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc > 0) { bool terminate = (mode == GACC_STORE) && (idx > 0); + if (rc == PGM_PROTECTION) + prot = PROT_TYPE_KEYC; + else + prot = PROT_TYPE_DUMMY; rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } out_unlock: if (need_ipte_lock) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); if (nr_pages > ARRAY_SIZE(gpa_array)) vfree(gpas); return rc; @@ -1148,10 +989,121 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, gra += fragment_len; data += fragment_len; } + if (rc > 0) + vcpu->arch.pgm.code = rc; return rc; } /** + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. + * @kvm: Virtual machine instance. + * @gpa: Absolute guest address of the location to be changed. + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a + * non power of two will result in failure. + * @old_addr: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old_addr contains the value at @gpa before the attempt to + * exchange the value. + * @new: The value to place at @gpa. + * @access_key: The access key to use for the guest access. + * @success: output value indicating if an exchange occurred. + * + * Atomically exchange the value at @gpa by @new, if it contains *@old. + * Honors storage keys. + * + * Return: * 0: successful exchange + * * >0: a program interruption code indicating the reason cmpxchg could + * not be attempted + * * -EINVAL: address misaligned or len not power of two + * * -EAGAIN: transient failure (len 1 or 2) + * * -EOPNOTSUPP: read-only memslot (should never occur) + */ +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, + __uint128_t *old_addr, __uint128_t new, + u8 access_key, bool *success) +{ + gfn_t gfn = gpa_to_gfn(gpa); + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + bool writable; + hva_t hva; + int ret; + + if (!IS_ALIGNED(gpa, len)) + return -EINVAL; + + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a read-only memslot, even though that cannot occur + * since those are unsupported. + * Don't try to actually handle that case. + */ + if (!writable) + return -EOPNOTSUPP; + + hva += offset_in_page(gpa); + /* + * The cmpxchg_user_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (len) { + case 1: { + u8 old; + + ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 2: { + u16 old; + + ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 4: { + u32 old; + + ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 8: { + u64 old; + + ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 16: { + __uint128_t old; + + ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + default: + return -EINVAL; + } + if (*success) + mark_page_dirty_in_slot(kvm, slot, gfn); + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (ret == -EFAULT) + ret = PGM_PROTECTION; + return ret; +} + +/** * guest_translate_address_with_key - translate guest logical into guest absolute address * @vcpu: virtual cpu * @gva: Guest virtual address @@ -1199,10 +1151,10 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, access_key); - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return rc; } @@ -1263,6 +1215,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, int *fake) { + struct kvm *kvm; struct gmap *parent; union asce asce; union vaddress vaddr; @@ -1271,10 +1224,11 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, *fake = 0; *dat_protection = 0; + kvm = sg->private; parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; - ptr = asce.origin * PAGE_SIZE; + ptr = asce.rsto * PAGE_SIZE; if (asce.r) { *fake = 1; ptr = 0; @@ -1331,6 +1285,7 @@ shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_r1_entry++; } fallthrough; case ASCE_TYPE_REGION2: { @@ -1359,6 +1314,7 @@ shadow_r3t: rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_r2_entry++; } fallthrough; case ASCE_TYPE_REGION3: { @@ -1396,6 +1352,7 @@ shadow_sgt: rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_r3_entry++; } fallthrough; case ASCE_TYPE_SEGMENT: { @@ -1429,6 +1386,7 @@ shadow_pgt: rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_sg_entry++; } } /* Return the parent address of the page table */ @@ -1437,6 +1395,44 @@ shadow_pgt: } /** + * shadow_pgt_lookup() - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_lock in read. + */ +static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, + int *dat_protection, int *fake) +{ + unsigned long pt_index; + unsigned long *table; + struct page *page; + int rc; + + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); + *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; +} + +/** * kvm_s390_shadow_fault - handle fault on a shadow page table * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure @@ -1459,15 +1455,18 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, int dat_protection, fake; int rc; + if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) + return -EFAULT; + mmap_read_lock(sg->mm); /* * We don't want any guest-2 tables to change - so the parent * tables/pointers we read stay valid - unshadowing is however * always possible - only guest_table_lock protects us. */ - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); - rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, &fake); @@ -1499,7 +1498,8 @@ shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - ipte_unlock(vcpu); + vcpu->kvm->stat.gmap_shadow_pg_entry++; + ipte_unlock(vcpu->kvm); mmap_read_unlock(sg->mm); return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 1124ff282012..3fde45a151f2 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, + __uint128_t new, u8 access_key, bool *success); + /** * write_guest_with_key - copy data from kernel space to guest space * @vcpu: virtual cpu @@ -402,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data, * @len: number of bytes to copy * * Copy @len bytes from @data (kernel space) to @gra (guest real address). - * It is up to the caller to ensure that the entire guest memory range is - * valid memory before calling this function. * Guest low address and key protection are not checked. * - * Returns zero on success or -EFAULT on error. + * Returns zero on success, -EFAULT when copying from @data failed, or + * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info + * is also stored to allow injecting into the guest (if applicable) using + * kvm_s390_inject_prog_cond(). * * If an error occurs data may have been copied partially to guest memory. */ @@ -425,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, * @len: number of bytes to copy * * Copy @len bytes from @gra (guest real address) to @data (kernel space). - * It is up to the caller to ensure that the entire guest memory range is - * valid memory before calling this function. * Guest key protection is not checked. * - * Returns zero on success or -EFAULT on error. + * Returns zero on success, -EFAULT when copying to @data failed, or + * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info + * is also stored to allow injecting into the guest (if applicable) using + * kvm_s390_inject_prog_cond(). * * If an error occurs data may have been copied partially to kernel space. */ @@ -440,9 +445,9 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, return access_guest_real(vcpu, gra, data, len, 0); } -void ipte_lock(struct kvm_vcpu *vcpu); -void ipte_unlock(struct kvm_vcpu *vcpu); -int ipte_lock_held(struct kvm_vcpu *vcpu); +void ipte_lock(struct kvm *kvm); +void ipte_unlock(struct kvm *kvm); +int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); /* MVPG PEI indication bits */ diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c new file mode 100644 index 000000000000..56ef153eb8fe --- /dev/null +++ b/arch/s390/kvm/gmap-vsie.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 nested VMs. + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> + */ + +#include <linux/compiler.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/pgtable.h> +#include <linux/pagemap.h> +#include <linux/mman.h> + +#include <asm/lowcore.h> +#include <asm/gmap.h> +#include <asm/uv.h> + +#include "kvm-s390.h" + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + * + * Context: Called with parent->shadow_lock held + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->shadow_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_shadow_valid(sg, asce, edat_level)) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + refcount_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || + KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) + return ERR_PTR(-EFAULT); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->private = parent->private; + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + refcount_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + mmap_read_lock(parent->mm); + rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1), + PROT_READ, GMAP_NOTIFY_SHADOW); + mmap_read_unlock(parent->mm); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 3765c4223bf9..80879fc73c90 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - bp_data = memdup_user(dbg->arch.hw_bp, - sizeof(*bp_data) * dbg->arch.nr_hw_bp); + bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp, + sizeof(*bp_data)); if (IS_ERR(bp_data)) return PTR_ERR(bp_data); diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 8bd42a20d924..c7908950c1f4 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -94,7 +94,7 @@ static int handle_validity(struct kvm_vcpu *vcpu) vcpu->stat.exit_validity++; trace_kvm_s390_intercept_validity(vcpu, viwhy); - KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy, + KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy, current->pid, vcpu->kvm); /* do not warn on invalid runtime instrumentation mode */ @@ -217,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu) return 0; if (current->thread.per_flags & PER_FLAG_NO_TE) return 0; - itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba; + itdb = phys_to_virt(vcpu->arch.sie_block->itdba); rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); if (rc) return rc; @@ -228,6 +228,21 @@ static int handle_itdb(struct kvm_vcpu *vcpu) #define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER) +static bool should_handle_per_event(const struct kvm_vcpu *vcpu) +{ + if (!guestdbg_enabled(vcpu) || !per_event(vcpu)) + return false; + if (guestdbg_sstep_enabled(vcpu) && + vcpu->arch.sie_block->iprcc != PGM_PER) { + /* + * __vcpu_run() will exit after delivering the concurrently + * indicated condition. + */ + return false; + } + return true; +} + static int handle_prog(struct kvm_vcpu *vcpu) { psw_t psw; @@ -242,7 +257,7 @@ static int handle_prog(struct kvm_vcpu *vcpu) if (kvm_s390_pv_cpu_is_protected(vcpu)) return -EOPNOTSUPP; - if (guestdbg_enabled(vcpu) && per_event(vcpu)) { + if (should_handle_per_event(vcpu)) { rc = kvm_s390_handle_per_event(vcpu); if (rc) return rc; @@ -271,10 +286,18 @@ static int handle_prog(struct kvm_vcpu *vcpu) * handle_external_interrupt - used for external interruption interceptions * @vcpu: virtual cpu * - * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if - * the new PSW does not have external interrupts disabled. In the first case, - * we've got to deliver the interrupt manually, and in the second case, we - * drop to userspace to handle the situation there. + * This interception occurs if: + * - the CPUSTAT_EXT_INT bit was already set when the external interrupt + * occurred. In this case, the interrupt needs to be injected manually to + * preserve interrupt priority. + * - the external new PSW has external interrupts enabled, which will cause an + * interruption loop. We drop to userspace in this case. + * + * The latter case can be detected by inspecting the external mask bit in the + * external new psw. + * + * Under PV, only the latter case can occur, since interrupt priorities are + * handled in the ultravisor. */ static int handle_external_interrupt(struct kvm_vcpu *vcpu) { @@ -285,10 +308,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) vcpu->stat.exit_external_interrupt++; - rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); - if (rc) - return rc; - /* We can not handle clock comparator or timer interrupt with bad PSW */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + newpsw = vcpu->arch.sie_block->gpsw; + } else { + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + } + + /* + * Clock comparator or timer interrupt with external interrupt enabled + * will cause interrupt loop. Drop to userspace. + */ if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && (newpsw.mask & PSW_MASK_EXT)) return -EOPNOTSUPP; @@ -336,7 +367,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); + rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); if (rc != 0) return rc; @@ -345,7 +376,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); + rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); if (rc != 0) return rc; @@ -373,8 +404,8 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) */ int handle_sthyi(struct kvm_vcpu *vcpu) { - int reg1, reg2, r = 0; - u64 code, addr, cc = 0, rc = 0; + int reg1, reg2, cc = 0, r = 0; + u64 code, addr, rc = 0; struct sthyi_sctns *sctns = NULL; if (!test_kvm_facility(vcpu->kvm, 74)) @@ -405,12 +436,14 @@ int handle_sthyi(struct kvm_vcpu *vcpu) return -ENOMEM; cc = sthyi_fill(sctns, &rc); - + if (cc < 0) { + free_page((unsigned long)sctns); + return cc; + } out: if (!cc) { if (kvm_s390_pv_cpu_is_protected(vcpu)) { - memcpy((void *)(sida_origin(vcpu->arch.sie_block)), - sctns, PAGE_SIZE); + memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE); } else { r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); if (r) { @@ -464,7 +497,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu) static int handle_pv_spx(struct kvm_vcpu *vcpu) { - u32 pref = *(u32 *)vcpu->arch.sie_block->sidad; + u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block); kvm_s390_set_prefix(vcpu, pref); trace_kvm_s390_handle_prefix(vcpu, 1, pref); @@ -497,7 +530,7 @@ static int handle_pv_sclp(struct kvm_vcpu *vcpu) static int handle_pv_uvc(struct kvm_vcpu *vcpu) { - struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad; + struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block); struct uv_cb_cts uvcb = { .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, .header.len = sizeof(uvcb), @@ -511,12 +544,12 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) guest_uvcb->header.cmd); return 0; } - rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb); /* * If the unpin did not succeed, the guest will exit again for the UVC * and we will retry the unpin. */ - if (rc == -EINVAL) + if (rc == -EINVAL || rc == -ENXIO) return 0; /* * If we got -EAGAIN here, we simply return it. It will eventually @@ -528,16 +561,44 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) static int handle_pv_notification(struct kvm_vcpu *vcpu) { + int ret; + if (vcpu->arch.sie_block->ipa == 0xb210) return handle_pv_spx(vcpu); if (vcpu->arch.sie_block->ipa == 0xb220) return handle_pv_sclp(vcpu); if (vcpu->arch.sie_block->ipa == 0xb9a4) return handle_pv_uvc(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) { + /* + * Besides external call, other SIGP orders also cause a + * 108 (pv notify) intercept. In contrast to external call, + * these orders need to be emulated and hence the appropriate + * place to handle them is in handle_instruction(). + * So first try kvm_s390_handle_sigp_pei() and if that isn't + * successful, go on with handle_instruction(). + */ + ret = kvm_s390_handle_sigp_pei(vcpu); + if (!ret) + return ret; + } return handle_instruction(vcpu); } +static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc) +{ + /* Process PER, also if the instruction is processed in user space. */ + if (!(vcpu->arch.sie_block->icptstatus & 0x02)) + return false; + if (rc != 0 && rc != -EOPNOTSUPP) + return false; + if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs) + /* __vcpu_run() will exit after delivering the interrupt. */ + return false; + return true; +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { int rc, per_rc = 0; @@ -572,8 +633,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) rc = handle_partial_execution(vcpu); break; case ICPT_KSS: - rc = kvm_s390_skey_check_enable(vcpu); - break; + /* Instruction will be redriven, skip the PER check. */ + return kvm_s390_skey_check_enable(vcpu); case ICPT_MCHKREQ: case ICPT_INT_ENABLE: /* @@ -591,18 +652,14 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) break; case ICPT_PV_PREF: rc = 0; - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu)); - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu)); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE); break; default: return -EOPNOTSUPP; } - /* process PER, also if the instrution is processed in user space */ - if (vcpu->arch.sie_block->icptstatus & 0x02 && - (!rc || rc == -EOPNOTSUPP)) + if (should_handle_per_ifetch(vcpu, rc)) per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index af96dc0549a4..60c360c18690 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -10,6 +10,7 @@ #define KMSG_COMPONENT "kvm-s390" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/cpufeature.h> #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/hrtimer.h> @@ -19,18 +20,20 @@ #include <linux/slab.h> #include <linux/bitmap.h> #include <linux/vmalloc.h> +#include <asm/access-regs.h> #include <asm/asm-offsets.h> #include <asm/dis.h> #include <linux/uaccess.h> #include <asm/sclp.h> #include <asm/isc.h> #include <asm/gmap.h> -#include <asm/switch_to.h> #include <asm/nmi.h> #include <asm/airq.h> +#include <asm/tpi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" +#include "pci.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -81,8 +84,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl; + union esca_sigp_ctrl new_val = {0}, old_val; + old_val = READ_ONCE(*sigp_ctrl); new_val.scn = src_id; new_val.c = 1; old_val.c = 0; @@ -93,8 +97,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl; + union bsca_sigp_ctrl new_val = {0}, old_val; + old_val = READ_ONCE(*sigp_ctrl); new_val.scn = src_id; new_val.c = 1; old_val.c = 0; @@ -114,8 +119,6 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { - int rc, expect; - if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); @@ -124,21 +127,16 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl old = *sigp_ctrl; - expect = old.value; - rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + WRITE_ONCE(sigp_ctrl->value, 0); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl old = *sigp_ctrl; - expect = old.value; - rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + WRITE_ONCE(sigp_ctrl->value, 0); } read_unlock(&vcpu->kvm->arch.sca_lock); - WARN_ON(rc != expect); /* cannot clear? */ } int psw_extint_disabled(struct kvm_vcpu *vcpu) @@ -241,12 +239,12 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) { u64 word, _word; + word = READ_ONCE(gisa->u64.word[0]); do { - word = READ_ONCE(gisa->u64.word[0]); if ((u64)gisa != word >> 32) return -EBUSY; _word = (word & ~0xffUL) | iam; - } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word)); return 0; } @@ -264,10 +262,10 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) { u64 word, _word; + word = READ_ONCE(gisa->u64.word[0]); do { - word = READ_ONCE(gisa->u64.word[0]); _word = word & ~(0xffUL << 24); - } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word)); } /** @@ -285,23 +283,18 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) u8 pending_mask, alert_mask; u64 word, _word; + word = READ_ONCE(gi->origin->u64.word[0]); do { - word = READ_ONCE(gi->origin->u64.word[0]); alert_mask = READ_ONCE(gi->alert.mask); pending_mask = (u8)(word >> 24) & alert_mask; if (pending_mask) return pending_mask; _word = (word & ~0xffUL) | alert_mask; - } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gi->origin->u64.word[0], &word, _word)); return 0; } -static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) -{ - return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; -} - static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -312,11 +305,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) return READ_ONCE(gisa->ipm); } -static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) -{ - clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); -} - static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -588,9 +576,9 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, mci.val = mchk->mcic; /* take care of lazy register loading */ - save_fpu_regs(); + kvm_s390_fpu_store(vcpu->run); save_access_regs(vcpu->run->s.regs.acrs); - if (MACHINE_HAS_GS && vcpu->arch.gs_enabled) + if (cpu_has_gs() && vcpu->arch.gs_enabled) save_gs_cb(current->thread.gs_cb); /* Extended save area */ @@ -643,7 +631,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); /* Register-save areas */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); } else { @@ -652,7 +640,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, } rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA, vcpu->run->s.regs.gprs, 128); - rc |= put_guest_lc(vcpu, current->thread.fpu.fpc, + rc |= put_guest_lc(vcpu, vcpu->run->s.regs.fpc, (u32 __user *) __LC_FP_CREG_SAVE_AREA); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr, (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA); @@ -702,7 +690,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) /* * We indicate floating repressible conditions along with * other pending conditions. Channel Report Pending and Channel - * Subsystem damage are the only two and and are indicated by + * Subsystem damage are the only two and are indicated by * bits in mcic and masked in cr14. */ if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { @@ -961,8 +949,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, (u64 *) __LC_PGM_LAST_BREAK); - rc |= put_guest_lc(vcpu, pgm_info.code, - (u16 *)__LC_PGM_INT_CODE); + rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW, @@ -1035,7 +1022,7 @@ static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) return 0; } ext = fi->srv_signal; - /* only clear the event bit */ + /* only clear the event bits */ fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); spin_unlock(&fi->lock); @@ -1045,7 +1032,7 @@ static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, ext.ext_params, 0); - return write_sclp(vcpu, SCCB_EVENT_PENDING); + return write_sclp(vcpu, ext.ext_params & SCCB_EVENT_PENDING); } static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) @@ -1391,6 +1378,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc = 0; + bool delivered = false; unsigned long irq_type; unsigned long irqs; @@ -1464,6 +1452,19 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) WARN_ONCE(1, "Unknown pending irq type %ld", irq_type); clear_bit(irq_type, &li->pending_irqs); } + delivered |= !rc; + } + + /* + * We delivered at least one interrupt and modified the PC. Force a + * singlestep event now. + */ + if (delivered && guestdbg_sstep_enabled(vcpu)) { + struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; + + debug_exit->addr = vcpu->arch.sie_block->gpsw.addr; + debug_exit->type = KVM_SINGLESTEP; + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; } set_intercept_indicators(vcpu); @@ -2677,9 +2678,13 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) kvm_s390_clear_float_irqs(dev->kvm); break; case KVM_DEV_FLIC_APF_ENABLE: + if (kvm_is_ucontrol(dev->kvm)) + return -EINVAL; dev->kvm->arch.gmap->pfault_enabled = 1; break; case KVM_DEV_FLIC_APF_DISABLE_WAIT: + if (kvm_is_ucontrol(dev->kvm)) + return -EINVAL; dev->kvm->arch.gmap->pfault_enabled = 0; /* * Make sure no async faults are in transition when @@ -2776,7 +2781,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr) mmap_read_lock(kvm->mm); get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL, NULL); + &page, NULL); mmap_read_unlock(kvm->mm); return page; } @@ -2888,20 +2893,25 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - u64 uaddr; + u64 uaddr_s, uaddr_i; + int idx; switch (ue->type) { /* we store the userspace addresses instead of the guest addresses */ case KVM_IRQ_ROUTING_S390_ADAPTER: + if (kvm_is_ucontrol(kvm)) + return -EINVAL; e->set = set_adapter_int; - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); - if (uaddr == -EFAULT) - return -EFAULT; - e->adapter.summary_addr = uaddr; - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); - if (uaddr == -EFAULT) + + idx = srcu_read_lock(&kvm->srcu); + uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr); + uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr); + srcu_read_unlock(&kvm->srcu, idx); + + if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; - e->adapter.ind_addr = uaddr; + e->adapter.summary_addr = uaddr_s; + e->adapter.ind_addr = uaddr_i; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; @@ -3102,9 +3112,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) static void process_gib_alert_list(void) { struct kvm_s390_gisa_interrupt *gi; + u32 final, gisa_phys, origin = 0UL; struct kvm_s390_gisa *gisa; struct kvm *kvm; - u32 final, origin = 0UL; do { /* @@ -3130,9 +3140,10 @@ static void process_gib_alert_list(void) * interruptions asap. */ while (origin & GISA_ADDR_MASK) { - gisa = (struct kvm_s390_gisa *)(u64)origin; + gisa_phys = origin; + gisa = phys_to_virt(gisa_phys); origin = gisa->next_alert; - gisa->next_alert = (u32)(u64)gisa; + gisa->next_alert = gisa_phys; kvm = container_of(gisa, struct sie_page2, gisa)->kvm; gi = &kvm->arch.gisa_int; if (hrtimer_active(&gi->timer)) @@ -3150,7 +3161,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) if (!gi->origin) return; gisa_clear_ipm(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin); } void kvm_s390_gisa_init(struct kvm *kvm) @@ -3163,11 +3174,10 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); gi->expires = 50 * 1000; /* 50 usec */ - hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - gi->timer.function = gisa_vcpu_kicker; + hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL); memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); - gi->origin->next_alert = (u32)(u64)gi->origin; - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); + gi->origin->next_alert = (u32)virt_to_phys(gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin); } void kvm_s390_gisa_enable(struct kvm *kvm) @@ -3200,14 +3210,15 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) if (!gi->origin) return; - if (gi->alert.mask) - KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", - kvm, gi->alert.mask); - while (gisa_in_alert_list(gi->origin)) - cpu_relax(); + WARN(gi->alert.mask != 0x00, + "unexpected non zero alert.mask 0x%02x", + gi->alert.mask); + gi->alert.mask = 0x00; + if (gisa_set_iam(gi->origin, gi->alert.mask)) + process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; - VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); + VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa); } void kvm_s390_gisa_disable(struct kvm *kvm) @@ -3311,29 +3322,111 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) +static void aen_host_forward(unsigned long si) +{ + struct kvm_s390_gisa_interrupt *gi; + struct zpci_gaite *gaite; + struct kvm *kvm; + + gaite = (struct zpci_gaite *)aift->gait + + (si * sizeof(struct zpci_gaite)); + if (gaite->count == 0) + return; + if (gaite->aisb != 0) + set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb)); + + kvm = kvm_s390_pci_si_to_kvm(aift, si); + if (!kvm) + return; + gi = &kvm->arch.gisa_int; + + if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) || + !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) { + gisa_set_ipm_gisc(gi->origin, gaite->gisc); + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + kvm->stat.aen_forward++; + } +} + +static void aen_process_gait(u8 isc) { + bool found = false, first = true; + union zpci_sic_iib iib = {{0}}; + unsigned long si, flags; + + spin_lock_irqsave(&aift->gait_lock, flags); + + if (!aift->gait) { + spin_unlock_irqrestore(&aift->gait_lock, flags); + return; + } + + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv)); + if (si == -1UL) { + if (first || found) { + /* Re-enable interrupts. */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc, + &iib); + first = found = false; + } else { + /* Interrupts on and all bits processed */ + break; + } + found = false; + si = 0; + /* Scan again after re-enabling interrupts */ + continue; + } + found = true; + aen_host_forward(si); + } + + spin_unlock_irqrestore(&aift->gait_lock, flags); +} + +static void gib_alert_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info; + inc_irq_stat(IRQIO_GAL); - process_gib_alert_list(); + + if ((info->forward || info->error) && + IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + aen_process_gait(info->isc); + if (info->aism != 0) + process_gib_alert_list(); + } else { + process_gib_alert_list(); + } } static struct airq_struct gib_alert_irq = { .handler = gib_alert_irq_handler, - .lsi_ptr = &gib_alert_irq.lsi_mask, }; void kvm_s390_gib_destroy(void) { if (!gib) return; + if (kvm_s390_pci_interp_allowed() && aift) { + mutex_lock(&aift->aift_lock); + kvm_s390_pci_aen_exit(); + mutex_unlock(&aift->aift_lock); + } chsc_sgib(0); unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); gib = NULL; } -int kvm_s390_gib_init(u8 nisc) +int __init kvm_s390_gib_init(u8 nisc) { + u32 gib_origin; int rc = 0; if (!css_general_characteristics.aiv) { @@ -3353,9 +3446,12 @@ int kvm_s390_gib_init(u8 nisc) rc = -EIO; goto out_free_gib; } + /* adapter interrupts used for AP (applicable here) don't use the LSI */ + *gib_alert_irq.lsi_ptr = 0xff; gib->nisc = nisc; - if (chsc_sgib((u32)(u64)gib)) { + gib_origin = virt_to_phys(gib); + if (chsc_sgib(gib_origin)) { pr_err("Associating the GIB with the AIV facility failed\n"); free_page((unsigned long)gib); gib = NULL; @@ -3363,7 +3459,15 @@ int kvm_s390_gib_init(u8 nisc) goto out_unreg_gal; } - KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + if (kvm_s390_pci_interp_allowed()) { + if (kvm_s390_pci_aen_init(nisc)) { + pr_err("Initializing AEN for PCI failed\n"); + rc = -EIO; + goto out_unreg_gal; + } + } + + KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc); goto out; out_unreg_gal: diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h deleted file mode 100644 index 484608c71dd0..000000000000 --- a/arch/s390/kvm/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 irqchip routines - * - * Copyright IBM Corp. 2014 - * - * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> - */ -#ifndef __KVM_IRQ_H -#define __KVM_IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} - -#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8fcb56141689..d5ad10791c25 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -23,6 +23,7 @@ #include <linux/mman.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/cpufeature.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/timer.h> @@ -31,22 +32,27 @@ #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/pgtable.h> +#include <linux/mmu_notifier.h> +#include <asm/access-regs.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> +#include <asm/machine.h> #include <asm/stp.h> #include <asm/gmap.h> +#include <asm/gmap_helpers.h> #include <asm/nmi.h> -#include <asm/switch_to.h> #include <asm/isc.h> #include <asm/sclp.h> #include <asm/cpacf.h> #include <asm/timex.h> +#include <asm/asm.h> +#include <asm/fpu.h> #include <asm/ap.h> #include <asm/uv.h> -#include <asm/fpu/api.h> #include "kvm-s390.h" #include "gaccess.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -63,7 +69,15 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_float_mchk), STATS_DESC_COUNTER(VM, inject_pfault_done), STATS_DESC_COUNTER(VM, inject_service_signal), - STATS_DESC_COUNTER(VM, inject_virtio) + STATS_DESC_COUNTER(VM, inject_virtio), + STATS_DESC_COUNTER(VM, aen_forward), + STATS_DESC_COUNTER(VM, gmap_shadow_reuse), + STATS_DESC_COUNTER(VM, gmap_shadow_create), + STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry), }; const struct kvm_stats_header kvm_vm_stats_header = { @@ -122,6 +136,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_io_other), STATS_DESC_COUNTER(VCPU, instruction_lpsw), STATS_DESC_COUNTER(VCPU, instruction_lpswe), + STATS_DESC_COUNTER(VCPU, instruction_lpswey), STATS_DESC_COUNTER(VCPU, instruction_pfmf), STATS_DESC_COUNTER(VCPU, instruction_ptff), STATS_DESC_COUNTER(VCPU, instruction_sck), @@ -207,6 +222,14 @@ module_param(diag9c_forwarding_hz, uint, 0644); MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); /* + * allow asynchronous deinit for protected guests; enable by default since + * the feature is opt-in anyway + */ +static int async_destroy = 1; +module_param(async_destroy, int, 0444); +MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests"); + +/* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond * this, this requires changes to code, but the external uapi can stay. @@ -245,17 +268,6 @@ debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); @@ -318,25 +330,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void *opaque) -{ - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_register(&s390_epoch_delta_notifier, - &kvm_clock_notifier); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, - &kvm_clock_notifier); -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); @@ -351,30 +344,48 @@ static inline int plo_test_bit(unsigned char nr) " lgr 0,%[function]\n" /* Parameter registers are ignored for "test bit" */ " plo 0,0,0,0(0)\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc) + CC_IPM(cc) + : CC_OUT(cc, cc) : [function] "d" (function) + : CC_CLOBBER_LIST("0")); + return CC_TRANSFORM(cc) == 0; +} + +static __always_inline void pfcr_query(u8 (*query)[16]) +{ + asm volatile( + " lghi 0,0\n" + " .insn rsy,0xeb0000000016,0,0,%[query]\n" + : [query] "=QS" (*query) + : : "cc", "0"); - return cc == 0; } -static __always_inline void __insn32_query(unsigned int opcode, u8 *query) +static __always_inline void __sortl_query(u8 (*query)[32]) { asm volatile( " lghi 0,0\n" - " lgr 1,%[query]\n" + " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rrf,%[opc] << 16,2,4,6,0\n" + " .insn rre,0xb9380000,2,4\n" + : [query] "=R" (*query) : - : [query] "d" ((unsigned long)query), [opc] "i" (opcode) - : "cc", "memory", "0", "1"); + : "cc", "0", "1"); } -#define INSN_SORTL 0xb938 -#define INSN_DFLTCC 0xb939 +static __always_inline void __dfltcc_query(u8 (*query)[32]) +{ + asm volatile( + " lghi 0,0\n" + " la 1,%[query]\n" + /* Parameter registers are ignored */ + " .insn rrf,0xb9390000,2,4,6,0\n" + : [query] "=R" (*query) + : + : "cc", "0", "1"); +} -static void kvm_s390_cpu_feat_init(void) +static void __init kvm_s390_cpu_feat_init(void) { int i; @@ -426,18 +437,21 @@ static void kvm_s390_cpu_feat_init(void) kvm_s390_available_subfunc.kdsa); if (test_facility(150)) /* SORTL */ - __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); + __sortl_query(&kvm_s390_available_subfunc.sortl); if (test_facility(151)) /* DFLTCC */ - __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); + __dfltcc_query(&kvm_s390_available_subfunc.dfltcc); + + if (test_facility(201)) /* PFCR */ + pfcr_query(&kvm_s390_available_subfunc.pfcr); - if (MACHINE_HAS_ESOP) + if (machine_has_esop()) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); /* * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). */ - if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + if (!sclp.has_sief2 || !machine_has_esop() || !sclp.has_64bscao || !test_facility(3) || !nested) return; allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); @@ -477,7 +491,7 @@ static void kvm_s390_cpu_feat_init(void) */ } -int kvm_arch_init(void *opaque) +static int __init __kvm_s390_init(void) { int rc = -ENOMEM; @@ -487,11 +501,11 @@ int kvm_arch_init(void *opaque) kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); if (!kvm_s390_dbf_uv) - goto out; + goto err_kvm_uv; if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) - goto out; + goto err_debug_view; kvm_s390_cpu_feat_init(); @@ -499,23 +513,52 @@ int kvm_arch_init(void *opaque) rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { pr_err("A FLIC registration call failed with rc=%d\n", rc); - goto out; + goto err_flic; + } + + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + rc = kvm_s390_pci_init(); + if (rc) { + pr_err("Unable to allocate AIFT for PCI\n"); + goto err_pci; + } } rc = kvm_s390_gib_init(GAL_ISC); if (rc) - goto out; + goto err_gib; + + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; -out: - kvm_arch_exit(); +err_gib: + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); +err_pci: +err_flic: +err_debug_view: + debug_unregister(kvm_s390_dbf_uv); +err_kvm_uv: + debug_unregister(kvm_s390_dbf); return rc; } -void kvm_arch_exit(void) +static void __kvm_s390_exit(void) { + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + kvm_s390_gib_destroy(); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); debug_unregister(kvm_s390_dbf); debug_unregister(kvm_s390_dbf_uv); } @@ -546,7 +589,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -563,7 +605,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: - case KVM_CAP_S390_MEM_OP_EXTENSION: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -571,12 +613,21 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_S390_HPAGE_1M: r = 0; - if (hpage && !kvm_is_ucontrol(kvm)) + if (hpage && !(kvm && kvm_is_ucontrol(kvm))) r = 1; break; case KVM_CAP_S390_MEM_OP: r = MEM_OP_MAX_SIZE; break; + case KVM_CAP_S390_MEM_OP_EXTENSION: + /* + * Flag bits indicating which extensions are supported. + * If r > 0, the base extension must also be supported/indicated, + * in order to maintain backwards compatibility. + */ + r = KVM_S390_MEMOP_EXTENSION_CAP_BASE | + KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG; + break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: @@ -589,10 +640,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = min_t(unsigned int, num_online_cpus(), r); break; case KVM_CAP_S390_COW: - r = MACHINE_HAS_ESOP; + r = machine_has_esop(); break; case KVM_CAP_S390_VECTOR_REGISTERS: - r = MACHINE_HAS_VX; + r = test_facility(129); break; case KVM_CAP_S390_RI: r = test_facility(64); @@ -603,9 +654,38 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; case KVM_CAP_S390_PROTECTED: r = is_prot_virt_host(); break; + case KVM_CAP_S390_PROTECTED_DUMP: { + u64 pv_cmds_dump[] = { + BIT_UVC_CMD_DUMP_INIT, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE, + BIT_UVC_CMD_DUMP_CPU, + BIT_UVC_CMD_DUMP_COMPLETE, + }; + int i; + + r = is_prot_virt_host(); + + for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) { + if (!test_bit_inv(pv_cmds_dump[i], + (unsigned long *)&uv_info.inst_calls_list)) { + r = 0; + break; + } + } + break; + } + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; default: r = 0; } @@ -712,7 +792,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) mutex_lock(&kvm->lock); if (kvm->created_vcpus) { r = -EBUSY; - } else if (MACHINE_HAS_VX) { + } else if (cpu_has_vx()) { set_kvm_facility(kvm->arch.model.fac_mask, 129); set_kvm_facility(kvm->arch.model.fac_list, 129); if (test_facility(134)) { @@ -735,6 +815,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) set_kvm_facility(kvm->arch.model.fac_mask, 192); set_kvm_facility(kvm->arch.model.fac_list, 192); } + if (test_facility(198)) { + set_kvm_facility(kvm->arch.model.fac_mask, 198); + set_kvm_facility(kvm->arch.model.fac_list, 198); + } + if (test_facility(199)) { + set_kvm_facility(kvm->arch.model.fac_mask, 199); + set_kvm_facility(kvm->arch.model.fac_list, 199); + } r = 0; } else r = -EINVAL; @@ -817,6 +905,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) icpt_operexc_on_all_vcpus(kvm); r = 0; break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; default: r = -EINVAL; break; @@ -920,7 +1022,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } mutex_unlock(&kvm->lock); VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%pK", + VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *) kvm->arch.gmap->asce); break; } @@ -1019,6 +1121,42 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu) +{ + /* Only set the ECB bits after guest requests zPCI interpretation */ + if (!vcpu->kvm->arch.use_zpci_interp) + return; + + vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI; + vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI; +} + +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + if (!kvm_s390_pci_interp_allowed()) + return; + + /* + * If host is configured for PCI and the necessary facilities are + * available, turn on interpretation for the life of this guest + */ + kvm->arch.use_zpci_interp = 1; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_pci_setup(vcpu); + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) { unsigned long cx; @@ -1118,6 +1256,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm, return 0; } +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); + static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_tod_clock gtod; @@ -1127,7 +1267,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) return -EINVAL; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", gtod.epoch_idx, gtod.tod); @@ -1158,7 +1298,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) sizeof(gtod.tod))) return -EFAULT; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); return 0; } @@ -1170,6 +1310,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) if (attr->flags) return -EINVAL; + mutex_lock(&kvm->lock); + /* + * For protected guests, the TOD is managed by the ultravisor, so trying + * to change it will never bring the expected results. + */ + if (kvm_s390_pv_is_protected(kvm)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + switch (attr->attr) { case KVM_S390_VM_TOD_EXT: ret = kvm_s390_set_tod_ext(kvm, attr); @@ -1184,6 +1334,9 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) ret = -ENXIO; break; } + +out_unlock: + mutex_unlock(&kvm->lock); return ret; } @@ -1414,6 +1567,42 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); + VM_EVENT(kvm, 3, "GET: guest PFCR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]); + + return 0; +} + +#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK \ +( \ + ((struct kvm_s390_vm_cpu_uv_feat){ \ + .ap = 1, \ + .ap_intr = 1, \ + }) \ + .feat \ +) + +static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr; + unsigned long data, filter; + + filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (get_user(data, &ptr->feat)) + return -EFAULT; + if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + kvm->arch.model.uv_feat_guest.feat = data; + mutex_unlock(&kvm->lock); + + VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data); return 0; } @@ -1432,6 +1621,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: ret = kvm_s390_set_processor_subfunc(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_set_uv_feat(kvm, attr); + break; } return ret; } @@ -1592,6 +1784,9 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm, ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); + VM_EVENT(kvm, 3, "GET: guest PFCR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]); return 0; } @@ -1660,6 +1855,36 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1], ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2], ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]); + VM_EVENT(kvm, 3, "GET: host PFCR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]); + + return 0; +} + +static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat = kvm->arch.model.uv_feat_guest.feat; + + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + +static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat; + + BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications)); + + feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); return 0; } @@ -1687,10 +1912,67 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_SUBFUNC: ret = kvm_s390_get_machine_subfunc(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_get_processor_uv_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + ret = kvm_s390_get_machine_uv_feat(kvm, attr); + break; } return ret; } +/** + * kvm_s390_update_topology_change_report - update CPU topology change report + * @kvm: guest KVM description + * @val: set or clear the MTCR bit + * + * Updates the Multiprocessor Topology-Change-Report bit to signal + * the guest with a topology change. + * This is only relevant if the topology facility is present. + * + * The SCA version, bsca or esca, doesn't matter as offset is the same. + */ +static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) +{ + union sca_utility new, old; + struct bsca_block *sca; + + read_lock(&kvm->arch.sca_lock); + sca = kvm->arch.sca; + old = READ_ONCE(sca->utility); + do { + new = old; + new.mtcr = val; + } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); + read_unlock(&kvm->arch.sca_lock); +} + +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; @@ -1711,6 +1993,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_set_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1736,6 +2021,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_get_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1782,6 +2070,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_FEAT: case KVM_S390_VM_CPU_MACHINE_SUBFUNC: case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: ret = 0; break; default: @@ -1809,6 +2099,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = 0; break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -1817,7 +2110,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } -static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -1865,7 +2158,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) return r; } -static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -1983,6 +2276,10 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); ofs = 0; } + + if (cur_gfn < ms->base_gfn) + ofs = 0; + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); while (ofs >= ms->npages && (mnode = rb_next(mnode))) { ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); @@ -2166,12 +2463,25 @@ out: return r; } -static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) +/** + * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to + * non protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Does not stop in case of error, tries to convert as many + * CPUs as possible. In case of error, the RC and RRC of the last error are + * returned. + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { struct kvm_vcpu *vcpu; - u16 rc, rrc; - int ret = 0; unsigned long i; + u16 _rc, _rrc; + int ret = 0; /* * We ignore failures and try to destroy as many CPUs as possible. @@ -2183,9 +2493,9 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) */ kvm_for_each_vcpu(i, vcpu, kvm) { mutex_lock(&vcpu->mutex); - if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) { - *rcp = rc; - *rrcp = rrc; + if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) { + *rc = _rc; + *rrc = _rrc; ret = -EIO; } mutex_unlock(&vcpu->mutex); @@ -2196,6 +2506,17 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) return ret; } +/** + * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM + * to protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Tries to undo the conversion in case of error. + * + * Return: 0 in case of success, otherwise -EIO + */ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { unsigned long i; @@ -2205,7 +2526,7 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) struct kvm_vcpu *vcpu; /* Disable the GISA if the ultravisor does not support AIV. */ - if (!test_bit_inv(BIT_UV_FEAT_AIV, &uv_info.uv_feature_indications)) + if (!uv_has_feature(BIT_UV_FEAT_AIV)) kvm_s390_gisa_disable(kvm); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -2220,11 +2541,124 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) return r; } +/* + * Here we provide user space with a direct interface to query UV + * related data like UV maxima and available features as well as + * feature specific data. + * + * To facilitate future extension of the data structures we'll try to + * write data up to the maximum requested length. + */ +static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info) +{ + ssize_t len_min; + + switch (info->header.id) { + case KVM_PV_INFO_VM: { + len_min = sizeof(info->header) + sizeof(info->vm); + + if (info->header.len_max < len_min) + return -EINVAL; + + memcpy(info->vm.inst_calls_list, + uv_info.inst_calls_list, + sizeof(uv_info.inst_calls_list)); + + /* It's max cpuid not max cpus, so it's off by one */ + info->vm.max_cpus = uv_info.max_guest_cpu_id + 1; + info->vm.max_guests = uv_info.max_num_sec_conf; + info->vm.max_guest_addr = uv_info.max_sec_stor_addr; + info->vm.feature_indication = uv_info.uv_feature_indications; + + return len_min; + } + case KVM_PV_INFO_DUMP: { + len_min = sizeof(info->header) + sizeof(info->dump); + + if (info->header.len_max < len_min) + return -EINVAL; + + info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len; + info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len; + info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len; + return len_min; + } + default: + return -EINVAL; + } +} + +static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, + struct kvm_s390_pv_dmp dmp) +{ + int r = -EINVAL; + void __user *result_buff = (void __user *)dmp.buff_addr; + + switch (dmp.subcmd) { + case KVM_PV_DUMP_INIT: { + if (kvm->arch.pv.dumping) + break; + + /* + * Block SIE entry as concurrent dump UVCs could lead + * to validities. + */ + kvm_s390_vcpu_block_all(kvm); + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x", + cmd->rc, cmd->rrc); + if (!r) { + kvm->arch.pv.dumping = true; + } else { + kvm_s390_vcpu_unblock_all(kvm); + r = -EINVAL; + } + break; + } + case KVM_PV_DUMP_CONFIG_STOR_STATE: { + if (!kvm->arch.pv.dumping) + break; + + /* + * gaddr is an output parameter since we might stop + * early. As dmp will be copied back in our caller, we + * don't need to do it ourselves. + */ + r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_DUMP_COMPLETE: { + if (!kvm->arch.pv.dumping) + break; + + r = -EINVAL; + if (dmp.buff_len < uv_info.conf_dump_finalize_len) + break; + + r = kvm_s390_pv_dump_complete(kvm, result_buff, + &cmd->rc, &cmd->rrc); + break; + } + default: + r = -ENOTTY; + break; + } + + return r; +} + static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) { + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; int r = 0; u16 dummy; - void __user *argp = (void __user *)cmd->data; + + if (need_lock) + mutex_lock(&kvm->lock); switch (cmd->cmd) { case KVM_PV_ENABLE: { @@ -2240,9 +2674,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (r) break; - mmap_write_lock(current->mm); - r = gmap_mark_unmergeable(); - mmap_write_unlock(current->mm); + mmap_write_lock(kvm->mm); + r = gmap_helper_disable_cow_sharing(); + mmap_write_unlock(kvm->mm); if (r) break; @@ -2258,6 +2692,31 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); break; } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; case KVM_PV_DISABLE: { r = -EINVAL; if (!kvm_s390_pv_is_protected(kvm)) @@ -2271,7 +2730,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) */ if (r) break; - r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); /* no need to block service interrupts any more */ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); @@ -2356,47 +2815,104 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) cmd->rc, cmd->rrc); break; } + case KVM_PV_INFO: { + struct kvm_s390_pv_info info = {}; + ssize_t data_len; + + /* + * No need to check the VM protection here. + * + * Maybe user space wants to query some of the data + * when the VM is still unprotected. If we see the + * need to fence a new data command we can still + * return an error in the info handler. + */ + + r = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info.header))) + break; + + r = -EINVAL; + if (info.header.len_max < sizeof(info.header)) + break; + + data_len = kvm_s390_handle_pv_info(&info); + if (data_len < 0) { + r = data_len; + break; + } + /* + * If a data command struct is extended (multiple + * times) this can be used to determine how much of it + * is valid. + */ + info.header.len_written = data_len; + + r = -EFAULT; + if (copy_to_user(argp, &info, data_len)) + break; + + r = 0; + break; + } + case KVM_PV_DUMP: { + struct kvm_s390_pv_dmp dmp; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&dmp, argp, sizeof(dmp))) + break; + + r = kvm_s390_pv_dmp(kvm, cmd, dmp); + if (r) + break; + + if (copy_to_user(argp, &dmp, sizeof(dmp))) { + r = -EFAULT; + break; + } + + break; + } default: r = -ENOTTY; } - return r; -} + if (need_lock) + mutex_unlock(&kvm->lock); -static bool access_key_invalid(u8 access_key) -{ - return access_key > 0xf; + return r; } -static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags) { - void __user *uaddr = (void __user *)mop->buf; - u64 supported_flags; - void *tmpbuf = NULL; - int r, srcu_idx; - - supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION - | KVM_S390_MEMOP_F_CHECK_ONLY; if (mop->flags & ~supported_flags || !mop->size) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - /* - * This is technically a heuristic only, if the kvm->lock is not - * taken, it is not guaranteed that the vm is/remains non-protected. - * This is ok from a kernel perspective, wrongdoing is detected - * on the access, -EFAULT is returned and the vm may crash the - * next time it accesses the memory in question. - * There is no sane usecase to do switching and a memop on two - * different CPUs at the same time. - */ - if (kvm_s390_pv_get_handle(kvm)) - return -EINVAL; if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { - if (access_key_invalid(mop->key)) + if (mop->key > 0xf) return -EINVAL; } else { mop->key = 0; } + return 0; +} + +static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r, srcu_idx; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | + KVM_S390_MEMOP_F_CHECK_ONLY); + if (r) + return r; + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) @@ -2405,40 +2921,30 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_gpa(kvm, mop->gaddr)) { + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { r = PGM_ADDRESSING; goto out_unlock; } - switch (mop->op) { - case KVM_S390_MEMOP_ABSOLUTE_READ: { - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key); - } else { - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_FETCH, mop->key); - if (r == 0) { - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } - } - break; + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); + goto out_unlock; } - case KVM_S390_MEMOP_ABSOLUTE_WRITE: { - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key); - } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - break; - } - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_STORE, mop->key); + if (acc_mode == GACC_FETCH) { + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_FETCH, mop->key); + if (r) + goto out_unlock; + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_unlock; } - break; - } - default: - r = -EINVAL; + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_STORE, mop->key); } out_unlock: @@ -2448,8 +2954,76 @@ out_unlock: return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void __user *old_addr = (void __user *)mop->old_addr; + union { + __uint128_t quad; + char raw[sizeof(__uint128_t)]; + } old = { .quad = 0}, new = { .quad = 0 }; + unsigned int off_in_quad = sizeof(new) - mop->size; + int r, srcu_idx; + bool success; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + /* + * This validates off_in_quad. Checking that size is a power + * of two is not necessary, as cmpxchg_guest_abs_with_key + * takes care of that + */ + if (mop->size > sizeof(new)) + return -EINVAL; + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + return -EFAULT; + if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + return -EFAULT; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, + new.quad, mop->key, &success); + if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) + r = -EFAULT; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return r; +} + +static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; + + switch (mop->op) { + case KVM_S390_MEMOP_ABSOLUTE_READ: + case KVM_S390_MEMOP_ABSOLUTE_WRITE: + return kvm_s390_vm_mem_op_abs(kvm, mop); + case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG: + return kvm_s390_vm_mem_op_cmpxchg(kvm, mop); + default: + return -EINVAL; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; @@ -2467,14 +3041,9 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_CREATE_IRQCHIP: { - struct kvm_irq_routing_entry routing; - r = -EINVAL; - if (kvm->arch.use_irqchip) { - /* Set up dummy routing. */ - memset(&routing, 0, sizeof(routing)); - r = kvm_set_irq_routing(kvm, &routing, 0, 0); - } + if (kvm->arch.use_irqchip) + r = 0; break; } case KVM_SET_DEVICE_ATTR: { @@ -2563,9 +3132,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EINVAL; break; } - mutex_lock(&kvm->lock); + /* must be called without kvm->lock */ r = kvm_s390_handle_pv(kvm, &args); - mutex_unlock(&kvm->lock); if (copy_to_user(argp, &args, sizeof(args))) { r = -EFAULT; break; @@ -2581,6 +3149,19 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -2610,7 +3191,7 @@ static int kvm_s390_apxa_installed(void) */ static void kvm_s390_set_crycb_format(struct kvm *kvm) { - kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb; + kvm->arch.crypto.crycbd = virt_to_phys(kvm->arch.crypto.crycb); /* Clear the CRYCB format bits - i.e., set format 0 by default */ kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK); @@ -2742,6 +3323,14 @@ static void sca_dispose(struct kvm *kvm) kvm->arch.sca = NULL; } +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_clear_list(kvm); + + __kvm_arch_free_vm(kvm); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; @@ -2811,7 +3400,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* we emulate STHYI in kvm */ set_kvm_facility(kvm->arch.model.fac_mask, 74); set_kvm_facility(kvm->arch.model.fac_list, 74); - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { set_kvm_facility(kvm->arch.model.fac_mask, 147); set_kvm_facility(kvm->arch.model.fac_list, 147); } @@ -2822,8 +3411,17 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; + kvm->arch.model.uv_feat_guest.feat = 0; + kvm_s390_crypto_init(kvm); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + mutex_lock(&kvm->lock); + kvm_s390_pci_init_list(kvm); + kvm_s390_vcpu_pci_enable_interp(kvm); + mutex_unlock(&kvm->lock); + } + mutex_init(&kvm->arch.float_int.ais_lock); spin_lock_init(&kvm->arch.float_int.lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) @@ -2835,8 +3433,20 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) VM_EVENT(kvm, 3, "vm created with type %lu", type); if (type & KVM_VM_S390_UCONTROL) { + struct kvm_userspace_memory_region2 fake_memslot = { + .slot = KVM_S390_UCONTROL_MEMSLOT, + .guest_phys_addr = 0, + .userspace_addr = 0, + .memory_size = ALIGN_DOWN(TASK_SIZE, _SEGMENT_SIZE), + .flags = 0, + }; + kvm->arch.gmap = NULL; kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; + /* one flat fake memslot covering the whole address-space */ + mutex_lock(&kvm->slots_lock); + KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); + mutex_unlock(&kvm->slots_lock); } else { if (sclp.hamax == U64_MAX) kvm->arch.mem_limit = TASK_SIZE_MAX; @@ -2856,7 +3466,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_vsie_init(kvm); if (use_gisa) kvm_s390_gisa_init(kvm); - KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid); return 0; out_err: @@ -2877,6 +3489,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) sca_del_vcpu(vcpu); + kvm_s390_update_topology_change_report(vcpu->kvm, 1); if (kvm_is_ucontrol(vcpu->kvm)) gmap_remove(vcpu->arch.gmap); @@ -2899,11 +3512,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* * We are already at the end of life and kvm->lock is not taken. * This is ok as the file descriptor is closed by now and nobody - * can mess with the pv state. To avoid lockdep_assert_held from - * complaining we do not use kvm_s390_pv_is_protected. + * can mess with the pv state. */ - if (kvm_s390_pv_get_handle(kvm)) - kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); + /* + * Remove the mmu notifier only when the whole KVM VM is torn down, + * and only if one was registered to begin with. If the VM is + * currently not protected, but has been previously been protected, + * then it's possible that the notifier is still registered. + */ + if (kvm->arch.pv.mmu_notifier.ops) + mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm); + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) @@ -2911,7 +3531,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); - KVM_EVENT(3, "vm 0x%pK destroyed", kvm); + KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ @@ -2947,28 +3567,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { if (!kvm_s390_use_sca_entries()) { - struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; return; } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); } read_unlock(&vcpu->kvm->arch.sca_lock); @@ -2999,6 +3621,7 @@ static int sca_switch_to_extended(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long vcpu_idx; u32 scaol, scaoh; + phys_addr_t new_sca_phys; if (kvm->arch.use_esca) return 0; @@ -3007,8 +3630,9 @@ static int sca_switch_to_extended(struct kvm *kvm) if (!new_sca) return -ENOMEM; - scaoh = (u32)((u64)(new_sca) >> 32); - scaol = (u32)(u64)(new_sca) & ~0x3fU; + new_sca_phys = virt_to_phys(new_sca); + scaoh = new_sca_phys >> 32; + scaol = new_sca_phys & ESCA_SCAOL_MASK; kvm_s390_vcpu_block_all(kvm); write_lock(&kvm->arch.sca_lock); @@ -3028,7 +3652,7 @@ static int sca_switch_to_extended(struct kvm *kvm) free_page((unsigned long)old_sca); - VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)", + VM_EVENT(kvm, 2, "Switched to ESCA (0x%p -> 0x%p)", old_sca, kvm->arch.sca); return 0; } @@ -3047,9 +3671,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (!sclp.has_esca || !sclp.has_64bscao) return false; - mutex_lock(&kvm->lock); rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - mutex_unlock(&kvm->lock); return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } @@ -3144,7 +3766,6 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - gmap_enable(vcpu->arch.enabled_gmap); kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __start_cpu_timer_accounting(vcpu); @@ -3157,8 +3778,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __stop_cpu_timer_accounting(vcpu); kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING); - vcpu->arch.enabled_gmap = gmap_get_enabled(); - gmap_disable(vcpu->arch.enabled_gmap); } @@ -3176,8 +3795,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) } if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; - /* make vcpu_load load the right gmap on the first trigger */ - vcpu->arch.enabled_gmap = vcpu->arch.gmap; } static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr) @@ -3199,6 +3816,13 @@ static bool kvm_has_pckmo_ecc(struct kvm *kvm) } +static bool kvm_has_pckmo_hmac(struct kvm *kvm) +{ + /* At least one HMAC subfunction must be present */ + return kvm_has_pckmo_subfunc(kvm, 118) || + kvm_has_pckmo_subfunc(kvm, 122); +} + static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) { /* @@ -3211,7 +3835,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); vcpu->arch.sie_block->eca &= ~ECA_APIE; - vcpu->arch.sie_block->ecd &= ~ECD_ECC; + vcpu->arch.sie_block->ecd &= ~(ECD_ECC | ECD_HMAC); if (vcpu->kvm->arch.crypto.apie) vcpu->arch.sie_block->eca |= ECA_APIE; @@ -3219,9 +3843,11 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) /* Set up protected key support */ if (vcpu->kvm->arch.crypto.aes_kw) { vcpu->arch.sie_block->ecb3 |= ECB3_AES; - /* ecc is also wrapped with AES key */ + /* ecc/hmac is also wrapped with AES key */ if (kvm_has_pckmo_ecc(vcpu->kvm)) vcpu->arch.sie_block->ecd |= ECD_ECC; + if (kvm_has_pckmo_hmac(vcpu->kvm)) + vcpu->arch.sie_block->ecd |= ECD_HMAC; } if (vcpu->kvm->arch.crypto.dea_kw) @@ -3230,15 +3856,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) { - free_page(vcpu->arch.sie_block->cbrlo); + free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo)); vcpu->arch.sie_block->cbrlo = 0; } int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) { - vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.sie_block->cbrlo) + void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!cbrlo_page) return -ENOMEM; + + vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page); return 0; } @@ -3248,7 +3877,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) - vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; + vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list); } static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) @@ -3267,11 +3896,13 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_setup_model(vcpu); - /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ - if (MACHINE_HAS_ESOP) + /* pgste_set_pte has special handling for !machine_has_esop() */ + if (machine_has_esop()) vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= ECB_SRSI; + if (test_kvm_facility(vcpu->kvm, 11)) + vcpu->arch.sie_block->ecb |= ECB_PTF; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; if (!kvm_is_ucontrol(vcpu->kvm)) @@ -3303,9 +3934,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); } - vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) - | SDNXC; - vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; + vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC; + vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb); if (sclp.has_kss) kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); @@ -3317,13 +3947,15 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) if (rc) return rc; } - hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); vcpu->arch.sie_block->hpid = HPID_KVM; kvm_s390_vcpu_crypto_setup(vcpu); + kvm_s390_vcpu_pci_setup(vcpu); + mutex_lock(&vcpu->kvm->lock); if (kvm_s390_pv_is_protected(vcpu->kvm)) { rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); @@ -3353,7 +3985,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; - vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); /* the real guest size will always be smaller than msl */ vcpu->arch.sie_block->mso = 0; @@ -3373,6 +4005,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_DIAG318; + vcpu->arch.acrs_loaded = false; kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; @@ -3383,9 +4016,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 156)) vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; /* fprs can be synchronized via vrs, even if the guest has no vx. With - * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. + * cpu_has_vx(), (load|store)_fpu_regs() will work with vrs format. */ - if (MACHINE_HAS_VX) + if (cpu_has_vx()) vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; else vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; @@ -3396,13 +4029,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto out_free_sie_block; } - VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p", vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); rc = kvm_s390_vcpu_setup(vcpu); if (rc) goto out_ucontrol_uninit; + + kvm_s390_update_topology_change_report(vcpu->kvm, 1); return 0; out_ucontrol_uninit: @@ -3479,6 +4114,8 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long prefix; unsigned long i; + trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); + if (gmap_is_shadow(gmap)) return; if (start >= 1UL << 31) @@ -3498,7 +4135,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ - if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >= + if (get_lowcore()->avg_steal_timer * 100 / (TICK_USEC << 12) >= READ_ONCE(halt_poll_max_steal)) { vcpu->stat.halt_no_poll_steal++; return true; @@ -3660,7 +4297,7 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) vcpu->run->s.regs.fpc = 0; /* * Do not reset these registers in the protected case, as some of - * them are overlayed and they are not accessible in this case + * them are overlaid and they are not accessible in this case * anyway. */ if (!kvm_s390_pv_cpu_is_protected(vcpu)) { @@ -3733,18 +4370,13 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - if (test_fp_ctl(fpu->fpc)) { - ret = -EINVAL; - goto out; - } vcpu->run->s.regs.fpc = fpu->fpc; - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); else memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); -out: vcpu_put(vcpu); return ret; } @@ -3753,9 +4385,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { vcpu_load(vcpu); - /* make sure we have the latest values */ - save_fpu_regs(); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_vx_to_fp((freg_t *) fpu->fprs, (__vector128 *) vcpu->run->s.regs.vrs); else @@ -3885,6 +4515,75 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } +static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) +{ + struct kvm *kvm = gmap->private; + gfn_t gfn = gpa_to_gfn(gaddr); + bool unlocked; + hva_t vmaddr; + gpa_t tmp; + int rc; + + if (kvm_is_ucontrol(kvm)) { + tmp = __gmap_translate(gmap, gaddr); + gfn = gpa_to_gfn(tmp); + } + + vmaddr = gfn_to_hva(kvm, gfn); + rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); + if (!rc) + rc = __gmap_link(gmap, gaddr, vmaddr); + return rc; +} + +/** + * __kvm_s390_mprotect_many() - Apply specified protection to guest pages + * @gmap: the gmap of the guest + * @gpa: the starting guest address + * @npages: how many pages to protect + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() + * + * Context: kvm->srcu and gmap->mm need to be held in read mode + */ +int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, + unsigned long bits) +{ + unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + gpa_t end = gpa + npages * PAGE_SIZE; + int rc; + + for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { + rc = gmap_protect_one(gmap, gpa, prot, bits); + if (rc == -EAGAIN) { + __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); + rc = gmap_protect_one(gmap, gpa, prot, bits); + } + if (rc < 0) + return rc; + } + + return 0; +} + +static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +{ + gpa_t gaddr = kvm_s390_get_prefix(vcpu); + int idx, rc; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + mmap_read_lock(vcpu->arch.gmap->mm); + + rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); + + mmap_read_unlock(vcpu->arch.gmap->mm); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return rc; +} + static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { retry: @@ -3900,9 +4599,8 @@ retry: */ if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = gmap_mprotect_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2, PROT_WRITE); + + rc = kvm_s390_mprotect_notify_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; @@ -3957,8 +4655,6 @@ retry: goto retry; } - /* nothing to do, just clear the request */ - kvm_clear_request(KVM_REQ_UNHALT, vcpu); /* we left the vsie handler, nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu); @@ -3993,13 +4689,6 @@ static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_t preempt_enable(); } -void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) -{ - mutex_lock(&kvm->lock); - __kvm_s390_set_tod_clock(kvm, gtod); - mutex_unlock(&kvm->lock); -} - int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) { if (!mutex_trylock(&kvm->lock)) @@ -4009,22 +4698,6 @@ int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clo return 1; } -/** - * kvm_arch_fault_in_page - fault-in guest page if necessary - * @vcpu: The corresponding virtual cpu - * @gpa: Guest physical address - * @writable: Whether the page should be writable or not - * - * Make sure that a guest page has been faulted-in on the host. - * - * Return: Zero on success, negative error code otherwise. - */ -long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable) -{ - return gmap_fault(vcpu->arch.gmap, gpa, - writable ? FAULT_FLAG_WRITE : 0); -} - static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, unsigned long token) { @@ -4092,12 +4765,11 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) if (!vcpu->arch.gmap->pfault_enabled) return false; - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); - hva += current->thread.gmap_addr & ~PAGE_MASK; + hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr); if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) return false; - return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); + return kvm_setup_async_pf(vcpu, current->thread.gmap_teid.addr * PAGE_SIZE, hva, &arch); } static int vcpu_pre_run(struct kvm_vcpu *vcpu) @@ -4119,7 +4791,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); - if (rc) + if (rc || guestdbg_exit_pending(vcpu)) return rc; } @@ -4135,6 +4807,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; + current->thread.gmap_int_code = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); trace_kvm_s390_sie_enter(vcpu, cpuflags); @@ -4142,7 +4815,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) return 0; } -static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) +static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu) { struct kvm_s390_pgm_info pgm_info = { .code = PGM_ADDRESSING, @@ -4178,10 +4851,182 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_irq(vcpu, &pgm_info); } +static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) +{ + KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, + "Unexpected program interrupt 0x%x, TEID 0x%016lx", + current->thread.gmap_int_code, current->thread.gmap_teid.val); +} + +/* + * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu + * @vcpu: the vCPU whose gmap is to be fixed up + * @gfn: the guest frame number used for memslots (including fake memslots) + * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps + * @flags: FOLL_* flags + * + * Return: 0 on success, < 0 in case of error. + * Context: The mm lock must not be held before calling. May sleep. + */ +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) +{ + struct kvm_memory_slot *slot; + unsigned int fault_flags; + bool writable, unlocked; + unsigned long vmaddr; + struct page *page; + kvm_pfn_t pfn; + int rc; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return vcpu_post_run_addressing_exception(vcpu); + + fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; + if (vcpu->arch.gmap->pfault_enabled) + flags |= FOLL_NOWAIT; + vmaddr = __gfn_to_hva_memslot(slot, gfn); + +try_again: + pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + + /* Access outside memory, inject addressing exception */ + if (is_noslot_pfn(pfn)) + return vcpu_post_run_addressing_exception(vcpu); + /* Signal pending: try again */ + if (pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ + if (pfn == KVM_PFN_ERR_NEEDS_IO) { + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously */ + flags &= ~FOLL_NOWAIT; + goto try_again; + } + /* Any other error */ + if (is_error_pfn(pfn)) + return -EFAULT; + + /* Success */ + mmap_read_lock(vcpu->arch.gmap->mm); + /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ + rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); + if (!rc) + rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); + scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { + kvm_release_faultin_page(vcpu->kvm, page, false, writable); + } + mmap_read_unlock(vcpu->arch.gmap->mm); + return rc; +} + +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) +{ + unsigned long gaddr_tmp; + gfn_t gfn; + + gfn = gpa_to_gfn(gaddr); + if (kvm_is_ucontrol(vcpu->kvm)) { + /* + * This translates the per-vCPU guest address into a + * fake guest address, which can then be used with the + * fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault + * resolution path, like normal VMs. + */ + mmap_read_lock(vcpu->arch.gmap->mm); + gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); + mmap_read_unlock(vcpu->arch.gmap->mm); + if (gaddr_tmp == -EFAULT) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; + return -EREMOTE; + } + gfn = gpa_to_gfn(gaddr_tmp); + } + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); +} + +static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) +{ + unsigned int flags = 0; + unsigned long gaddr; + int rc; + + gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; + if (kvm_s390_cur_gmap_fault_is_write()) + flags = FAULT_FLAG_WRITE; + + switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { + case 0: + vcpu->stat.exit_null++; + break; + case PGM_SECURE_STORAGE_ACCESS: + case PGM_SECURE_STORAGE_VIOLATION: + kvm_s390_assert_primary_as(vcpu); + /* + * This can happen after a reboot with asynchronous teardown; + * the new guest (normal or protected) will run on top of the + * previous protected guest. The old pages need to be destroyed + * so the new guest can use them. + */ + if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) { + /* + * Either KVM messed up the secure guest mapping or the + * same page is mapped into multiple secure guests. + * + * This exception is only triggered when a guest 2 is + * running and can therefore never occur in kernel + * context. + */ + pr_warn_ratelimited("Secure storage violation (%x) in task: %s, pid %d\n", + current->thread.gmap_int_code, current->comm, + current->pid); + send_sig(SIGSEGV, current, 0); + } + break; + case PGM_NON_SECURE_STORAGE_ACCESS: + kvm_s390_assert_primary_as(vcpu); + /* + * This is normal operation; a page belonging to a protected + * guest has not been imported yet. Try to import the page into + * the protected guest. + */ + rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr); + if (rc == -EINVAL) + send_sig(SIGSEGV, current, 0); + if (rc != -ENXIO) + break; + flags = FAULT_FLAG_WRITE; + fallthrough; + case PGM_PROTECTION: + case PGM_SEGMENT_TRANSLATION: + case PGM_PAGE_TRANSLATION: + case PGM_ASCE_TYPE: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + kvm_s390_assert_primary_as(vcpu); + return vcpu_dat_fault_handler(vcpu, gaddr, flags); + default: + KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", + current->thread.gmap_int_code, current->thread.gmap_teid.val); + send_sig(SIGSEGV, current, 0); + break; + } + return 0; +} + static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) { struct mcck_volatile_info *mcck_info; struct sie_page *sie_page; + int rc; VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); @@ -4203,7 +5048,7 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) } if (vcpu->arch.sie_block->icptcode > 0) { - int rc = kvm_handle_sie_intercept(vcpu); + rc = kvm_handle_sie_intercept(vcpu); if (rc != -EOPNOTSUPP) return rc; @@ -4212,24 +5057,9 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa; vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb; return -EREMOTE; - } else if (exit_reason != -EFAULT) { - vcpu->stat.exit_null++; - return 0; - } else if (kvm_is_ucontrol(vcpu->kvm)) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = - current->thread.gmap_addr; - vcpu->run->s390_ucontrol.pgm_code = 0x10; - return -EREMOTE; - } else if (current->thread.gmap_pfault) { - trace_kvm_s390_major_guest_pfault(vcpu); - current->thread.gmap_pfault = 0; - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1); } - return vcpu_post_run_fault_in_sie(vcpu); + + return vcpu_post_run_handle_fault(vcpu); } #define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) @@ -4246,7 +5076,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) do { rc = vcpu_pre_run(vcpu); - if (rc) + if (rc || guestdbg_exit_pending(vcpu)) break; kvm_vcpu_srcu_read_unlock(vcpu); @@ -4263,10 +5093,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->run->s.regs.gprs, sizeof(sie_page->pv_grregs)); } - if (test_cpu_flag(CIF_FPU)) - load_fpu_regs(); exit_reason = sie64a(vcpu->arch.sie_block, - vcpu->run->s.regs.gprs); + vcpu->run->s.regs.gprs, + vcpu->arch.gmap->asce); if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(vcpu->run->s.regs.gprs, sie_page->pv_grregs, @@ -4351,9 +5180,9 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; } - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); if (current->thread.gs_cb) { vcpu->arch.host_gscb = current->thread.gs_cb; save_gs_cb(vcpu->arch.host_gscb); @@ -4385,19 +5214,8 @@ static void sync_regs(struct kvm_vcpu *vcpu) } save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); - /* save host (userspace) fprs/vrs */ - save_fpu_regs(); - vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; - vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; - if (MACHINE_HAS_VX) - current->thread.fpu.regs = vcpu->run->s.regs.vrs; - else - current->thread.fpu.regs = vcpu->run->s.regs.fprs; - current->thread.fpu.fpc = vcpu->run->s.regs.fpc; - if (test_fp_ctl(current->thread.fpu.fpc)) - /* User space provided an invalid FPC, let's clear it */ - current->thread.fpu.fpc = 0; - + vcpu->arch.acrs_loaded = true; + kvm_s390_fpu_load(vcpu->run); /* Sync fmt2 only data */ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { sync_regs_fmt2(vcpu); @@ -4428,15 +5246,15 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu) kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); if (vcpu->arch.gs_enabled) save_gs_cb(current->thread.gs_cb); current->thread.gs_cb = vcpu->arch.host_gscb; restore_gs_cb(vcpu->arch.host_gscb); if (!vcpu->arch.host_gscb) - __ctl_clear_bit(2, 4); + local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT); vcpu->arch.host_gscb = NULL; preempt_enable(); } @@ -4458,12 +5276,8 @@ static void store_regs(struct kvm_vcpu *vcpu) kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); - /* Save guest register state */ - save_fpu_regs(); - vcpu->run->s.regs.fpc = current->thread.fpu.fpc; - /* Restore will be done lazily at return */ - current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; - current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; + vcpu->arch.acrs_loaded = false; + kvm_s390_fpu_store(vcpu->run); if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) store_regs_fmt2(vcpu); } @@ -4471,9 +5285,19 @@ static void store_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; + DECLARE_KERNEL_FPU_ONSTACK32(fpu); int rc; - if (kvm_run->immediate_exit) + /* + * Running a VM while dumping always has the potential to + * produce inconsistent dump data. But for PV vcpus a SIE + * entry while dumping could also lead to a fatal validity + * intercept which we absolutely want to avoid. + */ + if (vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (!vcpu->wants_to_run) return -EINTR; if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS || @@ -4503,6 +5327,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } + kernel_fpu_begin(&fpu, KERNEL_FPC | KERNEL_VXR); sync_regs(vcpu); enable_cpu_timer_accounting(vcpu); @@ -4526,6 +5351,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) disable_cpu_timer_accounting(vcpu); store_regs(vcpu); + kernel_fpu_end(&fpu, KERNEL_FPC | KERNEL_VXR); kvm_sigset_deactivate(vcpu); @@ -4562,7 +5388,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) gpa -= __LC_FPREGS_SAVE_AREA; /* manually convert vector registers if necessary */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, fprs, 128); @@ -4600,8 +5426,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) * switch in the run ioctl. Let's update our copies before we save * it into the save area */ - save_fpu_regs(); - vcpu->run->s.regs.fpc = current->thread.fpu.fpc; + kvm_s390_fpu_store(vcpu->run); save_access_regs(vcpu->run->s.regs.acrs); return kvm_s390_store_status_unloaded(vcpu, addr); @@ -4768,6 +5593,7 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *sida_addr; int r = 0; if (mop->flags || !mop->size) @@ -4779,16 +5605,16 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, if (!kvm_s390_pv_cpu_is_protected(vcpu)) return -EINVAL; + sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset; + switch (mop->op) { case KVM_S390_MEMOP_SIDA_READ: - if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) + - mop->sida_offset), mop->size)) + if (copy_to_user(uaddr, sida_addr, mop->size)) r = -EFAULT; break; case KVM_S390_MEMOP_SIDA_WRITE: - if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) + - mop->sida_offset), uaddr, mop->size)) + if (copy_from_user(sida_addr, uaddr, mop->size)) r = -EFAULT; break; } @@ -4799,62 +5625,54 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; void *tmpbuf = NULL; - int r = 0; - const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION - | KVM_S390_MEMOP_F_CHECK_ONLY - | KVM_S390_MEMOP_F_SKEY_PROTECTION; + int r; - if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | + KVM_S390_MEMOP_F_CHECK_ONLY | + KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + if (mop->ar >= NUM_ACRS) return -EINVAL; - if (mop->size > MEM_OP_MAX_SIZE) - return -E2BIG; if (kvm_s390_pv_cpu_is_protected(vcpu)) return -EINVAL; - if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { - if (access_key_invalid(mop->key)) - return -EINVAL; - } else { - mop->key = 0; - } if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } - switch (mop->op) { - case KVM_S390_MEMOP_LOGICAL_READ: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, - GACC_FETCH, mop->key); - break; - } + acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + acc_mode, mop->key); + goto out_inject; + } + if (acc_mode == GACC_FETCH) { r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - if (r == 0) { - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } - break; - case KVM_S390_MEMOP_LOGICAL_WRITE: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, - GACC_STORE, mop->key); - break; + if (r) + goto out_inject; + if (copy_to_user(uaddr, tmpbuf, mop->size)) { + r = -EFAULT; + goto out_free; } + } else { if (copy_from_user(tmpbuf, uaddr, mop->size)) { r = -EFAULT; - break; + goto out_free; } r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - break; } +out_inject: if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); +out_free: vfree(tmpbuf); return r; } @@ -4889,6 +5707,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; + int rc; switch (ioctl) { case KVM_S390_IRQ: { @@ -4896,7 +5715,8 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, if (copy_from_user(&s390irq, argp, sizeof(s390irq))) return -EFAULT; - return kvm_s390_inject_vcpu(vcpu, &s390irq); + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; } case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; @@ -4906,10 +5726,67 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, return -EFAULT; if (s390int_to_s390irq(&s390int, &s390irq)) return -EINVAL; - return kvm_s390_inject_vcpu(vcpu, &s390irq); + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; } + default: + rc = -ENOIOCTLCMD; + break; } - return -ENOIOCTLCMD; + + /* + * To simplify single stepping of userspace-emulated instructions, + * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see + * should_handle_per_ifetch()). However, if userspace emulation injects + * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens + * after (and not before) the interrupt delivery. + */ + if (!rc) + vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; + + return rc; +} + +static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu, + struct kvm_pv_cmd *cmd) +{ + struct kvm_s390_pv_dmp dmp; + void *data; + int ret; + + /* Dump initialization is a prerequisite */ + if (!vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp))) + return -EFAULT; + + /* We only handle this subcmd right now */ + if (dmp.subcmd != KVM_PV_DUMP_CPU) + return -EINVAL; + + /* CPU dump length is the same as create cpu storage donation. */ + if (dmp.buff_len != uv_info.guest_cpu_stor_len) + return -EINVAL; + + data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc); + + VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x", + vcpu->vcpu_id, cmd->rc, cmd->rrc); + + if (ret) + ret = -EINVAL; + + /* On success copy over the dump data */ + if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len)) + ret = -EFAULT; + + kvfree(data); + return ret; } long kvm_arch_vcpu_ioctl(struct file *filp, @@ -5021,7 +5898,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - r = gmap_fault(vcpu->arch.gmap, arg, 0); + idx = srcu_read_lock(&vcpu->kvm->srcu); + r = vcpu_dat_fault_handler(vcpu, arg, 0); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_ENABLE_CAP: @@ -5076,6 +5955,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, irq_state.len); break; } + case KVM_S390_PV_CPU_COMMAND: { + struct kvm_pv_cmd cmd; + + r = -EINVAL; + if (!is_prot_virt_host()) + break; + + r = -EFAULT; + if (copy_from_user(&cmd, argp, sizeof(cmd))) + break; + + r = -EINVAL; + if (cmd.flags) + break; + + /* We only handle this cmd right now */ + if (cmd.cmd != KVM_PV_DUMP) + break; + + r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd); + + /* Always copy over UV rc / rrc data */ + if (copy_to_user((__u8 __user *)argp, &cmd.rc, + sizeof(cmd.rc) + sizeof(cmd.rrc))) + r = -EFAULT; + break; + } default: r = -ENOTTY; } @@ -5097,6 +6003,11 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return true; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, @@ -5105,27 +6016,47 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, { gpa_t size; + if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS) + return -EINVAL; + /* When we are protected, we should not change the memory slots */ if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; - if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY) - return 0; + if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { + /* + * A few sanity checks. We can have memory slots which have to be + * located/ended at a segment boundary (1MB). The memory in userland is + * ok to be fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any time + */ + + if (new->userspace_addr & 0xffffful) + return -EINVAL; - /* A few sanity checks. We can have memory slots which have to be - located/ended at a segment boundary (1MB). The memory in userland is - ok to be fragmented into various different vmas. It is okay to mmap() - and munmap() stuff in this slot after doing this call at any time */ + size = new->npages * PAGE_SIZE; + if (size & 0xffffful) + return -EINVAL; - if (new->userspace_addr & 0xffffful) - return -EINVAL; + if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + return -EINVAL; + } - size = new->npages * PAGE_SIZE; - if (size & 0xffffful) - return -EINVAL; + if (!kvm->arch.migration_mode) + return 0; - if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) - return -EINVAL; + /* + * Turn off migration mode when: + * - userspace creates a new memslot with dirty logging off, + * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and + * dirty logging is turned off. + * Migration mode expects dirty page logging being enabled to store + * its dirty bitmap. + */ + if (change != KVM_MR_DELETE && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + WARN(kvm_s390_vm_stop_migration(kvm), + "Failed to stop migration mode"); return 0; } @@ -5137,6 +6068,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int rc = 0; + if (kvm_is_ucontrol(kvm)) + return; + switch (change) { case KVM_MR_DELETE: rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, @@ -5172,7 +6106,7 @@ static inline unsigned long nonhyp_mask(int i) static int __init kvm_s390_init(void) { - int i; + int i, r; if (!sclp.has_sief2) { pr_info("SIE is not available\n"); @@ -5188,12 +6122,23 @@ static int __init kvm_s390_init(void) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = __kvm_s390_init(); + if (r) + return r; + + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) { + __kvm_s390_exit(); + return r; + } + return 0; } static void __exit kvm_s390_exit(void) { kvm_exit(); + + __kvm_s390_exit(); } module_init(kvm_s390_init); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 497d52a83c78..c44fe0c3a097 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,10 +20,31 @@ #include <asm/processor.h> #include <asm/sclp.h> +#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) + +static inline void kvm_s390_fpu_store(struct kvm_run *run) +{ + fpu_stfpc(&run->s.regs.fpc); + if (cpu_has_vx()) + save_vx_regs((__vector128 *)&run->s.regs.vrs); + else + save_fp_regs((freg_t *)&run->s.regs.fprs); +} + +static inline void kvm_s390_fpu_load(struct kvm_run *run) +{ + fpu_lfpc_safe(&run->s.regs.fpc); + if (cpu_has_vx()) + load_vx_regs((__vector128 *)&run->s.regs.vrs); + else + load_fp_regs((freg_t *)&run->s.regs.fprs); +} + /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 -#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +#define IS_ITDB_VALID(vcpu) \ + ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; extern debug_info_t *kvm_s390_dbf_uv; @@ -119,6 +140,21 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; } +static inline u64 kvm_s390_get_base_disp_siy(struct kvm_vcpu *vcpu, u8 *ar) +{ + u32 base1 = vcpu->arch.sie_block->ipb >> 28; + s64 disp1; + + /* The displacement is a 20bit _SIGNED_ value */ + disp1 = sign_extend64(((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + + ((vcpu->arch.sie_block->ipb & 0xff00) << 4), 19); + + if (ar) + *ar = base1; + + return (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1; +} + static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, u64 *address1, u64 *address2, u8 *ar_b1, u8 *ar_b2) @@ -233,16 +269,33 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) { - u32 gd = (u32)(u64)kvm->arch.gisa_int.origin; + u32 gd; + + if (!kvm->arch.gisa_int.origin) + return 0; + + gd = virt_to_phys(kvm->arch.gisa_int.origin); if (gd && sclp.has_gisaf) gd |= GISA_FORMAT1; return gd; } +static inline hva_t gpa_to_hva(struct kvm *kvm, gpa_t gpa) +{ + hva_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + + if (!kvm_is_error_hva(hva)) + hva |= offset_in_page(gpa); + return hva; +} + /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, @@ -250,6 +303,14 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, unsigned long tweak, u16 *rc, u16 *rrc); int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc); +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb); static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) { @@ -261,16 +322,39 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) return vcpu->arch.pv.handle; } -static inline bool kvm_s390_pv_is_protected(struct kvm *kvm) +/** + * __kvm_s390_pv_destroy_page() - Destroy a guest page. + * @page: the page to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: must be called holding the mm lock for gmap->mm + */ +static inline int __kvm_s390_pv_destroy_page(struct page *page) { - lockdep_assert_held(&kvm->lock); - return !!kvm_s390_pv_get_handle(kvm); -} + struct folio *folio = page_folio(page); + int rc; -static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) -{ - lockdep_assert_held(&vcpu->mutex); - return !!kvm_s390_pv_cpu_get_handle(vcpu); + /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */ + if (folio_test_large(folio)) + return -EFAULT; + + rc = uv_destroy_folio(folio); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_from_secure_folio(folio); + + return rc; } /* implemented in interrupt.c */ @@ -352,15 +436,17 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); void kvm_s390_vsie_init(struct kvm *kvm); void kvm_s390_vsie_destroy(struct kvm *kvm); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); + +/* implemented in gmap-vsie.c */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ -void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); -long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); @@ -374,6 +460,15 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); +int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, + unsigned long bits); + +static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) +{ + return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); +} /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); @@ -461,7 +556,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); void kvm_s390_gisa_disable(struct kvm *kvm); void kvm_s390_gisa_enable(struct kvm *kvm); -int kvm_s390_gib_init(u8 nisc); +int __init kvm_s390_gib_init(u8 nisc); void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ @@ -494,6 +589,13 @@ static inline int kvm_s390_use_sca_entries(void) void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, struct mcck_volatile_info *mcck_info); +static inline bool kvm_s390_cur_gmap_fault_is_write(void) +{ + if (current->thread.gmap_int_code == PGM_PROTECTION) + return true; + return test_facility(75) && (current->thread.gmap_teid.fsi == TEID_FSI_STORE); +} + /** * kvm_s390_vcpu_crypto_reset_all * @@ -508,6 +610,16 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); /** + * kvm_s390_vcpu_pci_enable_interp + * + * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store + * interpretation as well as adapter interruption forwarding. + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm); + +/** * diag9c_forwarding_hz * * Set the maximum number of diag9c forwarding per second diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c new file mode 100644 index 000000000000..8c40154ff50f --- /dev/null +++ b/arch/s390/kvm/pci.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <asm/pci.h> +#include <asm/pci_insn.h> +#include <asm/pci_io.h> +#include <asm/sclp.h> +#include "pci.h" +#include "kvm-s390.h" + +struct zpci_aift *aift; + +static inline int __set_irq_noiib(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return zpci_set_irq_ctrl(ctl, isc, &iib); +} + +void kvm_s390_pci_aen_exit(void) +{ + unsigned long flags; + struct kvm_zdev **gait_kzdev; + + lockdep_assert_held(&aift->aift_lock); + + /* + * Contents of the aipb remain registered for the life of the host + * kernel, the information preserved in zpci_aipb and zpci_aif_sbv + * in case we insert the KVM module again later. Clear the AIFT + * information and free anything not registered with underlying + * firmware. + */ + spin_lock_irqsave(&aift->gait_lock, flags); + gait_kzdev = aift->kzdev; + aift->gait = NULL; + aift->sbv = NULL; + aift->kzdev = NULL; + spin_unlock_irqrestore(&aift->gait_lock, flags); + + kfree(gait_kzdev); +} + +static int zpci_setup_aipb(u8 nisc) +{ + struct page *page; + int size, rc; + + zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + if (!zpci_aipb) + return -ENOMEM; + + aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); + if (!aift->sbv) { + rc = -ENOMEM; + goto free_aipb; + } + zpci_aif_sbv = aift->sbv; + size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES * + sizeof(struct zpci_gaite))); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size); + if (!page) { + rc = -ENOMEM; + goto free_sbv; + } + aift->gait = (struct zpci_gaite *)page_to_virt(page); + + zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector); + zpci_aipb->aipb.gait = virt_to_phys(aift->gait); + zpci_aipb->aipb.afi = nisc; + zpci_aipb->aipb.faal = ZPCI_NR_DEVICES; + + /* Setup Adapter Event Notification Interpretation */ + if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) { + rc = -EIO; + goto free_gait; + } + + return 0; + +free_gait: + free_pages((unsigned long)aift->gait, size); +free_sbv: + airq_iv_release(aift->sbv); + zpci_aif_sbv = NULL; +free_aipb: + kfree(zpci_aipb); + zpci_aipb = NULL; + + return rc; +} + +static int zpci_reset_aipb(u8 nisc) +{ + /* + * AEN registration can only happen once per system boot. If + * an aipb already exists then AEN was already registered and + * we can reuse the aipb contents. This can only happen if + * the KVM module was removed and re-inserted. However, we must + * ensure that the same forwarding ISC is used as this is assigned + * during KVM module load. + */ + if (zpci_aipb->aipb.afi != nisc) + return -EINVAL; + + aift->sbv = zpci_aif_sbv; + aift->gait = phys_to_virt(zpci_aipb->aipb.gait); + + return 0; +} + +int kvm_s390_pci_aen_init(u8 nisc) +{ + int rc = 0; + + /* If already enabled for AEN, bail out now */ + if (aift->gait || aift->sbv) + return -EPERM; + + mutex_lock(&aift->aift_lock); + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), + GFP_KERNEL); + if (!aift->kzdev) { + rc = -ENOMEM; + goto unlock; + } + + if (!zpci_aipb) + rc = zpci_setup_aipb(nisc); + else + rc = zpci_reset_aipb(nisc); + if (rc) + goto free_zdev; + + /* Enable floating IRQs */ + if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) { + rc = -EIO; + kvm_s390_pci_aen_exit(); + } + + goto unlock; + +free_zdev: + kfree(aift->kzdev); +unlock: + mutex_unlock(&aift->aift_lock); + return rc; +} + +/* Modify PCI: Register floating adapter interruption forwarding */ +static int kvm_zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {}; + u8 status; + + fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruption forwarding */ +static int kvm_zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static inline void unaccount_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + + if (user) + atomic_long_sub(nr_pages, &user->locked_vm); + if (current->mm) + atomic64_sub(nr_pages, ¤t->mm->pinned_vm); +} + +static inline int account_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + unsigned long page_limit, cur_pages, new_pages; + + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + cur_pages = atomic_long_read(&user->locked_vm); + do { + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); + + atomic64_add(nr_pages, ¤t->mm->pinned_vm); + + return 0; +} + +static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, + bool assist) +{ + struct page *pages[1], *aibv_page, *aisb_page = NULL; + unsigned int msi_vecs, idx; + struct zpci_gaite *gaite; + unsigned long hva, bit; + struct kvm *kvm; + phys_addr_t gaddr; + int rc = 0, gisc, npages, pcount = 0; + + /* + * Interrupt forwarding is only applicable if the device is already + * enabled for interpretation + */ + if (zdev->gisa == 0) + return -EINVAL; + + kvm = zdev->kzdev->kvm; + msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi); + + /* Get the associated forwarding ISC - if invalid, return the error */ + gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc); + if (gisc < 0) + return gisc; + + /* Replace AIBV address */ + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto out; + } + aibv_page = pages[0]; + pcount++; + gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK); + fib->fmt0.aibv = gaddr; + + /* Pin the guest AISB if one was specified */ + if (fib->fmt0.sum == 1) { + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, + pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto unpin1; + } + aisb_page = pages[0]; + pcount++; + } + + /* Account for pinned pages, roll back on failure */ + if (account_mem(pcount)) + goto unpin2; + + /* AISB must be allocated before we can fill in GAITE */ + mutex_lock(&aift->aift_lock); + bit = airq_iv_alloc_bit(aift->sbv); + if (bit == -1UL) + goto unlock; + zdev->aisb = bit; /* store the summary bit number */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | + AIRQ_IV_BITLOCK | + AIRQ_IV_GUESTVEC, + phys_to_virt(fib->fmt0.aibv)); + + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + + /* If assist not requested, host will get all alerts */ + if (assist) + gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + else + gaite->gisa = 0; + + gaite->gisc = fib->fmt0.isc; + gaite->count++; + gaite->aisbo = fib->fmt0.aisbo; + gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb & + ~PAGE_MASK)); + aift->kzdev[zdev->aisb] = zdev->kzdev; + spin_unlock_irq(&aift->gait_lock); + + /* Update guest FIB for re-issue */ + fib->fmt0.aisbo = zdev->aisb & 63; + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.isc = gisc; + + /* Save some guest fib values in the host for later use */ + zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc; + zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv; + mutex_unlock(&aift->aift_lock); + + /* Issue the clp to setup the irq now */ + rc = kvm_zpci_set_airq(zdev); + return rc; + +unlock: + mutex_unlock(&aift->aift_lock); +unpin2: + if (fib->fmt0.sum == 1) + unpin_user_page(aisb_page); +unpin1: + unpin_user_page(aibv_page); +out: + return rc; +} + +static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) +{ + struct kvm_zdev *kzdev = zdev->kzdev; + struct zpci_gaite *gaite; + struct page *vpage = NULL, *spage = NULL; + int rc, pcount = 0; + u8 isc; + + if (zdev->gisa == 0) + return -EINVAL; + + mutex_lock(&aift->aift_lock); + + /* + * If the clear fails due to an error, leave now unless we know this + * device is about to go away (force) -- In that case clear the GAITE + * regardless. + */ + rc = kvm_zpci_clear_airq(zdev); + if (rc && !force) + goto out; + + if (zdev->kzdev->fib.fmt0.aibv == 0) + goto out; + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + isc = gaite->gisc; + gaite->count--; + if (gaite->count == 0) { + /* Release guest AIBV and AISB */ + vpage = phys_to_page(kzdev->fib.fmt0.aibv); + if (gaite->aisb != 0) + spage = phys_to_page(gaite->aisb); + /* Clear the GAIT entry */ + gaite->aisb = 0; + gaite->gisc = 0; + gaite->aisbo = 0; + gaite->gisa = 0; + aift->kzdev[zdev->aisb] = NULL; + /* Clear zdev info */ + airq_iv_free_bit(aift->sbv, zdev->aisb); + airq_iv_release(zdev->aibv); + zdev->aisb = 0; + zdev->aibv = NULL; + } + spin_unlock_irq(&aift->gait_lock); + kvm_s390_gisc_unregister(kzdev->kvm, isc); + kzdev->fib.fmt0.isc = 0; + kzdev->fib.fmt0.aibv = 0; + + if (vpage) { + unpin_user_page(vpage); + pcount++; + } + if (spage) { + unpin_user_page(spage); + pcount++; + } + if (pcount > 0) + unaccount_mem(pcount); +out: + mutex_unlock(&aift->aift_lock); + + return rc; +} + +static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + if (!kzdev) + return -ENOMEM; + + kzdev->zdev = zdev; + zdev->kzdev = kzdev; + + return 0; +} + +static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = zdev->kzdev; + WARN_ON(kzdev->zdev != zdev); + zdev->kzdev = NULL; + kfree(kzdev); +} + + +/* + * Register device with the specified KVM. If interpretation facilities are + * available, enable them and let userspace indicate whether or not they will + * be used (specify SHM bit to disable). + */ +static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) +{ + struct zpci_dev *zdev = opaque; + int rc; + + if (!zdev) + return -EINVAL; + + mutex_lock(&zdev->kzdev_lock); + + if (zdev->kzdev || zdev->gisa != 0 || !kvm) { + mutex_unlock(&zdev->kzdev_lock); + return -EINVAL; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + + rc = kvm_s390_pci_dev_open(zdev); + if (rc) + goto err; + + /* + * If interpretation facilities aren't available, add the device to + * the kzdev list but don't enable for interpretation. + */ + if (!kvm_s390_pci_interp_allowed()) + goto out; + + /* + * If this is the first request to use an interpreted device, make the + * necessary vcpu changes + */ + if (!kvm->arch.use_zpci_interp) + kvm_s390_vcpu_pci_enable_interp(kvm); + + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + goto err; + } + + /* + * Store information about the identity of the kvm guest allowed to + * access this device via interpretation to be used by host CLP + */ + zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + + rc = zpci_reenable_device(zdev); + if (rc) + goto clear_gisa; + +out: + zdev->kzdev->kvm = kvm; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list); + spin_unlock(&kvm->arch.kzdev_list_lock); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return 0; + +clear_gisa: + zdev->gisa = 0; +err: + if (zdev->kzdev) + kvm_s390_pci_dev_release(zdev); + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + kvm_put_kvm(kvm); + return rc; +} + +static void kvm_s390_pci_unregister_kvm(void *opaque) +{ + struct zpci_dev *zdev = opaque; + struct kvm *kvm; + + if (!zdev) + return; + + mutex_lock(&zdev->kzdev_lock); + + if (WARN_ON(!zdev->kzdev)) { + mutex_unlock(&zdev->kzdev_lock); + return; + } + + kvm = zdev->kzdev->kvm; + mutex_lock(&kvm->lock); + + /* + * A 0 gisa means interpretation was never enabled, just remove the + * device from the list. + */ + if (zdev->gisa == 0) + goto out; + + /* Forwarding must be turned off before interpretation */ + if (zdev->kzdev->fib.fmt0.aibv != 0) + kvm_s390_pci_aif_disable(zdev, true); + + /* Remove the host CLP guest designation */ + zdev->gisa = 0; + + if (zdev_enabled(zdev)) { + if (zpci_disable_device(zdev)) + goto out; + } + + zpci_reenable_device(zdev); + +out: + spin_lock(&kvm->arch.kzdev_list_lock); + list_del(&zdev->kzdev->entry); + spin_unlock(&kvm->arch.kzdev_list_lock); + kvm_s390_pci_dev_release(zdev); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + + kvm_put_kvm(kvm); +} + +void kvm_s390_pci_init_list(struct kvm *kvm) +{ + spin_lock_init(&kvm->arch.kzdev_list_lock); + INIT_LIST_HEAD(&kvm->arch.kzdev_list); +} + +void kvm_s390_pci_clear_list(struct kvm *kvm) +{ + /* + * This list should already be empty, either via vfio device closures + * or kvm fd cleanup. + */ + spin_lock(&kvm->arch.kzdev_list_lock); + WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list)); + spin_unlock(&kvm->arch.kzdev_list_lock); +} + +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + +int __init kvm_s390_pci_init(void) +{ + zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; + zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; + + if (!kvm_s390_pci_interp_allowed()) + return 0; + + aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + if (!aift) + return -ENOMEM; + + spin_lock_init(&aift->gait_lock); + mutex_init(&aift->aift_lock); + + return 0; +} + +void kvm_s390_pci_exit(void) +{ + zpci_kvm_hook.kvm_register = NULL; + zpci_kvm_hook.kvm_unregister = NULL; + + if (!kvm_s390_pci_interp_allowed()) + return; + + mutex_destroy(&aift->aift_lock); + + kfree(aift); +} diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h new file mode 100644 index 000000000000..ff0972dd5e71 --- /dev/null +++ b/arch/s390/kvm/pci.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#ifndef __KVM_S390_PCI_H +#define __KVM_S390_PCI_H + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <asm/airq.h> +#include <asm/cpu.h> + +struct kvm_zdev { + struct zpci_dev *zdev; + struct kvm *kvm; + struct zpci_fib fib; + struct list_head entry; +}; + +struct zpci_gaite { + u32 gisa; + u8 gisc; + u8 count; + u8 reserved; + u8 aisbo; + u64 aisb; +}; + +struct zpci_aift { + struct zpci_gaite *gait; + struct airq_iv *sbv; + struct kvm_zdev **kzdev; + spinlock_t gait_lock; /* Protects the gait, used during AEN forward */ + struct mutex aift_lock; /* Protects the other structures in aift */ +}; + +extern struct zpci_aift *aift; + +static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift, + unsigned long si) +{ + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev || + !aift->kzdev[si]) + return NULL; + return aift->kzdev[si]->kvm; +}; + +int kvm_s390_pci_aen_init(u8 nisc); +void kvm_s390_pci_aen_exit(void); + +void kvm_s390_pci_init_list(struct kvm *kvm); +void kvm_s390_pci_clear_list(struct kvm *kvm); + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + +int __init kvm_s390_pci_init(void); +void kvm_s390_pci_exit(void); + +static inline bool kvm_s390_pci_interp_allowed(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2817: + case 0x2818: + case 0x2827: + case 0x2828: + case 0x2964: + case 0x2965: + /* No SHM on certain machines */ + return false; + default: + return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) && + sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi && + sclp.has_aisii); + } +} + +#endif /* __KVM_S390_PCI_H */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 83bb5cf97282..9253c70897a8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -13,7 +13,7 @@ #include <linux/errno.h> #include <linux/mm_types.h> #include <linux/pgtable.h> - +#include <linux/io.h> #include <asm/asm-offsets.h> #include <asm/facility.h> #include <asm/current.h> @@ -22,7 +22,6 @@ #include <asm/sysinfo.h> #include <asm/page-states.h> #include <asm/gmap.h> -#include <asm/io.h> #include <asm/ptrace.h> #include <asm/sclp.h> #include <asm/ap.h> @@ -58,7 +57,7 @@ static int handle_gs(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 133)) { VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)"); preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb; restore_gs_cb(current->thread.gs_cb); preempt_enable(); @@ -150,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) * first page, since address is 8k aligned and memory pieces are always * at least 1MB aligned and have at least a size of 1MB. */ - if (kvm_is_error_gpa(vcpu->kvm, address)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, address)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); kvm_s390_set_prefix(vcpu, address); @@ -442,7 +441,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu) vcpu->stat.instruction_ipte_interlock++; if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm)); kvm_s390_retry_instr(vcpu); VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); return 0; @@ -465,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); addr = kvm_s390_real_to_abs(vcpu, addr); - if (kvm_is_error_gpa(vcpu->kvm, addr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); /* * We don't expect errors on modern systems, and do not care @@ -677,8 +676,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu) if (vcpu->kvm->arch.crypto.pqap_hook) { pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook; ret = pqap_hook(vcpu); - if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) - kvm_s390_set_psw_cc(vcpu, 3); + if (!ret) { + if (vcpu->run->s.regs.gprs[1] & 0x00ff0000) + kvm_s390_set_psw_cc(vcpu, 3); + else + kvm_s390_set_psw_cc(vcpu, 0); + } up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); return ret; } @@ -794,6 +797,36 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) return 0; } +static int handle_lpswey(struct kvm_vcpu *vcpu) +{ + psw_t new_psw; + u64 addr; + int rc; + u8 ar; + + vcpu->stat.instruction_lpswey++; + + if (!test_kvm_facility(vcpu->kvm, 193)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + addr = kvm_s390_get_base_disp_siy(vcpu, &ar); + if (addr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + vcpu->arch.sie_block->gpsw = new_psw; + if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + return 0; +} + static int handle_stidp(struct kvm_vcpu *vcpu) { u64 stidp_data = vcpu->kvm->arch.model.cpuid; @@ -873,10 +906,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - if (fc > 3) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } + /* Bailout forbidden function codes */ + if (fc > 3 && fc != 15) + goto out_no_data; + + /* + * fc 15 is provided only with + * - PTF/CPU topology support through facility 15 + * - KVM_CAP_S390_USER_STSI + */ + if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) || + !vcpu->kvm->arch.user_stsi)) + goto out_no_data; if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 || vcpu->run->s.regs.gprs[1] & 0xffff0000) @@ -910,10 +951,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu) goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; + case 15: /* fc 15 is fully handled in userspace */ + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + return -EREMOTE; } if (kvm_s390_pv_cpu_is_protected(vcpu)) { - memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, - PAGE_SIZE); + memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE); rc = 0; } else { rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); @@ -1204,6 +1248,8 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) static int handle_essa(struct kvm_vcpu *vcpu) { + lockdep_assert_held(&vcpu->kvm->srcu); + /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; @@ -1253,12 +1299,8 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* Retry the ESSA instruction */ kvm_s390_retry_instr(vcpu); } else { - int srcu_idx; - mmap_read_lock(vcpu->kvm->mm); - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); i = __do_essa(vcpu, orc); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); mmap_read_unlock(vcpu->kvm->mm); if (i < 0) return i; @@ -1448,6 +1490,8 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) case 0x61: case 0x62: return handle_ri(vcpu); + case 0x71: + return handle_lpswey(vcpu); default: return -EOPNOTSUPP; } @@ -1471,7 +1515,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) access_key = (operand2 & 0xf0) >> 4; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, GACC_STORE, access_key); @@ -1508,7 +1552,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) } if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return ret; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index cc7c9599f43e..14c330ec8ceb 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -7,13 +7,120 @@ */ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/minmax.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> #include <asm/gmap.h> #include <asm/uv.h> #include <asm/mman.h> +#include <linux/pagewalk.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> #include "kvm-s390.h" +bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected); + +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); + +/** + * kvm_s390_pv_make_secure() - make one guest page secure + * @kvm: the guest + * @gaddr: the guest address that needs to be made secure + * @uvcb: the UVCB specifying which operation needs to be performed + * + * Context: needs to be called with kvm->srcu held. + * Return: 0 on success, < 0 in case of error. + */ +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) +{ + unsigned long vmaddr; + + lockdep_assert_held(&kvm->srcu); + + vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); + if (kvm_is_error_hva(vmaddr)) + return -EFAULT; + return make_hva_secure(kvm->mm, vmaddr, uvcb); +} + +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = gaddr, + }; + + return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb); +} + +/** + * kvm_s390_pv_destroy_page() - Destroy a guest page. + * @kvm: the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: may sleep. + */ +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr) +{ + struct page *page; + int rc = 0; + + mmap_read_lock(kvm->mm); + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); + if (page) + rc = __kvm_s390_pv_destroy_page(page); + kvm_release_page_clean(page); + mmap_read_unlock(kvm->mm); + return rc; +} + +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + +static void kvm_s390_clear_pv_state(struct kvm *kvm) +{ + kvm->arch.pv.handle = 0; + kvm->arch.pv.guest_len = 0; + kvm->arch.pv.stor_base = 0; + kvm->arch.pv.stor_var = NULL; +} + int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) { int cc; @@ -32,7 +139,7 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); - free_page(sida_origin(vcpu->arch.sie_block)); + free_page((unsigned long)sida_addr(vcpu->arch.sie_block)); vcpu->arch.sie_block->pv_handle_cpu = 0; vcpu->arch.sie_block->pv_handle_config = 0; memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); @@ -54,6 +161,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) .header.cmd = UVC_CMD_CREATE_SEC_CPU, .header.len = sizeof(uvcb), }; + void *sida_addr; int cc; if (kvm_s390_pv_cpu_get_handle(vcpu)) @@ -67,16 +175,17 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) /* Input */ uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); uvcb.num = vcpu->arch.sie_block->icpua; - uvcb.state_origin = (u64)vcpu->arch.sie_block; - uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block); + uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base); /* Alloc Secure Instruction Data Area Designation */ - vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!vcpu->arch.sie_block->sidad) { + sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sida_addr) { free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); return -ENOMEM; } + vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr); cc = uv_call(0, (u64)&uvcb); *rc = uvcb.header.rc; @@ -108,7 +217,7 @@ static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) vfree(kvm->arch.pv.stor_var); free_pages(kvm->arch.pv.stor_base, get_order(uv_info.guest_base_stor_len)); - memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); + kvm_s390_clear_pv_state(kvm); } static int kvm_s390_pv_alloc_vm(struct kvm *kvm) @@ -147,26 +256,363 @@ out_err: return -ENOMEM; } -/* this should not fail, but if it does, we must not free the donated memory */ -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) +{ + int cc; + + /* It used the destroy-fast UVC, nothing left to do here */ + if (!leftover->handle) + goto done_fast; + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; + /* + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. + */ + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); +done_fast: + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) { + struct uv_cb_destroy_fast uvcb = { + .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST, + .header.len = sizeof(uvcb), + .handle = kvm_s390_pv_get_handle(kvm), + }; int cc; - /* make all pages accessible before destroying the guest */ - s390_reset_acc(kvm->mm); + cc = uv_call_sched(0, (u64)&uvcb); + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", + uvcb.header.rc, uvcb.header.rrc); + WARN_ONCE(cc && uvcb.header.rc != 0x104, + "protvirt destroy vm fast failed handle %llx rc %x rrc %x", + kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc); + /* Intended memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +static inline bool is_destroy_fast_available(void) +{ + return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + int res = 0; + + lockdep_assert_held(&kvm->lock); + /* + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. + */ + if (kvm->arch.pv.set_aside) + return -EINVAL; + + /* Guest with segment type ASCE, refuse to destroy asynchronously */ + if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + if (is_destroy_fast_available()) { + res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); + } else { + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) + res = -ENOMEM; + } + + if (res) { + kfree(priv); + return res; + } + + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); - atomic_set(&kvm->mm->context.is_protected, 0); + if (!cc) { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } else { + /* Intended memory leak on "impossible" error */ + s390_replace_asce(kvm->arch.gmap); + } KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); - /* Inteded memory leak on "impossible" error */ - if (!cc) - kvm_s390_pv_dealloc_vm(kvm); + + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* + * Nothing to do if the counter was already 0. Otherwise make sure + * the counter does not reach 0 before calling s390_uv_destroy_range. + */ + if (!atomic_inc_not_zero(&kvm->mm->context.protected_count)) + return 0; + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); return cc ? -EIO : 0; } +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + +static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); + u16 dummy; + int r; + + /* + * No locking is needed since this is the last thread of the last user of this + * struct mm. + * When the struct kvm gets deinitialized, this notifier is also + * unregistered. This means that if this notifier runs, then the + * struct kvm is still valid. + */ + r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); +} + +static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { + .release = kvm_s390_pv_mmu_notifier_release, +}; + int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_cgc uvcb = { @@ -184,27 +630,38 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; uvcb.guest_asce = kvm->arch.gmap->asce; - uvcb.guest_sca = (unsigned long)kvm->arch.sca; - uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; + uvcb.guest_sca = virt_to_phys(kvm->arch.sca); + uvcb.conf_base_stor_origin = + virt_to_phys((void *)kvm->arch.pv.stor_base); uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; + uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; cc = uv_call_sched(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; - KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x", - uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw); /* Outputs */ kvm->arch.pv.handle = uvcb.guest_handle; + atomic_inc(&kvm->mm->context.protected_count); if (cc) { - if (uvcb.header.rc & UVC_RC_NEED_DESTROY) + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); - else + } else { + atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); + } return -EIO; } kvm->arch.gmap->guest_handle = uvcb.guest_handle; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + } return 0; } @@ -224,8 +681,6 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, *rrc = uvcb.header.rrc; KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", *rc, *rrc); - if (!cc) - atomic_set(&kvm->mm->context.is_protected, 1); return cc ? -EINVAL : 0; } @@ -240,11 +695,29 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[0] = tweak, .tweak[1] = offset, }; - int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); + unsigned long vmaddr; + bool unlocked; *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; + if (ret == -ENXIO) { + mmap_read_lock(kvm->mm); + vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(vmaddr)) { + ret = -EFAULT; + } else { + ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); + if (!ret) + ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); + } + mmap_read_unlock(kvm->mm); + if (!ret) + return -EAGAIN; + return ret; + } + if (ret && ret != -EAGAIN) KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", uvcb.gaddr, *rc, *rrc); @@ -263,6 +736,8 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", addr, size); + guard(srcu)(&kvm->srcu); + while (offset < size) { ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); if (ret == -EAGAIN) { @@ -298,3 +773,200 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) return -EINVAL; return 0; } + +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_cpu uvcb = { + .header.cmd = UVC_CMD_DUMP_CPU, + .header.len = sizeof(uvcb), + .cpu_handle = vcpu->arch.pv.handle, + .dump_area_origin = (u64)buff, + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc; +} + +/* Size of the cache for the storage state dump data. 1MB for now */ +#define DUMP_BUFF_LEN HPAGE_SIZE + +/** + * kvm_s390_pv_dump_stor_state + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @gaddr: Starting absolute guest address for which the storage state + * is requested. + * @buff_user_len: Length of the buff_user buffer + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Stores buff_len bytes of tweak component values to buff_user + * starting with the 1MB block specified by the absolute guest address + * (gaddr). The gaddr pointer will be updated with the last address + * for which data was written when returning to userspace. buff_user + * might be written to even if an error rc is returned. For instance + * if we encounter a fault after writing the first page of data. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the cache fails + * -EINVAL if gaddr is not aligned to 1MB + * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_stor_state uvcb = { + .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE, + .header.len = sizeof(uvcb), + .config_handle = kvm->arch.pv.handle, + .gaddr = *gaddr, + .dump_area_origin = 0, + }; + const u64 increment_len = uv_info.conf_dump_storage_state_len; + size_t buff_kvm_size; + size_t size_done = 0; + u8 *buff_kvm = NULL; + int cc, ret; + + ret = -EINVAL; + /* UV call processes 1MB guest storage chunks at a time */ + if (!IS_ALIGNED(*gaddr, HPAGE_SIZE)) + goto out; + + /* + * We provide the storage state for 1MB chunks of guest + * storage. The buffer will need to be aligned to + * conf_dump_storage_state_len so we don't end on a partial + * chunk. + */ + if (!buff_user_len || + !IS_ALIGNED(buff_user_len, increment_len)) + goto out; + + /* + * Allocate a buffer from which we will later copy to the user + * process. We don't want userspace to dictate our buffer size + * so we limit it to DUMP_BUFF_LEN. + */ + ret = -ENOMEM; + buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN); + buff_kvm = vzalloc(buff_kvm_size); + if (!buff_kvm) + goto out; + + ret = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + /* We will loop until the user buffer is filled or an error occurs */ + do { + /* Get 1MB worth of guest storage state data */ + cc = uv_call_sched(0, (u64)&uvcb); + + /* All or nothing */ + if (cc) { + ret = -EINVAL; + break; + } + + size_done += increment_len; + uvcb.dump_area_origin += increment_len; + buff_user_len -= increment_len; + uvcb.gaddr += HPAGE_SIZE; + + /* KVM Buffer full, time to copy to the process */ + if (!buff_user_len || size_done == DUMP_BUFF_LEN) { + if (copy_to_user(buff_user, buff_kvm, size_done)) { + ret = -EFAULT; + break; + } + + buff_user += size_done; + size_done = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + } + } while (buff_user_len); + + /* Report back where we ended dumping */ + *gaddr = uvcb.gaddr; + + /* Lets only log errors, we don't want to spam */ +out: + if (ret) + KVM_UV_EVENT(kvm, 3, + "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x", + uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + vfree(buff_kvm); + + return ret; +} + +/** + * kvm_s390_pv_dump_complete + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Completes the dumping operation and writes the completion data to + * user space. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the completion buffer fails + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_complete complete = { + .header.len = sizeof(complete), + .header.cmd = UVC_CMD_DUMP_COMPLETE, + .config_handle = kvm_s390_pv_get_handle(kvm), + }; + u64 *compl_data; + int ret; + + /* Allocate dump area */ + compl_data = vzalloc(uv_info.conf_dump_finalize_len); + if (!compl_data) + return -ENOMEM; + complete.dump_area_origin = (u64)compl_data; + + ret = uv_call_sched(0, (u64)&complete); + *rc = complete.header.rc; + *rrc = complete.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x", + complete.header.rc, complete.header.rrc); + + if (!ret) { + /* + * kvm_s390_pv_dealloc_vm() will also (mem)set + * this to false on a reboot or other destroy + * operation for this vm. + */ + kvm->arch.pv.dumping = false; + kvm_s390_vcpu_unblock_all(kvm); + ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len); + if (ret) + ret = -EFAULT; + } + vfree(compl_data); + /* If the UVC returned an error, translate it to -EINVAL */ + if (ret > 0) + ret = -EINVAL; + return ret; +} diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 8aaee2892ec3..55c34cb35428 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -172,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, * first page, since address is 8k aligned and memory pieces are always * at least 1MB aligned and have at least a size of 1MB. */ - if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) { + if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INVALID_PARAMETER; return SIGP_CC_STATUS_STORED; @@ -469,7 +469,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) * * This interception will occur at the source cpu when a source cpu sends an * external call to a target cpu and the target cpu has the WAIT bit set in - * its cpuflags. Interception will occurr after the interrupt indicator bits at + * its cpuflags. Interception will occur after the interrupt indicator bits at * the target cpu have been set. All error cases will lead to instruction * interception, therefore nothing is to be checked or prepared. */ @@ -480,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) struct kvm_vcpu *dest_vcpu; u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); - if (order_code == SIGP_EXTERNAL_CALL) { + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); BUG_ON(dest_vcpu == NULL); diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 6f0209d45164..9e28f165c114 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu, __entry->sie_block = sie_block; ), - TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK", + TP_printk("create cpu %d at 0x%p, sie block at 0x%p", __entry->id, __entry->vcpu, __entry->sie_block) ); @@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css, __entry->kvm = kvm; ), - TP_printk("enabling channel I/O support (kvm @ %pK)\n", + TP_printk("enabling channel I/O support (kvm @ %p)\n", __entry->kvm) ); @@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed, __entry->id, __entry->isc) ); +/* + * Trace point for gmap notifier calls. + */ +TRACE_EVENT(kvm_s390_gmap_notifier, + TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow), + TP_ARGS(start, end, shadow), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, shadow) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->shadow = shadow; + ), + + TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)", + __entry->start, __entry->end, __entry->shadow) + ); + #endif /* _TRACE_KVMS390_H */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index dada78b92691..13a9661d2b28 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -12,16 +12,22 @@ #include <linux/list.h> #include <linux/bitmap.h> #include <linux/sched/signal.h> +#include <linux/io.h> +#include <linux/mman.h> #include <asm/gmap.h> #include <asm/mmu_context.h> #include <asm/sclp.h> #include <asm/nmi.h> #include <asm/dis.h> -#include <asm/fpu/api.h> +#include <asm/facility.h> #include "kvm-s390.h" #include "gaccess.h" +enum vsie_page_flags { + VSIE_PAGE_IN_USE = 0, +}; + struct vsie_page { struct kvm_s390_sie_block scb_s; /* 0x0000 */ /* @@ -45,11 +51,40 @@ struct vsie_page { gpa_t gvrd_gpa; /* 0x0240 */ gpa_t riccbd_gpa; /* 0x0248 */ gpa_t sdnx_gpa; /* 0x0250 */ - __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */ + /* + * guest address of the original SCB. Remains set for free vsie + * pages, so we can properly look them up in our addr_to_page + * radix tree. + */ + gpa_t scb_gpa; /* 0x0258 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; +/** + * gmap_shadow_valid() - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise; the + * caller has to request a new shadow gmap in this case. + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} + /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, __u16 reason_code) @@ -138,11 +173,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* Copy to APCB FORMAT1 from APCB FORMAT0 */ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, - unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h) + unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h) { struct kvm_s390_apcb0 tmp; + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); - if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0))) + if (read_guest_real(vcpu, apcb_gpa, &tmp, + sizeof(struct kvm_s390_apcb0))) return -EFAULT; apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; @@ -157,19 +196,24 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original apcb in the guest2 + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the guest1 * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, unsigned long *apcb_h) + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb0))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0)); return 0; } @@ -178,20 +222,25 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original guest apcb + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the host * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb1))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1)); return 0; } @@ -200,7 +249,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb - Create a shadow copy of the apcb. * @vcpu: pointer to the virtual CPU * @crycb_s: pointer to shadow crycb - * @crycb_o: pointer to original guest crycb + * @crycb_gpa: guest physical address of original guest crycb * @crycb_h: pointer to the host crycb * @fmt_o: format of the original guest crycb. * @fmt_h: format of the host crycb. @@ -211,50 +260,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * Return 0 or an error number if the guest and host crycb are incompatible. */ static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, - const u32 crycb_o, + const u32 crycb_gpa, struct kvm_s390_crypto_cb *crycb_h, int fmt_o, int fmt_h) { - struct kvm_s390_crypto_cb *crycb; - - crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o; - switch (fmt_o) { case CRYCB_FORMAT2: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK)) return -EACCES; if (fmt_h != CRYCB_FORMAT2) return -EINVAL; return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, - (unsigned long) &crycb->apcb1, + crycb_gpa, (unsigned long *)&crycb_h->apcb1); case CRYCB_FORMAT1: switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } break; case CRYCB_FORMAT0: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK)) return -EACCES; switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: case CRYCB_FORMAT0: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } } @@ -324,7 +369,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* we may only allow it if enabled for guest 2 */ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & (ECB3_AES | ECB3_DEA); - ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC; + ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & + (ECD_ECC | ECD_HMAC); if (!ecb3_flags && !ecd_flags) goto end; @@ -351,7 +397,7 @@ end: case -EACCES: return set_validity_icpt(scb_s, 0x003CU); } - scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2; + scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2; return 0; } @@ -494,7 +540,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->mso = new_mso; scb_s->prefix = new_prefix; - /* We have to definetly flush the tlb if this scb never ran */ + /* We have to definitely flush the tlb if this scb never ran */ if (scb_s->ihcpu != 0xffffU) scb_s->ihcpu = scb_o->ihcpu; @@ -503,6 +549,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; + /* + * CPU Topology + * This facility only uses the utility field of the SCA and none of + * the cpu entries that are problematic with the other interpretation + * facilities so we can pass it through + */ + if (test_kvm_facility(vcpu->kvm, 11)) + scb_s->ecb |= scb_o->ecb & ECB_PTF; /* transactional execution */ if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { /* remap the prefix is tx is toggled on */ @@ -538,8 +592,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) scb_s->eca |= scb_o->eca & ECA_CEI; /* Epoch Extension */ - if (test_kvm_facility(vcpu->kvm, 139)) + if (test_kvm_facility(vcpu->kvm, 139)) { scb_s->ecd |= scb_o->ecd & ECD_MEF; + scb_s->epdx = scb_o->epdx; + } /* etoken */ if (test_kvm_facility(vcpu->kvm, 156)) @@ -562,24 +618,18 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, struct kvm *kvm = gmap->private; struct vsie_page *cur; unsigned long prefix; - struct page *page; int i; if (!gmap_is_shadow(gmap)) return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. */ for (i = 0; i < kvm->arch.vsie.page_count; i++) { - page = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!page) + cur = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!cur) continue; - cur = page_to_virt(page); if (READ_ONCE(cur->gmap) != gmap) continue; prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; @@ -644,16 +694,16 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) struct page *page; page = gfn_to_page(kvm, gpa_to_gfn(gpa)); - if (is_error_page(page)) + if (!page) return -EINVAL; - *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); return 0; } /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) { - kvm_release_pfn_dirty(hpa >> PAGE_SHIFT); + kvm_release_page_dirty(pfn_to_page(hpa >> PAGE_SHIFT)); /* mark the page always as dirty for migration */ mark_page_dirty(kvm, gpa_to_gfn(gpa)); } @@ -836,7 +886,7 @@ unpin: static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, gpa_t gpa) { - hpa_t hpa = (hpa_t) vsie_page->scb_o; + hpa_t hpa = virt_to_phys(vsie_page->scb_o); if (hpa) unpin_guest_page(vcpu->kvm, gpa, hpa); @@ -861,7 +911,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, WARN_ON_ONCE(rc); return 1; } - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + vsie_page->scb_o = phys_to_virt(hpa); return 0; } @@ -881,7 +931,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, (vaddr & 0xfffffffffffff000UL) | /* 52-53: store / fetch */ (((unsigned int) !write_flag) + 1) << 10, - /* 62-63: asce id (alway primary == 0) */ + /* 62-63: asce id (always primary == 0) */ .exc_access_id = 0, /* always primary */ .op_access_id = 0, /* not MVPG */ }; @@ -905,19 +955,19 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { int rc; - if (current->thread.gmap_int_code == PGM_PROTECTION) + if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION) /* we can directly forward all protection exceptions */ return inject_fault(vcpu, PGM_PROTECTION, - current->thread.gmap_addr, 1); + current->thread.gmap_teid.addr * PAGE_SIZE, 1); rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_addr, NULL); + current->thread.gmap_teid.addr * PAGE_SIZE, NULL); if (rc > 0) { rc = inject_fault(vcpu, rc, - current->thread.gmap_addr, - current->thread.gmap_write_flag); + current->thread.gmap_teid.addr * PAGE_SIZE, + kvm_s390_cur_gmap_fault_is_write()); if (rc >= 0) - vsie_page->fault_addr = current->thread.gmap_addr; + vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE; } return rc; } @@ -968,14 +1018,28 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page) static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U; + __u32 fac = READ_ONCE(vsie_page->scb_o->fac); + /* + * Alternate-STFLE-Interpretive-Execution facilities are not supported + * -> format-0 flcb + */ if (fac && test_kvm_facility(vcpu->kvm, 7)) { retry_vsie_icpt(vsie_page); + /* + * The facility list origin (FLO) is in bits 1 - 28 of the FLD + * so we need to mask here before reading. + */ + fac = fac & 0x7ffffff8U; + /* + * format-0 -> size of nested guest's facility list == guest's size + * guest's size == host's size, since STFLE is interpretatively executed + * using a format-0 for the guest, too. + */ if (read_guest_real(vcpu, fac, &vsie_page->fac, - sizeof(vsie_page->fac))) + stfle_size() * sizeof(u64))) return set_validity_icpt(scb_s, 0x1090U); - scb_s->fac = (__u32)(__u64) &vsie_page->fac; + scb_s->fac = (u32)virt_to_phys(&vsie_page->fac); } return 0; } @@ -1117,11 +1181,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * also kick the vSIE. */ vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; + current->thread.gmap_int_code = 0; barrier(); - if (test_cpu_flag(CIF_FPU)) - load_fpu_regs(); if (!kvm_s390_vcpu_sie_inhibited(vcpu)) - rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); barrier(); vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; @@ -1143,7 +1206,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (rc > 0) rc = 0; /* we could still have an icpt */ - else if (rc == -EFAULT) + else if (current->thread.gmap_int_code) return handle_fault(vcpu, vsie_page); switch (scb_s->icptcode) { @@ -1194,15 +1257,17 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, * we're holding has been unshadowed. If the gmap is still valid, * we can safely reuse it. */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; return 0; + } /* release the old shadow - if any, and mark the prefix as unmapped */ release_gmap_shadow(vsie_page); gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); if (IS_ERR(gmap)) return PTR_ERR(gmap); - gmap->private = vcpu->kvm; + vcpu->kvm->stat.gmap_shadow_create++; WRITE_ONCE(vsie_page->gmap, gmap); return 0; } @@ -1264,19 +1329,31 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (!rc) rc = map_prefix(vcpu, vsie_page); if (!rc) { - gmap_enable(vsie_page->gmap); update_intervention_requests(vsie_page); rc = do_vsie_run(vcpu, vsie_page); - gmap_enable(vcpu->arch.gmap); } atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); if (rc == -EAGAIN) rc = 0; - if (rc || scb_s->icptcode || signal_pending(current) || + + /* + * Exit the loop if the guest needs to process the intercept + */ + if (rc || scb_s->icptcode) + break; + + /* + * Exit the loop if the host needs to process an intercept, + * but rewind the PSW to re-enter SIE once that's completed + * instead of passing a "no action" intercept to the guest. + */ + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) + kvm_s390_vcpu_sie_inhibited(vcpu)) { + kvm_s390_rewind_psw(vcpu, 4); break; + } cond_resched(); } @@ -1300,6 +1377,20 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return rc; } +/* Try getting a given vsie page, returning "true" on success. */ +static inline bool try_get_vsie_page(struct vsie_page *vsie_page) +{ + if (test_bit(VSIE_PAGE_IN_USE, &vsie_page->flags)) + return false; + return !test_and_set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); +} + +/* Put a vsie page acquired through get_vsie_page / try_get_vsie_page. */ +static void put_vsie_page(struct vsie_page *vsie_page) +{ + clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); +} + /* * Get or create a vsie page for a scb address. * @@ -1310,16 +1401,21 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) { struct vsie_page *vsie_page; - struct page *page; int nr_vcpus; rcu_read_lock(); - page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); rcu_read_unlock(); - if (page) { - if (page_ref_inc_return(page) == 2) - return page_to_virt(page); - page_ref_dec(page); + if (vsie_page) { + if (try_get_vsie_page(vsie_page)) { + if (vsie_page->scb_gpa == addr) + return vsie_page; + /* + * We raced with someone reusing + putting this vsie + * page before we grabbed it. + */ + put_vsie_page(vsie_page); + } } /* @@ -1330,36 +1426,40 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_lock(&kvm->arch.vsie.mutex); if (kvm->arch.vsie.page_count < nr_vcpus) { - page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); - if (!page) { + vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); + if (!vsie_page) { mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); } - page_ref_inc(page); - kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page; kvm->arch.vsie.page_count++; } else { /* reuse an existing entry that belongs to nobody */ while (true) { - page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; - if (page_ref_inc_return(page) == 2) + vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (try_get_vsie_page(vsie_page)) break; - page_ref_dec(page); kvm->arch.vsie.next++; kvm->arch.vsie.next %= nr_vcpus; } - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + if (vsie_page->scb_gpa != ULONG_MAX) + radix_tree_delete(&kvm->arch.vsie.addr_to_page, + vsie_page->scb_gpa >> 9); } - page->index = addr; - /* double use of the same address */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { - page_ref_dec(page); + /* Mark it as invalid until it resides in the tree. */ + vsie_page->scb_gpa = ULONG_MAX; + + /* Double use of the same address or allocation failure. */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, + vsie_page)) { + put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; } + vsie_page->scb_gpa = addr; mutex_unlock(&kvm->arch.vsie.mutex); - vsie_page = page_to_virt(page); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); release_gmap_shadow(vsie_page); vsie_page->fault_addr = 0; @@ -1367,14 +1467,6 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) return vsie_page; } -/* put a vsie page acquired via get_vsie_page */ -static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) -{ - struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); - - page_ref_dec(page); -} - int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) { struct vsie_page *vsie_page; @@ -1395,8 +1487,10 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) + kvm_s390_vcpu_sie_inhibited(vcpu)) { + kvm_s390_rewind_psw(vcpu, 4); return 0; + } vsie_page = get_vsie_page(vcpu->kvm, scb_addr); if (IS_ERR(vsie_page)) @@ -1423,7 +1517,7 @@ out_unshadow: out_unpin_scb: unpin_scb(vcpu, vsie_page, scb_addr); out_put: - put_vsie_page(vcpu->kvm, vsie_page); + put_vsie_page(vsie_page); return rc < 0 ? rc : 0; } @@ -1439,18 +1533,18 @@ void kvm_s390_vsie_init(struct kvm *kvm) void kvm_s390_vsie_destroy(struct kvm *kvm) { struct vsie_page *vsie_page; - struct page *page; int i; mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { - page = kvm->arch.vsie.pages[i]; + vsie_page = kvm->arch.vsie.pages[i]; kvm->arch.vsie.pages[i] = NULL; - vsie_page = page_to_virt(page); release_gmap_shadow(vsie_page); /* free the radix tree entry */ - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); - __free_page(page); + if (vsie_page->scb_gpa != ULONG_MAX) + radix_tree_delete(&kvm->arch.vsie.addr_to_page, + vsie_page->scb_gpa >> 9); + free_page((unsigned long)vsie_page); } kvm->arch.vsie.page_count = 0; mutex_unlock(&kvm->arch.vsie.mutex); diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 580d2e3265cb..cd35cdbfa871 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,7 +3,9 @@ # Makefile for s390-specific library files.. # -lib-y += delay.o string.o uaccess.o find.o spinlock.o +obj-y += crypto/ +lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o +lib-y += csum-partial.o obj-y += mem.o xor.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o @@ -22,4 +24,7 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o -obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/ +obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o + +obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o +crc32-s390-y := crc32.o crc32le-vx.o crc32be-vx.o diff --git a/arch/s390/lib/crc32-vx.h b/arch/s390/lib/crc32-vx.h new file mode 100644 index 000000000000..652c96e1a822 --- /dev/null +++ b/arch/s390/lib/crc32-vx.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRC32_VX_S390_H +#define _CRC32_VX_S390_H + +#include <linux/types.h> + +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); + +#endif /* _CRC32_VX_S390_H */ diff --git a/arch/s390/lib/crc32.c b/arch/s390/lib/crc32.c new file mode 100644 index 000000000000..3c4b344417c1 --- /dev/null +++ b/arch/s390/lib/crc32.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CRC-32 implemented with the z/Architecture Vector Extension Facility. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#define KMSG_COMPONENT "crc32-vx" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/crc32.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +/* + * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension + * + * Creates a function to perform a particular CRC-32 computation. Depending + * on the message buffer, the hardware-accelerated or software implementation + * is used. Note that the message buffer is aligned to improve fetch + * operations of VECTOR LOAD MULTIPLE instructions. + */ +#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ + u32 ___fname(u32 crc, const u8 *data, size_t datalen) \ + { \ + unsigned long prealign, aligned, remaining; \ + DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ + \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \ + return ___crc32_sw(crc, data, datalen); \ + \ + if ((unsigned long)data & VX_ALIGN_MASK) { \ + prealign = VX_ALIGNMENT - \ + ((unsigned long)data & VX_ALIGN_MASK); \ + datalen -= prealign; \ + crc = ___crc32_sw(crc, data, prealign); \ + data = (void *)((unsigned long)data + prealign); \ + } \ + \ + aligned = datalen & ~VX_ALIGN_MASK; \ + remaining = datalen & VX_ALIGN_MASK; \ + \ + kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ + crc = ___crc32_vx(crc, data, aligned); \ + kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ + \ + if (remaining) \ + crc = ___crc32_sw(crc, data + aligned, remaining); \ + \ + return crc; \ + } \ + EXPORT_SYMBOL(___fname); + +DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) +DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) +DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) + +u32 crc32_optimizations(void) +{ + if (cpu_has_vx()) { + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + } + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("CRC-32 algorithms using z/Architecture Vector Extension Facility"); +MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/lib/crc32be-vx.c index 6b3d1009c392..fed7c9c70d05 100644 --- a/arch/s390/crypto/crc32be-vx.S +++ b/arch/s390/lib/crc32be-vx.c @@ -12,20 +12,17 @@ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#include <linux/linkage.h> -#include <asm/nospec-insn.h> -#include <asm/vx-insn.h> +#include <linux/types.h> +#include <asm/fpu.h> +#include "crc32-vx.h" /* Vector register range containing CRC-32 constants */ -#define CONST_R1R2 %v9 -#define CONST_R3R4 %v10 -#define CONST_R5 %v11 -#define CONST_R6 %v12 -#define CONST_RU_POLY %v13 -#define CONST_CRC_POLY %v14 - -.data -.align 8 +#define CONST_R1R2 9 +#define CONST_R3R4 10 +#define CONST_R5 11 +#define CONST_R6 12 +#define CONST_RU_POLY 13 +#define CONST_CRC_POLY 14 /* * The CRC-32 constant block contains reduction constants to fold and @@ -48,7 +45,7 @@ * * Note that the constant definitions below are extended in order to compute * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. - * The righmost doubleword can be 0 to prevent contribution to the result or + * The rightmost doubleword can be 0 to prevent contribution to the result or * can be multiplied by 1 to perform an XOR without the need for a separate * VECTOR EXCLUSIVE OR instruction. * @@ -58,104 +55,74 @@ * P'(x) = 0xEDB88320 */ -.Lconstants_CRC_32_BE: - .quad 0x08833794c, 0x0e6228b11 # R1, R2 - .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4 - .quad 0x0f200aa66, 1 << 32 # R5, x32 - .quad 0x0490d678d, 1 # R6, 1 - .quad 0x104d101df, 0 # u - .quad 0x104C11DB7, 0 # P(x) - -.previous - - GEN_BR_THUNK %r14 - -.text -/* - * The CRC-32 function(s) use these calling conventions: - * - * Parameters: - * - * %r2: Initial CRC value, typically ~0; and final CRC (return) value. - * %r3: Input buffer pointer, performance might be improved if the - * buffer is on a doubleword boundary. - * %r4: Length of the buffer, must be 64 bytes or greater. +static unsigned long constants_CRC_32_BE[] = { + 0x08833794c, 0x0e6228b11, /* R1, R2 */ + 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */ + 0x0f200aa66, 1UL << 32, /* R5, x32 */ + 0x0490d678d, 1, /* R6, 1 */ + 0x104d101df, 0, /* u */ + 0x104C11DB7, 0, /* P(x) */ +}; + +/** + * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers + * @crc: Initial CRC value, typically ~0. + * @buf: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * @size: Size of the buffer, must be 64 bytes or greater. * * Register usage: - * - * %r5: CRC-32 constant pool base pointer. * V0: Initial CRC value and intermediate constants and results. * V1..V4: Data for CRC computation. * V5..V8: Next data chunks that are fetched from the input buffer. - * * V9..V14: CRC-32 constants. */ -ENTRY(crc32_be_vgfm_16) +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ /* Load CRC-32 constants */ - larl %r5,.Lconstants_CRC_32_BE - VLM CONST_R1R2,CONST_CRC_POLY,0,%r5 + fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE); + fpu_vzero(0); /* Load the initial CRC value into the leftmost word of V0. */ - VZERO %v0 - VLVGF %v0,%r2,0 + fpu_vlvgf(0, crc, 0); /* Load a 64-byte data chunk and XOR with CRC */ - VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ - VX %v1,%v0,%v1 /* V1 ^= CRC */ - aghi %r3,64 /* BUF = BUF + 64 */ - aghi %r4,-64 /* LEN = LEN - 64 */ - - /* Check remaining buffer size and jump to proper folding method */ - cghi %r4,64 - jl .Lless_than_64bytes - -.Lfold_64bytes_loop: - /* Load the next 64-byte data chunk into V5 to V8 */ - VLM %v5,%v8,0,%r3 + fpu_vlm(1, 4, buf); + fpu_vx(1, 0, 1); + buf += 64; + size -= 64; + + while (size >= 64) { + /* Load the next 64-byte data chunk into V5 to V8 */ + fpu_vlm(5, 8, buf); + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the reduction constants in V0. The intermediate result is + * then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + fpu_vgfmag(1, CONST_R1R2, 1, 5); + fpu_vgfmag(2, CONST_R1R2, 2, 6); + fpu_vgfmag(3, CONST_R1R2, 3, 7); + fpu_vgfmag(4, CONST_R1R2, 4, 8); + buf += 64; + size -= 64; + } - /* - * Perform a GF(2) multiplication of the doublewords in V1 with - * the reduction constants in V0. The intermediate result is - * then folded (accumulated) with the next data chunk in V5 and - * stored in V1. Repeat this step for the register contents - * in V2, V3, and V4 respectively. - */ - VGFMAG %v1,CONST_R1R2,%v1,%v5 - VGFMAG %v2,CONST_R1R2,%v2,%v6 - VGFMAG %v3,CONST_R1R2,%v3,%v7 - VGFMAG %v4,CONST_R1R2,%v4,%v8 - - /* Adjust buffer pointer and length for next loop */ - aghi %r3,64 /* BUF = BUF + 64 */ - aghi %r4,-64 /* LEN = LEN - 64 */ - - cghi %r4,64 - jnl .Lfold_64bytes_loop - -.Lless_than_64bytes: /* Fold V1 to V4 into a single 128-bit value in V1 */ - VGFMAG %v1,CONST_R3R4,%v1,%v2 - VGFMAG %v1,CONST_R3R4,%v1,%v3 - VGFMAG %v1,CONST_R3R4,%v1,%v4 - - /* Check whether to continue with 64-bit folding */ - cghi %r4,16 - jl .Lfinal_fold + fpu_vgfmag(1, CONST_R3R4, 1, 2); + fpu_vgfmag(1, CONST_R3R4, 1, 3); + fpu_vgfmag(1, CONST_R3R4, 1, 4); -.Lfold_16bytes_loop: + while (size >= 16) { + fpu_vl(2, buf); + fpu_vgfmag(1, CONST_R3R4, 1, 2); + buf += 16; + size -= 16; + } - VL %v2,0,,%r3 /* Load next data chunk */ - VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */ - - /* Adjust buffer pointer and size for folding next data chunk */ - aghi %r3,16 - aghi %r4,-16 - - /* Process remaining data chunks */ - cghi %r4,16 - jnl .Lfold_16bytes_loop - -.Lfinal_fold: /* * The R5 constant is used to fold a 128-bit value into an 96-bit value * that is XORed with the next 96-bit input data chunk. To use a single @@ -163,7 +130,7 @@ ENTRY(crc32_be_vgfm_16) * form an intermediate 96-bit value (with appended zeros) which is then * XORed with the intermediate reduction result. */ - VGFMG %v1,CONST_R5,%v1 + fpu_vgfmg(1, CONST_R5, 1); /* * Further reduce the remaining 96-bit value to a 64-bit value using a @@ -172,7 +139,7 @@ ENTRY(crc32_be_vgfm_16) * doubleword with R6. The result is a 64-bit value and is subject to * the Barret reduction. */ - VGFMG %v1,CONST_R6,%v1 + fpu_vgfmg(1, CONST_R6, 1); /* * The input values to the Barret reduction are the degree-63 polynomial @@ -193,20 +160,15 @@ ENTRY(crc32_be_vgfm_16) */ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ - VUPLLF %v2,%v1 - VGFMG %v2,CONST_RU_POLY,%v2 + fpu_vupllf(2, 1); + fpu_vgfmg(2, CONST_RU_POLY, 2); /* * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in * V2 and XOR the intermediate result, T2(x), with the value in V1. * The final result is in the rightmost word of V2. */ - VUPLLF %v2,%v2 - VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 - -.Ldone: - VLGVF %r2,%v2,3 - BR_EX %r14 -ENDPROC(crc32_be_vgfm_16) - -.previous + fpu_vupllf(2, 2); + fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); + return fpu_vlgvf(2, 3); +} diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/lib/crc32le-vx.c index 71caf0f4ec08..2f629f394df7 100644 --- a/arch/s390/crypto/crc32le-vx.S +++ b/arch/s390/lib/crc32le-vx.c @@ -13,20 +13,17 @@ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#include <linux/linkage.h> -#include <asm/nospec-insn.h> -#include <asm/vx-insn.h> +#include <linux/types.h> +#include <asm/fpu.h> +#include "crc32-vx.h" /* Vector register range containing CRC-32 constants */ -#define CONST_PERM_LE2BE %v9 -#define CONST_R2R1 %v10 -#define CONST_R4R3 %v11 -#define CONST_R5 %v12 -#define CONST_RU_POLY %v13 -#define CONST_CRC_POLY %v14 - -.data -.align 8 +#define CONST_PERM_LE2BE 9 +#define CONST_R2R1 10 +#define CONST_R4R3 11 +#define CONST_R5 12 +#define CONST_RU_POLY 13 +#define CONST_CRC_POLY 14 /* * The CRC-32 constant block contains reduction constants to fold and @@ -59,62 +56,43 @@ * P'(x) = 0x82F63B78 */ -.Lconstants_CRC_32_LE: - .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask - .quad 0x1c6e41596, 0x154442bd4 # R2, R1 - .quad 0x0ccaa009e, 0x1751997d0 # R4, R3 - .octa 0x163cd6124 # R5 - .octa 0x1F7011641 # u' - .octa 0x1DB710641 # P'(x) << 1 - -.Lconstants_CRC_32C_LE: - .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask - .quad 0x09e4addf8, 0x740eef02 # R2, R1 - .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3 - .octa 0x0dd45aab8 # R5 - .octa 0x0dea713f1 # u' - .octa 0x105ec76f0 # P'(x) << 1 - -.previous - - GEN_BR_THUNK %r14 - -.text - -/* - * The CRC-32 functions use these calling conventions: - * - * Parameters: - * - * %r2: Initial CRC value, typically ~0; and final CRC (return) value. - * %r3: Input buffer pointer, performance might be improved if the - * buffer is on a doubleword boundary. - * %r4: Length of the buffer, must be 64 bytes or greater. +static unsigned long constants_CRC_32_LE[] = { + 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ + 0x1c6e41596, 0x154442bd4, /* R2, R1 */ + 0x0ccaa009e, 0x1751997d0, /* R4, R3 */ + 0x0, 0x163cd6124, /* R5 */ + 0x0, 0x1f7011641, /* u' */ + 0x0, 0x1db710641 /* P'(x) << 1 */ +}; + +static unsigned long constants_CRC_32C_LE[] = { + 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ + 0x09e4addf8, 0x740eef02, /* R2, R1 */ + 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */ + 0x0, 0x0dd45aab8, /* R5 */ + 0x0, 0x0dea713f1, /* u' */ + 0x0, 0x105ec76f0 /* P'(x) << 1 */ +}; + +/** + * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers + * @crc: Initial CRC value, typically ~0. + * @buf: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * @size: Size of the buffer, must be 64 bytes or greater. + * @constants: CRC-32 constant pool base pointer. * * Register usage: - * - * %r5: CRC-32 constant pool base pointer. - * V0: Initial CRC value and intermediate constants and results. - * V1..V4: Data for CRC computation. - * V5..V8: Next data chunks that are fetched from the input buffer. - * V9: Constant for BE->LE conversion and shift operations - * + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9: Constant for BE->LE conversion and shift operations * V10..V14: CRC-32 constants. */ - -ENTRY(crc32_le_vgfm_16) - larl %r5,.Lconstants_CRC_32_LE - j crc32_le_vgfm_generic -ENDPROC(crc32_le_vgfm_16) - -ENTRY(crc32c_le_vgfm_16) - larl %r5,.Lconstants_CRC_32C_LE - j crc32_le_vgfm_generic -ENDPROC(crc32c_le_vgfm_16) - -ENTRY(crc32_le_vgfm_generic) +static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants) +{ /* Load CRC-32 constants */ - VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 + fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants); /* * Load the initial CRC value. @@ -123,90 +101,73 @@ ENTRY(crc32_le_vgfm_generic) * vector register and is later XORed with the LSB portion * of the loaded input data. */ - VZERO %v0 /* Clear V0 */ - VLVGF %v0,%r2,3 /* Load CRC into rightmost word */ + fpu_vzero(0); /* Clear V0 */ + fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */ /* Load a 64-byte data chunk and XOR with CRC */ - VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ - VPERM %v1,%v1,%v1,CONST_PERM_LE2BE - VPERM %v2,%v2,%v2,CONST_PERM_LE2BE - VPERM %v3,%v3,%v3,CONST_PERM_LE2BE - VPERM %v4,%v4,%v4,CONST_PERM_LE2BE + fpu_vlm(1, 4, buf); + fpu_vperm(1, 1, 1, CONST_PERM_LE2BE); + fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); + fpu_vperm(3, 3, 3, CONST_PERM_LE2BE); + fpu_vperm(4, 4, 4, CONST_PERM_LE2BE); + + fpu_vx(1, 0, 1); /* V1 ^= CRC */ + buf += 64; + size -= 64; + + while (size >= 64) { + fpu_vlm(5, 8, buf); + fpu_vperm(5, 5, 5, CONST_PERM_LE2BE); + fpu_vperm(6, 6, 6, CONST_PERM_LE2BE); + fpu_vperm(7, 7, 7, CONST_PERM_LE2BE); + fpu_vperm(8, 8, 8, CONST_PERM_LE2BE); + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate + * result is then folded (accumulated) with the next data chunk + * in V5 and stored in V1. Repeat this step for the register + * contents in V2, V3, and V4 respectively. + */ + fpu_vgfmag(1, CONST_R2R1, 1, 5); + fpu_vgfmag(2, CONST_R2R1, 2, 6); + fpu_vgfmag(3, CONST_R2R1, 3, 7); + fpu_vgfmag(4, CONST_R2R1, 4, 8); + buf += 64; + size -= 64; + } - VX %v1,%v0,%v1 /* V1 ^= CRC */ - aghi %r3,64 /* BUF = BUF + 64 */ - aghi %r4,-64 /* LEN = LEN - 64 */ - - cghi %r4,64 - jl .Lless_than_64bytes - -.Lfold_64bytes_loop: - /* Load the next 64-byte data chunk into V5 to V8 */ - VLM %v5,%v8,0,%r3 - VPERM %v5,%v5,%v5,CONST_PERM_LE2BE - VPERM %v6,%v6,%v6,CONST_PERM_LE2BE - VPERM %v7,%v7,%v7,CONST_PERM_LE2BE - VPERM %v8,%v8,%v8,CONST_PERM_LE2BE - - /* - * Perform a GF(2) multiplication of the doublewords in V1 with - * the R1 and R2 reduction constants in V0. The intermediate result - * is then folded (accumulated) with the next data chunk in V5 and - * stored in V1. Repeat this step for the register contents - * in V2, V3, and V4 respectively. - */ - VGFMAG %v1,CONST_R2R1,%v1,%v5 - VGFMAG %v2,CONST_R2R1,%v2,%v6 - VGFMAG %v3,CONST_R2R1,%v3,%v7 - VGFMAG %v4,CONST_R2R1,%v4,%v8 - - aghi %r3,64 /* BUF = BUF + 64 */ - aghi %r4,-64 /* LEN = LEN - 64 */ - - cghi %r4,64 - jnl .Lfold_64bytes_loop - -.Lless_than_64bytes: /* * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 * and R4 and accumulating the next 128-bit chunk until a single 128-bit * value remains. */ - VGFMAG %v1,CONST_R4R3,%v1,%v2 - VGFMAG %v1,CONST_R4R3,%v1,%v3 - VGFMAG %v1,CONST_R4R3,%v1,%v4 - - cghi %r4,16 - jl .Lfinal_fold - -.Lfold_16bytes_loop: - - VL %v2,0,,%r3 /* Load next data chunk */ - VPERM %v2,%v2,%v2,CONST_PERM_LE2BE - VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */ + fpu_vgfmag(1, CONST_R4R3, 1, 2); + fpu_vgfmag(1, CONST_R4R3, 1, 3); + fpu_vgfmag(1, CONST_R4R3, 1, 4); + + while (size >= 16) { + fpu_vl(2, buf); + fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); + fpu_vgfmag(1, CONST_R4R3, 1, 2); + buf += 16; + size -= 16; + } - aghi %r3,16 - aghi %r4,-16 - - cghi %r4,16 - jnl .Lfold_16bytes_loop - -.Lfinal_fold: /* * Set up a vector register for byte shifts. The shift value must * be loaded in bits 1-4 in byte element 7 of a vector register. * Shift by 8 bytes: 0x40 * Shift by 4 bytes: 0x20 */ - VLEIB %v9,0x40,7 + fpu_vleib(9, 0x40, 7); /* * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes * to move R4 into the rightmost doubleword and set the leftmost * doubleword to 0x1. */ - VSRLB %v0,CONST_R4R3,%v9 - VLEIG %v0,1,0 + fpu_vsrlb(0, CONST_R4R3, 9); + fpu_vleig(0, 1, 0); /* * Compute GF(2) product of V1 and V0. The rightmost doubleword @@ -214,7 +175,7 @@ ENTRY(crc32_le_vgfm_generic) * multiplied by 0x1 and is then XORed with rightmost product. * Implicitly, the intermediate leftmost product becomes padded */ - VGFMG %v1,%v0,%v1 + fpu_vgfmg(1, 0, 1); /* * Now do the final 32-bit fold by multiplying the rightmost word @@ -229,10 +190,10 @@ ENTRY(crc32_le_vgfm_generic) * rightmost doubleword and the leftmost doubleword is zero to ignore * the leftmost product of V1. */ - VLEIB %v9,0x20,7 /* Shift by words */ - VSRLB %v2,%v1,%v9 /* Store remaining bits in V2 */ - VUPLLF %v1,%v1 /* Split rightmost doubleword */ - VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */ + fpu_vleib(9, 0x20, 7); /* Shift by words */ + fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */ + fpu_vupllf(1, 1); /* Split rightmost doubleword */ + fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */ /* * Apply a Barret reduction to compute the final 32-bit CRC value. @@ -254,20 +215,26 @@ ENTRY(crc32_le_vgfm_generic) */ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ - VUPLLF %v2,%v1 - VGFMG %v2,CONST_RU_POLY,%v2 + fpu_vupllf(2, 1); + fpu_vgfmg(2, CONST_RU_POLY, 2); /* * Compute the GF(2) product of the CRC polynomial with T1(x) in * V2 and XOR the intermediate result, T2(x), with the value in V1. * The final result is stored in word element 2 of V2. */ - VUPLLF %v2,%v2 - VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 + fpu_vupllf(2, 2); + fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); + + return fpu_vlgvf(2, 2); +} -.Ldone: - VLGVF %r2,%v2,2 - BR_EX %r14 -ENDPROC(crc32_le_vgfm_generic) +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]); +} -.previous +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]); +} diff --git a/arch/s390/lib/crypto/Kconfig b/arch/s390/lib/crypto/Kconfig new file mode 100644 index 000000000000..e3f855ef4393 --- /dev/null +++ b/arch/s390/lib/crypto/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_S390 + tristate + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_SHA256_S390 + tristate + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/s390/lib/crypto/Makefile b/arch/s390/lib/crypto/Makefile new file mode 100644 index 000000000000..5df30f1e7930 --- /dev/null +++ b/arch/s390/lib/crypto/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o +chacha_s390-y := chacha-glue.o chacha-s390.o + +obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o +sha256-s390-y := sha256.o diff --git a/arch/s390/lib/crypto/chacha-glue.c b/arch/s390/lib/crypto/chacha-glue.c new file mode 100644 index 000000000000..f95ba3483bbc --- /dev/null +++ b/arch/s390/lib/crypto/chacha-glue.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha stream cipher (s390 optimized) + * + * Copyright IBM Corp. 2021 + */ + +#define KMSG_COMPONENT "chacha_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <crypto/chacha.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <asm/fpu.h> +#include "chacha-s390.h" + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + /* TODO: implement hchacha_block_arch() in assembly */ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + /* s390 chacha20 implementation has 20 rounds hard-coded, + * it cannot handle a block of data or less, but otherwise + * it can handle data of arbitrary size + */ + if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) { + chacha_crypt_generic(state, dst, src, bytes, nrounds); + } else { + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]); + kernel_fpu_end(&vxstate, KERNEL_VXR); + + state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) / + CHACHA_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return cpu_has_vx(); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/lib/crypto/chacha-s390.S index 9b033622191c..63f3102678c0 100644 --- a/arch/s390/crypto/chacha-s390.S +++ b/arch/s390/lib/crypto/chacha-s390.S @@ -8,32 +8,33 @@ #include <linux/linkage.h> #include <asm/nospec-insn.h> -#include <asm/vx-insn.h> +#include <asm/fpu-insn.h> #define SP %r15 #define FRAME (16 * 8 + 4 * 8) -.data -.align 32 + .data + .balign 32 -.Lsigma: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral -.long 1,0,0,0 -.long 2,0,0,0 -.long 3,0,0,0 -.long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap +SYM_DATA_START_LOCAL(sigma) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 1,0,0,0 + .long 2,0,0,0 + .long 3,0,0,0 + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap -.long 0,1,2,3 -.long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma -.long 0x3320646e,0x3320646e,0x3320646e,0x3320646e -.long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 -.long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 + .long 0,1,2,3 + .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 +SYM_DATA_END(sigma) -.previous + .previous GEN_BR_THUNK %r14 -.text + .text ############################################################################# # void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, @@ -78,10 +79,10 @@ #define XT2 %v29 #define XT3 %v30 -ENTRY(chacha20_vx_4x) +SYM_FUNC_START(chacha20_vx_4x) stmg %r6,%r7,6*8(SP) - larl %r7,.Lsigma + larl %r7,sigma lhi %r0,10 lhi %r1,0 @@ -403,7 +404,7 @@ ENTRY(chacha20_vx_4x) lmg %r6,%r7,6*8(SP) BR_EX %r14 -ENDPROC(chacha20_vx_4x) +SYM_FUNC_END(chacha20_vx_4x) #undef OUT #undef INP @@ -471,7 +472,7 @@ ENDPROC(chacha20_vx_4x) #define T2 %v29 #define T3 %v30 -ENTRY(chacha20_vx) +SYM_FUNC_START(chacha20_vx) clgfi LEN,256 jle chacha20_vx_4x stmg %r6,%r7,6*8(SP) @@ -481,7 +482,7 @@ ENTRY(chacha20_vx) la SP,0(%r1,SP) stg %r0,0(SP) # back-chain - larl %r7,.Lsigma + larl %r7,sigma lhi %r0,10 VLM K1,K2,0,KEY,0 # load key @@ -902,6 +903,6 @@ ENTRY(chacha20_vx) lmg %r6,%r7,FRAME+6*8(SP) la SP,FRAME(SP) BR_EX %r14 -ENDPROC(chacha20_vx) +SYM_FUNC_END(chacha20_vx) .previous diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/lib/crypto/chacha-s390.h index 733744ce30f5..733744ce30f5 100644 --- a/arch/s390/crypto/chacha-s390.h +++ b/arch/s390/lib/crypto/chacha-s390.h diff --git a/arch/s390/lib/crypto/sha256.c b/arch/s390/lib/crypto/sha256.c new file mode 100644 index 000000000000..7dfe120fafab --- /dev/null +++ b/arch/s390/lib/crypto/sha256.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF) + * + * Copyright 2025 Google LLC + */ +#include <asm/cpacf.h> +#include <crypto/internal/sha2.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256); + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_cpacf_sha256)) + cpacf_kimd(CPACF_KIMD_SHA_256, state, data, + nblocks * SHA256_BLOCK_SIZE); + else + sha256_blocks_generic(state, data, nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return static_key_enabled(&have_cpacf_sha256); +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_s390_mod_init(void) +{ + if (cpu_have_feature(S390_CPU_FEATURE_MSA) && + cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) + static_branch_enable(&have_cpacf_sha256); + return 0; +} +subsys_initcall(sha256_s390_mod_init); + +static void __exit sha256_s390_mod_exit(void) +{ +} +module_exit(sha256_s390_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)"); diff --git a/arch/s390/lib/csum-partial.c b/arch/s390/lib/csum-partial.c new file mode 100644 index 000000000000..458abd9bac70 --- /dev/null +++ b/arch/s390/lib/csum-partial.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <asm/checksum.h> +#include <asm/fpu.h> + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit). If copy is true copies to dst. + * + * Returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic. + * + * This function must be called with even lengths, except + * for the last fragment, which may be odd. + * + * It's best to have src and dst aligned on a 64-bit boundary. + */ +static __always_inline __wsum csum_copy(void *dst, const void *src, int len, __wsum sum, bool copy) +{ + DECLARE_KERNEL_FPU_ONSTACK8(vxstate); + + if (!cpu_has_vx()) { + if (copy) + memcpy(dst, src, len); + return cksm(dst, len, sum); + } + kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23); + fpu_vlvgf(16, (__force u32)sum, 1); + fpu_vzero(17); + fpu_vzero(18); + fpu_vzero(19); + while (len >= 64) { + fpu_vlm(20, 23, src); + if (copy) { + fpu_vstm(20, 23, dst); + dst += 64; + } + fpu_vcksm(16, 20, 16); + fpu_vcksm(17, 21, 17); + fpu_vcksm(18, 22, 18); + fpu_vcksm(19, 23, 19); + src += 64; + len -= 64; + } + while (len >= 32) { + fpu_vlm(20, 21, src); + if (copy) { + fpu_vstm(20, 21, dst); + dst += 32; + } + fpu_vcksm(16, 20, 16); + fpu_vcksm(17, 21, 17); + src += 32; + len -= 32; + } + while (len >= 16) { + fpu_vl(20, src); + if (copy) { + fpu_vst(20, dst); + dst += 16; + } + fpu_vcksm(16, 20, 16); + src += 16; + len -= 16; + } + if (len) { + fpu_vll(20, len - 1, src); + if (copy) + fpu_vstl(20, len - 1, dst); + fpu_vcksm(16, 20, 16); + } + fpu_vcksm(18, 19, 18); + fpu_vcksm(16, 17, 16); + fpu_vcksm(16, 18, 16); + sum = (__force __wsum)fpu_vlgvf(16, 1); + kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23); + return sum; +} + +__wsum csum_partial(const void *buff, int len, __wsum sum) +{ + return csum_copy(NULL, buff, len, sum, false); +} +EXPORT_SYMBOL(csum_partial); + +__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) +{ + return csum_copy(dst, src, len, 0, true); +} +EXPORT_SYMBOL(csum_partial_copy_nocheck); diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index f7f5adea8940..be14c58cb989 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -13,13 +13,10 @@ void __delay(unsigned long loops) { - /* - * To end the bloody studid and useless discussion about the - * BogoMips number I took the liberty to define the __delay - * function in a way that that resulting BogoMips number will - * yield the megahertz number of the cpu. The important function - * is udelay and that is done using the tod clock. -- martin. - */ + /* + * Loop 'loops' times. Callers must not assume a specific + * amount of time passes before this function returns. + */ asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1)); } EXPORT_SYMBOL(__delay); diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline.S index 92ed8409a7a4..92ed8409a7a4 100644 --- a/arch/s390/lib/expoline/expoline.S +++ b/arch/s390/lib/expoline.S diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile deleted file mode 100644 index 854631d9cb03..000000000000 --- a/arch/s390/lib/expoline/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -obj-y += expoline.o diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index dc0874f2e203..d026debf250c 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -5,8 +5,8 @@ * Copyright IBM Corp. 2012 */ +#include <linux/export.h> #include <linux/linkage.h> -#include <asm/export.h> #include <asm/nospec-insn.h> GEN_BR_THUNK %r14 @@ -14,8 +14,7 @@ /* * void *memmove(void *dest, const void *src, size_t n) */ -WEAK(memmove) -ENTRY(__memmove) +SYM_FUNC_START(__memmove) ltgr %r4,%r4 lgr %r1,%r2 jz .Lmemmove_exit @@ -35,8 +34,7 @@ ENTRY(__memmove) la %r3,256(%r3) brctg %r0,.Lmemmove_forward_loop .Lmemmove_forward_remainder: - larl %r5,.Lmemmove_mvc - ex %r4,0(%r5) + exrl %r4,.Lmemmove_mvc .Lmemmove_exit: BR_EX %r14 .Lmemmove_reverse: @@ -48,7 +46,10 @@ ENTRY(__memmove) BR_EX %r14 .Lmemmove_mvc: mvc 0(1,%r1),0(%r3) -ENDPROC(__memmove) +SYM_FUNC_END(__memmove) +EXPORT_SYMBOL(__memmove) + +SYM_FUNC_ALIAS(memmove, __memmove) EXPORT_SYMBOL(memmove) /* @@ -66,8 +67,7 @@ EXPORT_SYMBOL(memmove) * return __builtin_memset(s, c, n); * } */ -WEAK(memset) -ENTRY(__memset) +SYM_FUNC_START(__memset) ltgr %r4,%r4 jz .Lmemset_exit ltgr %r3,%r3 @@ -82,8 +82,7 @@ ENTRY(__memset) la %r1,256(%r1) brctg %r3,.Lmemset_clear_loop .Lmemset_clear_remainder: - larl %r3,.Lmemset_xc - ex %r4,0(%r3) + exrl %r4,.Lmemset_xc .Lmemset_exit: BR_EX %r14 .Lmemset_fill: @@ -101,8 +100,7 @@ ENTRY(__memset) brctg %r5,.Lmemset_fill_loop .Lmemset_fill_remainder: stc %r3,0(%r1) - larl %r5,.Lmemset_mvc - ex %r4,0(%r5) + exrl %r4,.Lmemset_mvc BR_EX %r14 .Lmemset_fill_exit: stc %r3,0(%r1) @@ -111,7 +109,10 @@ ENTRY(__memset) xc 0(1,%r1),0(%r1) .Lmemset_mvc: mvc 1(1,%r1),0(%r1) -ENDPROC(__memset) +SYM_FUNC_END(__memset) +EXPORT_SYMBOL(__memset) + +SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset) /* @@ -119,8 +120,7 @@ EXPORT_SYMBOL(memset) * * void *memcpy(void *dest, const void *src, size_t n) */ -WEAK(memcpy) -ENTRY(__memcpy) +SYM_FUNC_START(__memcpy) ltgr %r4,%r4 jz .Lmemcpy_exit aghi %r4,-1 @@ -129,8 +129,7 @@ ENTRY(__memcpy) lgr %r1,%r2 jnz .Lmemcpy_loop .Lmemcpy_remainder: - larl %r5,.Lmemcpy_mvc - ex %r4,0(%r5) + exrl %r4,.Lmemcpy_mvc .Lmemcpy_exit: BR_EX %r14 .Lmemcpy_loop: @@ -141,7 +140,10 @@ ENTRY(__memcpy) j .Lmemcpy_remainder .Lmemcpy_mvc: mvc 0(1,%r1),0(%r3) -ENDPROC(__memcpy) +SYM_FUNC_END(__memcpy) +EXPORT_SYMBOL(__memcpy) + +SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) /* @@ -152,7 +154,7 @@ EXPORT_SYMBOL(memcpy) * void *__memset64(uint64_t *s, uint64_t v, size_t count) */ .macro __MEMSET bits,bytes,insn -ENTRY(__memset\bits) +SYM_FUNC_START(__memset\bits) ltgr %r4,%r4 jz .L__memset_exit\bits cghi %r4,\bytes @@ -169,8 +171,7 @@ ENTRY(__memset\bits) brctg %r5,.L__memset_loop\bits .L__memset_remainder\bits: \insn %r3,0(%r1) - larl %r5,.L__memset_mvc\bits - ex %r4,0(%r5) + exrl %r4,.L__memset_mvc\bits BR_EX %r14 .L__memset_store\bits: \insn %r3,0(%r2) @@ -178,7 +179,7 @@ ENTRY(__memset\bits) BR_EX %r14 .L__memset_mvc\bits: mvc \bytes(1,%r1),0(%r1) -ENDPROC(__memset\bits) +SYM_FUNC_END(__memset\bits) .endm __MEMSET 16,2,sth diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 04d4c6cf898e..ad9da4038511 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -10,11 +10,14 @@ #include <linux/export.h> #include <linux/spinlock.h> #include <linux/jiffies.h> +#include <linux/sysctl.h> #include <linux/init.h> #include <linux/smp.h> #include <linux/percpu.h> +#include <linux/io.h> #include <asm/alternative.h> -#include <asm/io.h> +#include <asm/machine.h> +#include <asm/asm.h> int spin_retry = -1; @@ -36,6 +39,23 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +static const struct ctl_table s390_spin_sysctl_table[] = { + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_spin_sysctls(void) +{ + register_sysctl_init("kernel", s390_spin_sysctl_table); + return 0; +} +arch_initcall(init_s390_spin_sysctls); + struct spin_wait { struct spin_wait *next, *prev; int node_id; @@ -75,25 +95,44 @@ static inline int arch_load_niai4(int *lock) int owner; asm_inline volatile( - ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */ - " l %0,%1\n" - : "=d" (owner) : "Q" (*lock) : "memory"); + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */ + " l %[owner],%[lock]\n" + : [owner] "=d" (owner) : [lock] "R" (*lock) : "memory"); return owner; } -static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +#ifdef __HAVE_ASM_FLAG_OUTPUTS__ + +static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new) +{ + int cc; + + asm_inline volatile( + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ + " cs %[old],%[new],%[lock]\n" + : [old] "+d" (old), [lock] "+Q" (*lock), "=@cc" (cc) + : [new] "d" (new) + : "memory"); + return cc == 0; +} + +#else /* __HAVE_ASM_FLAG_OUTPUTS__ */ + +static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new) { int expected = old; asm_inline volatile( - ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */ - " cs %0,%3,%1\n" - : "=d" (old), "=Q" (*lock) - : "0" (old), "d" (new), "Q" (*lock) + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ + " cs %[old],%[new],%[lock]\n" + : [old] "+d" (old), [lock] "+Q" (*lock) + : [new] "d" (new) : "cc", "memory"); return expected == old; } +#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */ + static inline struct spin_wait *arch_spin_decode_tail(int lock) { int ix, cpu; @@ -119,16 +158,16 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) struct spin_wait *node, *next; int lockval, ix, node_id, tail_id, old, new, owner, count; - ix = S390_lowcore.spinlock_index++; + ix = get_lowcore()->spinlock_index++; barrier(); - lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + lockval = spinlock_lockval(); /* cpu + 1 */ node = this_cpu_ptr(&spin_wait[ix]); node->prev = node->next = NULL; node_id = node->node_id; /* Enqueue the node for this CPU in the spinlock wait queue */ + old = READ_ONCE(lp->lock); while (1) { - old = READ_ONCE(lp->lock); if ((old & _Q_LOCK_CPU_MASK) == 0 && (old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) { /* @@ -139,7 +178,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) * waiter will get the lock. */ new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval; - if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + if (arch_try_cmpxchg(&lp->lock, &old, new)) /* Got the lock */ goto out; /* lock passing in progress */ @@ -147,7 +186,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) } /* Make the node of this CPU the new tail. */ new = node_id | (old & _Q_LOCK_MASK); - if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + if (arch_try_cmpxchg(&lp->lock, &old, new)) break; } /* Set the 'next' pointer of the tail node in the queue */ @@ -184,7 +223,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) if (!owner) { tail_id = old & _Q_TAIL_MASK; new = ((tail_id != node_id) ? tail_id : 0) | lockval; - if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + if (arch_try_cmpxchg(&lp->lock, &old, new)) /* Got the lock */ break; continue; @@ -192,7 +231,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) if (count-- >= 0) continue; count = spin_retry; - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + if (!machine_is_lpar() || arch_vcpu_is_preempted(owner - 1)) smp_yield_cpu(owner - 1); } @@ -205,14 +244,14 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp) } out: - S390_lowcore.spinlock_index--; + get_lowcore()->spinlock_index--; } static inline void arch_spin_lock_classic(arch_spinlock_t *lp) { int lockval, old, new, owner, count; - lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + lockval = spinlock_lockval(); /* cpu + 1 */ /* Pass the virtual CPU to the lock holder if it is not running */ owner = arch_spin_yield_target(READ_ONCE(lp->lock), NULL); @@ -226,7 +265,7 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) /* Try to get the lock if it is free. */ if (!owner) { new = (old & _Q_TAIL_MASK) | lockval; - if (arch_cmpxchg_niai8(&lp->lock, old, new)) { + if (arch_try_cmpxchg_niai8(&lp->lock, old, new)) { /* Got the lock */ return; } @@ -235,7 +274,7 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) if (count-- >= 0) continue; count = spin_retry; - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + if (!machine_is_lpar() || arch_vcpu_is_preempted(owner - 1)) smp_yield_cpu(owner - 1); } } @@ -251,14 +290,14 @@ EXPORT_SYMBOL(arch_spin_lock_wait); int arch_spin_trylock_retry(arch_spinlock_t *lp) { - int cpu = SPINLOCK_LOCKVAL; + int cpu = spinlock_lockval(); int owner, count; for (count = spin_retry; count > 0; count--) { owner = READ_ONCE(lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_try_cmpxchg(&lp->lock, &owner, cpu)) return 1; } } @@ -300,7 +339,7 @@ void arch_write_lock_wait(arch_rwlock_t *rw) while (1) { old = READ_ONCE(rw->cnts); if ((old & 0x1ffff) == 0 && - __atomic_cmpxchg_bool(&rw->cnts, old, old | 0x10000)) + arch_try_cmpxchg(&rw->cnts, &old, old | 0x10000)) /* Got the lock */ break; barrier(); @@ -317,7 +356,7 @@ void arch_spin_relax(arch_spinlock_t *lp) cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK; if (!cpu) return; - if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) + if (machine_is_lpar() && !arch_vcpu_is_preempted(cpu - 1)) return; smp_yield_cpu(cpu - 1); } diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 7d8741818239..099de76e8b1a 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -15,6 +15,7 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/export.h> +#include <asm/asm.h> /* * Helper functions to find the end of a string @@ -77,50 +78,6 @@ EXPORT_SYMBOL(strnlen); #endif /** - * strcpy - Copy a %NUL terminated string - * @dest: Where to copy the string to - * @src: Where to copy the string from - * - * returns a pointer to @dest - */ -#ifdef __HAVE_ARCH_STRCPY -char *strcpy(char *dest, const char *src) -{ - char *ret = dest; - - asm volatile( - " lghi 0,0\n" - "0: mvst %[dest],%[src]\n" - " jo 0b\n" - : [dest] "+&a" (dest), [src] "+&a" (src) - : - : "cc", "memory", "0"); - return ret; -} -EXPORT_SYMBOL(strcpy); -#endif - -/** - * strncpy - Copy a length-limited, %NUL-terminated string - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @n: The maximum number of bytes to copy - * - * The result is not %NUL-terminated if the source exceeds - * @n bytes. - */ -#ifdef __HAVE_ARCH_STRNCPY -char *strncpy(char *dest, const char *src, size_t n) -{ - size_t len = __strnend(src, n) - src; - memset(dest + len, 0, n - len); - memcpy(dest, src, len); - return dest; -} -EXPORT_SYMBOL(strncpy); -#endif - -/** * strcat - Append one %NUL-terminated string to another * @dest: The string to be appended to * @src: The string to append to it @@ -180,9 +137,6 @@ EXPORT_SYMBOL(strlcat); * @n: The maximum numbers of bytes to copy * * returns a pointer to @dest - * - * Note that in contrast to strncpy, strncat ensures the result is - * terminated. */ #ifdef __HAVE_ARCH_STRNCAT char *strncat(char *dest, const char *src, size_t n) @@ -238,12 +192,11 @@ static inline int clcle(const char *s1, unsigned long l1, asm volatile( "0: clcle %[r1],%[r3],0\n" " jo 0b\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=&d" (cc), [r1] "+&d" (r1.pair), [r3] "+&d" (r3.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [r1] "+d" (r1.pair), [r3] "+d" (r3.pair) : - : "cc", "memory"); - return cc; + : CC_CLOBBER_LIST("memory")); + return CC_TRANSFORM(cc); } /** diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c index 9e62d62812e5..9021298c3e8a 100644 --- a/arch/s390/lib/test_kprobes.c +++ b/arch/s390/lib/test_kprobes.c @@ -72,4 +72,5 @@ static struct kunit_suite kprobes_test_suite = { kunit_test_suites(&kprobes_test_suite); +MODULE_DESCRIPTION("KUnit tests for kprobes"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c index 9894009fc1f2..f96b6a3737e7 100644 --- a/arch/s390/lib/test_modules.c +++ b/arch/s390/lib/test_modules.c @@ -29,4 +29,5 @@ static struct kunit_suite modules_test_suite = { kunit_test_suites(&modules_test_suite); +MODULE_DESCRIPTION("KUnit test that modules with many relocations are loaded properly"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index 5a053b393d5c..6e42100875e7 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -47,7 +47,7 @@ static void print_backtrace(char *bt) static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, unsigned long sp) { - int frame_count, prev_is_func2, seen_func2_func1, seen_kretprobe_trampoline; + int frame_count, prev_is_func2, seen_func2_func1, seen_arch_rethook_trampoline; const int max_frames = 128; struct unwind_state state; size_t bt_pos = 0; @@ -63,7 +63,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, frame_count = 0; prev_is_func2 = 0; seen_func2_func1 = 0; - seen_kretprobe_trampoline = 0; + seen_arch_rethook_trampoline = 0; unwind_for_each_frame(&state, task, regs, sp) { unsigned long addr = unwind_get_return_address(&state); char sym[KSYM_SYMBOL_LEN]; @@ -89,8 +89,8 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1")) seen_func2_func1 = 1; prev_is_func2 = str_has_prefix(sym, "unwindme_func2"); - if (str_has_prefix(sym, "__kretprobe_trampoline+0x0/")) - seen_kretprobe_trampoline = 1; + if (str_has_prefix(sym, "arch_rethook_trampoline+0x0/")) + seen_arch_rethook_trampoline = 1; } /* Check the results. */ @@ -106,8 +106,8 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, kunit_err(current_test, "Maximum number of frames exceeded\n"); ret = -EINVAL; } - if (seen_kretprobe_trampoline) { - kunit_err(current_test, "__kretprobe_trampoline+0x0 in unwinding results\n"); + if (seen_arch_rethook_trampoline) { + kunit_err(current_test, "arch_rethook_trampoline+0x0 in unwinding results\n"); ret = -EINVAL; } if (ret || force_bt) @@ -270,9 +270,9 @@ static void notrace __used test_unwind_ftrace_handler(unsigned long ip, struct ftrace_ops *fops, struct ftrace_regs *fregs) { - struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2]; + struct unwindme *u = (struct unwindme *)arch_ftrace_regs(fregs)->regs.gprs[2]; - u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL, + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &arch_ftrace_regs(fregs)->regs : NULL, (u->flags & UWM_SP) ? u->sp : 0); } @@ -350,15 +350,15 @@ static noinline int unwindme_func3(struct unwindme *u) /* This function must appear in the backtrace. */ static noinline int unwindme_func2(struct unwindme *u) { - unsigned long flags; + unsigned long flags, mflags; int rc; if (u->flags & UWM_SWITCH_STACK) { local_irq_save(flags); - local_mcck_disable(); - rc = call_on_stack(1, S390_lowcore.nodat_stack, + local_mcck_save(mflags); + rc = call_on_stack(1, get_lowcore()->nodat_stack, int, unwindme_func3, struct unwindme *, u); - local_mcck_enable(); + local_mcck_restore(mflags); local_irq_restore(flags); return rc; } else { @@ -519,4 +519,5 @@ static struct kunit_suite test_unwind_suite = { kunit_test_suites(&test_unwind_suite); +MODULE_DESCRIPTION("KUnit test for unwind_for_each_frame"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S new file mode 100644 index 000000000000..96214f51f49b --- /dev/null +++ b/arch/s390/lib/tishift.S @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/export.h> +#include <linux/linkage.h> +#include <asm/nospec-insn.h> + + .section .noinstr.text, "ax" + + GEN_BR_THUNK %r14 + +SYM_FUNC_START(__ashlti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + srlg %r3,%r1,0(%r3) + sllg %r0,%r0,0(%r4) + sllg %r1,%r1,0(%r4) + ogr %r0,%r3 + j 1f +0: sllg %r0,%r1,-64(%r4) + lghi %r1,0 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__ashlti3) +EXPORT_SYMBOL(__ashlti3) + +SYM_FUNC_START(__ashrti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + sllg %r3,%r0,0(%r3) + srlg %r1,%r1,0(%r4) + srag %r0,%r0,0(%r4) + ogr %r1,%r3 + j 1f +0: srag %r1,%r0,-64(%r4) + srag %r0,%r0,63 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__ashrti3) +EXPORT_SYMBOL(__ashrti3) + +SYM_FUNC_START(__lshrti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + sllg %r3,%r0,0(%r3) + srlg %r1,%r1,0(%r4) + srlg %r0,%r0,0(%r4) + ogr %r1,%r3 + j 1f +0: srlg %r1,%r0,-64(%r4) + lghi %r0,0 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__lshrti3) +EXPORT_SYMBOL(__lshrti3) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index d7b3b193d108..fa7d98fa1320 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -12,67 +12,79 @@ #include <linux/export.h> #include <linux/mm.h> #include <asm/asm-extable.h> +#include <asm/ctlreg.h> #ifdef CONFIG_DEBUG_ENTRY void debug_user_asce(int exit) { - unsigned long cr1, cr7; + struct lowcore *lc = get_lowcore(); + struct ctlreg cr1, cr7; - __ctl_store(cr1, 1, 1); - __ctl_store(cr7, 7, 7); - if (cr1 == S390_lowcore.kernel_asce && cr7 == S390_lowcore.user_asce) + local_ctl_store(1, &cr1); + local_ctl_store(7, &cr7); + if (cr1.val == lc->user_asce.val && cr7.val == lc->user_asce.val) return; panic("incorrect ASCE on kernel %s\n" "cr1: %016lx cr7: %016lx\n" - "kernel: %016llx user: %016llx\n", - exit ? "exit" : "entry", cr1, cr7, - S390_lowcore.kernel_asce, S390_lowcore.user_asce); - + "kernel: %016lx user: %016lx\n", + exit ? "exit" : "entry", cr1.val, cr7.val, + lc->kernel_asce.val, lc->user_asce.val); } #endif /*CONFIG_DEBUG_ENTRY */ -static unsigned long raw_copy_from_user_key(void *to, const void __user *from, - unsigned long size, unsigned long key) +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + +static uaccess_kmsan_or_inline __must_check unsigned long +raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key) { - unsigned long tmp1, tmp2; + unsigned long osize; union oac spec = { .oac2.key = key, .oac2.as = PSW_BITS_AS_SECONDARY, .oac2.k = 1, .oac2.a = 1, }; + int cc; - tmp1 = -4096UL; - asm volatile( - " lr 0,%[spec]\n" - "0: mvcos 0(%2),0(%1),%0\n" - "6: jz 4f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ - " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ - " slgr %4,%1\n" - " clgr %0,%4\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: mvcos 0(%2),0(%1),%4\n" - "7: slgr %0,%4\n" - " j 5f\n" - "4: slgr %0,%0\n" - "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (from), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; -} - -unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) -{ - return raw_copy_from_user_key(to, from, n, 0); + while (1) { + osize = size; + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_FROM(0b, 0b) + EX_TABLE_UA_MVCOS_FROM(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to) + : [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from) + : CC_CLOBBER_LIST("memory", "0")); + if (CC_TRANSFORM(cc) == 0) + return osize - size; + size -= 4096; + to += 4096; + from += 4096; + } } -EXPORT_SYMBOL(raw_copy_from_user); unsigned long _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key) @@ -81,8 +93,9 @@ unsigned long _copy_from_user_key(void *to, const void __user *from, might_fault(); if (!should_fail_usercopy()) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user_key(to, from, n, key); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); @@ -90,48 +103,37 @@ unsigned long _copy_from_user_key(void *to, const void __user *from, } EXPORT_SYMBOL(_copy_from_user_key); -static unsigned long raw_copy_to_user_key(void __user *to, const void *from, - unsigned long size, unsigned long key) +static uaccess_kmsan_or_inline __must_check unsigned long +raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key) { - unsigned long tmp1, tmp2; + unsigned long osize; union oac spec = { .oac1.key = key, .oac1.as = PSW_BITS_AS_SECONDARY, .oac1.k = 1, .oac1.a = 1, }; + int cc; - tmp1 = -4096UL; - asm volatile( - " lr 0,%[spec]\n" - "0: mvcos 0(%1),0(%2),%0\n" - "6: jz 4f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ - " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ - " slgr %4,%1\n" - " clgr %0,%4\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: mvcos 0(%1),0(%2),%4\n" - "7: slgr %0,%4\n" - " j 5f\n" - "4: slgr %0,%0\n" - "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; -} - -unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) -{ - return raw_copy_to_user_key(to, from, n, 0); + while (1) { + osize = size; + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: nopr %%r7\n" + CC_IPM(cc) + EX_TABLE_UA_MVCOS_TO(0b, 0b) + EX_TABLE_UA_MVCOS_TO(1b, 0b) + : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) + : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) + : CC_CLOBBER_LIST("memory", "0")); + if (CC_TRANSFORM(cc) == 0) + return osize - size; + size -= 4096; + to += 4096; + from += 4096; + } } -EXPORT_SYMBOL(raw_copy_to_user); unsigned long _copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key) @@ -143,37 +145,3 @@ unsigned long _copy_to_user_key(void __user *to, const void *from, return raw_copy_to_user_key(to, from, n, key); } EXPORT_SYMBOL(_copy_to_user_key); - -unsigned long __clear_user(void __user *to, unsigned long size) -{ - unsigned long tmp1, tmp2; - union oac spec = { - .oac1.as = PSW_BITS_AS_SECONDARY, - .oac1.a = 1, - }; - - tmp1 = -4096UL; - asm volatile( - " lr 0,%[spec]\n" - "0: mvcos 0(%1),0(%4),%0\n" - " jz 4f\n" - "1: algr %0,%2\n" - " slgr %1,%2\n" - " j 0b\n" - "2: la %3,4095(%1)\n"/* %4 = to + 4095 */ - " nr %3,%2\n" /* %4 = (to + 4095) & -4096 */ - " slgr %3,%1\n" - " clgr %0,%3\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: mvcos 0(%1),0(%4),%3\n" - " slgr %0,%3\n" - " j 5f\n" - "4: slgr %0,%0\n" - "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) - : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : "a" (empty_zero_page), [spec] "d" (spec.val) - : "cc", "memory", "0"); - return size; -} -EXPORT_SYMBOL(__clear_user); diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c index fb924a8041dc..ce7bcf7c0032 100644 --- a/arch/s390/lib/xor.c +++ b/arch/s390/lib/xor.c @@ -15,7 +15,6 @@ static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2) { asm volatile( - " larl 1,2f\n" " aghi %0,-1\n" " jm 3f\n" " srlg 0,%0,8\n" @@ -25,12 +24,12 @@ static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, " la %1,256(%1)\n" " la %2,256(%2)\n" " brctg 0,0b\n" - "1: ex %0,0(1)\n" + "1: exrl %0,2f\n" " j 3f\n" "2: xc 0(1,%1),0(%2)\n" "3:\n" : : "d" (bytes), "a" (p1), "a" (p2) - : "0", "1", "cc", "memory"); + : "0", "cc", "memory"); } static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, @@ -38,9 +37,8 @@ static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p3) { asm volatile( - " larl 1,2f\n" " aghi %0,-1\n" - " jm 3f\n" + " jm 4f\n" " srlg 0,%0,8\n" " ltgr 0,0\n" " jz 1f\n" @@ -50,14 +48,14 @@ static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, " la %2,256(%2)\n" " la %3,256(%3)\n" " brctg 0,0b\n" - "1: ex %0,0(1)\n" - " ex %0,6(1)\n" - " j 3f\n" + "1: exrl %0,2f\n" + " exrl %0,3f\n" + " j 4f\n" "2: xc 0(1,%1),0(%2)\n" - " xc 0(1,%1),0(%3)\n" - "3:\n" + "3: xc 0(1,%1),0(%3)\n" + "4:\n" : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3) - : : "0", "1", "cc", "memory"); + : : "0", "cc", "memory"); } static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, @@ -66,9 +64,8 @@ static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p4) { asm volatile( - " larl 1,2f\n" " aghi %0,-1\n" - " jm 3f\n" + " jm 5f\n" " srlg 0,%0,8\n" " ltgr 0,0\n" " jz 1f\n" @@ -80,16 +77,16 @@ static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, " la %3,256(%3)\n" " la %4,256(%4)\n" " brctg 0,0b\n" - "1: ex %0,0(1)\n" - " ex %0,6(1)\n" - " ex %0,12(1)\n" - " j 3f\n" + "1: exrl %0,2f\n" + " exrl %0,3f\n" + " exrl %0,4f\n" + " j 5f\n" "2: xc 0(1,%1),0(%2)\n" - " xc 0(1,%1),0(%3)\n" - " xc 0(1,%1),0(%4)\n" - "3:\n" + "3: xc 0(1,%1),0(%3)\n" + "4: xc 0(1,%1),0(%4)\n" + "5:\n" : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4) - : : "0", "1", "cc", "memory"); + : : "0", "cc", "memory"); } static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, @@ -101,7 +98,7 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, asm volatile( " larl 1,2f\n" " aghi %0,-1\n" - " jm 3f\n" + " jm 6f\n" " srlg 0,%0,8\n" " ltgr 0,0\n" " jz 1f\n" @@ -115,19 +112,19 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, " la %4,256(%4)\n" " la %5,256(%5)\n" " brctg 0,0b\n" - "1: ex %0,0(1)\n" - " ex %0,6(1)\n" - " ex %0,12(1)\n" - " ex %0,18(1)\n" - " j 3f\n" + "1: exrl %0,2f\n" + " exrl %0,3f\n" + " exrl %0,4f\n" + " exrl %0,5f\n" + " j 6f\n" "2: xc 0(1,%1),0(%2)\n" - " xc 0(1,%1),0(%3)\n" - " xc 0(1,%1),0(%4)\n" - " xc 0(1,%1),0(%5)\n" - "3:\n" + "3: xc 0(1,%1),0(%3)\n" + "4: xc 0(1,%1),0(%4)\n" + "5: xc 0(1,%1),0(%5)\n" + "6:\n" : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4), "+a" (p5) - : : "0", "1", "cc", "memory"); + : : "0", "cc", "memory"); } struct xor_block_template xor_block_xc = { diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 57e4f3a24829..bd0401cc7ca5 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -7,9 +7,10 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o +obj-$(CONFIG_PFAULT) += pfault.o -KASAN_SANITIZE_kasan_init.o := n -obj-$(CONFIG_KASAN) += kasan_init.o +obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 9141ed4c52e9..e2a6eb92420f 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -90,16 +90,17 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(virt_to_pfn(addr), 1); + diag10_range(virt_to_pfn((void *)addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); nr--; + cond_resched(); } return nr; } -static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +static long __cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) { struct cmm_page_array *pa; unsigned long addr; @@ -123,6 +124,21 @@ static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) return nr; } +static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +{ + long inc = 0; + + while (nr) { + inc = min(256L, nr); + nr -= inc; + inc = __cmm_free_pages(inc, counter, list); + if (inc) + break; + cond_resched(); + } + return nr + inc; +} + static int cmm_oom_notify(struct notifier_block *self, unsigned long dummy, void *parm) { @@ -185,10 +201,10 @@ static void cmm_set_timer(void) { if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) { if (timer_pending(&cmm_timer)) - del_timer(&cmm_timer); + timer_delete(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); + mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds)); } static void cmm_timer_fn(struct timer_list *unused) @@ -243,7 +259,7 @@ static int cmm_skip_blanks(char *cp, char **endp) return str != cp; } -static int cmm_pages_handler(struct ctl_table *ctl, int write, +static int cmm_pages_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); @@ -262,7 +278,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, return 0; } -static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, +static int cmm_timed_pages_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -282,7 +298,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, return 0; } -static int cmm_timeout_handler(struct ctl_table *ctl, int write, +static int cmm_timeout_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; @@ -316,7 +332,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, return 0; } -static struct ctl_table cmm_table[] = { +static const struct ctl_table cmm_table[] = { { .procname = "cmm_pages", .mode = 0644, @@ -332,17 +348,6 @@ static struct ctl_table cmm_table[] = { .mode = 0644, .proc_handler = cmm_timeout_handler, }, - { } -}; - -static struct ctl_table cmm_dir_table[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = cmm_table, - }, - { } }; #ifdef CONFIG_CMM_IUCV @@ -389,7 +394,7 @@ static int __init cmm_init(void) { int rc = -ENOMEM; - cmm_sysctl_header = register_sysctl_table(cmm_dir_table); + cmm_sysctl_header = register_sysctl("vm", cmm_table); if (!cmm_sysctl_header) goto out_sysctl; #ifdef CONFIG_CMM_IUCV @@ -419,7 +424,7 @@ out_smsg: #endif unregister_sysctl_table(cmm_sysctl_header); out_sysctl: - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); return rc; } module_init(cmm_init); @@ -432,10 +437,11 @@ static void __exit cmm_exit(void) #endif unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); - del_timer_sync(&cmm_timer); + timer_delete_sync(&cmm_timer); cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); } module_exit(cmm_exit); +MODULE_DESCRIPTION("Cooperative memory management interface"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 9f9af5298dd6..ac604b176660 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,69 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpufeature.h> #include <linux/set_memory.h> #include <linux/ptdump.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/sort.h> #include <linux/mm.h> #include <linux/kfence.h> #include <linux/kasan.h> -#include <asm/ptdump.h> #include <asm/kasan.h> +#include <asm/abs_lowcore.h> #include <asm/nospec-branch.h> #include <asm/sections.h> +#include <asm/maccess.h> static unsigned long max_addr; struct addr_marker { + int is_start; unsigned long start_address; + unsigned long size; const char *name; }; -enum address_markers_idx { - IDENTITY_BEFORE_NR = 0, - IDENTITY_BEFORE_END_NR, - KERNEL_START_NR, - KERNEL_END_NR, -#ifdef CONFIG_KFENCE - KFENCE_START_NR, - KFENCE_END_NR, -#endif - IDENTITY_AFTER_NR, - IDENTITY_AFTER_END_NR, -#ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, -#endif - VMEMMAP_NR, - VMEMMAP_END_NR, - VMALLOC_NR, - VMALLOC_END_NR, - MODULES_NR, - MODULES_END_NR, -}; - -static struct addr_marker address_markers[] = { - [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, - [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, - [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, - [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, -#ifdef CONFIG_KFENCE - [KFENCE_START_NR] = {0, "KFence Pool Start"}, - [KFENCE_END_NR] = {0, "KFence Pool End"}, -#endif - [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, - [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, -#ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, - [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, -#endif - [VMEMMAP_NR] = {0, "vmemmap Area Start"}, - [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, - [VMALLOC_NR] = {0, "vmalloc Area Start"}, - [VMALLOC_END_NR] = {0, "vmalloc Area End"}, - [MODULES_NR] = {0, "Modules Area Start"}, - [MODULES_END_NR] = {0, "Modules Area End"}, - { -1, NULL } -}; +static struct addr_marker *markers; +static unsigned int markers_cnt; struct pg_state { struct ptdump_state ptdump; @@ -108,7 +70,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level) static void note_prot_wx(struct pg_state *st, unsigned long addr) { -#ifdef CONFIG_DEBUG_WX if (!st->check_wx) return; if (st->current_prot & _PAGE_INVALID) @@ -123,12 +84,26 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) * in which case we have two lpswe instructions in lowcore that need * to be executable. */ - if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear())) return; - WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; -#endif /* CONFIG_DEBUG_WX */ +} + +static void note_page_update_state(struct pg_state *st, unsigned long addr, unsigned int prot, int level) +{ + struct seq_file *m = st->seq; + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(m, "---[ %s %s ]---\n", st->marker->name, + st->marker->is_start ? "Start" : "End"); + } + st->start_address = addr; + st->current_prot = prot; + st->level = level; } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) @@ -153,10 +128,8 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, if (level == -1) addr = max_addr; if (st->level == -1) { - pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); - st->start_address = addr; - st->current_prot = prot; - st->level = level; + pt_dump_seq_puts(m, "---[ Kernel Virtual Address Space ]---\n"); + note_page_update_state(st, addr, prot, level); } else if (prot != st->current_prot || level != st->level || addr >= st->marker[1].start_address) { note_prot_wx(st, addr); @@ -170,22 +143,52 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - while (addr >= st->marker[1].start_address) { - st->marker++; - pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); - } - st->start_address = addr; - st->current_prot = prot; - st->level = level; + note_page_update_state(st, addr, prot, level); } } -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void) +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + +bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -203,24 +206,33 @@ void ptdump_check_wx(void) }, }; - if (!MACHINE_HAS_NX) - return; + if (!cpu_has_nx()) + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", - (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + (nospec_uses_trampoline() || !cpu_has_bear()) ? "unexpected " : ""); + + return true; + } } -#endif /* CONFIG_DEBUG_WX */ #ifdef CONFIG_PTDUMP_DEBUGFS static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .ptdump = { - .note_page = note_page, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, .range = (struct ptdump_range[]) { {.start = 0, .end = max_addr}, {.start = 0, .end = 0}, @@ -232,7 +244,7 @@ static int ptdump_show(struct seq_file *m, void *v) .check_wx = false, .wx_pages = 0, .start_address = 0, - .marker = address_markers, + .marker = markers, }; get_online_mems(); @@ -245,22 +257,66 @@ static int ptdump_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump); #endif /* CONFIG_PTDUMP_DEBUGFS */ -/* - * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple - * insertion sort to preserve the original order of markers with the same - * start address. - */ -static void sort_address_markers(void) +static int ptdump_cmp(const void *a, const void *b) { - struct addr_marker tmp; - int i, j; - - for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { - tmp = address_markers[i]; - for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) - address_markers[j + 1] = address_markers[j]; - address_markers[j + 1] = tmp; + const struct addr_marker *ama = a; + const struct addr_marker *amb = b; + + if (ama->start_address > amb->start_address) + return 1; + if (ama->start_address < amb->start_address) + return -1; + /* + * If the start addresses of two markers are identical sort markers in an + * order that considers areas contained within other areas correctly. + */ + if (ama->is_start && amb->is_start) { + if (ama->size > amb->size) + return -1; + if (ama->size < amb->size) + return 1; + return 0; } + if (!ama->is_start && !amb->is_start) { + if (ama->size > amb->size) + return 1; + if (ama->size < amb->size) + return -1; + return 0; + } + if (ama->is_start) + return 1; + if (amb->is_start) + return -1; + return 0; +} + +static int add_marker(unsigned long start, unsigned long end, const char *name) +{ + size_t oldsize, newsize; + + oldsize = markers_cnt * sizeof(*markers); + newsize = oldsize + 2 * sizeof(*markers); + if (!oldsize) + markers = kvmalloc(newsize, GFP_KERNEL); + else + markers = kvrealloc(markers, newsize, GFP_KERNEL); + if (!markers) + goto error; + markers[markers_cnt].is_start = 1; + markers[markers_cnt].start_address = start; + markers[markers_cnt].size = end - start; + markers[markers_cnt].name = name; + markers_cnt++; + markers[markers_cnt].is_start = 0; + markers[markers_cnt].start_address = end; + markers[markers_cnt].size = end - start; + markers[markers_cnt].name = name; + markers_cnt++; + return 0; +error: + markers_cnt = 0; + return -ENOMEM; } static int pt_dump_init(void) @@ -268,28 +324,48 @@ static int pt_dump_init(void) #ifdef CONFIG_KFENCE unsigned long kfence_start = (unsigned long)__kfence_pool; #endif + unsigned long lowcore = (unsigned long)get_lowcore(); + int rc; + /* * Figure out the maximum virtual address being accessible with the * kernel ASCE. We need this to keep the page table walker functions * from accessing non-existent entries. */ - max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = (get_lowcore()->kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); - address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; - address_markers[MODULES_NR].start_address = MODULES_VADDR; - address_markers[MODULES_END_NR].start_address = MODULES_END; - address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; - address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; - address_markers[VMALLOC_NR].start_address = VMALLOC_START; - address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; + /* start + end markers - must be added first */ + rc = add_marker(0, -1UL, NULL); + rc |= add_marker((unsigned long)_stext, (unsigned long)_end, "Kernel Image"); + rc |= add_marker(lowcore, lowcore + sizeof(struct lowcore), "Lowcore"); + rc |= add_marker(__identity_base, __identity_base + ident_map_size, "Identity Mapping"); + rc |= add_marker((unsigned long)__samode31, (unsigned long)__eamode31, "Amode31 Area"); + rc |= add_marker(MODULES_VADDR, MODULES_END, "Modules Area"); + rc |= add_marker(__abs_lowcore, __abs_lowcore + ABS_LOWCORE_MAP_SIZE, "Lowcore Area"); + rc |= add_marker(__memcpy_real_area, __memcpy_real_area + MEMCPY_REAL_SIZE, "Real Memory Copy Area"); + rc |= add_marker((unsigned long)vmemmap, (unsigned long)vmemmap + vmemmap_size, "vmemmap Area"); + rc |= add_marker(VMALLOC_START, VMALLOC_END, "vmalloc Area"); #ifdef CONFIG_KFENCE - address_markers[KFENCE_START_NR].start_address = kfence_start; - address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE; + rc |= add_marker(kfence_start, kfence_start + KFENCE_POOL_SIZE, "KFence Pool"); +#endif +#ifdef CONFIG_KMSAN + rc |= add_marker(KMSAN_VMALLOC_SHADOW_START, KMSAN_VMALLOC_SHADOW_END, "Kmsan vmalloc Shadow"); + rc |= add_marker(KMSAN_VMALLOC_ORIGIN_START, KMSAN_VMALLOC_ORIGIN_END, "Kmsan vmalloc Origins"); + rc |= add_marker(KMSAN_MODULES_SHADOW_START, KMSAN_MODULES_SHADOW_END, "Kmsan Modules Shadow"); + rc |= add_marker(KMSAN_MODULES_ORIGIN_START, KMSAN_MODULES_ORIGIN_END, "Kmsan Modules Origins"); +#endif +#ifdef CONFIG_KASAN + rc |= add_marker(KASAN_SHADOW_START, KASAN_SHADOW_END, "Kasan Shadow"); #endif - sort_address_markers(); + if (rc) + goto error; + sort(&markers[1], markers_cnt - 1, sizeof(*markers), ptdump_cmp, NULL); #ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); #endif /* CONFIG_PTDUMP_DEBUGFS */ return 0; +error: + kvfree(markers); + return -ENOMEM; } device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index 1e4d2187541a..7498e858c401 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -7,6 +7,7 @@ #include <linux/panic.h> #include <asm/asm-extable.h> #include <asm/extable.h> +#include <asm/fpu.h> const struct exception_table_entry *s390_search_extables(unsigned long addr) { @@ -26,7 +27,7 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_r return true; } -static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs) { unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); @@ -35,26 +36,83 @@ static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct p return true; } -static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) { - unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); - size_t len = FIELD_GET(EX_DATA_LEN, ex->data); regs->gprs[reg_err] = -EFAULT; - memset((void *)regs->gprs[reg_addr], 0, len); + regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; regs->psw.addr = extable_fixup(ex); return true; } -static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) { - unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); - unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned long data, addr, offset; - regs->gprs[reg_err] = -EFAULT; - regs->gprs[reg_zero] = 0; + addr = regs->gprs[reg_addr]; + offset = addr & (sizeof(unsigned long) - 1); + addr &= ~(sizeof(unsigned long) - 1); + data = *(unsigned long *)addr; + data <<= BITS_PER_BYTE * offset; + regs->gprs[reg_data] = data; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + fpu_sfpc(0); + regs->psw.addr = extable_fixup(ex); + return true; +} + +struct insn_ssf { + u64 opc1 : 8; + u64 r3 : 4; + u64 opc2 : 4; + u64 b1 : 4; + u64 d1 : 12; + u64 b2 : 4; + u64 d2 : 12; +} __packed; + +static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex, + bool from, struct pt_regs *regs) +{ + unsigned long uaddr, remainder; + struct insn_ssf *insn; + + /* + * If the faulting user space access crossed a page boundary retry by + * limiting the access to the first page (adjust length accordingly). + * Then the mvcos instruction will either complete with condition code + * zero, or generate another fault where the user space access did not + * cross a page boundary. + * If the faulting user space access did not cross a page boundary set + * length to zero and retry. In this case no user space access will + * happen, and the mvcos instruction will complete with condition code + * zero. + * In both cases the instruction will complete with condition code + * zero (copying finished), and the register which contains the + * length, indicates the number of bytes copied. + */ regs->psw.addr = extable_fixup(ex); + insn = (struct insn_ssf *)regs->psw.addr; + if (from) + uaddr = regs->gprs[insn->b2] + insn->d2; + else + uaddr = regs->gprs[insn->b1] + insn->d1; + remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1)); + if (regs->gprs[insn->r3] <= remainder) + remainder = 0; + regs->gprs[insn->r3] = remainder; return true; } @@ -70,12 +128,20 @@ bool fixup_exception(struct pt_regs *regs) return ex_handler_fixup(ex, regs); case EX_TYPE_BPF: return ex_handler_bpf(ex, regs); - case EX_TYPE_UA_STORE: - return ex_handler_ua_store(ex, regs); - case EX_TYPE_UA_LOAD_MEM: - return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_FAULT: + return ex_handler_ua_fault(ex, regs); case EX_TYPE_UA_LOAD_REG: - return ex_handler_ua_load_reg(ex, regs); + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(ex, regs); + case EX_TYPE_FPC: + return ex_handler_fpc(ex, regs); + case EX_TYPE_UA_MVCOS_TO: + return ex_handler_ua_mvcos(ex, false, regs); + case EX_TYPE_UA_MVCOS_FROM: + return ex_handler_ua_mvcos(ex, true, regs); } panic("invalid exception table entry"); } diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 5060956b8e7d..f7da53e212f5 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -21,6 +21,7 @@ #include <linux/ioport.h> #include <linux/refcount.h> #include <linux/pgtable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/page.h> #include <asm/ebcdic.h> @@ -28,6 +29,7 @@ #include <asm/extmem.h> #include <asm/cpcmd.h> #include <asm/setup.h> +#include <asm/asm.h> #define DCSS_PURGESEG 0x08 #define DCSS_LOADSHRX 0x20 @@ -134,20 +136,21 @@ dcss_diag(int *func, void *parameter, unsigned long *ret1, unsigned long *ret2) { unsigned long rx, ry; - int rc; + int cc; - rx = (unsigned long) parameter; + rx = virt_to_phys(parameter); ry = (unsigned long) *func; diag_stat_inc(DIAG_STAT_X064); asm volatile( - " diag %0,%1,0x64\n" - " ipm %2\n" - " srl %2,28\n" - : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + " diag %[rx],%[ry],0x64\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry) + : + : CC_CLOBBER); *ret1 = rx; *ret2 = ry; - return rc; + return CC_TRANSFORM(cc); } static inline int @@ -178,7 +181,7 @@ query_segment_type (struct dcss_segment *seg) /* initialize diag input parameters */ qin->qopcode = DCSS_FINDSEGA; - qin->qoutptr = (unsigned long) qout; + qin->qoutptr = virt_to_phys(qout); qin->qoutlen = sizeof(struct qout64); memcpy (qin->qname, seg->dcss_name, 8); @@ -253,7 +256,7 @@ segment_type (char* name) int rc; struct dcss_segment seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; dcss_mkname(name, seg.dcss_name); @@ -289,15 +292,17 @@ segment_overlaps_others (struct dcss_segment *seg) /* * real segment loading function, called from segment_load + * Must return either an error code < 0, or the segment type code >= 0 */ static int __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) { unsigned long start_addr, end_addr, dummy; struct dcss_segment *seg; - int rc, diag_cc; + int rc, diag_cc, segtype; start_addr = end_addr = 0; + segtype = -1; seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); if (seg == NULL) { rc = -ENOMEM; @@ -326,9 +331,9 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long seg->res_name[8] = '\0'; strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; - rc = seg->vm_segtype; - if (rc == SEG_TYPE_SC || - ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) + segtype = seg->vm_segtype; + if (segtype == SEG_TYPE_SC || + ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; /* Check for overlapping resources before adding the mapping. */ @@ -386,7 +391,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long out_free: kfree(seg); out: - return rc; + return rc < 0 ? rc : segtype; } /* @@ -414,7 +419,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr, struct dcss_segment *seg; int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -ENOSYS; mutex_lock(&dcss_lock); @@ -525,6 +530,14 @@ segment_modify_shared (char *name, int do_nonshared) return rc; } +static void __dcss_diag_purge_on_cpu_0(void *data) +{ + struct dcss_segment *seg = (struct dcss_segment *)data; + unsigned long dummy; + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); +} + /* * Decrease the use count of a DCSS segment and remove * it from the address space if nobody is using it @@ -533,10 +546,9 @@ segment_modify_shared (char *name, int do_nonshared) void segment_unload(char *name) { - unsigned long dummy; struct dcss_segment *seg; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -551,7 +563,14 @@ segment_unload(char *name) kfree(seg->res); vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); list_del(&seg->list); - dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + /* + * Workaround for z/VM issue, where calling the DCSS unload diag on + * a non-IPL CPU would cause bogus sclp maximum memory detection on + * next IPL. + * IPL CPU 0 cannot be set offline, so the dcss_diag() call can + * directly be scheduled to that CPU. + */ + smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1); kfree(seg); out_unlock: mutex_unlock(&dcss_lock); @@ -568,7 +587,7 @@ segment_save(char *name) char cmd2[80]; int i, response; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return; mutex_lock(&dcss_lock); @@ -638,10 +657,13 @@ void segment_warning(int rc, char *seg_name) pr_err("There is not enough memory to load or query " "DCSS %s\n", seg_name); break; - case -ERANGE: - pr_err("DCSS %s exceeds the kernel mapping range (%lu) " - "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); break; + } default: break; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e173b6187ad5..e1ad05bfd28a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -3,13 +3,15 @@ * S390 version * Copyright IBM Corp. 1999 * Author(s): Hartmut Penner (hp@de.ibm.com) - * Ulrich Weigand (uweigand@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds */ #include <linux/kernel_stat.h> +#include <linux/mmu_context.h> +#include <linux/cpufeature.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> @@ -32,123 +34,93 @@ #include <linux/uaccess.h> #include <linux/hugetlb.h> #include <linux/kfence.h> +#include <linux/pagewalk.h> #include <asm/asm-extable.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> +#include <asm/fault.h> #include <asm/diag.h> -#include <asm/gmap.h> #include <asm/irq.h> -#include <asm/mmu_context.h> #include <asm/facility.h> #include <asm/uv.h> #include "../kernel/entry.h" -#define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL - -#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) -#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) -#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) -#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) -#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) - -enum fault_type { - KERNEL_FAULT, - USER_FAULT, - GMAP_FAULT, -}; - -static unsigned long store_indication __read_mostly; - -static int __init fault_init(void) -{ - if (test_facility(75)) - store_indication = 0xc00; - return 0; -} -early_initcall(fault_init); - /* * Find out which address space caused the exception. */ -static enum fault_type get_fault_type(struct pt_regs *regs) +static bool is_kernel_fault(struct pt_regs *regs) { - unsigned long trans_exc_code; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long & 3; - if (likely(trans_exc_code == 0)) { - /* primary space exception */ - if (user_mode(regs)) - return USER_FAULT; - if (!IS_ENABLED(CONFIG_PGSTE)) - return KERNEL_FAULT; - if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) - return GMAP_FAULT; - return KERNEL_FAULT; - } - if (trans_exc_code == 2) - return USER_FAULT; - if (trans_exc_code == 1) { - /* access register mode, not used in the kernel */ - return USER_FAULT; - } - /* home space exception -> access via kernel ASCE */ - return KERNEL_FAULT; + if (user_mode(regs)) + return false; + if (teid.as == PSW_BITS_AS_SECONDARY) + return false; + return true; } -static int bad_address(void *p) +static unsigned long get_fault_address(struct pt_regs *regs) { - unsigned long dummy; + union teid teid = { .val = regs->int_parm_long }; - return get_kernel_nofault(dummy, (unsigned long *)p); + return teid.addr * PAGE_SIZE; +} + +static __always_inline bool fault_is_write(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; + + if (test_facility(75)) + return teid.fsi == TEID_FSI_STORE; + return false; } static void dump_pagetable(unsigned long asce, unsigned long address) { - unsigned long *table = __va(asce & _ASCE_ORIGIN); + unsigned long entry, *table = __va(asce & _ASCE_ORIGIN); pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R1:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R1:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R2:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R2:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R3:%016lx ", *table); - if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + pr_cont("R3:%016lx ", entry); + if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("S:%016lx ", *table); - if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + pr_cont("S:%016lx ", entry); + if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(entry & _SEGMENT_ENTRY_ORIGIN); } - table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; - if (bad_address(table)) + table += (address & _PAGE_INDEX) >> PAGE_SHIFT; + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("P:%016lx ", *table); + pr_cont("P:%016lx ", entry); out: pr_cont("\n"); return; @@ -158,163 +130,118 @@ bad: static void dump_fault_info(struct pt_regs *regs) { + union teid teid = { .val = regs->int_parm_long }; unsigned long asce; pr_alert("Failing address: %016lx TEID: %016lx\n", - regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); + get_fault_address(regs), teid.val); pr_alert("Fault in "); - switch (regs->int_parm_long & 3) { - case 3: + switch (teid.as) { + case PSW_BITS_AS_HOME: pr_cont("home space "); break; - case 2: + case PSW_BITS_AS_SECONDARY: pr_cont("secondary space "); break; - case 1: + case PSW_BITS_AS_ACCREG: pr_cont("access register "); break; - case 0: + case PSW_BITS_AS_PRIMARY: pr_cont("primary space "); break; } pr_cont("mode while using "); - switch (get_fault_type(regs)) { - case USER_FAULT: - asce = S390_lowcore.user_asce; - pr_cont("user "); - break; - case GMAP_FAULT: - asce = ((struct gmap *) S390_lowcore.gmap)->asce; - pr_cont("gmap "); - break; - case KERNEL_FAULT: - asce = S390_lowcore.kernel_asce; + if (is_kernel_fault(regs)) { + asce = get_lowcore()->kernel_asce.val; pr_cont("kernel "); - break; - default: - unreachable(); + } else { + asce = get_lowcore()->user_asce.val; + pr_cont("user "); } pr_cont("ASCE.\n"); - dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); + dump_pagetable(asce, get_fault_address(regs)); } int show_unhandled_signals = 1; +static const struct ctl_table s390_fault_sysctl_table[] = { + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_fault_sysctls(void) +{ + register_sysctl_init("kernel", s390_fault_sysctl_table); + return 0; +} +arch_initcall(init_s390_fault_sysctls); + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; - if (!printk_ratelimit()) + if (!__ratelimit(&rs)) return; - printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ", - regs->int_code & 0xffff, regs->int_code >> 17); + pr_alert("User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); print_vma_addr(KERN_CONT "in ", regs->psw.addr); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (is_mm_fault) dump_fault_info(regs); show_regs(regs); } -/* - * Send SIGSEGV to task. This is an external routine - * to keep the stack usage of do_page_fault small. - */ -static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +static void do_sigsegv(struct pt_regs *regs, int si_code) { report_user_fault(regs, SIGSEGV, 1); - force_sig_fault(SIGSEGV, si_code, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs)); } -static noinline void do_no_context(struct pt_regs *regs) +static void handle_fault_error_nolock(struct pt_regs *regs, int si_code) { + unsigned long address; + bool is_write; + + if (user_mode(regs)) { + if (WARN_ON_ONCE(!si_code)) + si_code = SEGV_MAPERR; + return do_sigsegv(regs, si_code); + } if (fixup_exception(regs)) return; - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - if (get_fault_type(regs) == KERNEL_FAULT) - printk(KERN_ALERT "Unable to handle kernel pointer dereference" - " in virtual kernel address space\n"); - else - printk(KERN_ALERT "Unable to handle kernel paging request" - " in virtual user address space\n"); + if (is_kernel_fault(regs)) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n"); + } else { + pr_alert("Unable to handle kernel paging request in virtual user address space\n"); + } dump_fault_info(regs); die(regs, "Oops"); } -static noinline void do_low_address(struct pt_regs *regs) +static void handle_fault_error(struct pt_regs *regs, int si_code) { - /* Low-address protection hit in kernel mode means - NULL pointer write access in kernel mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - /* Low-address protection hit in user mode 'cannot happen'. */ - die (regs, "Low-address protection"); - } + struct mm_struct *mm = current->mm; - do_no_context(regs); + mmap_read_unlock(mm); + handle_fault_error_nolock(regs, si_code); } -static noinline void do_sigbus(struct pt_regs *regs) +static void do_sigbus(struct pt_regs *regs) { - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); -} - -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) -{ - int si_code; - - switch (fault) { - case VM_FAULT_BADACCESS: - case VM_FAULT_BADMAP: - /* Bad memory access. Check if it is kernel or user space. */ - if (user_mode(regs)) { - /* User mode accesses just cause a SIGSEGV */ - si_code = (fault == VM_FAULT_BADMAP) ? - SEGV_MAPERR : SEGV_ACCERR; - do_sigsegv(regs, si_code); - break; - } - fallthrough; - case VM_FAULT_BADCONTEXT: - case VM_FAULT_PFAULT: - do_no_context(regs); - break; - case VM_FAULT_SIGNAL: - if (!user_mode(regs)) - do_no_context(regs); - break; - default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) { - if (!user_mode(regs)) - do_no_context(regs); - else - pagefault_out_of_memory(); - } else if (fault & VM_FAULT_SIGSEGV) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & VM_FAULT_SIGBUS) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigbus(regs); - } else - BUG(); - break; - } + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs)); } /* @@ -323,161 +250,116 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suppression) - * 10 Segment translation -> Not present (nullification) - * 11 Page translation -> Not present (nullification) - * 3b Region third trans. -> Not present (nullification) + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) */ -static inline vm_fault_t do_exception(struct pt_regs *regs, int access) +static void do_exception(struct pt_regs *regs, int access) { - struct gmap *gmap; - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; - enum fault_type type; - unsigned long trans_exc_code; unsigned long address; + struct mm_struct *mm; unsigned int flags; vm_fault_t fault; bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ clear_thread_flag(TIF_PER_TRAP); - if (kprobe_page_fault(regs, 14)) - return 0; - - mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - address = trans_exc_code & __FAIL_ADDR_MASK; - is_write = (trans_exc_code & store_indication) == 0x400; - - /* - * Verify that the fault happened in user space, that - * we are not in an interrupt and that there is a - * user context. - */ - fault = VM_FAULT_BADCONTEXT; - type = get_fault_type(regs); - switch (type) { - case KERNEL_FAULT: - if (kfence_handle_page_fault(address, is_write, regs)) - return 0; - goto out; - case USER_FAULT: - case GMAP_FAULT: - if (faulthandler_disabled() || !mm) - goto out; - break; - } - + return; + mm = current->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (is_kernel_fault(regs) || faulthandler_disabled() || !mm) + return handle_fault_error_nolock(regs, 0); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (access == VM_WRITE || is_write) + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - mmap_read_lock(mm); - - gmap = NULL; - if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { - gmap = (struct gmap *) S390_lowcore.gmap; - current->thread.gmap_addr = address; - current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); - current->thread.gmap_int_code = regs->int_code & 0xffff; - address = __gmap_translate(gmap, address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } - if (gmap->pfault_enabled) - flags |= FAULT_FLAG_RETRY_NOWAIT; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } +lock_mmap: retry: - fault = VM_FAULT_BADMAP; - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto out_up; - - if (unlikely(vma->vm_start > address)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_up; - if (expand_stack(vma, address)) - goto out_up; - } - - /* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ - fault = VM_FAULT_BADACCESS; + return handle_fault_error_nolock(regs, SEGV_MAPERR); if (unlikely(!(vma->vm_flags & access))) - goto out_up; - - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ + return handle_fault_error(regs, SEGV_ACCERR); fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { - fault = VM_FAULT_SIGNAL; - if (flags & FAULT_FLAG_RETRY_NOWAIT) - goto out_up; - goto out; + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; } - if (unlikely(fault & VM_FAULT_ERROR)) - goto out_up; - + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* - * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has - * not been released - */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - flags &= ~FAULT_FLAG_RETRY_NOWAIT; flags |= FAULT_FLAG_TRIED; - mmap_read_lock(mm); goto retry; } - if (IS_ENABLED(CONFIG_PGSTE) && gmap) { - address = __gmap_link(gmap, current->thread.gmap_addr, - address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } - if (address == -ENOMEM) { - fault = VM_FAULT_OOM; - goto out_up; - } - } - fault = 0; -out_up: mmap_read_unlock(mm); -out: - return fault; +done: + if (!(fault & VM_FAULT_ERROR)) + return; + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigbus(regs); + } else { + pr_emerg("Unexpected fault flags: %08x\n", fault); + BUG(); + } } void do_protection_exception(struct pt_regs *regs) { - unsigned long trans_exc_code; - int access; - vm_fault_t fault; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long; /* * Protection exceptions are suppressing, decrement psw address. * The exception to this rule are aborted transactions, for these @@ -490,281 +372,51 @@ void do_protection_exception(struct pt_regs *regs) * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ - if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs); - return; + if (unlikely(!teid.b61)) { + if (user_mode(regs)) { + /* Low-address protection in user mode: cannot happen */ + dump_fault_info(regs); + die(regs, "Low-address protection"); + } + /* + * Low-address protection in kernel mode means + * NULL pointer write access in kernel mode. + */ + return handle_fault_error_nolock(regs, 0); } - if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) { - regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) | - (regs->psw.addr & PAGE_MASK); - access = VM_EXEC; - fault = VM_FAULT_BADACCESS; - } else { - access = VM_WRITE; - fault = do_exception(regs, access); + if (unlikely(cpu_has_nx() && teid.b56)) { + regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_exception(regs, VM_WRITE); } NOKPROBE_SYMBOL(do_protection_exception); void do_dat_exception(struct pt_regs *regs) { - int access; - vm_fault_t fault; - - access = VM_ACCESS_FLAGS; - fault = do_exception(regs, access); - if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_exception(regs, VM_ACCESS_FLAGS); } NOKPROBE_SYMBOL(do_dat_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) -{ - pfault_disable = 1; - return 1; -} - -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -static struct pfault_refbk pfault_init_refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD -}; - -int pfault_init(void) -{ - int rc; - - if (pfault_disable) - return -1; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) - : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); - return rc; -} - -static struct pfault_refbk pfault_fini_refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, -}; - -void pfault_fini(void) -{ - - if (pfault_disable) - return; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %0,0,0x258\n" - "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -#define PF_COMPLETE 0x0080 - -/* - * The mechanism of our pfault code: if Linux is running as guest, runs a user - * space process and the user space process accesses a page that the host has - * paged out we get a pfault interrupt. - * - * This allows us, within the guest, to schedule a different process. Without - * this mechanism the host would have to suspend the whole virtual cpu until - * the page has been paged in. - * - * So when we get such an interrupt then we set the state of the current task - * to uninterruptible and also set the need_resched flag. Both happens within - * interrupt context(!). If we later on want to return to user space we - * recognize the need_resched flag and then call schedule(). It's not very - * obvious how this works... - * - * Of course we have a lot of additional fun with the completion interrupt (-> - * host signals that a page of a process has been paged in and the process can - * continue to run). This interrupt can arrive on any cpu and, since we have - * virtual cpus, actually appear before the interrupt that signals that a page - * is missing. - */ -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; - - /* - * Get the external interruption subcode & pfault initial/completion - * signal bit. VM stores this in the 'cpu address' field associated - * with the external interrupt. - */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = param64 & LPP_PID_MASK; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & PF_COMPLETE) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (task_is_running(tsk)) - tsk->thread.pfault_wait = -1; - } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - goto block; - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); -block: - /* Since this must be a userspace fault, there - * is no kernel task state to trample. Rely on the - * return to userspace schedule() to block. */ - __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); - } - } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); -} - -static int pfault_cpu_dead(unsigned int cpu) -{ - struct thread_struct *thread, *next; - struct task_struct *tsk; - - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - return 0; -} - -static int __init pfault_irq_init(void) -{ - int rc; - - rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", - NULL, pfault_cpu_dead); - return 0; - -out_pfault: - unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; -} -early_initcall(pfault_irq_init); - -#endif /* CONFIG_PFAULT */ - #if IS_ENABLED(CONFIG_PGSTE) void do_secure_storage_access(struct pt_regs *regs) { - unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + union teid teid = { .val = regs->int_parm_long }; + unsigned long addr = get_fault_address(regs); struct vm_area_struct *vma; + struct folio_walk fw; struct mm_struct *mm; - struct page *page; + struct folio *folio; int rc; /* - * bit 61 tells us if the address is valid, if it's not we - * have a major problem and should stop the kernel or send a - * SIGSEGV to the process. Unfortunately bit 61 is not - * reliable without the misc UV feature so we need to check - * for that as well. + * Bit 61 indicates if the address is valid, if it is not the + * kernel should be stopped or SIGSEGV should be sent to the + * process. Bit 61 is not reliable without the misc UV feature, + * therefore this needs to be checked too. */ - if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) && - !test_bit_inv(61, ®s->int_parm_long)) { + if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) { /* * When this happens, userspace did something that it * was not supposed to do, e.g. branching into secure @@ -774,80 +426,43 @@ void do_secure_storage_access(struct pt_regs *regs) send_sig(SIGSEGV, current, 0); return; } - /* - * The kernel should never run into this case and we - * have no way out of this situation. + * The kernel should never run into this case and + * there is no way out of this situation. */ panic("Unexpected PGM 0x3d with TEID bit 61=0"); } - - switch (get_fault_type(regs)) { - case USER_FAULT: + if (is_kernel_fault(regs)) { + folio = phys_to_folio(addr); + if (unlikely(!folio_try_get(folio))) + return; + rc = arch_make_folio_accessible(folio); + folio_put(folio); + if (rc) + BUG(); + } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); - if (!vma) { - mmap_read_unlock(mm); - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); - break; - } - page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) { + if (!vma) + return handle_fault_error(regs, SEGV_MAPERR); + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) { mmap_read_unlock(mm); - break; + return; } - if (arch_make_page_accessible(page)) + /* arch_make_folio_accessible() needs a raised refcount. */ + folio_get(folio); + rc = arch_make_folio_accessible(folio); + folio_put(folio); + folio_walk_end(&fw, vma); + if (rc) send_sig(SIGSEGV, current, 0); - put_page(page); mmap_read_unlock(mm); - break; - case KERNEL_FAULT: - page = phys_to_page(addr); - if (unlikely(!try_get_page(page))) - break; - rc = arch_make_page_accessible(page); - put_page(page); - if (rc) - BUG(); - break; - case GMAP_FAULT: - default: - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); - WARN_ON_ONCE(1); } } NOKPROBE_SYMBOL(do_secure_storage_access); -void do_non_secure_storage_access(struct pt_regs *regs) -{ - unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; - struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; - - if (get_fault_type(regs) != GMAP_FAULT) { - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); - WARN_ON_ONCE(1); - return; - } - - if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) - send_sig(SIGSEGV, current, 0); -} -NOKPROBE_SYMBOL(do_non_secure_storage_access); - -void do_secure_storage_violation(struct pt_regs *regs) -{ - /* - * Either KVM messed up the secure guest mapping or the same - * page is mapped into multiple secure guests. - * - * This exception is only triggered when a guest 2 is running - * and can therefore never occur in kernel context. - */ - printk_ratelimited(KERN_WARNING - "Secure storage violation in task: %s, pid %d\n", - current->comm, current->pid); - send_sig(SIGSEGV, current, 0); -} - #endif /* CONFIG_PGSTE */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b8ae4a4aa2ba..012a4366a2ad 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -8,6 +8,7 @@ * Janosch Frank <frankja@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/kernel.h> #include <linux/pagewalk.h> #include <linux/swap.h> @@ -18,20 +19,43 @@ #include <linux/ksm.h> #include <linux/mman.h> #include <linux/pgtable.h> - +#include <asm/page-states.h> #include <asm/pgalloc.h> +#include <asm/machine.h> +#include <asm/gmap_helpers.h> #include <asm/gmap.h> -#include <asm/tlb.h> +#include <asm/page.h> + +/* + * The address is saved in a radix tree directly; NULL would be ambiguous, + * since 0 is a valid address, and NULL is returned when nothing was found. + * The lower bits are ignored by all users of the macro, so it can be used + * to distinguish a valid address 0 from a NULL. + */ +#define VALID_GADDR_FLAG 1 +#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) +#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) #define GMAP_SHADOW_FAKE_TABLE 1ULL +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); + return page; +} + /** * gmap_alloc - allocate and initialize a guest address space * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -static struct gmap *gmap_alloc(unsigned long limit) +struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -58,21 +82,17 @@ static struct gmap *gmap_alloc(unsigned long limit) gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; - INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); - INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); refcount_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -85,6 +105,7 @@ out_free: out: return NULL; } +EXPORT_SYMBOL_GPL(gmap_alloc); /** * gmap_create - create a guest address space @@ -116,7 +137,7 @@ EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { - if (MACHINE_HAS_IDTE) + if (cpu_has_idte()) __tlb_flush_idte(gmap->asce); else __tlb_flush_global(); @@ -173,30 +194,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_free_crst(unsigned long *table, bool free_ptes) +{ + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; + int i; + + if (is_segment) { + if (!free_ptes) + goto out; + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); + } else { + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _REGION_ENTRY_INVALID)) + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); + } + +out: + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure * * No locks required. There are no references to this gmap anymore. */ -static void gmap_free(struct gmap *gmap) +void gmap_free(struct gmap *gmap) { - struct page *page, *next; - /* Flush tlb of all gmaps (if not already done for shadows) */ if (!(gmap_is_shadow(gmap) && gmap->removed)) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, CRST_ALLOC_ORDER); + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); + gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { - /* Free all page tables. */ - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) - page_table_free_pgste(page); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -204,6 +241,7 @@ static void gmap_free(struct gmap *gmap) kfree(gmap); } +EXPORT_SYMBOL_GPL(gmap_free); /** * gmap_get - increase reference counter for guest address space @@ -267,37 +305,6 @@ void gmap_remove(struct gmap *gmap) } EXPORT_SYMBOL_GPL(gmap_remove); -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) -{ - S390_lowcore.gmap = (unsigned long) gmap; -} -EXPORT_SYMBOL_GPL(gmap_enable); - -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) -{ - S390_lowcore.gmap = 0UL; -} -EXPORT_SYMBOL_GPL(gmap_disable); - -/** - * gmap_get_enabled - get a pointer to the currently enabled gmap - * - * Returns a pointer to the currently enabled gmap. 0 if none is enabled. - */ -struct gmap *gmap_get_enabled(void) -{ - return (struct gmap *) S390_lowcore.gmap; -} -EXPORT_SYMBOL_GPL(gmap_get_enabled); - /* * gmap_alloc_table is assumed to be called with mmap_lock held */ @@ -308,17 +315,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; page = NULL; } spin_unlock(&gmap->guest_table_lock); @@ -327,22 +332,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return 0; } -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) +static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) { - struct page *page; - unsigned long offset, mask; + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); - return page->index + offset; +static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, + unsigned long *gaddr) +{ + *gaddr = host_to_guest_delete(gmap, vmaddr); + if (IS_GADDR_VALID(*gaddr)) + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); + return NULL; } /** @@ -354,16 +360,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry) */ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) { - unsigned long *entry; + unsigned long gaddr; int flush = 0; + pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_EMPTY); - *entry = _SEGMENT_ENTRY_EMPTY; + + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } + spin_unlock(&gmap->guest_table_lock); return flush; } @@ -482,26 +491,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) EXPORT_SYMBOL_GPL(__gmap_translate); /** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - mmap_read_lock(gmap->mm); - rc = __gmap_translate(gmap, gaddr); - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - -/** * gmap_unlink - disconnect a page table from the gmap shadow tables * @mm: pointer to the parent mm_struct * @table: pointer to the host page table @@ -557,7 +546,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -565,7 +554,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -573,7 +562,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ @@ -585,12 +574,12 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); /* Are we allowed to use huge pages? */ - if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); @@ -600,12 +589,14 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) spin_lock(&gmap->guest_table_lock); if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); + vmaddr >> PMD_SHIFT, + (void *)MAKE_VALID_GADDR(gaddr)); if (!rc) { - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC; + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; } else *table = pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS; @@ -622,113 +613,27 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) radix_tree_preload_end(); return rc; } - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - unsigned long vmaddr; - int rc; - bool unlocked; - - mmap_read_lock(gmap->mm); - -retry: - unlocked = false; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - goto out_up; - } - if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, - &unlocked)) { - rc = -EFAULT; - goto out_up; - } - /* - * In the case that fixup_user_fault unlocked the mmap_lock during - * faultin redo __gmap_translate to not race with a map/unmap_segment. - */ - if (unlocked) - goto retry; - - rc = __gmap_link(gmap, gaddr, vmaddr); -out_up: - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); +EXPORT_SYMBOL(__gmap_link); /* * this function is assumed to be called with mmap_lock held */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { - struct vm_area_struct *vma; unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; + + mmap_assert_locked(gmap->mm); /* Find the vm address for the guest address */ vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; - - vma = vma_lookup(gmap->mm, vmaddr); - if (!vma || is_vm_hugetlb_page(vma)) - return; - - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) { - ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); - } + gmap_helper_zap_one_page(gmap->mm, vmaddr); } } EXPORT_SYMBOL_GPL(__gmap_zap); -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - mmap_read_lock(gmap->mm); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - if (!vma) - continue; - /* - * We do not discard pages that are backed by - * hugetlbfs, so we don't have to refault them. - */ - if (is_vm_hugetlb_page(vma)) - continue; - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); - } - mmap_read_unlock(gmap->mm); -} -EXPORT_SYMBOL_GPL(gmap_discard); - static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); @@ -790,8 +695,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, * * Note: Can also be called for shadow gmaps. */ -static inline unsigned long *gmap_table_walk(struct gmap *gmap, - unsigned long gaddr, int level) +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { const int asce_type = gmap->asce & _ASCE_TYPE_MASK; unsigned long *table = gmap->table; @@ -813,7 +717,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -821,7 +725,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -829,7 +733,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@ -837,11 +741,12 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; } return table; } +EXPORT_SYMBOL(gmap_table_walk); /** * gmap_pte_op_walk - walk the gmap page table, get the page table lock @@ -896,12 +801,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, /** * gmap_pte_op_end - release the page table lock - * @ptl: pointer to the spinlock pointer + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock */ -static void gmap_pte_op_end(spinlock_t *ptl) +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) { - if (ptl) - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } /** @@ -932,7 +837,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) } /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_large(*pmdp)) + if (!pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); return pmdp; } @@ -944,7 +849,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) */ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) { - if (pmd_large(*pmdp)) + if (pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); } @@ -1012,7 +917,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, { int rc; pte_t *ptep; - spinlock_t *ptl = NULL; + spinlock_t *ptl; unsigned long pbits = 0; if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) @@ -1026,7 +931,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; /* Protect and unlock. */ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); return rc; } @@ -1038,86 +943,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * @bits: pgste notification bits to set * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * Returns: + * PAGE_SIZE if a small page was successfully protected; + * HPAGE_SIZE if a large page was successfully protected; + * -ENOMEM if out of memory; + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. * - * Called with sg->mm->mmap_lock in read. + * Context: Called with sg->mm->mmap_lock in read. */ -static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot, unsigned long bits) +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) { - unsigned long vmaddr, dist; pmd_t *pmdp; - int rc; + int rc = 0; BUG_ON(gmap_is_shadow(gmap)); - while (len) { - rc = -EAGAIN; - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (pmdp) { - if (!pmd_large(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - len -= PAGE_SIZE; - gaddr += PAGE_SIZE; - } - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); - len = len < dist ? 0 : len - dist; - gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; - } - } - gmap_pmd_op_end(gmap, pmdp); - } - if (rc) { - if (rc == -EINVAL) - return rc; - /* -EAGAIN, fixup of userspace mm and gmap */ - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); - if (rc) - return rc; - } - } - return 0; -} + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return -EAGAIN; -/** - * gmap_mprotect_notify - change access rights for a range of ptes and - * call the notifier if any pte changes again - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if for each page in the given range a gmap mapping exists, - * the new access rights could be set and the notifier could be armed. - * If the gmap mapping is missing for one or more pages -EFAULT is - * returned. If no memory could be allocated -ENOMEM is returned. - * This function establishes missing page table entries. - */ -int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot) -{ - int rc; + if (!pmd_leaf(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = PAGE_SIZE; + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = HPAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) - return -EINVAL; - if (!MACHINE_HAS_ESOP && prot == PROT_READ) - return -EINVAL; - mmap_read_lock(gmap->mm); - rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); - mmap_read_unlock(gmap->mm); return rc; } -EXPORT_SYMBOL_GPL(gmap_mprotect_notify); +EXPORT_SYMBOL_GPL(gmap_protect_one); /** * gmap_read_table - get an unsigned long value from a guest page table using @@ -1150,12 +1009,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; + *val = *(unsigned long *)__va(address); set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } if (!rc) break; @@ -1249,7 +1108,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, if (!rc) gmap_insert_rmap(sg, vmaddr, rmap); spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } radix_tree_preload_end(); if (rc) { @@ -1304,7 +1163,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); + gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1322,7 +1181,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1335,23 +1194,23 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; - struct page *page; + unsigned long *ste; + phys_addr_t sto, pgt; + struct ptdesc *ptdesc; BUG_ON(!gmap_is_shadow(sg)); ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); } /** @@ -1365,21 +1224,20 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long *pgt; - struct page *page; + struct ptdesc *ptdesc; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + page_table_free_pgste(ptdesc); } } @@ -1392,7 +1250,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1401,13 +1260,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(sgt); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1422,20 +1280,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(sgt); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1449,7 +1306,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1458,13 +1316,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r3t); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1479,7 +1336,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long *r3t; + phys_addr_t r3t; struct page *page; int i; @@ -1487,12 +1344,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r3t); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1506,8 +1362,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ @@ -1515,13 +1372,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r2t); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1536,23 +1392,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + asce = __pa(r1t) | _ASCE_TYPE_REGION1; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); - list_del(&page->lru); + page = phys_to_page(r2t); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1563,7 +1419,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, * * Called with sg->guest_table_lock */ -static void gmap_unshadow(struct gmap *sg) +void gmap_unshadow(struct gmap *sg) { unsigned long *table; @@ -1573,7 +1429,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1589,142 +1445,7 @@ static void gmap_unshadow(struct gmap *sg) break; } } - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg; - - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce != asce || sg->edat_level != edat_level || - sg->removed) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. - * - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} -EXPORT_SYMBOL_GPL(gmap_shadow_valid); - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - BUG_ON(parent->mm->context.allow_gmap_hpage_1m); - BUG_ON(gmap_is_shadow(parent)); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} -EXPORT_SYMBOL_GPL(gmap_shadow); +EXPORT_SYMBOL(gmap_unshadow); /** * gmap_shadow_r2t - create an empty shadow region 2 table @@ -1736,7 +1457,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its decendents. + * remove the shadow r2 table and all of its descendants. * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1748,19 +1469,17 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r2t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1775,13 +1494,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1798,8 +1516,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1832,19 +1549,17 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r3t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1859,13 +1574,12 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1882,8 +1596,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1916,19 +1629,17 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = sgt & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1943,13 +1654,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1966,8 +1676,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1983,45 +1692,22 @@ out_free: } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); -/** - * gmap_shadow_pgt_lookup - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) { - unsigned long *table; - struct page *page; - int rc; + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - BUG_ON(!gmap_is_shadow(sg)); - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; - } - spin_unlock(&sg->guest_table_lock); - return rc; + pgstes += _PAGE_ENTRIES; + + pgstes[0] &= ~PGSTE_ST2_MASK; + pgstes[1] &= ~PGSTE_ST2_MASK; + pgstes[2] &= ~PGSTE_ST2_MASK; + pgstes[3] &= ~PGSTE_ST2_MASK; + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; } -EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); /** * gmap_shadow_pgt - instantiate a shadow page table @@ -2040,19 +1726,21 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; - struct page *page; + unsigned long *table; + struct ptdesc *ptdesc; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); /* Allocate a shadow page table */ - page = page_table_alloc_pgste(sg->mm); - if (!page) + ptdesc = page_table_alloc_pgste(sg->mm); + if (!ptdesc) return -ENOMEM; - page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + origin = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + origin |= GMAP_SHADOW_FAKE_TABLE; + gmap_pgste_set_pgt_addr(ptdesc, origin); + s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2070,7 +1758,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&page->lru, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2085,8 +1772,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2097,7 +1783,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(page); + page_table_free_pgste(ptdesc); return rc; } @@ -2152,7 +1838,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); if (!tptep) { spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); radix_tree_preload_end(); break; } @@ -2163,7 +1849,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rmap = NULL; rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); spin_unlock(&sg->guest_table_lock); } radix_tree_preload_end(); @@ -2249,7 +1935,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte, unsigned long bits) { unsigned long offset, gaddr = 0; - unsigned long *table; struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); @@ -2257,12 +1942,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (table) - gaddr = __gmap_segment_gaddr(table) + offset; + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; spin_unlock(&gmap->guest_table_lock); - if (!table) + if (!IS_GADDR_VALID(gaddr)) continue; if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { @@ -2302,10 +1984,10 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, gaddr &= HPAGE_MASK; pmdp_notify_gmap(gmap, pmdp, gaddr); new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) + else if (cpu_has_idte()) __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); @@ -2322,13 +2004,12 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); if (pmdp) { - gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (purge) __pmdp_csp(pmdp); set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); @@ -2368,27 +2049,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_csp); */ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); - if (MACHINE_HAS_TLB_GUEST) + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); - else if (MACHINE_HAS_IDTE) + else if (cpu_has_idte()) __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2403,29 +2082,27 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); */ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); - if (MACHINE_HAS_TLB_GUEST) + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); + if (machine_has_tlb_guest()) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) + else if (cpu_has_idte()) __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2481,7 +2158,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], if (!pmdp) return; - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) bitmap_fill(bitmap, _PAGE_ENTRIES); } else { @@ -2491,7 +2168,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], continue; if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) set_bit(i, bitmap); - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } } gmap_pmd_op_end(gmap, pmdp); @@ -2510,15 +2187,16 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops thp_split_walk_ops = { .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + for_each_vma(vmi, vma) { + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; @@ -2530,33 +2208,6 @@ static inline void thp_split_mm(struct mm_struct *mm) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Remove all empty zero pages from the mapping for lazy refaulting - * - This must be called after mm->context.has_pgste is set, to avoid - * future creation of zero pages - * - This must be called after THP was enabled - */ -static int __zap_zero_pages(pmd_t *pmd, unsigned long start, - unsigned long end, struct mm_walk *walk) -{ - unsigned long addr; - - for (addr = start; addr != end; addr += PAGE_SIZE) { - pte_t *ptep; - spinlock_t *ptl; - - ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (is_zero_pfn(pte_pfn(*ptep))) - ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); - pte_unmap_unlock(ptep, ptl); - } - return 0; -} - -static const struct mm_walk_ops zap_zero_walk_ops = { - .pmd_entry = __zap_zero_pages, -}; - -/* * switch on pgstes for its userspace process (for kvm) */ int s390_enable_sie(void) @@ -2566,36 +2217,15 @@ int s390_enable_sie(void) /* Do we have pgstes? if yes, we are done */ if (mm_has_pgste(mm)) return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; mmap_write_lock(mm); mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); mmap_write_unlock(mm); return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); -int gmap_mark_unmergeable(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int ret; - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); - if (ret) - return ret; - } - mm->def_flags &= ~VM_MERGEABLE; - return 0; -} -EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); - /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2626,7 +2256,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, { pmd_t *pmd = (pmd_t *)pte; unsigned long start, end; - struct page *page = pmd_page(*pmd); + struct folio *folio = page_folio(pmd_page(*pmd)); /* * The write check makes sure we do not set a key on shared @@ -2639,9 +2269,9 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, return 0; start = pmd_val(*pmd) & HPAGE_MASK; - end = start + HPAGE_SIZE - 1; + end = start + HPAGE_SIZE; __storage_key_init_range(start, end); - set_bit(PG_arch_1, &page->flags); + set_bit(PG_arch_1, &folio->flags); cond_resched(); return 0; } @@ -2650,6 +2280,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, }; int s390_enable_skey(void) @@ -2662,7 +2293,7 @@ int s390_enable_skey(void) goto out_up; mm->context.uses_skeys = 1; - rc = gmap_mark_unmergeable(); + rc = gmap_helper_disable_cow_sharing(); if (rc) { mm->context.uses_skeys = 0; goto out_up; @@ -2687,6 +2318,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, static const struct mm_walk_ops reset_cmma_walk_ops = { .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, }; void s390_reset_cmma(struct mm_struct *mm) @@ -2697,41 +2329,124 @@ void s390_reset_cmma(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(s390_reset_cmma); +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, +}; + /* - * make inaccessible pages accessible again + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. */ -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { - pte_t pte = READ_ONCE(*ptep); + struct folio *folio; + unsigned long i; - /* There is a reference through the mapping */ - if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + for (i = 0; i < count; i++) { + folio = pfn_folio(pfns[i]); + /* we always have an extra reference */ + uv_destroy_folio(folio); + /* get rid of the extra reference */ + folio_put(folio); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } return 0; } +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -static const struct mm_walk_ops reset_acc_walk_ops = { - .pte_entry = __s390_reset_acc, -}; - -#include <linux/sched/mm.h> -void s390_reset_acc(struct mm_struct *mm) +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) { - if (!mm_is_protected(mm)) - return; - /* - * we might be called during - * reset: we walk the pages and clear - * close of all kvm file descriptors: we walk the pages and clear - * exit of process on fd closure: vma already gone, do nothing - */ - if (!mmget_not_zero(mm)) - return; - mmap_read_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); - mmap_read_unlock(mm); - mmput(mm); + unsigned long asce; + struct page *page; + void *table; + + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; } -EXPORT_SYMBOL_GPL(s390_reset_acc); +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c new file mode 100644 index 000000000000..a45d417ad951 --- /dev/null +++ b/arch/s390/mm/gmap_helpers.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2025 + */ +#include <linux/mm_types.h> +#include <linux/mmap_lock.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagewalk.h> +#include <linux/ksm.h> +#include <asm/gmap_helpers.h> + +/** + * ptep_zap_swap_entry() - discard a swap entry. + * @mm: the mm + * @entry: the swap entry that needs to be zapped + * + * Discards the given swap entry. If the swap entry was an actual swap + * entry (and not a migration entry, for example), the actual swapped + * page is also discarded from swap. + */ +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +{ + if (!non_swap_entry(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (is_migration_entry(entry)) + dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); + free_swap_and_cache(entry); +} + +/** + * gmap_helper_zap_one_page() - discard a page if it was swapped. + * @mm: the mm + * @vmaddr: the userspace virtual address that needs to be discarded + * + * If the given address maps to a swap entry, discard it. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pte_t *ptep; + + mmap_assert_locked(mm); + + /* Find the vm address for the guest address */ + vma = vma_lookup(mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + + /* Get pointer to the page table entry */ + ptep = get_locked_pte(mm, vmaddr, &ptl); + if (unlikely(!ptep)) + return; + if (pte_swap(*ptep)) + ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); + pte_unmap_unlock(ptep, ptl); +} +EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); + +/** + * gmap_helper_discard() - discard user pages in the given range + * @mm: the mm + * @vmaddr: starting userspace address + * @end: end address (first address outside the range) + * + * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + + while (vmaddr < end) { + vma = find_vma_intersection(mm, vmaddr, end); + if (!vma) + return; + if (!is_vm_hugetlb_page(vma)) + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + vmaddr = vma->vm_end; + } +} +EXPORT_SYMBOL_GPL(gmap_helper_discard); + +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + unsigned long *found_addr = walk->private; + + /* Return 1 of the page is a zeropage. */ + if (is_zero_pfn(pte_pfn(*pte))) { + /* + * Shared zeropage in e.g., a FS DAX mapping? We cannot do the + * right thing and likely don't care: FAULT_FLAG_UNSHARE + * currently only works in COW mappings, which is also where + * mm_forbids_zeropage() is checked. + */ + if (!is_cow_mapping(walk->vma->vm_flags)) + return -EFAULT; + + *found_addr = addr; + return 1; + } + return 0; +} + +static const struct mm_walk_ops find_zeropage_ops = { + .pte_entry = find_zeropage_pte_entry, + .walk_lock = PGWALK_WRLOCK, +}; + +/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages + * @mm: the mm whose zero pages are to be unshared + * + * Unshare all shared zeropages, replacing them by anonymous pages. Note that + * we cannot simply zap all shared zeropages, because this could later + * trigger unexpected userfaultfd missing events. + * + * This must be called after mm->context.allow_cow_sharing was + * set to 0, to avoid future mappings of shared zeropages. + * + * mm contracts with s390, that even if mm were to remove a page table, + * and racing with walk_page_range_vma() calling pte_offset_map_lock() + * would fail, it will never insert a page table containing empty zero + * pages once mm_forbids_zeropage(mm) i.e. + * mm->context.allow_cow_sharing is set to 0. + */ +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + unsigned long addr; + vm_fault_t fault; + int rc; + + for_each_vma(vmi, vma) { + /* + * We could only look at COW mappings, but it's more future + * proof to catch unexpected zeropages in other mappings and + * fail. + */ + if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) + continue; + addr = vma->vm_start; + +retry: + rc = walk_page_range_vma(vma, addr, vma->vm_end, + &find_zeropage_ops, &addr); + if (rc < 0) + return rc; + else if (!rc) + continue; + + /* addr was updated by find_zeropage_pte_entry() */ + fault = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + if (fault & VM_FAULT_OOM) + return -ENOMEM; + /* + * See break_ksm(): even after handle_mm_fault() returned 0, we + * must start the lookup from the current address, because + * handle_mm_fault() may back out if there's any difficulty. + * + * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but + * maybe they could trigger in the future on concurrent + * truncation. In that case, the shared zeropage would be gone + * and we can simply retry and make progress. + */ + cond_resched(); + goto retry; + } + + return 0; +} + +/** + * gmap_helper_disable_cow_sharing() - disable all COW sharing + * + * Disable most COW-sharing of memory pages for the whole process: + * (1) Disable KSM and unmerge/unshare any KSM pages. + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. + * + * Not that we currently don't bother with COW-shared pages that are shared + * with parent/child processes due to fork(). + */ +int gmap_helper_disable_cow_sharing(void) +{ + struct mm_struct *mm = current->mm; + int rc; + + mmap_assert_write_locked(mm); + + if (!mm->context.allow_cow_sharing) + return 0; + + mm->context.allow_cow_sharing = 0; + + /* Replace all shared zeropages by anonymous pages. */ + rc = __gmap_helper_unshare_zeropages(mm); + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + if (!rc) + rc = ksm_disable(mm); + if (rc) + mm->context.allow_cow_sharing = 1; + return rc; +} +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 10e51ef9c79a..e88c02c9e642 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -9,12 +9,13 @@ #define KMSG_COMPONENT "hugetlb" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include <asm/pgalloc.h> +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/security.h> +#include <asm/pgalloc.h> /* * If the bit selected by single-bit bitmask "a" is set within "x", move @@ -24,6 +25,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) { + swp_entry_t arch_entry; unsigned long rste; /* @@ -48,6 +50,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) */ if (pte_present(pte)) { rste = pte_val(pte) & PAGE_MASK; + rste |= _SEGMENT_ENTRY_PRESENT; rste |= move_set_bit(pte_val(pte), _PAGE_READ, _SEGMENT_ENTRY_READ); rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, @@ -66,6 +69,10 @@ static inline unsigned long __pte_to_rste(pte_t pte) #endif rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, _SEGMENT_ENTRY_NOEXEC); + } else if (!pte_none(pte)) { + /* swap pte */ + arch_entry = __pte_to_swp_entry(pte); + rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)); } else rste = _SEGMENT_ENTRY_EMPTY; return rste; @@ -73,13 +80,18 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + swp_entry_t arch_entry; unsigned long pteval; - int present; + int present, none; + pte_t pte; - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { present = pud_present(__pud(rste)); - else + none = pud_none(__pud(rste)); + } else { present = pmd_present(__pmd(rste)); + none = pmd_none(__pmd(rste)); + } /* * Convert encoding pmd / pud bits pte bits @@ -114,6 +126,11 @@ static inline pte_t __rste_to_pte(unsigned long rste) pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else if (!none) { + /* swap rste */ + arch_entry = __rste_to_swp_entry(rste); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + pteval = pte_val(pte); } else pteval = _PAGE_INVALID; return __pte(pteval); @@ -121,7 +138,7 @@ static inline pte_t __rste_to_pte(unsigned long rste) static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) { - struct page *page; + struct folio *folio; unsigned long size, paddr; if (!mm_uses_skeys(mm) || @@ -129,27 +146,25 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) return; if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { - page = pud_page(__pud(rste)); + folio = page_folio(pud_page(__pud(rste))); size = PUD_SIZE; paddr = rste & PUD_MASK; } else { - page = pmd_page(__pmd(rste)); + folio = page_folio(pmd_page(__pmd(rste))); size = PMD_SIZE; paddr = rste & PMD_MASK; } - if (!test_and_set_bit(PG_arch_1, &page->flags)) - __storage_key_init_range(paddr, paddr + size - 1); + if (!test_and_set_bit(PG_arch_1, &folio->flags)) + __storage_key_init_range(paddr, paddr + size); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { unsigned long rste; rste = __pte_to_rste(pte); - if (!MACHINE_HAS_NX) - rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { @@ -163,15 +178,21 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(rste)); } -pte_t huge_ptep_get(pte_t *ptep) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get(ptep); + pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; pud_t *pudp = (pud_t *) ptep; @@ -217,130 +238,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4dp = p4d_offset(pgdp, addr); if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); - if (pud_present(*pudp)) { - if (pud_large(*pudp)) - return (pte_t *) pudp; + if (sz == PUD_SIZE) + return (pte_t *)pudp; + if (pud_present(*pudp)) pmdp = pmd_offset(pudp, addr); - } } } return (pte_t *) pmdp; } -int pmd_huge(pmd_t pmd) -{ - return pmd_large(pmd); -} - -int pud_huge(pud_t pud) -{ - return pud_large(pud); -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - if (flags & FOLL_GET) - return NULL; - - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + if (cpu_has_edat1() && size == PMD_SIZE) return true; - else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + else if (cpu_has_edat2() && size == PUD_SIZE) return true; else return false; } - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; - - info.flags = 0; - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; - unsigned long addr; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - goto check_asce_limit; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - goto check_asce_limit; - } - - if (mm->get_unmapped_area == arch_get_unmapped_area) - addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - addr = hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - if (offset_in_page(addr)) - return addr; - -check_asce_limit: - return check_asce_limit(mm, addr, len); -} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 6a0ac00d5a42..074bf4fb4ce2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -8,6 +8,7 @@ * Copyright (C) 1995 Linus Torvalds */ +#include <linux/cpufeature.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -31,29 +32,38 @@ #include <linux/cma.h> #include <linux/gfp.h> #include <linux/dma-direct.h> -#include <linux/platform-feature.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> +#include <asm/ctlreg.h> #include <asm/kfence.h> -#include <asm/ptdump.h> #include <asm/dma.h> -#include <asm/lowcore.h> -#include <asm/tlb.h> +#include <asm/abs_lowcore.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/ctl_reg.h> #include <asm/sclp.h> #include <asm/set_memory.h> #include <asm/kasan.h> #include <asm/dma-mapping.h> #include <asm/uv.h> +#include <linux/virtio_anchor.h> #include <linux/virtio_config.h> +#include <linux/execmem.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); -static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); -unsigned long s390_invalid_asce; +struct ctlreg __bootdata_preserved(s390_invalid_asce); + +unsigned long __bootdata_preserved(page_noexec_mask); +EXPORT_SYMBOL(page_noexec_mask); + +unsigned long __bootdata_preserved(segment_noexec_mask); +EXPORT_SYMBOL(segment_noexec_mask); + +unsigned long __bootdata_preserved(region_noexec_mask); +EXPORT_SYMBOL(region_noexec_mask); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -61,27 +71,17 @@ EXPORT_SYMBOL(zero_page_mask); static void __init setup_zero_pages(void) { + unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; - struct page *page; - int i; /* Latest machines require a mapping granularity of 512KB */ order = 7; /* Limit number of empty zero pages for small memory sizes */ - while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) + while (order > 2 && (total_pages >> 10) < (1UL << order)) order--; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Out of memory in setup_zero_pages"); - - page = virt_to_page((void *) empty_zero_page); - split_page(page, order); - for (i = 1 << order; i > 0; i--) { - mark_page_reserved(page); - page++; - } + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -92,41 +92,12 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - - s390_invalid_asce = (unsigned long)invalid_pg_dir; - s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = s390_invalid_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); - vmem_map_init(); - kasan_copy_shadow_mapping(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); + vmem_map_init(); sparse_init(); - zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; free_area_init(max_zone_pfns); } @@ -135,30 +106,31 @@ void mark_rodata_ro(void) { unsigned long size = __end_ro_after_init - __start_ro_after_init; - set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); + if (cpu_has_nx()) + system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); + __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); - debug_checkwx(); } -int set_memory_encrypted(unsigned long addr, int numpages) +int set_memory_encrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages unshared, (swiotlb, dma_free) */ for (i = 0; i < numpages; ++i) { - uv_remove_shared(addr); - addr += PAGE_SIZE; + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -int set_memory_decrypted(unsigned long addr, int numpages) +int set_memory_decrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages shared (swiotlb, dma_alloca) */ for (i = 0; i < numpages; ++i) { - uv_set_shared(addr); - addr += PAGE_SIZE; + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } @@ -175,42 +147,21 @@ static void pv_init(void) if (!is_prot_virt_guest()) return; - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); /* make sure bounce buffers are shared */ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); swiotlb_update_mem_attributes(); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - set_max_mapnr(max_low_pfn); - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - pv_init(); - kfence_split_mapping(); - /* Setup guest page hinting */ - cmma_init(); - /* this will put all low memory onto the freelists */ - memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ - - cmma_init_nodat(); -} - -void free_initmem(void) -{ - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RW | SET_MEMORY_NX); - free_reserved_area(sclp_early_sccb, - sclp_early_sccb + EXT_SCCB_READ_SCP, - POISON_FREE_INITMEM, "unused early sccb"); - free_initmem_default(POISON_FREE_INITMEM); } unsigned long memory_block_size_bytes(void) @@ -222,6 +173,41 @@ unsigned long memory_block_size_bytes(void) return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); } +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + return LOCAL_DISTANCE; +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return 0; +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} + #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_CMA @@ -236,16 +222,13 @@ struct s390_cma_mem_data { static int s390_cma_check_range(struct cma *cma, void *data) { struct s390_cma_mem_data *mem_data; - unsigned long start, end; mem_data = data; - start = cma_get_base(cma); - end = start + cma_get_size(cma); - if (end < mem_data->start) - return 0; - if (start >= mem_data->end) - return 0; - return -EBUSY; + + if (cma_intersects(cma, mem_data->start, mem_data->end)) + return -EBUSY; + + return 0; } static int s390_cma_mem_notifier(struct notifier_block *nb, @@ -282,10 +265,7 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(params->altmap)) - return -EINVAL; - - if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) + if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL))) return -EINVAL; VM_BUG_ON(!mhp_range_allowed(start, size, true)); @@ -308,3 +288,32 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) vmem_remove_mapping(start, size); } #endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_EXECMEM +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) +{ + unsigned long module_load_offset = 0; + unsigned long start; + + if (kaslr_enabled()) + module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + + start = MODULES_VADDR + module_load_offset; + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + }, + }; + + return &execmem_info; +} +#endif /* CONFIG_EXECMEM */ diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c deleted file mode 100644 index 9f988d4582ed..000000000000 --- a/arch/s390/mm/kasan_init.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kasan.h> -#include <linux/sched/task.h> -#include <linux/memblock.h> -#include <linux/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/kasan.h> -#include <asm/mem_detect.h> -#include <asm/processor.h> -#include <asm/sclp.h> -#include <asm/facility.h> -#include <asm/sections.h> -#include <asm/setup.h> -#include <asm/uv.h> - -static unsigned long segment_pos __initdata; -static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; -static bool has_edat __initdata; -static bool has_nx __initdata; - -#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) - -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - -static void __init kasan_early_panic(const char *reason) -{ - sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); - sclp_early_printk(reason); - disabled_wait(); -} - -static void * __init kasan_early_alloc_segment(void) -{ - segment_pos -= _SEGMENT_SIZE; - - if (segment_pos < segment_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)segment_pos; -} - -static void * __init kasan_early_alloc_pages(unsigned int order) -{ - pgalloc_pos -= (PAGE_SIZE << order); - - if (pgalloc_pos < pgalloc_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)pgalloc_pos; -} - -static void * __init kasan_early_crst_alloc(unsigned long val) -{ - unsigned long *table; - - table = kasan_early_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); - return table; -} - -static pte_t * __init kasan_early_pte_alloc(void) -{ - static void *pte_leftover; - pte_t *pte; - - BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); - - if (!pte_leftover) { - pte_leftover = kasan_early_alloc_pages(0); - pte = pte_leftover + _PAGE_TABLE_SIZE; - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); - return pte; -} - -enum populate_mode { - POPULATE_ONE2ONE, - POPULATE_MAP, - POPULATE_ZERO_SHADOW, - POPULATE_SHALLOW -}; -static void __init kasan_early_pgtable_populate(unsigned long address, - unsigned long end, - enum populate_mode mode) -{ - unsigned long pgt_prot_zero, pgt_prot, sgt_prot; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); - if (!has_nx) - pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!has_nx || mode == POPULATE_ONE2ONE) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - - /* - * The first 1MB of 1:1 mapping is mapped with 4KB pages - */ - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PGDIR_SIZE) && - end - address >= PGDIR_SIZE) { - pgd_populate(&init_mm, pg_dir, - kasan_early_shadow_p4d); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - continue; - } - p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY); - pgd_populate(&init_mm, pg_dir, p4_dir); - } - - if (mode == POPULATE_SHALLOW) { - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, P4D_SIZE) && - end - address >= P4D_SIZE) { - p4d_populate(&init_mm, p4_dir, - kasan_early_shadow_pud); - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY); - p4d_populate(&init_mm, p4_dir, pu_dir); - } - - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PUD_SIZE) && - end - address >= PUD_SIZE) { - pud_populate(&init_mm, pu_dir, - kasan_early_shadow_pmd); - address = (address + PUD_SIZE) & PUD_MASK; - continue; - } - pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY); - pud_populate(&init_mm, pu_dir, pm_dir); - } - - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - if (IS_ALIGNED(address, PMD_SIZE) && - end - address >= PMD_SIZE) { - if (mode == POPULATE_ZERO_SHADOW) { - pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } else if (has_edat && address) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); - } - set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot)); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - } - pt_dir = kasan_early_pte_alloc(); - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *page; - - switch (mode) { - case POPULATE_ONE2ONE: - page = (void *)address; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; - case POPULATE_MAP: - page = kasan_early_alloc_pages(0); - memset(page, 0, PAGE_SIZE); - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; - case POPULATE_ZERO_SHADOW: - page = kasan_early_shadow_page; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero)); - break; - case POPULATE_SHALLOW: - /* should never happen */ - break; - } - } - address += PAGE_SIZE; - } -} - -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - -static void __init kasan_early_detect_facilities(void) -{ - if (test_facility(8)) { - has_edat = true; - __ctl_set_bit(0, 23); - } - if (!noexec_disabled && test_facility(130)) { - has_nx = true; - __ctl_set_bit(0, 20); - } -} - -void __init kasan_early_init(void) -{ - unsigned long shadow_alloc_size; - unsigned long initrd_end; - unsigned long memsize; - unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); - pte_t pte_z; - pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); - pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); - p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); - - kasan_early_detect_facilities(); - if (!has_nx) - pgt_prot &= ~_PAGE_NOEXEC; - pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot); - - memsize = get_mem_detect_end(); - if (!memsize) - kasan_early_panic("cannot detect physical memory size\n"); - /* - * Kasan currently supports standby memory but only if it follows - * online memory (default allocation), i.e. no memory holes. - * - memsize represents end of online memory - * - ident_map_size represents online + standby and memory limits - * accounted. - * Kasan maps "memsize" right away. - * [0, memsize] - as identity mapping - * [__sha(0), __sha(memsize)] - shadow memory for identity mapping - * The rest [memsize, ident_map_size] if memsize < ident_map_size - * could be mapped/unmapped dynamically later during memory hotplug. - */ - memsize = min(memsize, ident_map_size); - - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); - - /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); - memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); - - shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } - - if (pgalloc_low + shadow_alloc_size > memsize) - kasan_early_panic("out of memory during initialisation\n"); - - if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); - segment_low = segment_pos - shadow_alloc_size; - pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; - } - init_mm.pgd = early_pg_dir; - /* - * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | | - * | |/ | kasan | - * +- shadow start --+ | zero | - * | 1/8 addr space | | page | - * +- shadow end -+ | mapping | - * | ... gap ... |\ | (untracked) | - * +- vmalloc area -+ \ | | - * | vmalloc_size | \ | | - * +- modules vaddr -+ \ +----------------+ - * | 2Gb | \| unmapped | allocated per module - * +-----------------+ +- shadow end ---+ - * - * Current memory layout (KASAN_VMALLOC): - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | kasan | - * | |/ | zero | - * +- shadow start --+ | page | - * | 1/8 addr space | | mapping | - * +- shadow end -+ | (untracked) | - * | ... gap ... |\ | | - * +- vmalloc area -+ \ +- vmalloc area -+ - * | vmalloc_size | \ |shallow populate| - * +- modules vaddr -+ \ +- modules area -+ - * | 2Gb | \|shallow populate| - * +-----------------+ +- shadow end ---+ - */ - /* populate kasan shadow (for identity mapping and zero page mapping) */ - kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP); - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - /* shallowly populate kasan shadow for vmalloc and modules */ - kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END), - POPULATE_SHALLOW); - } - /* populate kasan shadow for untracked memory */ - kasan_early_pgtable_populate(__sha(ident_map_size), - IS_ENABLED(CONFIG_KASAN_VMALLOC) ? - __sha(VMALLOC_START) : - __sha(MODULES_VADDR), - POPULATE_ZERO_SHADOW); - kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), - POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); - kasan_enable_dat(); - /* enable kasan */ - init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); - sclp_early_printk("KernelAddressSanitizer initialized\n"); -} - -void __init kasan_copy_shadow_mapping(void) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 421efa46946b..44426e0f2944 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -12,10 +12,18 @@ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/cpu.h> +#include <linux/uio.h> +#include <linux/io.h> #include <asm/asm-extable.h> -#include <asm/ctl_reg.h> -#include <asm/io.h> +#include <asm/abs_lowcore.h> #include <asm/stacktrace.h> +#include <asm/sections.h> +#include <asm/maccess.h> +#include <asm/ctlreg.h> + +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { @@ -41,7 +49,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz } /* - * s390_kernel_write - write to kernel memory bypassing DAT + * __s390_kernel_write - write to kernel memory bypassing DAT * @dst: destination address * @src: source address * @size: number of bytes to copy @@ -54,166 +62,84 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz */ static DEFINE_SPINLOCK(s390_kernel_write_lock); -notrace void *s390_kernel_write(void *dst, const void *src, size_t size) +notrace void *__s390_kernel_write(void *dst, const void *src, size_t size) { void *tmp = dst; unsigned long flags; long copied; spin_lock_irqsave(&s390_kernel_write_lock, flags); - if (!(flags & PSW_MASK_DAT)) { - memcpy(dst, src, size); - } else { - while (size) { - copied = s390_kernel_write_odd(tmp, src, size); - tmp += copied; - src += copied; - size -= copied; - } + while (size) { + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; + src += copied; + size -= copied; } spin_unlock_irqrestore(&s390_kernel_write_lock, flags); return dst; } -static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count) -{ - union register_pair _dst, _src; - int rc = -EFAULT; - - _dst.even = (unsigned long) dest; - _dst.odd = (unsigned long) count; - _src.even = (unsigned long) src; - _src.odd = (unsigned long) count; - asm volatile ( - "0: mvcle %[dst],%[src],0\n" - "1: jo 0b\n" - " lhi %[rc],0\n" - "2:\n" - EX_TABLE(1b,2b) - : [rc] "+&d" (rc), [dst] "+&d" (_dst.pair), [src] "+&d" (_src.pair) - : : "cc", "memory"); - return rc; -} - -static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, - unsigned long src, - unsigned long count) -{ - int irqs_disabled, rc; - unsigned long flags; - - if (!count) - return 0; - flags = arch_local_irq_save(); - irqs_disabled = arch_irqs_disabled_flags(flags); - if (!irqs_disabled) - trace_hardirqs_off(); - __arch_local_irq_stnsm(0xf8); // disable DAT - rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); - if (flags & PSW_MASK_DAT) - __arch_local_irq_stosm(0x04); // enable DAT - if (!irqs_disabled) - trace_hardirqs_on(); - __arch_local_irq_ssm(flags); - return rc; -} - -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, unsigned long src, size_t count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long _dest = (unsigned long)dest; - unsigned long _src = (unsigned long)src; - unsigned long _count = (unsigned long)count; - int rc; - - if (S390_lowcore.nodat_stack != 0) { - preempt_disable(); - rc = call_on_stack(3, S390_lowcore.nodat_stack, - unsigned long, _memcpy_real, - unsigned long, _dest, - unsigned long, _src, - unsigned long, _count); - preempt_enable(); - return rc; - } - /* - * This is a really early memcpy_real call, the stacks are - * not set up yet. Just call _memcpy_real on the early boot - * stack - */ - return _memcpy_real(_dest, _src, _count); -} - -/* - * Copy memory in absolute mode (kernel to kernel) - */ -void memcpy_absolute(void *dest, void *src, size_t count) -{ - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); + while (count) { + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, MEMCPY_REAL_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); + return res; } -/* - * Copy memory from kernel (real) to user (virtual) - */ -int copy_to_user_real(void __user *dest, unsigned long src, unsigned long count) +int memcpy_real(void *dest, unsigned long src, size_t count) { - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (memcpy_real(buf, src + offs, size)) - goto out; - if (copy_to_user(dest + offs, buf, size)) - goto out; - offs += size; - } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* - * Check if physical address is within prefix or zero page + * Find CPU that owns swapped prefix page */ -static int is_swapped(phys_addr_t addr) +static int get_swapped_owner(phys_addr_t addr) { phys_addr_t lc; int cpu; - if (addr < sizeof(struct lowcore)) - return 1; for_each_online_cpu(cpu) { lc = virt_to_phys(lowcore_ptr[cpu]); if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -226,17 +152,34 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) { void *ptr = phys_to_virt(addr); void *bounce = ptr; + struct lowcore *abs_lc; unsigned long size; + int this_cpu, cpu; cpus_read_lock(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, ptr, size); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; + } + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); } - preempt_enable(); +out: + put_cpu(); cpus_read_unlock(); return bounce; } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index d545f5c39f7e..40a526d28184 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -17,6 +17,7 @@ #include <linux/random.h> #include <linux/compat.h> #include <linux/security.h> +#include <linux/hugetlb.h> #include <asm/elf.h> static unsigned long stack_maxrandom_size(void) @@ -37,7 +38,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack) unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_base_legacy(unsigned long rnd) @@ -50,7 +51,6 @@ static inline unsigned long mmap_base(unsigned long rnd, { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -60,24 +60,29 @@ static inline unsigned long mmap_base(unsigned long rnd, * Top of mmap area (just below the process stack). * Leave at least a ~128 MB hole. */ - gap_min = SZ_128M; - gap_max = (STACK_TOP / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5); return PAGE_ALIGN(STACK_TOP - gap - rnd); } +static int get_align_mask(struct file *filp, unsigned long flags) +{ + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); + if (!(current->flags & PF_RANDOMIZE)) + return 0; + if (filp || (flags & MAP_SHARED)) + return MMAP_ALIGN_MASK << PAGE_SHIFT; + return 0; +} + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; @@ -93,15 +98,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, goto check_asce_limit; } - info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; - if (filp || (flags & MAP_SHARED)) - info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; - else - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if (offset_in_page(addr)) return addr; @@ -112,11 +114,11 @@ check_asce_limit: unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) @@ -136,13 +138,11 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - if (filp || (flags & MAP_SHARED)) - info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; - else - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* @@ -182,9 +182,35 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } + +static pgprot_t protection_map[16] __ro_after_init; + +void __init setup_protection_map(void) +{ + pgprot_t *pm = protection_map; + + pm[VM_NONE] = PAGE_NONE; + pm[VM_READ] = PAGE_RO; + pm[VM_WRITE] = PAGE_RO; + pm[VM_WRITE | VM_READ] = PAGE_RO; + pm[VM_EXEC] = PAGE_RX; + pm[VM_EXEC | VM_READ] = PAGE_RX; + pm[VM_EXEC | VM_WRITE] = PAGE_RX; + pm[VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX; + pm[VM_SHARED] = PAGE_NONE; + pm[VM_SHARED | VM_READ] = PAGE_RO; + pm[VM_SHARED | VM_WRITE] = PAGE_RW; + pm[VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW; + pm[VM_SHARED | VM_EXEC] = PAGE_RX; + pm[VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX; + pm[VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX; + pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX; +} + +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index d5ea09d78938..01f9b39e65f5 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -7,210 +7,18 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> #include <linux/mm.h> -#include <linux/memblock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <asm/asm-extable.h> -#include <asm/facility.h> #include <asm/page-states.h> +#include <asm/sections.h> +#include <asm/page.h> -static int cmma_flag = 1; - -static int __init cmma(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - cmma_flag = enabled; - return 1; -} -__setup("cmma=", cmma); - -static inline int cmma_test_essa(void) -{ - unsigned long tmp = 0; - int rc = -EOPNOTSUPP; - - /* test ESSA_GET_STATE */ - asm volatile( - " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" - "0: la %[rc],0\n" - "1:\n" - EX_TABLE(0b,1b) - : [rc] "+&d" (rc), [tmp] "+&d" (tmp) - : [cmd] "i" (ESSA_GET_STATE)); - return rc; -} - -void __init cmma_init(void) -{ - if (!cmma_flag) - return; - if (cmma_test_essa()) { - cmma_flag = 0; - return; - } - if (test_facility(147)) - cmma_flag = 2; -} - -static inline unsigned char get_page_state(struct page *page) -{ - unsigned char state; - - asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (state) - : "a" (page_to_phys(page)), - "i" (ESSA_GET_STATE)); - return state & 0x3f; -} - -static inline void set_page_unused(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_UNUSED)); -} - -static inline void set_page_stable_dat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); -} - -static inline void set_page_stable_nodat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE_NODAT)); -} - -static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pmd_t *pmd; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || pmd_large(*pmd)) - continue; - page = phys_to_page(pmd_val(*pmd)); - set_bit(PG_arch_1, &page->flags); - } while (pmd++, addr = next, addr != end); -} - -static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pud_t *pud; - int i; - - pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none(*pud) || pud_large(*pud)) - continue; - if (!pud_folded(*pud)) { - page = phys_to_page(pud_val(*pud)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - p4d_t *p4d; - int i; - - p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(*p4d)) - continue; - if (!p4d_folded(*p4d)) { - page = phys_to_page(p4d_val(*p4d)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pud(p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -static void mark_kernel_pgd(void) -{ - unsigned long addr, next; - struct page *page; - pgd_t *pgd; - int i; - - addr = 0; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, MODULES_END); - if (pgd_none(*pgd)) - continue; - if (!pgd_folded(*pgd)) { - page = phys_to_page(pgd_val(*pgd)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_p4d(pgd, addr, next); - } while (pgd++, addr = next, addr != MODULES_END); -} - -void __init cmma_init_nodat(void) -{ - struct page *page; - unsigned long start, end, ix; - int i; - - if (cmma_flag < 2) - return; - /* Mark pages used in kernel page tables */ - mark_kernel_pgd(); - - /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { - page = pfn_to_page(start); - for (ix = start; ix < end; ix++, page++) { - if (__test_and_clear_bit(PG_arch_1, &page->flags)) - continue; /* skip page table pages */ - if (!list_empty(&page->lru)) - continue; /* skip free pages */ - set_page_stable_nodat(page, 0); - } - } -} +int __bootdata_preserved(cmma_flag); void arch_free_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_unused(page, order); + __set_page_unused(page_to_virt(page), 1UL << order); } void arch_alloc_page(struct page *page, int order) @@ -218,14 +26,7 @@ void arch_alloc_page(struct page *page, int order) if (!cmma_flag) return; if (cmma_flag < 2) - set_page_stable_dat(page, order); + __set_page_stable_dat(page_to_virt(page), 1UL << order); else - set_page_stable_nodat(page, order); -} - -void arch_set_page_dat(struct page *page, int order) -{ - if (!cmma_flag) - return; - set_page_stable_dat(page, order); + __set_page_stable_nodat(page_to_virt(page), 1UL << order); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 85195c18b2e8..348e759840e7 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -3,13 +3,17 @@ * Copyright IBM Corp. 2011 * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/hugetlb.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <linux/mm.h> #include <asm/cacheflush.h> #include <asm/facility.h> #include <asm/pgalloc.h> #include <asm/kfence.h> #include <asm/page.h> +#include <asm/asm.h> #include <asm/set_memory.h> static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) @@ -24,7 +28,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) unsigned long boundary, size; while (start < end) { - if (MACHINE_HAS_EDAT1) { + if (cpu_has_edat1()) { /* set storage keys for a 1MB frame */ size = 1UL << 20; boundary = (start + size) & ~(size - 1); @@ -41,7 +45,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) } #ifdef CONFIG_PROC_FS -atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); void arch_report_meminfo(struct seq_file *m) { @@ -60,7 +64,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long *table, mask; mask = 0; - if (MACHINE_HAS_EDAT2) { + if (cpu_has_edat2()) { switch (dtt) { case CRDTE_DTT_REGION3: mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); @@ -73,8 +77,8 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, break; } table = (unsigned long *)((unsigned long)old & mask); - crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); - } else if (MACHINE_HAS_IDTE) { + crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val); + } else if (cpu_has_idte()) { cspg(old, *old, new); } else { csp((unsigned int *)old + 1, *old, new); @@ -96,11 +100,17 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, if (flags & SET_MEMORY_RO) new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pte_mkwrite(pte_mkdirty(new)); + new = pte_mkwrite_novma(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pte_bit(new, __pgprot(_PAGE_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pte(pte_val(new) & PAGE_MASK); + new = set_pte_bit(new, PAGE_KERNEL); + } pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); ptep++; addr += PAGE_SIZE; @@ -146,11 +156,17 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, if (flags & SET_MEMORY_RO) new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pmd_mkwrite(pmd_mkdirty(new)); + new = pmd_mkwrite_novma(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pmd(pmd_val(new) & PMD_MASK); + new = set_pmd_bit(new, SEGMENT_KERNEL); + } pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); } @@ -167,7 +183,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, if (pmd_none(*pmdp)) return -EINVAL; next = pmd_addr_end(addr, end); - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PMD_MASK); need_split |= !!(addr + PMD_SIZE > next); @@ -232,6 +248,12 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pud(pud_val(new) & PUD_MASK); + new = set_pud_bit(new, REGION3_KERNEL); + } pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } @@ -248,7 +270,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pudp)) return -EINVAL; next = pud_addr_end(addr, end); - if (pud_large(*pudp)) { + if (pud_leaf(*pudp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PUD_MASK); need_split |= !!(addr + PUD_SIZE > next); @@ -298,11 +320,6 @@ static int change_page_attr(unsigned long addr, unsigned long end, int rc = -EINVAL; pgd_t *pgdp; - if (addr == end) - return 0; - if (end >= MODULES_END) - return -EINVAL; - mutex_lock(&cpa_mutex); pgdp = pgd_offset_k(addr); do { if (pgd_none(*pgdp)) @@ -313,18 +330,103 @@ static int change_page_attr(unsigned long addr, unsigned long end, break; cond_resched(); } while (pgdp++, addr = next, addr < end && !rc); - mutex_unlock(&cpa_mutex); return rc; } -int __set_memory(unsigned long addr, int numpages, unsigned long flags) +static int change_page_attr_alias(unsigned long addr, unsigned long end, + unsigned long flags) { - if (!MACHINE_HAS_NX) + unsigned long alias, offset, va_start, va_end; + struct vm_struct *area; + int rc = 0; + + /* + * Changes to read-only permissions on kernel VA mappings are also + * applied to the kernel direct mapping. Execute permissions are + * intentionally not transferred to keep all allocated pages within + * the direct mapping non-executable. + */ + flags &= SET_MEMORY_RO | SET_MEMORY_RW; + if (!flags) + return 0; + area = NULL; + while (addr < end) { + if (!area) + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_ALLOC)) + return 0; + va_start = (unsigned long)area->addr; + va_end = va_start + area->nr_pages * PAGE_SIZE; + offset = (addr - va_start) >> PAGE_SHIFT; + alias = (unsigned long)page_address(area->pages[offset]); + rc = change_page_attr(alias, alias + PAGE_SIZE, flags); + if (rc) + break; + addr += PAGE_SIZE; + if (addr >= va_end) + area = NULL; + } + return rc; +} + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags) +{ + unsigned long end; + int rc; + + if (!cpu_has_nx()) flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); if (!flags) return 0; + if (!numpages) + return 0; addr &= PAGE_MASK; - return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); + end = addr + numpages * PAGE_SIZE; + mutex_lock(&cpa_mutex); + rc = change_page_attr(addr, end, flags); + if (rc) + goto out; + rc = change_page_attr_alias(addr, end, flags); +out: + mutex_unlock(&cpa_mutex); + return rc; +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); +} + +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long flags; + + if (valid) + flags = SET_MEMORY_DEF; + else + flags = SET_MEMORY_INV; + + return __set_memory((unsigned long)page_to_virt(page), nr, flags); +} + +bool kernel_page_present(struct page *page) +{ + unsigned long addr; + unsigned int cc; + + addr = (unsigned long)page_address(page); + asm volatile( + " lra %[addr],0(%[addr])\n" + CC_IPM(cc) + : CC_OUT(cc, cc), [addr] "+a" (addr) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc) == 0; } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 000000000000..e6175d75e4b0 --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/cpuhotplug.h> +#include <linux/sched/task.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <asm/asm-extable.h> +#include <asm/asm-offsets.h> +#include <asm/pfault.h> +#include <asm/diag.h> + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000UL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} +early_param("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +}; + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc = -EOPNOTSUPP; + + if (pfault_disable) + return rc; + diag_stat_inc(DIAG_STAT_X258); + asm_inline volatile( + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm_inline volatile( + " diag %[refbk],0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* + * Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. + */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* + * Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. + */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* + * Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. + */ + tsk->thread.pfault_wait = 0; + } else { + /* + * Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. + */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* + * Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + set_preempt_need_resched(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 2de48b2c1b04..b449fd2605b0 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -10,70 +10,41 @@ #include <linux/slab.h> #include <linux/mm.h> #include <asm/mmu_context.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> -#include <asm/gmap.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> -#ifdef CONFIG_PGSTE - -int page_table_allocate_pgste = 0; -EXPORT_SYMBOL(page_table_allocate_pgste); - -static struct ctl_table page_table_sysctl[] = { - { - .procname = "allocate_pgste", - .data = &page_table_allocate_pgste, - .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } -}; - -static int __init page_table_register_sysctl(void) -{ - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; -} -__initcall(page_table_register_sysctl); - -#endif /* CONFIG_PGSTE */ - unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + unsigned long *table; - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + if (!table) + return; + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) { struct mm_struct *mm = arg; + struct ctlreg asce; /* change all active ASCEs to avoid the creation of new TLBs */ if (current->active_mm == mm) { - S390_lowcore.user_asce = mm->context.asce; - __ctl_load(S390_lowcore.user_asce, 7, 7); + asce.val = mm->context.asce; + get_lowcore()->user_asce = asce; + local_ctl_load(7, &asce); + if (!test_thread_flag(TIF_ASCE_PRIMARY)) + local_ctl_load(1, &asce); } __tlb_flush_local(); } @@ -83,6 +54,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) unsigned long *pgd = NULL, *p4d = NULL, *__pgd; unsigned long asce_limit = mm->context.asce_limit; + mmap_assert_write_locked(mm); + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ VM_BUG_ON(asce_limit < _REGION2_SIZE); @@ -94,23 +67,18 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) if (unlikely(!p4d)) goto err_p4d; crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + pagetable_p4d_ctor(virt_to_ptdesc(p4d)); } if (end > _REGION1_SIZE) { pgd = crst_table_alloc(mm); if (unlikely(!pgd)) goto err_pgd; crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + pagetable_pgd_ctor(virt_to_ptdesc(pgd)); } spin_lock_bh(&mm->page_table_lock); - /* - * This routine gets called with mmap_lock lock held and there is - * no reason to optimize for the case of otherwise. However, if - * that would ever change, the below check will let us know. - */ - VM_BUG_ON(asce_limit != mm->context.asce_limit); - if (p4d) { __pgd = (unsigned long *) mm->pgd; p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); @@ -136,292 +104,82 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) return 0; err_pgd: + pagetable_dtor(virt_to_ptdesc(p4d)); crst_table_free(mm, p4d); err_p4d: return -ENOMEM; } -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; - - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; -} - #ifdef CONFIG_PGSTE -struct page *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc; } -void page_table_free_pgste(struct page *page) +void page_table_free_pgste(struct ptdesc *ptdesc) { - __free_page(page); + pagetable_free(ptdesc); } #endif /* CONFIG_PGSTE */ -/* - * A 2KB-pgtable is either upper or lower half of a normal page. - * The second half of the page may be unused or used as another - * 2KB-pgtable. - * - * Whenever possible the parent page for a new 2KB-pgtable is picked - * from the list of partially allocated pages mm_context_t::pgtable_list. - * In case the list is empty a new parent page is allocated and added to - * the list. - * - * When a parent page gets fully allocated it contains 2KB-pgtables in both - * upper and lower halves and is removed from mm_context_t::pgtable_list. - * - * When 2KB-pgtable is freed from to fully allocated parent page that - * page turns partially allocated and added to mm_context_t::pgtable_list. - * - * If 2KB-pgtable is freed from the partially allocated parent page that - * page turns unused and gets removed from mm_context_t::pgtable_list. - * Furthermore, the unused parent page is released. - * - * As follows from the above, no unallocated or fully allocated parent - * pages are contained in mm_context_t::pgtable_list. - * - * The upper byte (bits 24-31) of the parent page _refcount is used - * for tracking contained 2KB-pgtables and has the following format: - * - * PP AA - * 01234567 upper byte (bits 24-31) of struct page::_refcount - * || || - * || |+--- upper 2KB-pgtable is allocated - * || +---- lower 2KB-pgtable is allocated - * |+------- upper 2KB-pgtable is pending for removal - * +-------- lower 2KB-pgtable is pending for removal - * - * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why - * using _refcount is possible). - * - * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still unallocated; - * - removed from mm_context_t::pgtable_list in case both hales of the - * parent page are allocated; - * These operations are protected with mm_context_t::lock. - * - * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 - * and the corresponding PP bit is set to 1 in a single atomic operation. - * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually - * exclusive and may never be both set to 1! - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still allocated; - * - removed from mm_context_t::pgtable_list in case the second half of - * the parent page is unallocated; - * These operations are protected with mm_context_t::lock. - * - * It is important to understand that mm_context_t::lock only protects - * mm_context_t::pgtable_list and AA bits, but not the parent page itself - * and PP bits. - * - * Releasing the parent page happens whenever the PP bit turns from 1 to 0, - * while both AA bits and the second PP bit are already unset. Then the - * parent page does not contain any 2KB-pgtable fragment anymore, and it has - * also been removed from mm_context_t::pgtable_list. It is safe to release - * the page therefore. - * - * PGSTE memory spaces use full 4KB-pgtables and do not need most of the - * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable - * while the PP bits are never used, nor such a page is added to or removed - * from mm_context_t::pgtable_list. - */ unsigned long *page_table_alloc(struct mm_struct *mm) { + struct ptdesc *ptdesc; unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; - /* - * The pending removal bits must also be checked. - * Failure to do so might lead to an impossible - * value of (i.e 0x13 or 0x23) written to _refcount. - * Such values violate the assumption that pending and - * allocation bits are mutually exclusive, and the rest - * of the code unrails as result. That could lead to - * a whole bunch of races and corruptions. - */ - mask = (mask | (mask >> 4)) & 0x03U; - if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, - 0x01U << (bit + 24)); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); - /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 0x03U << 24); - memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); - memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } else { - /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); - memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); - spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.lock); - } + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; } -static void page_table_release_check(struct page *page, void *table, - unsigned int half, unsigned int mask) +void page_table_free(struct mm_struct *mm, unsigned long *table) { - char msg[128]; + struct ptdesc *ptdesc = virt_to_ptdesc(table); - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) - return; - snprintf(msg, sizeof(msg), - "Invalid pgtable %p release half 0x%02x mask 0x%02x", - table, half, mask); - dump_page(page, msg); + pagetable_dtor_free(ptdesc); } -void page_table_free(struct mm_struct *mm, unsigned long *table) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) { - unsigned int mask, bit, half; - struct page *page; - - page = virt_to_page(table); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release - * will happen outside of the critical section from this - * function or from __tlb_remove_table() - */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 0x03U) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - half = 0x01U << bit; - } else { - half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); - mask >>= 24; - } + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_dtor_free(ptdesc); } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = virt_to_page(table); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); - return; - } - bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); + + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); /* - * Mark the page for delayed release. The actual release will happen - * outside of the critical section from __tlb_remove_table() or from - * page_table_free() + * THPs are not allowed for KVM guests. Warn if pgste ever reaches here. + * Turn to the generic pte_free_defer() version once gmap is removed. */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 0x03U) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); - tlb_remove_table(tlb, table); -} - -void __tlb_remove_table(void *_table) -{ - unsigned int mask = (unsigned long) _table & 0x03U, half = mask; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); - - switch (half) { - case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); - return; - case 0x01U: /* lower 2K of a 4K page table */ - case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - break; - case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); - mask >>= 24; - break; - } - - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, @@ -448,16 +206,21 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + if (!table) + return; + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ @@ -469,7 +232,7 @@ static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ return (next - 1) < (end - 1) ? next : end; \ } -BASE_ADDR_END_FUNC(page, _PAGE_SIZE) +BASE_ADDR_END_FUNC(page, PAGE_SIZE) BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) @@ -493,7 +256,7 @@ static int base_page_walk(unsigned long *origin, unsigned long addr, if (!alloc) return 0; pte = origin; - pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; + pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT; do { next = base_page_addr_end(addr, end); *pte = base_lra(addr); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4909dcd762e8..7df70cd8f739 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -4,6 +4,7 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -19,10 +20,10 @@ #include <linux/ksm.h> #include <linux/mman.h> -#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/page-states.h> +#include <asm/machine.h> pgprot_t pgprot_writecombine(pgprot_t prot) { @@ -34,22 +35,12 @@ pgprot_t pgprot_writecombine(pgprot_t prot) } EXPORT_SYMBOL_GPL(pgprot_writecombine); -pgprot_t pgprot_writethrough(pgprot_t prot) -{ - /* - * mio_wb_bit_mask may be set on a different CPU, but it is only set - * once at init and only read afterwards. - */ - return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); -} -EXPORT_SYMBOL_GPL(pgprot_writethrough); - static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int nodat) { unsigned long opt, asce; - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL || nodat) @@ -69,7 +60,7 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, { unsigned long opt, asce; - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL || nodat) @@ -94,7 +85,7 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, if (unlikely(pte_val(old) & _PAGE_INVALID)) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) ptep_ipte_local(mm, addr, ptep, nodat); else @@ -125,32 +116,23 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, static inline pgste_t pgste_get_lock(pte_t *ptep) { - unsigned long new = 0; + unsigned long value = 0; #ifdef CONFIG_PGSTE - unsigned long old; - - asm( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear PCL bit in old */ - " oihh %1,0x0080\n" /* set PCL bit in new */ - " csg %0,%1,%2\n" - " jl 0b\n" - : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) - : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); + unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); + + do { + value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); + } while (value & PGSTE_PCL_BIT); + value |= PGSTE_PCL_BIT; #endif - return __pgste(new); + return __pgste(value); } static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - asm( - " nihh %1,0xff7f\n" /* clear PCL bit */ - " stg %1,%0\n" - : "=Q" (ptep[PTRS_PER_PTE]) - : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) - : "cc", "memory"); + barrier(); + WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); #endif } @@ -182,10 +164,10 @@ static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, skey = (unsigned long) page_get_storage_key(address); bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); /* Transfer page changed & referenced bit to guest bits in pgste */ - pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ + pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ /* Copy page access key and fetch protection bit to pgste */ - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); #endif return pgste; @@ -219,7 +201,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) if ((pte_val(entry) & _PAGE_PRESENT) && (pte_val(entry) & _PAGE_WRITE) && !(pte_val(entry) & _PAGE_INVALID)) { - if (!MACHINE_HAS_ESOP) { + if (!machine_has_esop()) { /* * Without enhanced suppression-on-protection force * the dirty bit on for all writable ptes. @@ -229,7 +211,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) } if (!(pte_val(entry) & _PAGE_PROTECT)) /* This pte allows write access, set user-dirty */ - pgste_val(pgste) |= PGSTE_UC_BIT; + pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); } #endif set_pte(ptep, entry); @@ -245,7 +227,7 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm, bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); if (bits) { - pgste_val(pgste) ^= bits; + pgste = __pgste(pgste_val(pgste) ^ bits); ptep_notify(mm, addr, ptep, bits); } #endif @@ -302,6 +284,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_direct); +/* + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). + */ +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) +{ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 0, 0, 1); + else + __ptep_rdp(addr, ptep, 0, 0, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} +EXPORT_SYMBOL(ptep_reset_dat_prot); + pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { @@ -344,8 +351,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pgste_t pgste; struct mm_struct *mm = vma->vm_mm; - if (!MACHINE_HAS_NX) - pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); if (mm_has_pgste(mm)) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); @@ -360,7 +365,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else @@ -372,12 +377,12 @@ static inline void pmdp_idte_local(struct mm_struct *mm, static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) { + if (machine_has_tlb_guest()) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_global(mm, addr); - } else if (MACHINE_HAS_IDTE) { + } else if (cpu_has_idte()) { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_global(mm, addr); @@ -397,7 +402,7 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) pmdp_idte_local(mm, addr, pmdp); else @@ -454,7 +459,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) return -ENOENT; /* Large PUDs are not supported yet. */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; *pmdp = pmd_offset(pud, addr); @@ -491,7 +496,7 @@ EXPORT_SYMBOL(pmdp_xchg_lazy); static inline void pudp_idte_local(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else @@ -501,15 +506,15 @@ static inline void pudp_idte_local(struct mm_struct *mm, static inline void pudp_idte_global(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - if (MACHINE_HAS_TLB_GUEST) + if (machine_has_tlb_guest()) __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) + else if (cpu_has_idte()) __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); else /* * Invalid bit position is the same for pmd and pud, so we can - * re-use _pmd_csp() here + * reuse _pmd_csp() here */ __pmdp_csp((pmd_t *) pudp); } @@ -523,7 +528,7 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm, if (pud_val(old) & _REGION_ENTRY_INVALID) return old; atomic_inc(&mm->context.flush_count); - if (MACHINE_HAS_TLB_LC && + if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) pudp_idte_local(mm, addr, pudp); else @@ -595,7 +600,7 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, /* the mm_has_pgste() check is done in set_pte_at() */ preempt_disable(); pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); pgste_set_key(ptep, pgste, entry, mm); pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); @@ -608,7 +613,7 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) preempt_disable(); pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; + pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -653,7 +658,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } - pgste_val(pgste) |= bit; + pgste = set_pgste_bit(pgste, bit); pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); return 0; @@ -673,7 +678,7 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, if (!(pte_val(spte) & _PAGE_INVALID) && !((pte_val(spte) & _PAGE_PROTECT) && !(pte_val(pte) & _PAGE_PROTECT))) { - pgste_val(spgste) |= PGSTE_VSIE_BIT; + spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); tpgste = pgste_get_lock(tptep); tpte = __pte((pte_val(spte) & PAGE_MASK) | (pte_val(pte) & _PAGE_PROTECT)); @@ -705,9 +710,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(folio)); } free_swap_and_cache(entry); } @@ -731,7 +736,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_clear(mm, addr, ptep); } if (reset) - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -744,8 +749,8 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) /* Clear storage key ACC and F, but set R/C */ preempt_disable(); pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; + pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); ptev = pte_val(*ptep); if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); @@ -766,13 +771,13 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, pgste = pgste_get_lock(ptep); dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste_val(pgste) &= ~PGSTE_UC_BIT; + pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); ptep_ipte_global(mm, addr, ptep, nodat); - if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); else pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); @@ -804,14 +809,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return key ? -EFAULT : 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; /* @@ -825,12 +830,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); - pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); + new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); keyul = (unsigned long) key; - pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); + new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); if (!(pte_val(*ptep) & _PAGE_INVALID)) { unsigned long bits, skey; @@ -841,12 +848,12 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, /* Set storage key ACC and FP */ page_set_storage_key(paddr, skey, !nq); /* Merge host changed & referenced into pgste */ - pgste_val(new) |= bits << 52; + new = set_pgste_bit(new, bits << 52); } /* changing the guest storage key is considered a change of the page */ if ((pgste_val(new) ^ pgste_val(old)) & (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_UC_BIT; + new = set_pgste_bit(new, PGSTE_UC_BIT); pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); @@ -913,14 +920,14 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; cc = page_reset_referenced(paddr); @@ -930,21 +937,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ - pgste_val(new) &= ~PGSTE_GR_BIT; + new = clear_pgste_bit(new, PGSTE_GR_BIT); if (!(pte_val(*ptep) & _PAGE_INVALID)) { paddr = pte_val(*ptep) & PAGE_MASK; cc = page_reset_referenced(paddr); /* Merge real referenced bit into host-set */ - pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; + new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); } /* Reflect guest's logical view, not physical */ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; /* Changing the guest storage key is considered a change of the page */ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) - pgste_val(new) |= PGSTE_UC_BIT; + new = set_pgste_bit(new, PGSTE_UC_BIT); pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); @@ -975,14 +984,14 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; *key = page_get_storage_key(paddr); @@ -992,6 +1001,8 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; paddr = pte_val(*ptep) & PAGE_MASK; @@ -1106,7 +1117,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, if (res) pgstev |= _PGSTE_GPS_ZERO; - pgste_val(pgste) = pgstev; + pgste = __pgste(pgstev); pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); return res; @@ -1139,8 +1150,8 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long hva, return -EFAULT; new = pgste_get_lock(ptep); - pgste_val(new) &= ~bits; - pgste_val(new) |= value & bits; + new = clear_pgste_bit(new, bits); + new = set_pgste_bit(new, value & bits); pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c new file mode 100644 index 000000000000..59de866c72d9 --- /dev/null +++ b/arch/s390/mm/physaddr.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mmdebug.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <asm/page.h> + +unsigned long __phys_addr(unsigned long x, bool is_31bit) +{ + VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x))); + x = __pa_nodebug(x); + if (is_31bit) + VIRTUAL_BUG_ON(x >> 31); + return x; +} +EXPORT_SYMBOL(__phys_addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index c2583f921ca8..448dd6ed1069 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,6 +4,7 @@ */ #include <linux/memory_hotplug.h> +#include <linux/cpufeature.h> #include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> @@ -11,13 +12,19 @@ #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> +#include <asm/page-states.h> +#include <asm/abs_lowcore.h> #include <asm/cacheflush.h> +#include <asm/maccess.h> #include <asm/nospec-branch.h> +#include <asm/ctlreg.h> #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/set_memory.h> +#include <asm/physmem_info.h> static DEFINE_MUTEX(vmem_mutex); @@ -30,11 +37,15 @@ static void __ref *vmem_alloc_pages(unsigned int order) return memblock_alloc(size, size); } -static void vmem_free_pages(unsigned long addr, int order) +static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { + if (altmap) { + vmem_altmap_free(altmap, 1 << order); + return; + } /* We don't expect boot memory to be removed ever. */ if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) + WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) return; free_pages(addr, order); } @@ -44,8 +55,10 @@ void *vmem_crst_alloc(unsigned long val) unsigned long *table; table = vmem_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + if (!table) + return NULL; + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -61,6 +74,7 @@ pte_t __ref *vmem_pte_alloc(void) if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + __arch_set_page_dat(pte, 1); return pte; } @@ -150,27 +164,25 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long prot, pages = 0; int ret = -ENOMEM; pte_t *pte; prot = pgprot_val(PAGE_KERNEL); - if (!MACHINE_HAS_NX) - prot &= ~_PAGE_NOEXEC; - pte = pte_offset_kernel(pmd, addr); for (; addr < end; addr += PAGE_SIZE, pte++) { if (!add) { if (pte_none(*pte)) continue; if (!direct) - vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { - void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); if (!new_page) goto out; @@ -207,7 +219,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -215,24 +228,21 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, pte_t *pte; prot = pgprot_val(SEGMENT_KERNEL); - if (!MACHINE_HAS_NX) - prot &= ~_SEGMENT_ENTRY_NOEXEC; - pmd = pmd_offset(pud, addr); for (; addr < end; addr = next, pmd++) { next = pmd_addr_end(addr, end); if (!add) { if (pmd_none(*pmd)) continue; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); pages++; } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); } continue; @@ -240,12 +250,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && addr && direct && + cpu_has_edat1() && direct && !debug_pagealloc_enabled()) { set_pmd(pmd, __pmd(__pa(addr) | prot)); pages++; continue; - } else if (!direct && MACHINE_HAS_EDAT1) { + } else if (!direct && cpu_has_edat1()) { void *new_page; /* @@ -255,7 +265,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, * page tables since vmemmap_populate gets * called for each section separately. */ - new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); if (new_page) { set_pmd(pmd, __pmd(__pa(new_page) | prot)); if (!IS_ALIGNED(addr, PMD_SIZE) || @@ -269,12 +279,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (pmd_large(*pmd)) { + } else if (pmd_leaf(*pmd)) { if (!direct) vmemmap_use_sub_pmd(addr, next); continue; } - ret = modify_pte_table(pmd, addr, next, add, direct); + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -289,27 +299,19 @@ out: static void try_free_pmd_table(pud_t *pud, unsigned long start) { - const unsigned long end = start + PUD_SIZE; pmd_t *pmd; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif pmd = pmd_offset(pud, start); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) return; - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); pud_clear(pud); } static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -317,15 +319,13 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, pmd_t *pmd; prot = pgprot_val(REGION3_KERNEL); - if (!MACHINE_HAS_NX) - prot &= ~_REGION_ENTRY_NOEXEC; pud = pud_offset(p4d, addr); for (; addr < end; addr = next, pud++) { next = pud_addr_end(addr, end); if (!add) { if (pud_none(*pud)) continue; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); @@ -336,7 +336,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && addr && direct && + cpu_has_edat2() && direct && !debug_pagealloc_enabled()) { set_pud(pud, __pud(__pa(addr) | prot)); pages++; @@ -346,10 +346,10 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } - ret = modify_pmd_table(pud, addr, next, add, direct); + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -364,29 +364,20 @@ out: static void try_free_pud_table(p4d_t *p4d, unsigned long start) { - const unsigned long end = start + P4D_SIZE; pud_t *pud; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif - pud = pud_offset(p4d, start); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { if (!pud_none(*pud)) return; } - vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); p4d_clear(p4d); } static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next; int ret = -ENOMEM; @@ -405,7 +396,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, goto out; p4d_populate(&init_mm, p4d, pud); } - ret = modify_pud_table(p4d, addr, next, add, direct); + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -418,29 +409,20 @@ out: static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { - const unsigned long end = start + PGDIR_SIZE; p4d_t *p4d; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif - p4d = p4d_offset(pgd, start); for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { if (!p4d_none(*p4d)) return; } - vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); pgd_clear(pgd); } static int modify_pagetable(unsigned long start, unsigned long end, bool add, - bool direct) + bool direct, struct vmem_altmap *altmap) { unsigned long addr, next; int ret = -ENOMEM; @@ -449,6 +431,9 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (WARN_ON_ONCE(end > __abs_lowcore)) + return -EINVAL; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); @@ -462,7 +447,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, goto out; pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add, direct); + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -475,14 +460,16 @@ out: return ret; } -static int add_pagetable(unsigned long start, unsigned long end, bool direct) +static int add_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, true, direct); + return modify_pagetable(start, end, true, direct, altmap); } -static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +static int remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, false, direct); + return modify_pagetable(start, end, false, direct, altmap); } /* @@ -490,7 +477,8 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct) */ static int vmem_add_range(unsigned long start, unsigned long size) { - return add_pagetable(start, start + size, true); + start = (unsigned long)__va(start); + return add_pagetable(start, start + size, true, NULL); } /* @@ -498,7 +486,8 @@ static int vmem_add_range(unsigned long start, unsigned long size) */ static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_pagetable(start, start + size, true); + start = (unsigned long)__va(start); + remove_pagetable(start, start + size, true, NULL); } /* @@ -511,21 +500,25 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, mutex_lock(&vmem_mutex); /* We don't care about the node, just use NUMA_NO_NODE on allocations */ - ret = add_pagetable(start, end, false); + ret = add_pagetable(start, end, false, altmap); if (ret) - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); return ret; } +#ifdef CONFIG_MEMORY_HOTPLUG + void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { mutex_lock(&vmem_mutex); - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); } +#endif + void vmem_remove_mapping(unsigned long start, unsigned long size) { mutex_lock(&vmem_mutex); @@ -538,7 +531,7 @@ struct range arch_get_mappable_range(void) struct range mhp_range; mhp_range.start = 0; - mhp_range.end = VMEM_MAX_PHYS - 1; + mhp_range.end = max_mappable - 1; return mhp_range; } @@ -561,33 +554,116 @@ int vmem_add_mapping(unsigned long start, unsigned long size) } /* - * map whole physical memory to virtual memory (identity mapping) - * we reserve enough space in the vmalloc area for vmemmap to hotplug - * additional memory segments. + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserved for 4KB mappings only. */ +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +{ + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_leaf(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; +} + +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) +{ + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; + + mutex_lock(&vmem_mutex); + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); + mutex_unlock(&vmem_mutex); +} + void __init vmem_map_init(void) { - phys_addr_t base, end; - u64 i; - - for_each_mem_range(i, &base, &end) - vmem_add_range(base, end - base); - __set_memory((unsigned long)_stext, - (unsigned long)(_etext - _stext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory((unsigned long)_etext, - (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, - SET_MEMORY_RO); - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - - /* lowcore must be executable for LPSWE */ - if (!static_key_enabled(&cpu_has_bear)) + __set_memory_rox(_stext, _etext); + __set_memory_ro(_etext, __end_rodata); + __set_memory_rox(__stext_amode31, __etext_amode31); + /* + * If the BEAR-enhancement facility is not installed the first + * prefix page is used to return to the previous context with + * an LPSWE instruction and therefore must be executable. + */ + if (!cpu_has_bear()) set_memory_x(0, 1); - + if (debug_pagealloc_enabled()) + __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index af35052d06ed..c7f8313ba449 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -30,11 +30,13 @@ #include <asm/facility.h> #include <asm/nospec-branch.h> #include <asm/set_memory.h> +#include <asm/text-patching.h> +#include <asm/unwind.h> #include "bpf_jit.h" struct bpf_jit { u32 seen; /* Flags to remember seen eBPF instructions */ - u32 seen_reg[16]; /* Array to remember which registers are used */ + u16 seen_regs; /* Mask to remember which registers are used */ u32 *addrs; /* Array with relative instruction addresses */ u8 *prg_buf; /* Start of program */ int size; /* Size of program and literal pool */ @@ -46,18 +48,21 @@ struct bpf_jit { int lit64; /* Current position in 64-bit literal pool */ int base_ip; /* Base address for literal pool */ int exit_ip; /* Address of exit */ - int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ - int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ + int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ + int prologue_plt; /* Start of prologue hotpatch PLT */ + int kern_arena; /* Pool offset of kernel arena address */ + u64 user_arena; /* User arena address */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ #define SEEN_LITERAL BIT(1) /* code uses literals */ #define SEEN_FUNC BIT(2) /* calls C functions */ -#define SEEN_TAIL_CALL BIT(3) /* code uses tail calls */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM) +#define NVREGS 0xffc0 /* %r6-%r15 */ + /* * s390 registers */ @@ -68,6 +73,10 @@ struct bpf_jit { #define REG_0 REG_W0 /* Register 0 */ #define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ +#define REG_3 BPF_REG_2 /* Register 3 */ +#define REG_4 BPF_REG_3 /* Register 4 */ +#define REG_7 BPF_REG_6 /* Register 7 */ +#define REG_8 BPF_REG_7 /* Register 8 */ #define REG_14 BPF_REG_0 /* Register 14 */ /* @@ -112,8 +121,20 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) { u32 r1 = reg2hex[b1]; - if (r1 >= 6 && r1 <= 15 && !jit->seen_reg[r1]) - jit->seen_reg[r1] = 1; + if (r1 >= 6 && r1 <= 15) + jit->seen_regs |= (1 << r1); +} + +static s32 off_to_pcrel(struct bpf_jit *jit, u32 off) +{ + return off - jit->prg; +} + +static s64 ptr_to_pcrel(struct bpf_jit *jit, const void *ptr) +{ + if (jit->prg_buf) + return (const u8 *)ptr - ((const u8 *)jit->prg_buf + jit->prg); + return 0; } #define REG_SET_SEEN(b1) \ @@ -121,8 +142,6 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) reg_set_seen(jit, b1); \ }) -#define REG_SEEN(b1) jit->seen_reg[reg2hex[(b1)]] - /* * EMIT macros for code generation */ @@ -192,7 +211,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT4_PCREL_RIC(op, mask, target) \ ({ \ - int __rel = ((target) - jit->prg) / 2; \ + int __rel = off_to_pcrel(jit, target) / 2; \ _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \ }) @@ -230,7 +249,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ @@ -239,7 +258,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -248,29 +267,41 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ + int rel = off_to_pcrel(jit, addrs[(i) + (off) + 1]) / 2;\ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) +static void emit6_pcrel_ril(struct bpf_jit *jit, u32 op, s64 pcrel) +{ + u32 pc32dbl = (s32)(pcrel / 2); + + _EMIT6(op | pc32dbl >> 16, pc32dbl & 0xffff); +} + +static void emit6_pcrel_rilb(struct bpf_jit *jit, u32 op, u8 b, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | reg_high(b) << 16, pcrel); + REG_SET_SEEN(b); +} + #define EMIT6_PCREL_RILB(op, b, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\ - REG_SET_SEEN(b); \ -}) + emit6_pcrel_rilb(jit, op, b, off_to_pcrel(jit, target)) -#define EMIT6_PCREL_RIL(op, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | rel >> 16, rel & 0xffff); \ -}) +#define EMIT6_PCREL_RILB_PTR(op, b, target_ptr) \ + emit6_pcrel_rilb(jit, op, b, ptr_to_pcrel(jit, target_ptr)) + +static void emit6_pcrel_rilc(struct bpf_jit *jit, u32 op, u8 mask, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | mask << 20, pcrel); +} #define EMIT6_PCREL_RILC(op, mask, target) \ -({ \ - EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \ -}) + emit6_pcrel_rilc(jit, op, mask, off_to_pcrel(jit, target)) + +#define EMIT6_PCREL_RILC_PTR(op, mask, target_ptr) \ + emit6_pcrel_rilc(jit, op, mask, ptr_to_pcrel(jit, target_ptr)) #define _EMIT6_IMM(op, imm) \ ({ \ @@ -430,12 +461,12 @@ static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth) /* * Return first seen register (from start) */ -static int get_start(struct bpf_jit *jit, int start) +static int get_start(u16 seen_regs, int start) { int i; for (i = start; i <= 15; i++) { - if (jit->seen_reg[i]) + if (seen_regs & (1 << i)) return i; } return 0; @@ -444,15 +475,15 @@ static int get_start(struct bpf_jit *jit, int start) /* * Return last seen register (from start) (gap >= 2) */ -static int get_end(struct bpf_jit *jit, int start) +static int get_end(u16 seen_regs, int start) { int i; for (i = start; i < 15; i++) { - if (!jit->seen_reg[i] && !jit->seen_reg[i + 1]) + if (!(seen_regs & (3 << i))) return i - 1; } - return jit->seen_reg[15] ? 15 : 14; + return (seen_regs & (1 << 15)) ? 15 : 14; } #define REGS_SAVE 1 @@ -461,8 +492,10 @@ static int get_end(struct bpf_jit *jit, int start) * Save and restore clobbered registers (6-15) on stack. * We save/restore registers in chunks with gap >= 2 registers. */ -static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) +static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth, + u16 extra_regs) { + u16 seen_regs = jit->seen_regs | extra_regs; const int last = 15, save_restore_size = 6; int re = 6, rs; @@ -476,10 +509,10 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) } do { - rs = get_start(jit, re); + rs = get_start(seen_regs, re); if (!rs) break; - re = get_end(jit, rs + 1); + re = get_end(seen_regs, rs + 1); if (op == REGS_SAVE) save_regs(jit, rs, re); else @@ -492,7 +525,7 @@ static void bpf_skip(struct bpf_jit *jit, int size) { if (size >= 6 && !is_valid_rel(size)) { /* brcl 0xf,size */ - EMIT6_PCREL_RIL(0xc0f4000000, size); + EMIT6_PCREL_RILC(0xc0040000, 0xf, size); size -= 6; } else if (size >= 4 && is_valid_rel(size)) { /* brc 0xf,size */ @@ -507,27 +540,78 @@ static void bpf_skip(struct bpf_jit *jit, int size) } /* + * PLT for hotpatchable calls. The calling convention is the same as for the + * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered. + */ +struct bpf_plt { + char code[16]; + void *ret; + void *target; +} __packed; +extern const struct bpf_plt bpf_plt; +asm( + ".pushsection .rodata\n" + " .balign 8\n" + "bpf_plt:\n" + " lgrl %r0,bpf_plt_ret\n" + " lgrl %r1,bpf_plt_target\n" + " br %r1\n" + " .balign 8\n" + "bpf_plt_ret: .quad 0\n" + "bpf_plt_target: .quad 0\n" + " .popsection\n" +); + +static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target) +{ + memcpy(plt, &bpf_plt, sizeof(*plt)); + plt->ret = ret; + plt->target = target; +} + +/* * Emit function prologue * * Save registers and create stack frame if necessary. - * See stack frame layout desription in "bpf_jit.h"! + * See stack frame layout description in "bpf_jit.h"! */ -static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) +static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, + u32 stack_depth) { - if (jit->seen & SEEN_TAIL_CALL) { + /* No-op for hotpatching */ + /* brcl 0,prologue_plt */ + EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt); + jit->prologue_plt_ret = jit->prg; + + if (!bpf_is_subprog(fp)) { + /* Initialize the tail call counter in the main program. */ /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT); } else { /* - * There are no tail calls. Insert nops in order to have - * tail_call_start at a predictable offset. + * Skip the tail call counter initialization in subprograms. + * Insert nops in order to have tail_call_start at a + * predictable offset. */ bpf_skip(jit, 6); } /* Tail calls have to skip above initialization */ jit->tail_call_start = jit->prg; - /* Save registers */ - save_restore_regs(jit, REGS_SAVE, stack_depth); + if (fp->aux->exception_cb) { + /* + * Switch stack, the new address is in the 2nd parameter. + * + * Arrange the restoration of %r6-%r15 in the epilogue. + * Do not restore them now, the prog does not need them. + */ + /* lgr %r15,%r3 */ + EMIT4(0xb9040000, REG_15, REG_3); + jit->seen_regs |= NVREGS; + } else { + /* Save registers */ + save_restore_regs(jit, REGS_SAVE, stack_depth, + fp->aux->exception_boundary ? NVREGS : 0); + } /* Setup literal pool */ if (is_first_pass(jit) || (jit->seen & SEEN_LITERAL)) { if (!is_first_pass(jit) && @@ -543,21 +627,46 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) } /* Setup stack and backchain */ if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) { - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* lgr %w1,%r15 (backchain) */ - EMIT4(0xb9040000, REG_W1, REG_15); + /* lgr %w1,%r15 (backchain) */ + EMIT4(0xb9040000, REG_W1, REG_15); /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); /* aghi %r15,-STK_OFF */ EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* stg %w1,152(%r15) (backchain) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, - REG_15, 152); + /* stg %w1,152(%r15) (backchain) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, + REG_15, 152); } } /* + * Jump using a register either directly or via an expoline thunk + */ +#define EMIT_JUMP_REG(reg) do { \ + if (nospec_uses_trampoline()) \ + /* brcl 0xf,__s390_indirect_jump_rN */ \ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0x0f, \ + __s390_indirect_jump_r ## reg); \ + else \ + /* br %rN */ \ + _EMIT2(0x07f0 | reg); \ +} while (0) + +/* + * Call r1 either directly or via __s390_indirect_jump_r1 thunk + */ +static void call_r1(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r1); + else + /* basr %r14,%r1 */ + EMIT2(0x0d00, REG_14, REG_1); +} + +/* * Function epilogue */ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) @@ -566,72 +675,122 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) /* Load exit code: lgr %r2,%b0 */ EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ - save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (nospec_uses_trampoline()) { - jit->r14_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r14 thunk */ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); - } - /* br %r14 */ - _EMIT2(0x07fe); - - if ((nospec_uses_trampoline()) && - (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) { - jit->r1_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r1 thunk */ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); - /* br %r1 */ - _EMIT2(0x07f1); + save_restore_regs(jit, REGS_RESTORE, stack_depth, 0); + EMIT_JUMP_REG(14); + + jit->prg = ALIGN(jit->prg, 8); + jit->prologue_plt = jit->prg; + if (jit->prg_buf) + bpf_jit_plt((struct bpf_plt *)(jit->prg_buf + jit->prg), + jit->prg_buf + jit->prologue_plt_ret, NULL); + jit->prg += sizeof(struct bpf_plt); +} + +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(x); + if (x->data != -1) + regs->gprs[x->data] = 0; + return true; +} + +/* + * A single BPF probe instruction + */ +struct bpf_jit_probe { + int prg; /* JITed instruction offset */ + int nop_prg; /* JITed nop offset */ + int reg; /* Register to clear on exception */ + int arena_reg; /* Register to use for arena addressing */ +}; + +static void bpf_jit_probe_init(struct bpf_jit_probe *probe) +{ + probe->prg = -1; + probe->nop_prg = -1; + probe->reg = -1; + probe->arena_reg = REG_0; +} + +/* + * Handlers of certain exceptions leave psw.addr pointing to the instruction + * directly after the failing one. Therefore, create two exception table + * entries and also add a nop in case two probing instructions come directly + * after each other. + */ +static void bpf_jit_probe_emit_nop(struct bpf_jit *jit, + struct bpf_jit_probe *probe) +{ + if (probe->prg == -1 || probe->nop_prg != -1) + /* The probe is not armed or nop is already emitted. */ + return; + + probe->nop_prg = jit->prg; + /* bcr 0,%0 */ + _EMIT2(0x0700); +} + +static void bpf_jit_probe_load_pre(struct bpf_jit *jit, struct bpf_insn *insn, + struct bpf_jit_probe *probe) +{ + if (BPF_MODE(insn->code) != BPF_PROBE_MEM && + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32) + return; + + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + /* lgrl %r1,kern_arena */ + EMIT6_PCREL_RILB(0xc4080000, REG_W1, jit->kern_arena); + probe->arena_reg = REG_W1; } + probe->prg = jit->prg; + probe->reg = reg2hex[insn->dst_reg]; } -static int get_probe_mem_regno(const u8 *insn) +static void bpf_jit_probe_store_pre(struct bpf_jit *jit, struct bpf_insn *insn, + struct bpf_jit_probe *probe) { - /* - * insn must point to llgc, llgh, llgf or lg, which have destination - * register at the same position. - */ - if (insn[0] != 0xe3) /* common llgc, llgh, llgf and lg prefix */ - return -1; - if (insn[5] != 0x90 && /* llgc */ - insn[5] != 0x91 && /* llgh */ - insn[5] != 0x16 && /* llgf */ - insn[5] != 0x04) /* lg */ - return -1; - return insn[1] >> 4; + if (BPF_MODE(insn->code) != BPF_PROBE_MEM32) + return; + + /* lgrl %r1,kern_arena */ + EMIT6_PCREL_RILB(0xc4080000, REG_W1, jit->kern_arena); + probe->arena_reg = REG_W1; + probe->prg = jit->prg; } -bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) +static void bpf_jit_probe_atomic_pre(struct bpf_jit *jit, + struct bpf_insn *insn, + struct bpf_jit_probe *probe) { - regs->psw.addr = extable_fixup(x); - regs->gprs[x->data] = 0; - return true; + if (BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) + return; + + /* lgrl %r1,kern_arena */ + EMIT6_PCREL_RILB(0xc4080000, REG_W1, jit->kern_arena); + /* agr %r1,%dst */ + EMIT4(0xb9080000, REG_W1, insn->dst_reg); + probe->arena_reg = REG_W1; + probe->prg = jit->prg; } -static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, - int probe_prg, int nop_prg) +static int bpf_jit_probe_post(struct bpf_jit *jit, struct bpf_prog *fp, + struct bpf_jit_probe *probe) { struct exception_table_entry *ex; - int reg, prg; + int i, prg; s64 delta; u8 *insn; - int i; + if (probe->prg == -1) + /* The probe is not armed. */ + return 0; + bpf_jit_probe_emit_nop(jit, probe); if (!fp->aux->extable) /* Do nothing during early JIT passes. */ return 0; - insn = jit->prg_buf + probe_prg; - reg = get_probe_mem_regno(insn); - if (WARN_ON_ONCE(reg < 0)) - /* JIT bug - unexpected probe instruction. */ - return -1; - if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg)) + insn = jit->prg_buf + probe->prg; + if (WARN_ON_ONCE(probe->prg + insn_length(*insn) != probe->nop_prg)) /* JIT bug - gap between probe and nop instructions. */ return -1; for (i = 0; i < 2; i++) { @@ -640,29 +799,58 @@ static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, return -1; ex = &fp->aux->extable[jit->excnt]; /* Add extable entries for probe and nop instructions. */ - prg = i == 0 ? probe_prg : nop_prg; + prg = i == 0 ? probe->prg : probe->nop_prg; delta = jit->prg_buf + prg - (u8 *)&ex->insn; if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) /* JIT bug - code and extable must be close. */ return -1; ex->insn = delta; /* - * Always land on the nop. Note that extable infrastructure - * ignores fixup field, it is handled by ex_handler_bpf(). + * Land on the current instruction. Note that the extable + * infrastructure ignores the fixup field; it is handled by + * ex_handler_bpf(). */ - delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup; + delta = jit->prg_buf + jit->prg - (u8 *)&ex->fixup; if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) /* JIT bug - landing pad and extable must be close. */ return -1; ex->fixup = delta; ex->type = EX_TYPE_BPF; - ex->data = reg; + ex->data = probe->reg; jit->excnt++; } return 0; } /* + * Sign-extend the register if necessary + */ +static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) +{ + if (!(flags & BTF_FMODEL_SIGNED_ARG)) + return 0; + + switch (size) { + case 1: + /* lgbr %r,%r */ + EMIT4(0xb9060000, r, r); + return 0; + case 2: + /* lghr %r,%r */ + EMIT4(0xb9070000, r, r); + return 0; + case 4: + /* lgfr %r,%r */ + EMIT4(0xb9140000, r, r); + return 0; + case 8: + return 0; + default: + return -1; + } +} + +/* * Compile one eBPF instruction into s390x code * * NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of @@ -672,34 +860,80 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i, bool extra_pass, u32 stack_depth) { struct bpf_insn *insn = &fp->insnsi[i]; + s32 branch_oc_off = insn->off; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; + struct bpf_jit_probe probe; int last, insn_count = 1; u32 *addrs = jit->addrs; s32 imm = insn->imm; s16 off = insn->off; - int probe_prg = -1; unsigned int mask; - int nop_prg; int err; - if (BPF_CLASS(insn->code) == BPF_LDX && - BPF_MODE(insn->code) == BPF_PROBE_MEM) - probe_prg = jit->prg; + bpf_jit_probe_init(&probe); switch (insn->code) { /* * BPF_MOV */ - case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */ - /* llgfr %dst,%src */ - EMIT4(0xb9160000, dst_reg, src_reg); - if (insn_is_zext(&insn[1])) - insn_count = 2; + case BPF_ALU | BPF_MOV | BPF_X: + switch (insn->off) { + case 0: /* DST = (u32) SRC */ + /* llgfr %dst,%src */ + EMIT4(0xb9160000, dst_reg, src_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case 8: /* DST = (u32)(s8) SRC */ + /* lbr %dst,%src */ + EMIT4(0xb9260000, dst_reg, src_reg); + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); + break; + case 16: /* DST = (u32)(s16) SRC */ + /* lhr %dst,%src */ + EMIT4(0xb9270000, dst_reg, src_reg); + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); + break; + } break; - case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ - /* lgr %dst,%src */ - EMIT4(0xb9040000, dst_reg, src_reg); + case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn_is_cast_user(insn)) { + int patch_brc; + + /* ltgr %dst,%src */ + EMIT4(0xb9020000, dst_reg, src_reg); + /* brc 8,0f */ + patch_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, 0); + /* iihf %dst,user_arena>>32 */ + EMIT6_IMM(0xc0080000, dst_reg, jit->user_arena >> 32); + /* 0: */ + if (jit->prg_buf) + *(u16 *)(jit->prg_buf + patch_brc + 2) = + (jit->prg - patch_brc) >> 1; + break; + } + switch (insn->off) { + case 0: /* DST = SRC */ + /* lgr %dst,%src */ + EMIT4(0xb9040000, dst_reg, src_reg); + break; + case 8: /* DST = (s8) SRC */ + /* lgbr %dst,%src */ + EMIT4(0xb9060000, dst_reg, src_reg); + break; + case 16: /* DST = (s16) SRC */ + /* lghr %dst,%src */ + EMIT4(0xb9070000, dst_reg, src_reg); + break; + case 32: /* DST = (s32) SRC */ + /* lgfr %dst,%src */ + EMIT4(0xb9140000, dst_reg, src_reg); + break; + } break; case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */ /* llilf %dst,imm */ @@ -808,66 +1042,115 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* * BPF_DIV / BPF_MOD */ - case BPF_ALU | BPF_DIV | BPF_X: /* dst = (u32) dst / (u32) src */ - case BPF_ALU | BPF_MOD | BPF_X: /* dst = (u32) dst % (u32) src */ + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_MOD | BPF_X: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - /* lhi %w0,0 */ - EMIT4_IMM(0xa7080000, REG_W0, 0); - /* lr %w1,%dst */ - EMIT2(0x1800, REG_W1, dst_reg); - /* dlr %w0,%src */ - EMIT4(0xb9970000, REG_W0, src_reg); + switch (off) { + case 0: /* dst = (u32) dst {/,%} (u32) src */ + /* xr %w0,%w0 */ + EMIT2(0x1700, REG_W0, REG_W0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* dlr %w0,%src */ + EMIT4(0xb9970000, REG_W0, src_reg); + break; + case 1: /* dst = (u32) ((s32) dst {/,%} (s32) src) */ + /* lgfr %r1,%dst */ + EMIT4(0xb9140000, REG_W1, dst_reg); + /* dsgfr %r0,%src */ + EMIT4(0xb91d0000, REG_W0, src_reg); + break; + } /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); if (insn_is_zext(&insn[1])) insn_count = 2; break; } - case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */ - case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % src */ + case BPF_ALU64 | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_X: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - /* lghi %w0,0 */ - EMIT4_IMM(0xa7090000, REG_W0, 0); - /* lgr %w1,%dst */ - EMIT4(0xb9040000, REG_W1, dst_reg); - /* dlgr %w0,%dst */ - EMIT4(0xb9870000, REG_W0, src_reg); + switch (off) { + case 0: /* dst = dst {/,%} src */ + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dlgr %w0,%src */ + EMIT4(0xb9870000, REG_W0, src_reg); + break; + case 1: /* dst = (s64) dst {/,%} (s64) src */ + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dsgr %w0,%src */ + EMIT4(0xb90d0000, REG_W0, src_reg); + break; + } /* lgr %dst,%rc */ EMIT4(0xb9040000, dst_reg, rc_reg); break; } - case BPF_ALU | BPF_DIV | BPF_K: /* dst = (u32) dst / (u32) imm */ - case BPF_ALU | BPF_MOD | BPF_K: /* dst = (u32) dst % (u32) imm */ + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; if (imm == 1) { if (BPF_OP(insn->code) == BPF_MOD) - /* lhgi %dst,0 */ + /* lghi %dst,0 */ EMIT4_IMM(0xa7090000, dst_reg, 0); else EMIT_ZERO(dst_reg); break; } - /* lhi %w0,0 */ - EMIT4_IMM(0xa7080000, REG_W0, 0); - /* lr %w1,%dst */ - EMIT2(0x1800, REG_W1, dst_reg); if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) { - /* dl %w0,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L, - EMIT_CONST_U32(imm)); + switch (off) { + case 0: /* dst = (u32) dst {/,%} (u32) imm */ + /* xr %w0,%w0 */ + EMIT2(0x1700, REG_W0, REG_W0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* dl %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, + REG_L, EMIT_CONST_U32(imm)); + break; + case 1: /* dst = (s32) dst {/,%} (s32) imm */ + /* lgfr %r1,%dst */ + EMIT4(0xb9140000, REG_W1, dst_reg); + /* dsgf %r0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x001d, REG_W0, REG_0, + REG_L, EMIT_CONST_U32(imm)); + break; + } } else { - /* lgfrl %dst,imm */ - EMIT6_PCREL_RILB(0xc40c0000, dst_reg, - _EMIT_CONST_U32(imm)); - jit->seen |= SEEN_LITERAL; - /* dlr %w0,%dst */ - EMIT4(0xb9970000, REG_W0, dst_reg); + switch (off) { + case 0: /* dst = (u32) dst {/,%} (u32) imm */ + /* xr %w0,%w0 */ + EMIT2(0x1700, REG_W0, REG_W0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* lrl %dst,imm */ + EMIT6_PCREL_RILB(0xc40d0000, dst_reg, + _EMIT_CONST_U32(imm)); + jit->seen |= SEEN_LITERAL; + /* dlr %w0,%dst */ + EMIT4(0xb9970000, REG_W0, dst_reg); + break; + case 1: /* dst = (s32) dst {/,%} (s32) imm */ + /* lgfr %w1,%dst */ + EMIT4(0xb9140000, REG_W1, dst_reg); + /* lgfrl %dst,imm */ + EMIT6_PCREL_RILB(0xc40c0000, dst_reg, + _EMIT_CONST_U32(imm)); + jit->seen |= SEEN_LITERAL; + /* dsgr %w0,%dst */ + EMIT4(0xb90d0000, REG_W0, dst_reg); + break; + } } /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); @@ -875,8 +1158,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; } - case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */ - case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % imm */ + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; @@ -886,21 +1169,50 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7090000, dst_reg, 0); break; } - /* lghi %w0,0 */ - EMIT4_IMM(0xa7090000, REG_W0, 0); - /* lgr %w1,%dst */ - EMIT4(0xb9040000, REG_W1, dst_reg); if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { - /* dlg %w0,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L, - EMIT_CONST_U64(imm)); + switch (off) { + case 0: /* dst = dst {/,%} imm */ + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dlg %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, + REG_L, EMIT_CONST_U64(imm)); + break; + case 1: /* dst = (s64) dst {/,%} (s64) imm */ + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dsg %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x000d, REG_W0, REG_0, + REG_L, EMIT_CONST_U64(imm)); + break; + } } else { - /* lgrl %dst,imm */ - EMIT6_PCREL_RILB(0xc4080000, dst_reg, - _EMIT_CONST_U64(imm)); - jit->seen |= SEEN_LITERAL; - /* dlgr %w0,%dst */ - EMIT4(0xb9870000, REG_W0, dst_reg); + switch (off) { + case 0: /* dst = dst {/,%} imm */ + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* dlgr %w0,%dst */ + EMIT4(0xb9870000, REG_W0, dst_reg); + break; + case 1: /* dst = (s64) dst {/,%} (s64) imm */ + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* dsgr %w0,%dst */ + EMIT4(0xb90d0000, REG_W0, dst_reg); + break; + } } /* lgr %dst,%rc */ EMIT4(0xb9040000, dst_reg, rc_reg); @@ -1113,6 +1425,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } break; case BPF_ALU | BPF_END | BPF_FROM_LE: + case BPF_ALU64 | BPF_END | BPF_FROM_LE: switch (imm) { case 16: /* dst = (u16) cpu_to_le16(dst) */ /* lrvr %dst,%dst */ @@ -1146,51 +1459,99 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * BPF_ST(X) */ case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */ - /* stcy %src,off(%dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0072, src_reg, dst_reg, REG_0, off); + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + bpf_jit_probe_store_pre(jit, insn, &probe); + /* stcy %src,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0072, src_reg, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_STX | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = src */ - /* sthy %src,off(%dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0070, src_reg, dst_reg, REG_0, off); + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + bpf_jit_probe_store_pre(jit, insn, &probe); + /* sthy %src,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0070, src_reg, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_STX | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = src */ - /* sty %src,off(%dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0050, src_reg, dst_reg, REG_0, off); + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + bpf_jit_probe_store_pre(jit, insn, &probe); + /* sty %src,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0050, src_reg, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_STX | BPF_MEM | BPF_DW: /* (u64 *)(dst + off) = src */ - /* stg %src,off(%dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, src_reg, dst_reg, REG_0, off); + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + bpf_jit_probe_store_pre(jit, insn, &probe); + /* stg %src,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, src_reg, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_ST | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = imm */ + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: /* lhi %w0,imm */ EMIT4_IMM(0xa7080000, REG_W0, (u8) imm); - /* stcy %w0,off(dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0072, REG_W0, dst_reg, REG_0, off); + bpf_jit_probe_store_pre(jit, insn, &probe); + /* stcy %w0,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0072, REG_W0, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_ST | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = imm */ + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: /* lhi %w0,imm */ EMIT4_IMM(0xa7080000, REG_W0, (u16) imm); - /* sthy %w0,off(dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0070, REG_W0, dst_reg, REG_0, off); + bpf_jit_probe_store_pre(jit, insn, &probe); + /* sthy %w0,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0070, REG_W0, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_ST | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = imm */ + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: /* llilf %w0,imm */ EMIT6_IMM(0xc00f0000, REG_W0, (u32) imm); - /* sty %w0,off(%dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, dst_reg, REG_0, off); + bpf_jit_probe_store_pre(jit, insn, &probe); + /* sty %w0,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; case BPF_ST | BPF_MEM | BPF_DW: /* *(u64 *)(dst + off) = imm */ + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: /* lgfi %w0,imm */ EMIT6_IMM(0xc0010000, REG_W0, imm); - /* stg %w0,off(%dst) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, dst_reg, REG_0, off); + bpf_jit_probe_store_pre(jit, insn, &probe); + /* stg %w0,off(%dst,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, dst_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; break; /* @@ -1198,17 +1559,36 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, */ case BPF_STX | BPF_ATOMIC | BPF_DW: case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: { bool is32 = BPF_SIZE(insn->code) == BPF_W; + /* + * Unlike loads and stores, atomics have only a base register, + * but no index register. For the non-arena case, simply use + * %dst as a base. For the arena case, use the work register + * %r1: first, load the arena base into it, and then add %dst + * to it. + */ + probe.arena_reg = dst_reg; + switch (insn->imm) { -/* {op32|op64} {%w0|%src},%src,off(%dst) */ #define EMIT_ATOMIC(op32, op64) do { \ + bpf_jit_probe_atomic_pre(jit, insn, &probe); \ + /* {op32|op64} {%w0|%src},%src,off(%arena) */ \ EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64), \ (insn->imm & BPF_FETCH) ? src_reg : REG_W0, \ - src_reg, dst_reg, off); \ - if (is32 && (insn->imm & BPF_FETCH)) \ - EMIT_ZERO(src_reg); \ + src_reg, probe.arena_reg, off); \ + err = bpf_jit_probe_post(jit, fp, &probe); \ + if (err < 0) \ + return err; \ + if (insn->imm & BPF_FETCH) { \ + /* bcr 14,0 - see atomic_fetch_{add,and,or,xor}() */ \ + _EMIT2(0x07e0); \ + if (is32) \ + EMIT_ZERO(src_reg); \ + } \ } while (0) case BPF_ADD: case BPF_ADD | BPF_FETCH: @@ -1231,25 +1611,50 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT_ATOMIC(0x00f7, 0x00e7); break; #undef EMIT_ATOMIC - case BPF_XCHG: - /* {ly|lg} %w0,off(%dst) */ + case BPF_XCHG: { + struct bpf_jit_probe load_probe = probe; + int loop_start; + + bpf_jit_probe_atomic_pre(jit, insn, &load_probe); + /* {ly|lg} %w0,off(%arena) */ EMIT6_DISP_LH(0xe3000000, is32 ? 0x0058 : 0x0004, REG_W0, REG_0, - dst_reg, off); - /* 0: {csy|csg} %w0,%src,off(%dst) */ + load_probe.arena_reg, off); + bpf_jit_probe_emit_nop(jit, &load_probe); + /* Reuse {ly|lg}'s arena_reg for {csy|csg}. */ + if (load_probe.prg != -1) { + probe.prg = jit->prg; + probe.arena_reg = load_probe.arena_reg; + } + loop_start = jit->prg; + /* 0: {csy|csg} %w0,%src,off(%arena) */ EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030, - REG_W0, src_reg, dst_reg, off); + REG_W0, src_reg, probe.arena_reg, off); + bpf_jit_probe_emit_nop(jit, &probe); /* brc 4,0b */ - EMIT4_PCREL_RIC(0xa7040000, 4, jit->prg - 6); + EMIT4_PCREL_RIC(0xa7040000, 4, loop_start); /* {llgfr|lgr} %src,%w0 */ EMIT4(is32 ? 0xb9160000 : 0xb9040000, src_reg, REG_W0); + /* Both probes should land here on exception. */ + err = bpf_jit_probe_post(jit, fp, &load_probe); + if (err < 0) + return err; + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; if (is32 && insn_is_zext(&insn[1])) insn_count = 2; break; + } case BPF_CMPXCHG: - /* 0: {csy|csg} %b0,%src,off(%dst) */ + bpf_jit_probe_atomic_pre(jit, insn, &probe); + /* 0: {csy|csg} %b0,%src,off(%arena) */ EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030, - BPF_REG_0, src_reg, dst_reg, off); + BPF_REG_0, src_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; break; default: pr_err("Unknown atomic operation %02x\n", insn->imm); @@ -1264,42 +1669,97 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ case BPF_LDX | BPF_PROBE_MEM | BPF_B: - /* llgc %dst,0(off,%src) */ - EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + bpf_jit_probe_load_pre(jit, insn, &probe); + /* llgc %dst,off(%src,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; if (insn_is_zext(&insn[1])) insn_count = 2; break; + case BPF_LDX | BPF_MEMSX | BPF_B: /* dst = *(s8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: + bpf_jit_probe_load_pre(jit, insn, &probe); + /* lgb %dst,off(%src) */ + EMIT6_DISP_LH(0xe3000000, 0x0077, dst_reg, src_reg, REG_0, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; + jit->seen |= SEEN_MEM; + break; case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ case BPF_LDX | BPF_PROBE_MEM | BPF_H: - /* llgh %dst,0(off,%src) */ - EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + bpf_jit_probe_load_pre(jit, insn, &probe); + /* llgh %dst,off(%src,%arena) */ + EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; jit->seen |= SEEN_MEM; if (insn_is_zext(&insn[1])) insn_count = 2; break; + case BPF_LDX | BPF_MEMSX | BPF_H: /* dst = *(s16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: + bpf_jit_probe_load_pre(jit, insn, &probe); + /* lgh %dst,off(%src) */ + EMIT6_DISP_LH(0xe3000000, 0x0015, dst_reg, src_reg, REG_0, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; + jit->seen |= SEEN_MEM; + break; case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ case BPF_LDX | BPF_PROBE_MEM | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + bpf_jit_probe_load_pre(jit, insn, &probe); /* llgf %dst,off(%src) */ jit->seen |= SEEN_MEM; - EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); + EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; if (insn_is_zext(&insn[1])) insn_count = 2; break; + case BPF_LDX | BPF_MEMSX | BPF_W: /* dst = *(s32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: + bpf_jit_probe_load_pre(jit, insn, &probe); + /* lgf %dst,off(%src) */ + jit->seen |= SEEN_MEM; + EMIT6_DISP_LH(0xe3000000, 0x0014, dst_reg, src_reg, REG_0, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; + break; case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_PROBE_MEM | BPF_DW: - /* lg %dst,0(off,%src) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + bpf_jit_probe_load_pre(jit, insn, &probe); + /* lg %dst,off(%src,%arena) */ jit->seen |= SEEN_MEM; - EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off); + EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, + probe.arena_reg, off); + err = bpf_jit_probe_post(jit, fp, &probe); + if (err < 0) + return err; break; /* * BPF_JMP / CALL */ case BPF_JMP | BPF_CALL: { - u64 func; + const struct btf_func_model *m; bool func_addr_fixed; - int ret; + int j, ret; + u64 func; ret = bpf_jit_get_func_addr(fp, insn, extra_pass, &func, &func_addr_fixed); @@ -1308,15 +1768,38 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* + * Copy the tail call counter to where the callee expects it. + * + * Note 1: The callee can increment the tail call counter, but + * we do not load it back, since the x86 JIT does not do this + * either. + * + * Note 2: We assume that the verifier does not let us call the + * main program, which clears the tail call counter on entry. + */ + /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, + 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth)); + + /* Sign-extend the kfunc arguments. */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + m = bpf_jit_find_kfunc_model(fp, insn); + if (!m) + return -1; + + for (j = 0; j < m->nr_args; j++) { + if (sign_extend(jit, BPF_REG_1 + j, + m->arg_size[j], + m->arg_flags[j])) + return -1; + } + } + /* lgrl %w1,func */ EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - if (nospec_uses_trampoline()) { - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); - } else { - /* basr %r14,%w1 */ - EMIT2(0x0d00, REG_14, REG_W1); - } + /* %r1() */ + call_r1(jit); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); break; @@ -1329,10 +1812,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * B1: pointer to ctx * B2: pointer to bpf_array * B3: index in bpf_array - */ - jit->seen |= SEEN_TAIL_CALL; - - /* + * * if (index >= array->map.max_entries) * goto out; */ @@ -1384,7 +1864,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* * Restore registers before calling function */ - save_restore_regs(jit, REGS_RESTORE, stack_depth); + save_restore_regs(jit, REGS_RESTORE, stack_depth, 0); /* * goto *(prog->bpf_func + tail_call_start); @@ -1393,8 +1873,17 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* lg %r1,bpf_func(%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0, offsetof(struct bpf_prog, bpf_func)); - /* bc 0xf,tail_call_start(%r1) */ - _EMIT4(0x47f01000 + jit->tail_call_start); + if (nospec_uses_trampoline()) { + jit->seen |= SEEN_FUNC; + /* aghi %r1,tail_call_start */ + EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); + /* brcl 0xf,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + __s390_indirect_jump_r1); + } else { + /* bc 0xf,tail_call_start(%r1) */ + _EMIT4(0x47f01000 + jit->tail_call_start); + } /* out: */ if (jit->prg_buf) { *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = @@ -1437,6 +1926,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * instruction itself (loop) and for BPF with offset 0 we * branch to the instruction behind the branch. */ + case BPF_JMP32 | BPF_JA: /* if (true) */ + branch_oc_off = imm; + fallthrough; case BPF_JMP | BPF_JA: /* if (true) */ mask = 0xf000; /* j */ goto branch_oc; @@ -1605,14 +2097,16 @@ branch_xu: break; branch_oc: if (!is_first_pass(jit) && - can_use_rel(jit, addrs[i + off + 1])) { + can_use_rel(jit, addrs[i + branch_oc_off + 1])) { /* brc mask,off */ EMIT4_PCREL_RIC(0xa7040000, - mask >> 12, addrs[i + off + 1]); + mask >> 12, + addrs[i + branch_oc_off + 1]); } else { /* brcl mask,off */ EMIT6_PCREL_RILC(0xc0040000, - mask >> 12, addrs[i + off + 1]); + mask >> 12, + addrs[i + branch_oc_off + 1]); } break; } @@ -1621,22 +2115,6 @@ branch_oc: return -1; } - if (probe_prg != -1) { - /* - * Handlers of certain exceptions leave psw.addr pointing to - * the instruction directly after the failing one. Therefore, - * create two exception table entries and also add a nop in - * case two probing instructions come directly after each - * other. - */ - nop_prg = jit->prg; - /* bcr 0,%0 */ - _EMIT2(0x0700); - err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg); - if (err < 0) - return err; - } - return insn_count; } @@ -1682,13 +2160,19 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, bool extra_pass, u32 stack_depth) { int i, insn_count, lit32_size, lit64_size; + u64 kern_arena; jit->lit32 = jit->lit32_start; jit->lit64 = jit->lit64_start; jit->prg = 0; jit->excnt = 0; - bpf_jit_prologue(jit, stack_depth); + kern_arena = bpf_arena_get_kern_vm_start(fp->aux->arena); + if (kern_arena) + jit->kern_arena = _EMIT_CONST_U64(kern_arena); + jit->user_arena = bpf_arena_get_user_vm_start(fp->aux->arena); + + bpf_jit_prologue(jit, fp, stack_depth); if (bpf_set_addr(jit, 0) < 0) return -1; for (i = 0; i < fp->len; i += insn_count) { @@ -1735,9 +2219,25 @@ static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, struct bpf_prog *fp) { struct bpf_binary_header *header; + struct bpf_insn *insn; u32 extable_size; u32 code_size; + int i; + for (i = 0; i < fp->len; i++) { + insn = &fp->insnsi[i]; + + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC && + (BPF_SIZE(insn->code) == BPF_DW || + BPF_SIZE(insn->code) == BPF_W) && + insn->imm == BPF_XCHG) + /* + * bpf_jit_insn() emits a load and a compare-and-swap, + * both of which need to be probed. + */ + fp->aux->num_exentries += 1; + } /* We need two entries per insn. */ fp->aux->num_exentries *= 2; @@ -1836,7 +2336,11 @@ skip_init_ctx: print_fn_code(jit.prg_buf, jit.size_prg); } if (!fp->is_func || extra_pass) { - bpf_jit_binary_lock_ro(header); + if (bpf_jit_binary_lock_ro(header)) { + bpf_jit_binary_free(header); + fp = orig_fp; + goto free_addrs; + } } else { jit_data->header = header; jit_data->ctx = jit; @@ -1859,3 +2363,600 @@ out: tmp : orig_fp); return fp; } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + +bool bpf_jit_supports_far_kfunc_call(void) +{ + return true; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + struct bpf_plt expected_plt, current_plt, new_plt, *plt; + struct { + u16 opc; + s32 disp; + } __packed insn; + char *ret; + int err; + + /* Verify the branch to be patched. */ + err = copy_from_kernel_nofault(&insn, ip, sizeof(insn)); + if (err < 0) + return err; + if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0))) + return -EINVAL; + + if (t == BPF_MOD_JUMP && + insn.disp == ((char *)new_addr - (char *)ip) >> 1) { + /* + * The branch already points to the destination, + * there is no PLT. + */ + } else { + /* Verify the PLT. */ + plt = ip + (insn.disp << 1); + err = copy_from_kernel_nofault(¤t_plt, plt, + sizeof(current_plt)); + if (err < 0) + return err; + ret = (char *)ip + 6; + bpf_jit_plt(&expected_plt, ret, old_addr); + if (memcmp(¤t_plt, &expected_plt, sizeof(current_plt))) + return -EINVAL; + /* Adjust the call address. */ + bpf_jit_plt(&new_plt, ret, new_addr); + s390_kernel_write(&plt->target, &new_plt.target, + sizeof(void *)); + } + + /* Adjust the mask of the branch. */ + insn.opc = 0xc004 | (new_addr ? 0xf0 : 0); + s390_kernel_write((char *)ip + 1, (char *)&insn.opc + 1, 1); + + /* Make the new code visible to the other CPUs. */ + text_poke_sync_lock(); + + return 0; +} + +struct bpf_tramp_jit { + struct bpf_jit common; + int orig_stack_args_off;/* Offset of arguments placed on stack by the + * func_addr's original caller + */ + int stack_size; /* Trampoline stack size */ + int backchain_off; /* Offset of backchain */ + int stack_args_off; /* Offset of stack arguments for calling + * func_addr, has to be at the top + */ + int reg_args_off; /* Offset of register arguments for calling + * func_addr + */ + int ip_off; /* For bpf_get_func_ip(), has to be at + * (ctx - 16) + */ + int arg_cnt_off; /* For bpf_get_func_arg_cnt(), has to be at + * (ctx - 8) + */ + int bpf_args_off; /* Offset of BPF_PROG context, which consists + * of BPF arguments followed by return value + */ + int retval_off; /* Offset of return value (see above) */ + int r7_r8_off; /* Offset of saved %r7 and %r8, which are used + * for __bpf_prog_enter() return value and + * func_addr respectively + */ + int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */ + int tccnt_off; /* Offset of saved tailcall counter */ + int r14_off; /* Offset of saved %r14, has to be at the + * bottom */ + int do_fexit; /* do_fexit: label */ +}; + +static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val) +{ + /* llihf %dst_reg,val_hi */ + EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32)); + /* oilf %rdst_reg,val_lo */ + EMIT6_IMM(0xc00d0000, dst_reg, val); +} + +static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + struct bpf_tramp_link *tlink, bool save_ret) +{ + struct bpf_jit *jit = &tjit->common; + int cookie_off = tjit->run_ctx_off + + offsetof(struct bpf_tramp_run_ctx, bpf_cookie); + struct bpf_prog *p = tlink->link.prog; + int patch; + + /* + * run_ctx.cookie = tlink->cookie; + */ + + /* %r0 = tlink->cookie */ + load_imm64(jit, REG_W0, tlink->cookie); + /* stg %r0,cookie_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off); + + /* + * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) + * goto skip; + */ + + /* %r1 = __bpf_prog_enter */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* la %r3,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + /* ltgr %r7,%r2 */ + EMIT4(0xb9020000, REG_7, REG_2); + /* brcl 8,skip */ + patch = jit->prg; + EMIT6_PCREL_RILC(0xc0040000, 8, 0); + + /* + * retval = bpf_func(args, p->insnsi); + */ + + /* %r1 = p->bpf_func */ + load_imm64(jit, REG_1, (u64)p->bpf_func); + /* la %r2,bpf_args_off(%r15) */ + EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); + /* %r3 = p->insnsi */ + if (!p->jited) + load_imm64(jit, REG_3, (u64)p->insnsi); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + if (save_ret) { + if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) + return -1; + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + } + + /* skip: */ + if (jit->prg_buf) + *(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1; + + /* + * __bpf_prog_exit(p, start, &run_ctx); + */ + + /* %r1 = __bpf_prog_exit */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* lgr %r3,%r7 */ + EMIT4(0xb9040000, REG_3, REG_7); + /* la %r4,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + + return 0; +} + +static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size) +{ + int stack_offset = tjit->stack_size; + + tjit->stack_size += size; + return stack_offset; +} + +/* ABI uses %r2 - %r6 for parameter passing. */ +#define MAX_NR_REG_ARGS 5 + +/* The "L" field of the "mvc" instruction is 8 bits. */ +#define MAX_MVC_SIZE 256 +#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64)) + +/* -mfentry generates a 6-byte nop on s390x. */ +#define S390X_PATCH_SIZE 6 + +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, + struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + int nr_bpf_args, nr_reg_args, nr_stack_args; + struct bpf_jit *jit = &tjit->common; + int arg, bpf_arg_off; + int i, j; + + /* Support as many stack arguments as "mvc" instruction can handle. */ + nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS); + nr_stack_args = m->nr_args - nr_reg_args; + if (nr_stack_args > MAX_NR_STACK_ARGS) + return -ENOTSUPP; + + /* Return to %r14 in the struct_ops case. */ + if (flags & BPF_TRAMP_F_INDIRECT) + flags |= BPF_TRAMP_F_SKIP_FRAME; + + /* + * Compute how many arguments we need to pass to BPF programs. + * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or + * smaller are packed into 1 or 2 registers; larger arguments are + * passed via pointers. + * In s390x ABI, arguments that are 8 bytes or smaller are packed into + * a register; larger arguments are passed via pointers. + * We need to deal with this difference. + */ + nr_bpf_args = 0; + for (i = 0; i < m->nr_args; i++) { + if (m->arg_size[i] <= 8) + nr_bpf_args += 1; + else if (m->arg_size[i] <= 16) + nr_bpf_args += 2; + else + return -ENOTSUPP; + } + + /* + * Calculate the stack layout. + */ + + /* + * Allocate STACK_FRAME_OVERHEAD bytes for the callees. As the s390x + * ABI requires, put our backchain at the end of the allocated memory. + */ + tjit->stack_size = STACK_FRAME_OVERHEAD; + tjit->backchain_off = tjit->stack_size - sizeof(u64); + tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64)); + tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64)); + tjit->ip_off = alloc_stack(tjit, sizeof(u64)); + tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64)); + tjit->retval_off = alloc_stack(tjit, sizeof(u64)); + tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64)); + tjit->run_ctx_off = alloc_stack(tjit, + sizeof(struct bpf_tramp_run_ctx)); + tjit->tccnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->r14_off = alloc_stack(tjit, sizeof(u64) * 2); + /* + * In accordance with the s390x ABI, the caller has allocated + * STACK_FRAME_OVERHEAD bytes for us. 8 of them contain the caller's + * backchain, and the rest we can use. + */ + tjit->stack_size -= STACK_FRAME_OVERHEAD - sizeof(u64); + tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD; + + /* lgr %r1,%r15 */ + EMIT4(0xb9040000, REG_1, REG_15); + /* aghi %r15,-stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size); + /* stg %r1,backchain_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15, + tjit->backchain_off); + /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | (tjit->stack_size + STK_OFF_TCCNT)); + /* stmg %r2,%rN,fwd_reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + for (i = 0, j = 0; i < m->nr_args; i++) { + if (i < MAX_NR_REG_ARGS) + arg = REG_2 + i; + else + arg = tjit->orig_stack_args_off + + (i - MAX_NR_REG_ARGS) * sizeof(u64); + bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64); + if (m->arg_size[i] <= 8) { + if (i < MAX_NR_REG_ARGS) + /* stg %arg,bpf_arg_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, arg, + REG_0, REG_15, bpf_arg_off); + else + /* mvc bpf_arg_off(8,%r15),arg(%r15) */ + _EMIT6(0xd207f000 | bpf_arg_off, + 0xf000 | arg); + j += 1; + } else { + if (i < MAX_NR_REG_ARGS) { + /* mvc bpf_arg_off(16,%r15),0(%arg) */ + _EMIT6(0xd20ff000 | bpf_arg_off, + reg2hex[arg] << 12); + } else { + /* lg %r1,arg(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0, + REG_15, arg); + /* mvc bpf_arg_off(16,%r15),0(%r1) */ + _EMIT6(0xd20ff000 | bpf_arg_off, 0x1000); + } + j += 2; + } + } + /* stmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* stg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off); + + if (flags & BPF_TRAMP_F_ORIG_STACK) { + /* + * The ftrace trampoline puts the return address (which is the + * address of the original function + S390X_PATCH_SIZE) into + * %r0; see ftrace_shared_hotpatch_trampoline_br and + * ftrace_init_nop() for details. + */ + + /* lgr %r8,%r0 */ + EMIT4(0xb9040000, REG_8, REG_0); + } else { + /* %r8 = func_addr + S390X_PATCH_SIZE */ + load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); + } + + /* + * ip = func_addr; + * arg_cnt = m->nr_args; + */ + + if (flags & BPF_TRAMP_F_IP_ARG) { + /* %r0 = func_addr */ + load_imm64(jit, REG_0, (u64)func_addr); + /* stg %r0,ip_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->ip_off); + } + /* lghi %r0,nr_bpf_args */ + EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args); + /* stg %r0,arg_cnt_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->arg_cnt_off); + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * __bpf_tramp_enter(im); + */ + + /* %r1 = __bpf_tramp_enter */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + for (i = 0; i < fentry->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fentry->links[i], + flags & BPF_TRAMP_F_RET_FENTRY_RET)) + return -EINVAL; + + if (fmod_ret->nr_links) { + /* + * retval = 0; + */ + + /* xc retval_off(8,%r15),retval_off(%r15) */ + _EMIT6(0xd707f000 | tjit->retval_off, + 0xf000 | tjit->retval_off); + + for (i = 0; i < fmod_ret->nr_links; i++) { + if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true)) + return -EINVAL; + + /* + * if (retval) + * goto do_fexit; + */ + + /* ltg %r0,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15, + tjit->retval_off); + /* brcl 7,do_fexit */ + EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit); + } + } + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * retval = func_addr(args); + */ + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */ + if (nr_stack_args) + _EMIT6(0xd200f000 | + (nr_stack_args * sizeof(u64) - 1) << 16 | + tjit->stack_args_off, + 0xf000 | tjit->orig_stack_args_off); + /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off); + /* lgr %r1,%r8 */ + EMIT4(0xb9040000, REG_1, REG_8); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + + im->ip_after_call = jit->prg_buf + jit->prg; + + /* + * The following nop will be patched by bpf_tramp_image_put(). + */ + + /* brcl 0,im->ip_epilogue */ + EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue); + } + + /* do_fexit: */ + tjit->do_fexit = jit->prg; + for (i = 0; i < fexit->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fexit->links[i], false)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = jit->prg_buf + jit->prg; + + /* + * __bpf_tramp_exit(im); + */ + + /* %r1 = __bpf_tramp_exit */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* lgr %r1,%r8 */ + if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + EMIT4(0xb9040000, REG_1, REG_8); + /* lmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* lg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off); + /* lg %r2,retval_off(%r15) */ + if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET)) + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15, + tjit->retval_off); + /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT), + 0xf000 | tjit->tccnt_off); + /* aghi %r15,stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); + if (flags & BPF_TRAMP_F_SKIP_FRAME) + EMIT_JUMP_REG(14); + else + EMIT_JUMP_REG(1); + + return 0; +} + +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *orig_call) +{ + struct bpf_tramp_image im; + struct bpf_tramp_jit tjit; + int ret; + + memset(&tjit, 0, sizeof(tjit)); + + ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags, + tlinks, orig_call); + + return ret < 0 ? ret : tjit.common.prg; +} + +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, + void *image_end, const struct btf_func_model *m, + u32 flags, struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_jit tjit; + int ret; + + /* Compute offsets, check whether the code fits. */ + memset(&tjit, 0, sizeof(tjit)); + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + + if (ret < 0) + return ret; + if (tjit.common.prg > (char *)image_end - (char *)image) + /* + * Use the same error code as for exceeding + * BPF_MAX_TRAMP_LINKS. + */ + return -E2BIG; + + tjit.common.prg = 0; + tjit.common.prg_buf = image; + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + + return ret < 0 ? ret : tjit.common.prg; +} + +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} + +bool bpf_jit_supports_arena(void) +{ + return true; +} + +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; + } + return true; +} + +bool bpf_jit_supports_exceptions(void) +{ + /* + * Exceptions require unwinding support, which is always available, + * because the kernel is always built with backchain. + */ + return true; +} + +void arch_bpf_stack_walk(bool (*consume_fn)(void *, u64, u64, u64), + void *cookie) +{ + unsigned long addr, prev_addr = 0; + struct unwind_state state; + + unwind_for_each_frame(&state, NULL, NULL, 0) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + /* + * addr is a return address and state.sp is the value of %r15 + * at this address. exception_cb needs %r15 at entry to the + * function containing addr, so take the next state.sp. + * + * There is no bp, and the exception_cb prog does not need one + * to perform a quasi-longjmp. The common code requires a + * non-zero bp, so pass sp there as well. + */ + if (prev_addr && !consume_fn(cookie, prev_addr, state.sp, + state.sp)) + break; + prev_addr = addr; + } +} diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index bf557a1b789c..1810e0944a4e 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -3,7 +3,8 @@ # Makefile for the s390 PCI subsystem. # -obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_clp.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ - pci_bus.o + pci_bus.o pci_kvm_hook.o pci_report.o pci_fixup.o obj-$(CONFIG_PCI_IOV) += pci_iov.o +obj-$(CONFIG_SYSFS) += pci_sysfs.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index bc980fd313d5..cd6676c2d602 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -28,7 +28,10 @@ #include <linux/jump_label.h> #include <linux/pci.h> #include <linux/printk.h> +#include <linux/lockdep.h> +#include <linux/list_sort.h> +#include <asm/machine.h> #include <asm/isc.h> #include <asm/airq.h> #include <asm/facility.h> @@ -42,6 +45,7 @@ /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); +static DEFINE_MUTEX(zpci_add_remove_lock); static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); static DEFINE_SPINLOCK(zpci_domain_lock); @@ -61,6 +65,21 @@ DEFINE_STATIC_KEY_FALSE(have_mio); static struct kmem_cache *zdev_fmb_cache; +/* AEN structures that must be preserved over KVM module re-insertion */ +union zpci_sic_iib *zpci_aipb; +EXPORT_SYMBOL_GPL(zpci_aipb); +struct airq_iv *zpci_aif_sbv; +EXPORT_SYMBOL_GPL(zpci_aif_sbv); + +void zpci_zdev_put(struct zpci_dev *zdev) +{ + if (!zdev) + return; + mutex_lock(&zpci_add_remove_lock); + kref_put_lock(&zdev->kref, zpci_release_device, &zpci_list_lock); + mutex_unlock(&zpci_add_remove_lock); +} + struct zpci_dev *get_zdev_by_fid(u32 fid) { struct zpci_dev *tmp, *zdev = NULL; @@ -110,21 +129,26 @@ EXPORT_SYMBOL_GPL(pci_proc_domain); /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, - u64 base, u64 limit, u64 iota) + u64 base, u64 limit, u64 iota, u8 *status) { u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT); struct zpci_fib fib = {0}; - u8 cc, status; + u8 cc; - WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; - fib.pal = limit; - fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; - cc = zpci_mod_fc(req, &fib, &status); + /* Work around off by one in ISM virt device */ + if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base) + fib.pal = limit + (1 << 12); + else + fib.pal = limit; + fib.iota = iota; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, status); if (cc) - zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); + zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status); return cc; } +EXPORT_SYMBOL_GPL(zpci_register_ioat); /* Modify PCI: Unregister I/O address translation parameters */ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) @@ -133,6 +157,8 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc) zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); @@ -143,7 +169,9 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) int zpci_fmb_enable_device(struct zpci_dev *zdev) { u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_iommu_ctrs *ctrs; struct zpci_fib fib = {0}; + unsigned long flags; u8 cc, status; if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length) @@ -155,11 +183,20 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) WARN_ON((u64) zdev->fmb & 0xf); /* reset software counters */ - atomic64_set(&zdev->allocated_pages, 0); - atomic64_set(&zdev->mapped_pages, 0); - atomic64_set(&zdev->unmapped_pages, 0); + spin_lock_irqsave(&zdev->dom_lock, flags); + ctrs = zpci_get_iommu_ctrs(zdev); + if (ctrs) { + atomic64_set(&ctrs->mapped_pages, 0); + atomic64_set(&ctrs->unmapped_pages, 0); + atomic64_set(&ctrs->global_rpcits, 0); + atomic64_set(&ctrs->sync_map_rpcits, 0); + atomic64_set(&ctrs->sync_rpcits, 0); + } + spin_unlock_irqrestore(&zdev->dom_lock, flags); + fib.fmb_addr = virt_to_phys(zdev->fmb); + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc) { kmem_cache_free(zdev_fmb_cache, zdev->fmb); @@ -178,6 +215,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) if (!zdev->fmb) return -EINVAL; + fib.gd = zdev->gisa; + /* Function measurement is disabled if fmb address is zero */ cc = zpci_mod_fc(req, &fib, &status); if (cc == 3) /* Function already gone. */ @@ -225,68 +264,25 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, return 0; } -/* combine single writes by using store-block insn */ -void __iowrite64_copy(void __iomem *to, const void *from, size_t count) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t prot) { - zpci_memcpy_toio(to, from, count); -} - -static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) -{ - unsigned long offset, vaddr; - struct vm_struct *area; - phys_addr_t last_addr; - - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) addr; + return (void __iomem *)phys_addr; - offset = addr & ~PAGE_MASK; - addr &= PAGE_MASK; - size = PAGE_ALIGN(size + offset); - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { - free_vm_area(area); - return NULL; - } - return (void __iomem *) ((unsigned long) area->addr + offset); -} - -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) -{ - return __ioremap(addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); -void __iomem *ioremap(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(ioremap); - -void __iomem *ioremap_wc(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem *ioremap_wt(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wt); - void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) - vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); @@ -531,8 +527,7 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, return r; } -int zpci_setup_bus_resources(struct zpci_dev *zdev, - struct list_head *resources) +int zpci_setup_bus_resources(struct zpci_dev *zdev) { unsigned long addr, size, flags; struct resource *res; @@ -568,7 +563,6 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev, return -ENOMEM; } zdev->bars[i].res = res; - pci_add_resource(resources, res); } zdev->has_resources = 1; @@ -577,17 +571,23 @@ int zpci_setup_bus_resources(struct zpci_dev *zdev, static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) { + struct resource *res; int i; + pci_lock_rescan_remove(); for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (!zdev->bars[i].size || !zdev->bars[i].res) + res = zdev->bars[i].res; + if (!res) continue; + release_resource(res); + pci_bus_remove_resource(zdev->zbus->bus, res); zpci_free_iomap(zdev, zdev->bars[i].map_idx); - release_resource(zdev->bars[i].res); - kfree(zdev->bars[i].res); + zdev->bars[i].res = NULL; + kfree(res); } zdev->has_resources = 0; + pci_unlock_rescan_remove(); } int pcibios_device_add(struct pci_dev *pdev) @@ -601,8 +601,6 @@ int pcibios_device_add(struct pci_dev *pdev) if (pdev->is_physfn) pdev->no_vf_scan = 1; - pdev->dev.groups = zpci_attr_groups; - pdev->dev.dma_ops = &s390_pci_dma_ops; zpci_map_resources(pdev); for (i = 0; i < PCI_STD_NUM_BARS; i++) { @@ -700,6 +698,24 @@ int zpci_enable_device(struct zpci_dev *zdev) zpci_update_fh(zdev, fh); return rc; } +EXPORT_SYMBOL_GPL(zpci_enable_device); + +int zpci_reenable_device(struct zpci_dev *zdev) +{ + u8 status; + int rc; + + rc = zpci_enable_device(zdev); + if (rc) + return rc; + + rc = zpci_iommu_register_ioat(zdev, &status); + if (rc) + zpci_disable_device(zdev); + + return rc; +} +EXPORT_SYMBOL_GPL(zpci_reenable_device); int zpci_disable_device(struct zpci_dev *zdev) { @@ -723,6 +739,7 @@ int zpci_disable_device(struct zpci_dev *zdev) } return rc; } +EXPORT_SYMBOL_GPL(zpci_disable_device); /** * zpci_hot_reset_device - perform a reset of the given zPCI function @@ -738,12 +755,12 @@ int zpci_disable_device(struct zpci_dev *zdev) * equivalent to its state during boot when first probing a driver. * Consequently after reset the PCI function requires re-initialization via the * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors() - * and enabling the function via e.g.pci_enablde_device_flags().The caller + * and enabling the function via e.g. pci_enable_device_flags(). The caller * must guard against concurrent reset attempts. * * In most cases this function should not be called directly but through * pci_reset_function() or pci_reset_bus() which handle the save/restore and - * locking. + * locking - asserted by lockdep. * * Return: 0 on success and an error value otherwise */ @@ -751,6 +768,7 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) { int rc; + lockdep_assert_held(&zdev->state_lock); zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh); if (zdev_enabled(zdev)) { /* Disables device access, DMAs and IRQs (reset state) */ @@ -766,21 +784,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) return rc; } - rc = zpci_enable_device(zdev); - if (rc) - return rc; + rc = zpci_reenable_device(zdev); - if (zdev->dma_table) - rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); - else - rc = zpci_dma_init_device(zdev); - if (rc) { - zpci_disable_device(zdev); - return rc; - } - - return 0; + return rc; } /** @@ -789,8 +795,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) * @fh: Current Function Handle of the device to be created * @state: Initial state after creation either Standby or Configured * - * Creates a new zpci device and adds it to its, possibly newly created, zbus - * as well as zpci_list. + * Allocates a new struct zpci_dev and queries the platform for its details. + * If successful the device can subsequently be added to the zPCI subsystem + * using zpci_add_device(). * * Returns: the zdev on success or an error pointer otherwise */ @@ -799,7 +806,6 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) struct zpci_dev *zdev; int rc; - zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state); zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); if (!zdev) return ERR_PTR(-ENOMEM); @@ -814,9 +820,35 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) goto error; zdev->state = state; - kref_init(&zdev->kref); - mutex_init(&zdev->lock); + mutex_init(&zdev->state_lock); + mutex_init(&zdev->fmb_lock); + mutex_init(&zdev->kzdev_lock); + + return zdev; +error: + zpci_dbg(0, "crt fid:%x, rc:%d\n", fid, rc); + kfree(zdev); + return ERR_PTR(rc); +} + +/** + * zpci_add_device() - Add a previously created zPCI device to the zPCI subsystem + * @zdev: The zPCI device to be added + * + * A struct zpci_dev is added to the zPCI subsystem and to a virtual PCI bus creating + * a new one as necessary. A hotplug slot is created and events start to be handled. + * If successful from this point on zpci_zdev_get() and zpci_zdev_put() must be used. + * If adding the struct zpci_dev fails the device was not added and should be freed. + * + * Return: 0 on success, or an error code otherwise + */ +int zpci_add_device(struct zpci_dev *zdev) +{ + int rc; + + mutex_lock(&zpci_add_remove_lock); + zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state); rc = zpci_init_iommu(zdev); if (rc) goto error; @@ -825,18 +857,19 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) if (rc) goto error_destroy_iommu; + kref_init(&zdev->kref); spin_lock(&zpci_list_lock); list_add_tail(&zdev->entry, &zpci_list); spin_unlock(&zpci_list_lock); - - return zdev; + mutex_unlock(&zpci_add_remove_lock); + return 0; error_destroy_iommu: zpci_destroy_iommu(zdev); error: - zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc); - kfree(zdev); - return ERR_PTR(rc); + zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc); + mutex_unlock(&zpci_add_remove_lock); + return rc; } bool zpci_is_device_configured(struct zpci_dev *zdev) @@ -853,32 +886,15 @@ bool zpci_is_device_configured(struct zpci_dev *zdev) * @fh: The general function handle supplied by the platform * * Given a device in the configuration state Configured, enables, scans and - * adds it to the common code PCI subsystem if possible. If the PCI device is - * parked because we can not yet create a PCI bus because we have not seen - * function 0, it is ignored but will be scanned once function 0 appears. - * If any failure occurs, the zpci_dev is left disabled. + * adds it to the common code PCI subsystem if possible. If any failure occurs, + * the zpci_dev is left disabled. * * Return: 0 on success, or an error code otherwise */ int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh) { - int rc; - zpci_update_fh(zdev, fh); - /* the PCI function will be scanned once function 0 appears */ - if (!zdev->zbus->bus) - return 0; - - /* For function 0 on a multi-function bus scan whole bus as we might - * have to pick up existing functions waiting for it to allow creating - * the PCI bus - */ - if (zdev->devfn == 0 && zdev->zbus->multifunction) - rc = zpci_bus_scan_bus(zdev->zbus); - else - rc = zpci_bus_scan_device(zdev); - - return rc; + return zpci_bus_scan_device(zdev); } /** @@ -895,14 +911,13 @@ int zpci_deconfigure_device(struct zpci_dev *zdev) { int rc; + lockdep_assert_held(&zdev->state_lock); + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + return 0; + if (zdev->zbus->bus) zpci_bus_remove_device(zdev, false); - if (zdev->dma_table) { - rc = zpci_dma_exit_device(zdev); - if (rc) - return rc; - } if (zdev_enabled(zdev)) { rc = zpci_disable_device(zdev); if (rc) @@ -919,67 +934,50 @@ int zpci_deconfigure_device(struct zpci_dev *zdev) } /** - * zpci_device_reserved() - Mark device as resverved + * zpci_device_reserved() - Mark device as reserved * @zdev: the zpci_dev that was reserved * * Handle the case that a given zPCI function was reserved by another system. - * After a call to this function the zpci_dev can not be found via - * get_zdev_by_fid() anymore but may still be accessible via existing - * references though it will not be functional anymore. */ void zpci_device_reserved(struct zpci_dev *zdev) { - if (zdev->has_hp_slot) - zpci_exit_slot(zdev); - /* - * Remove device from zpci_list as it is going away. This also - * makes sure we ignore subsequent zPCI events for this device. - */ - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); + lockdep_assert_held(&zdev->state_lock); + /* We may declare the device reserved multiple times */ + if (zdev->state == ZPCI_FN_STATE_RESERVED) + return; zdev->state = ZPCI_FN_STATE_RESERVED; zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + /* + * The underlying device is gone. Allow the zdev to be freed + * as soon as all other references are gone by accounting for + * the removal as a dropped reference. + */ zpci_zdev_put(zdev); } void zpci_release_device(struct kref *kref) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); - int ret; - if (zdev->zbus->bus) - zpci_bus_remove_device(zdev, false); + lockdep_assert_held(&zpci_add_remove_lock); + WARN_ON(zdev->state != ZPCI_FN_STATE_RESERVED); + /* + * We already hold zpci_list_lock thanks to kref_put_lock(). + * This makes sure no new reference can be taken from the list. + */ + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); - if (zdev->dma_table) - zpci_dma_exit_device(zdev); - if (zdev_enabled(zdev)) - zpci_disable_device(zdev); + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); - switch (zdev->state) { - case ZPCI_FN_STATE_CONFIGURED: - ret = sclp_pci_deconfigure(zdev->fid); - zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret); - fallthrough; - case ZPCI_FN_STATE_STANDBY: - if (zdev->has_hp_slot) - zpci_exit_slot(zdev); - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); - zpci_dbg(3, "rsv fid:%x\n", zdev->fid); - fallthrough; - case ZPCI_FN_STATE_RESERVED: - if (zdev->has_resources) - zpci_cleanup_bus_resources(zdev); - zpci_bus_device_unregister(zdev); - zpci_destroy_iommu(zdev); - fallthrough; - default: - break; - } + if (zdev->has_resources) + zpci_cleanup_bus_resources(zdev); + + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); zpci_dbg(3, "rem fid:%x\n", zdev->fid); - kfree(zdev); + kfree_rcu(zdev, rcu); } int zpci_report_error(struct pci_dev *pdev, @@ -1094,7 +1092,7 @@ char * __init pcibios_setup(char *str) return NULL; } if (!strcmp(str, "nomio")) { - S390_lowcore.machine_flags &= ~MACHINE_FLAG_PCI_MIO; + clear_machine_feature(MFEATURE_PCI_MIO); return NULL; } if (!strcmp(str, "force_floating")) { @@ -1113,6 +1111,50 @@ bool zpci_is_enabled(void) return s390_pci_initialized; } +static int zpci_cmp_rid(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct zpci_dev *za = container_of(a, struct zpci_dev, entry); + struct zpci_dev *zb = container_of(b, struct zpci_dev, entry); + + /* + * PCI functions without RID available maintain original order + * between themselves but sort before those with RID. + */ + if (za->rid == zb->rid) + return za->rid_available > zb->rid_available; + /* + * PCI functions with RID sort by RID ascending. + */ + return za->rid > zb->rid; +} + +static void zpci_add_devices(struct list_head *scan_list) +{ + struct zpci_dev *zdev, *tmp; + + list_sort(NULL, scan_list, &zpci_cmp_rid); + list_for_each_entry_safe(zdev, tmp, scan_list, entry) { + list_del_init(&zdev->entry); + if (zpci_add_device(zdev)) + kfree(zdev); + } +} + +int zpci_scan_devices(void) +{ + LIST_HEAD(scan_list); + int rc; + + rc = clp_scan_pci_devices(&scan_list); + if (rc) + return rc; + + zpci_add_devices(&scan_list); + zpci_bus_scan_busses(); + return 0; +} + static int __init pci_base_init(void) { int rc; @@ -1125,9 +1167,9 @@ static int __init pci_base_init(void) return 0; } - if (MACHINE_HAS_PCI_MIO) { + if (test_machine_feature(MFEATURE_PCI_MIO)) { static_branch_enable(&have_mio); - ctl_set_bit(2, 5); + system_ctl_set_bit(2, CR2_MIO_ADDRESSING_BIT); } rc = zpci_debug_init(); @@ -1142,21 +1184,14 @@ static int __init pci_base_init(void) if (rc) goto out_irq; - rc = zpci_dma_init(); - if (rc) - goto out_dma; - - rc = clp_scan_pci_devices(); + rc = zpci_scan_devices(); if (rc) goto out_find; - zpci_bus_scan_busses(); s390_pci_initialized = 1; return 0; out_find: - zpci_dma_exit(); -out_dma: zpci_irq_exit(); out_irq: zpci_mem_exit(); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 5d77acbd1c87..81bdb54ad5e3 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -19,6 +19,7 @@ #include <linux/jump_label.h> #include <linux/pci.h> #include <linux/printk.h> +#include <linux/dma-direct.h> #include <asm/pci_clp.h> #include <asm/pci_dma.h> @@ -41,26 +42,19 @@ static int zpci_nb_devices; */ static int zpci_bus_prepare_device(struct zpci_dev *zdev) { - struct resource_entry *window, *n; - struct resource *res; - int rc; + int rc, i; if (!zdev_enabled(zdev)) { rc = zpci_enable_device(zdev); if (rc) return rc; - rc = zpci_dma_init_device(zdev); - if (rc) { - zpci_disable_device(zdev); - return rc; - } } if (!zdev->has_resources) { - zpci_setup_bus_resources(zdev, &zdev->zbus->resources); - resource_list_for_each_entry_safe(window, n, &zdev->zbus->resources) { - res = window->res; - pci_bus_add_resource(zdev->zbus->bus, res, 0); + zpci_setup_bus_resources(zdev); + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (zdev->bars[i].res) + pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res); } } @@ -87,9 +81,8 @@ int zpci_bus_scan_device(struct zpci_dev *zdev) if (!pdev) return -ENODEV; - pci_bus_add_device(pdev); pci_lock_rescan_remove(); - pci_bus_add_devices(zdev->zbus->bus); + pci_bus_add_device(pdev); pci_unlock_rescan_remove(); return 0; @@ -132,11 +125,8 @@ void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error) * @zbus: the zbus to be scanned * * Enables and scans all PCI functions on the bus making them available to the - * common PCI code. If there is no function 0 on the zbus nothing is scanned. If - * a function does not have a slot yet because it was added to the zbus before - * function 0 the slot is created. If a PCI function fails to be initialized - * an error will be returned but attempts will still be made for all other - * functions on the bus. + * common PCI code. If a PCI function fails to be initialized an error will be + * returned but attempts will still be made for all other functions on the bus. * * Return: 0 on success, an error value otherwise */ @@ -145,9 +135,6 @@ int zpci_bus_scan_bus(struct zpci_bus *zbus) struct zpci_dev *zdev; int devfn, rc, ret = 0; - if (!zbus->function[0]) - return 0; - for (devfn = 0; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) { zdev = zbus->function[devfn]; if (zdev && zdev->state == ZPCI_FN_STATE_CONFIGURED) { @@ -182,28 +169,34 @@ void zpci_bus_scan_busses(void) mutex_unlock(&zbus_list_lock); } +static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) +{ + return !s390_pci_no_rid && zdev->rid_available && + !zdev->vfn; +} + /* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus * @zbus: the zbus holding the zdevices - * @f0: function 0 of the bus + * @fr: PCI root function that will determine the bus's domain, and bus speed * @ops: the pci operations * - * Function zero is taken as a parameter as this is used to determine the - * domain, multifunction property and maximum bus speed of the entire bus. + * The PCI function @fr determines the domain (its UID), multifunction property + * and maximum bus speed of the entire bus. * * Return: 0 on success, an error code otherwise */ -static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *f0, struct pci_ops *ops) +static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops) { struct pci_bus *bus; int domain; - domain = zpci_alloc_domain((u16)f0->uid); + domain = zpci_alloc_domain((u16)fr->uid); if (domain < 0) return domain; zbus->domain_nr = domain; - zbus->multifunction = f0->rid_available; - zbus->max_bus_speed = f0->max_bus_speed; + zbus->multifunction = zpci_bus_is_multifunction_root(fr); + zbus->max_bus_speed = fr->max_bus_speed; /* * Note that the zbus->resources are taken over and zbus->resources @@ -216,7 +209,6 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *f0, s } zbus->bus = bus; - pci_bus_add_devices(bus); return 0; } @@ -247,13 +239,15 @@ static void zpci_bus_put(struct zpci_bus *zbus) kref_put(&zbus->kref, zpci_bus_release); } -static struct zpci_bus *zpci_bus_get(int pchid) +static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) { struct zpci_bus *zbus; mutex_lock(&zbus_list_lock); list_for_each_entry(zbus, &zbus_list, bus_next) { - if (pchid == zbus->pchid) { + if (!zbus->multifunction) + continue; + if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { kref_get(&zbus->kref); goto out_unlock; } @@ -264,7 +258,7 @@ out_unlock: return zbus; } -static struct zpci_bus *zpci_bus_alloc(int pchid) +static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) { struct zpci_bus *zbus; @@ -272,7 +266,8 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) if (!zbus) return NULL; - zbus->pchid = pchid; + zbus->topo = topo; + zbus->topo_is_tid = topo_is_tid; INIT_LIST_HEAD(&zbus->bus_next); mutex_lock(&zbus_list_lock); list_add_tail(&zbus->bus_next, &zbus_list); @@ -289,10 +284,32 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) return zbus; } +static void pci_dma_range_setup(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + u64 aligned_end, size; + dma_addr_t dma_start; + int ret; + + dma_start = PAGE_ALIGN(zdev->start_dma); + aligned_end = PAGE_ALIGN_DOWN(zdev->end_dma + 1); + if (aligned_end >= dma_start) + size = aligned_end - dma_start; + else + size = 0; + WARN_ON_ONCE(size == 0); + + ret = dma_direct_set_offset(&pdev->dev, 0, dma_start, size); + if (ret) + pr_err("Failed to allocate DMA range map for %s\n", pci_name(pdev)); +} + void pcibios_bus_add_device(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); + pci_dma_range_setup(pdev); + /* * With pdev->no_vf_scan the common PCI probing code does not * perform PF/VF linking. @@ -303,51 +320,18 @@ void pcibios_bus_add_device(struct pci_dev *pdev) } } -/* zpci_bus_create_hotplug_slots - Add hotplug slot(s) for device added to bus - * @zdev: the zPCI device that was newly added - * - * Add the hotplug slot(s) for the newly added PCI function. Normally this is - * simply the slot for the function itself. If however we are adding the - * function 0 on a zbus, it might be that we already registered functions on - * that zbus but could not create their hotplug slots yet so add those now too. - * - * Return: 0 on success, an error code otherwise - */ -static int zpci_bus_create_hotplug_slots(struct zpci_dev *zdev) +static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) { - struct zpci_bus *zbus = zdev->zbus; - int devfn, rc = 0; - - rc = zpci_init_slot(zdev); - if (rc) - return rc; - zdev->has_hp_slot = 1; + int rc = -EINVAL; - if (zdev->devfn == 0 && zbus->multifunction) { - /* Now that function 0 is there we can finally create the - * hotplug slots for those functions with devfn != 0 that have - * been parked in zbus->function[] waiting for us to be able to - * create the PCI bus. - */ - for (devfn = 1; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) { - zdev = zbus->function[devfn]; - if (zdev && !zdev->has_hp_slot) { - rc = zpci_init_slot(zdev); - if (rc) - return rc; - zdev->has_hp_slot = 1; - } + if (zbus->multifunction) { + if (!zdev->rid_available) { + WARN_ONCE(1, "rid_available not set for multifunction\n"); + return rc; } - + zdev->devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; } - return rc; -} - -static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) -{ - int rc = -EINVAL; - if (zbus->function[zdev->devfn]) { pr_err("devfn %04x is already assigned\n", zdev->devfn); return rc; @@ -356,17 +340,10 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) zbus->function[zdev->devfn] = zdev; zpci_nb_devices++; - if (zbus->bus) { - if (zbus->multifunction && !zdev->rid_available) { - WARN_ONCE(1, "rid_available not set for multifunction\n"); - goto error; - } - - zpci_bus_create_hotplug_slots(zdev); - } else { - /* Hotplug slot will be created once function 0 appears */ - zbus->multifunction = 1; - } + rc = zpci_init_slot(zdev); + if (rc) + goto error; + zdev->has_hp_slot = 1; return 0; @@ -377,10 +354,25 @@ error: return rc; } +static bool zpci_bus_is_isolated_vf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + + if (!zdev->vfn) + return false; + + pdev = zpci_iov_find_parent_pf(zbus, zdev); + if (!pdev) + return true; + pci_dev_put(pdev); + return false; +} + int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { + bool topo_is_tid = zdev->tid_avail; struct zpci_bus *zbus = NULL; - int rc = -EBADF; + int topo, rc = -EBADF; if (zpci_nb_devices == ZPCI_NR_DEVICES) { pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n", @@ -388,19 +380,28 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) return -ENOSPC; } - if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS) - return -EINVAL; - - if (!s390_pci_no_rid && zdev->rid_available) - zbus = zpci_bus_get(zdev->pchid); + topo = topo_is_tid ? zdev->tid : zdev->pchid; + zbus = zpci_bus_get(topo, topo_is_tid); + /* + * An isolated VF gets its own domain/bus even if there exists + * a matching domain/bus already + */ + if (zbus && zpci_bus_is_isolated_vf(zbus, zdev)) { + zpci_bus_put(zbus); + zbus = NULL; + } if (!zbus) { - zbus = zpci_bus_alloc(zdev->pchid); + zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) return -ENOMEM; } - if (zdev->devfn == 0) { + if (!zbus->bus) { + /* The UID of the first PCI function registered with a zpci_bus + * is used as the domain number for that bus. Currently there + * is exactly one zpci_bus per domain. + */ rc = zpci_bus_create_pci_bus(zbus, zdev, ops); if (rc) goto error; diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h index e96c9860e064..ae3d7a9159bd 100644 --- a/arch/s390/pci/pci_bus.h +++ b/arch/s390/pci/pci_bus.h @@ -6,6 +6,10 @@ * Pierre Morel <pmorel@linux.ibm.com> * */ +#ifndef __S390_PCI_BUS_H +#define __S390_PCI_BUS_H + +#include <linux/pci.h> int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); void zpci_bus_device_unregister(struct zpci_dev *zdev); @@ -17,11 +21,8 @@ int zpci_bus_scan_device(struct zpci_dev *zdev); void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error); void zpci_release_device(struct kref *kref); -static inline void zpci_zdev_put(struct zpci_dev *zdev) -{ - if (zdev) - kref_put(&zdev->kref, zpci_release_device); -} + +void zpci_zdev_put(struct zpci_dev *zdev); static inline void zpci_zdev_get(struct zpci_dev *zdev) { @@ -30,8 +31,7 @@ static inline void zpci_zdev_get(struct zpci_dev *zdev) int zpci_alloc_domain(int domain); void zpci_free_domain(int domain); -int zpci_setup_bus_resources(struct zpci_dev *zdev, - struct list_head *resources); +int zpci_setup_bus_resources(struct zpci_dev *zdev); static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus, unsigned int devfn) @@ -41,3 +41,4 @@ static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus, return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn]; } +#endif /* __S390_PCI_BUS_H */ diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 375e0a5120bc..241f7251c873 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -20,6 +20,7 @@ #include <asm/asm-extable.h> #include <asm/pci_debug.h> #include <asm/pci_clp.h> +#include <asm/asm.h> #include <asm/clp.h> #include <uapi/asm/clp.h> @@ -52,18 +53,20 @@ static inline void zpci_err_clp(unsigned int rsp, int rc) static inline int clp_get_ilp(unsigned long *ilp) { unsigned long mask; - int cc = 3; + int cc, exception; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rrf,0xb9a00000,%[mask],%[cmd],8,0\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [mask] "=d" (mask) : [cmd] "a" (1) - : "cc"); + : CC_OUT(cc, cc), [mask] "=d" (mask), [exc] "+d" (exception) + : [cmd] "a" (1) + : CC_CLOBBER); *ilp = mask; - return cc; + return exception ? 3 : CC_TRANSFORM(cc); } /* @@ -72,19 +75,20 @@ static inline int clp_get_ilp(unsigned long *ilp) static __always_inline int clp_req(void *data, unsigned int lps) { struct { u8 _[CLP_BLK_SIZE]; } *req = data; + int cc, exception; u64 ignored; - int cc = 3; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rrf,0xb9a00000,%[ign],%[req],0,%[lps]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [ign] "=d" (ignored), "+m" (*req) + : CC_OUT(cc, cc), [ign] "=d" (ignored), "+m" (*req), [exc] "+d" (exception) : [req] "a" (req), [lps] "i" (lps) - : "cc"); - return cc; + : CC_CLOBBER); + return exception ? 3 : CC_TRANSFORM(cc); } static void *clp_alloc_block(gfp_t gfp_mask) @@ -106,6 +110,9 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->max_msi = response->noi; zdev->fmb_update = response->mui; zdev->version = response->version; + zdev->maxstbl = response->maxstbl; + zdev->dtsm = response->dtsm; + zdev->rtr_avail = response->rtr; switch (response->version) { case 1: @@ -160,12 +167,16 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->pft = response->pft; zdev->vfn = response->vfn; zdev->port = response->port; + zdev->fidparm = response->fidparm; zdev->uid = response->uid; zdev->fmb_length = sizeof(u32) * response->fmb_len; - zdev->rid_available = response->rid_avail; zdev->is_physfn = response->is_physfn; - if (!s390_pci_no_rid && zdev->rid_available) - zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; + zdev->rid_available = response->rid_avail; + if (zdev->rid_available) + zdev->rid = response->rid; + zdev->tid_avail = response->tid_avail; + if (zdev->tid_avail) + zdev->tid = response->tid; memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip)); if (response->util_str_avail) { @@ -229,12 +240,16 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma { struct clp_req_rsp_set_pci *rrb; int rc, retries = 100; + u32 gisa = 0; *fh = 0; rrb = clp_alloc_block(GFP_KERNEL); if (!rrb) return -ENOMEM; + if (command != CLP_SET_DISABLE_PCI_FN) + gisa = zdev->gisa; + do { memset(rrb, 0, sizeof(*rrb)); rrb->request.hdr.len = sizeof(rrb->request); @@ -243,6 +258,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma rrb->request.fh = zdev->fh; rrb->request.oc = command; rrb->request.ndas = nr_dma_as; + rrb->request.gisa = gisa; rc = clp_req(rrb, CLP_LPS_PCI); if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) { @@ -400,6 +416,7 @@ static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid, static void __clp_add(struct clp_fh_list_entry *entry, void *data) { + struct list_head *scan_list = data; struct zpci_dev *zdev; if (!entry->vendor_id) @@ -410,10 +427,13 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data) zpci_zdev_put(zdev); return; } - zpci_create_device(entry->fid, entry->fh, entry->config_state); + zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state); + if (IS_ERR(zdev)) + return; + list_add_tail(&zdev->entry, scan_list); } -int clp_scan_pci_devices(void) +int clp_scan_pci_devices(struct list_head *scan_list) { struct clp_req_rsp_list_pci *rrb; int rc; @@ -422,7 +442,7 @@ int clp_scan_pci_devices(void) if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, NULL, __clp_add); + rc = clp_list_pci(rrb, scan_list, __clp_add); clp_free_block(rrb); return rc; @@ -650,7 +670,6 @@ static const struct file_operations clp_misc_fops = { .release = clp_misc_release, .unlocked_ioctl = clp_misc_ioctl, .compat_ioctl = clp_misc_ioctl, - .llseek = no_llseek, }; static struct miscdevice clp_misc_device = { @@ -659,9 +678,4 @@ static struct miscdevice clp_misc_device = { .fops = &clp_misc_fops, }; -static int __init clp_misc_init(void) -{ - return misc_register(&clp_misc_device); -} - -device_initcall(clp_misc_init); +builtin_misc_device(clp_misc_device); diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index ca6bd98eec13..38014206c16b 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -53,9 +53,11 @@ static char *pci_fmt3_names[] = { }; static char *pci_sw_names[] = { - "Allocated pages", "Mapped pages", "Unmapped pages", + "Global RPCITs", + "Sync Map RPCITs", + "Sync RPCITs", }; static void pci_fmb_show(struct seq_file *m, char *name[], int length, @@ -70,12 +72,22 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length, static void pci_sw_counter_show(struct seq_file *m) { struct zpci_dev *zdev = m->private; - atomic64_t *counter = &zdev->allocated_pages; + struct zpci_iommu_ctrs *ctrs; + atomic64_t *counter; + unsigned long flags; int i; + spin_lock_irqsave(&zdev->dom_lock, flags); + ctrs = zpci_get_iommu_ctrs(m->private); + if (!ctrs) + goto unlock; + + counter = &ctrs->mapped_pages; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], atomic64_read(counter)); +unlock: + spin_unlock_irqrestore(&zdev->dom_lock, flags); } static int pci_perf_show(struct seq_file *m, void *v) @@ -85,9 +97,9 @@ static int pci_perf_show(struct seq_file *m, void *v) if (!zdev) return 0; - mutex_lock(&zdev->lock); + mutex_lock(&zdev->fmb_lock); if (!zdev->fmb) { - mutex_unlock(&zdev->lock); + mutex_unlock(&zdev->fmb_lock); seq_puts(m, "FMB statistics disabled\n"); return 0; } @@ -124,7 +136,7 @@ static int pci_perf_show(struct seq_file *m, void *v) } pci_sw_counter_show(m); - mutex_unlock(&zdev->lock); + mutex_unlock(&zdev->fmb_lock); return 0; } @@ -142,7 +154,7 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf, if (rc) return rc; - mutex_lock(&zdev->lock); + mutex_lock(&zdev->fmb_lock); switch (val) { case 0: rc = zpci_fmb_disable_device(zdev); @@ -151,7 +163,7 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf, rc = zpci_fmb_enable_device(zdev); break; } - mutex_unlock(&zdev->lock); + mutex_unlock(&zdev->fmb_lock); return rc ? rc : count; } diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c deleted file mode 100644 index f46833a25526..000000000000 --- a/arch/s390/pci/pci_dma.c +++ /dev/null @@ -1,715 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corp. 2012 - * - * Author(s): - * Jan Glauber <jang@linux.vnet.ibm.com> - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/iommu-helper.h> -#include <linux/dma-map-ops.h> -#include <linux/vmalloc.h> -#include <linux/pci.h> -#include <asm/pci_dma.h> - -static struct kmem_cache *dma_region_table_cache; -static struct kmem_cache *dma_page_table_cache; -static int s390_iommu_strict; -static u64 s390_iommu_aperture; -static u32 s390_iommu_aperture_factor = 1; - -static int zpci_refresh_global(struct zpci_dev *zdev) -{ - return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma, - zdev->iommu_pages * PAGE_SIZE); -} - -unsigned long *dma_alloc_cpu_table(void) -{ - unsigned long *table, *entry; - - table = kmem_cache_alloc(dma_region_table_cache, GFP_ATOMIC); - if (!table) - return NULL; - - for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++) - *entry = ZPCI_TABLE_INVALID; - return table; -} - -static void dma_free_cpu_table(void *table) -{ - kmem_cache_free(dma_region_table_cache, table); -} - -static unsigned long *dma_alloc_page_table(void) -{ - unsigned long *table, *entry; - - table = kmem_cache_alloc(dma_page_table_cache, GFP_ATOMIC); - if (!table) - return NULL; - - for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++) - *entry = ZPCI_PTE_INVALID; - return table; -} - -static void dma_free_page_table(void *table) -{ - kmem_cache_free(dma_page_table_cache, table); -} - -static unsigned long *dma_get_seg_table_origin(unsigned long *entry) -{ - unsigned long *sto; - - if (reg_entry_isvalid(*entry)) - sto = get_rt_sto(*entry); - else { - sto = dma_alloc_cpu_table(); - if (!sto) - return NULL; - - set_rt_sto(entry, virt_to_phys(sto)); - validate_rt_entry(entry); - entry_clr_protected(entry); - } - return sto; -} - -static unsigned long *dma_get_page_table_origin(unsigned long *entry) -{ - unsigned long *pto; - - if (reg_entry_isvalid(*entry)) - pto = get_st_pto(*entry); - else { - pto = dma_alloc_page_table(); - if (!pto) - return NULL; - set_st_pto(entry, virt_to_phys(pto)); - validate_st_entry(entry); - entry_clr_protected(entry); - } - return pto; -} - -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr) -{ - unsigned long *sto, *pto; - unsigned int rtx, sx, px; - - rtx = calc_rtx(dma_addr); - sto = dma_get_seg_table_origin(&rto[rtx]); - if (!sto) - return NULL; - - sx = calc_sx(dma_addr); - pto = dma_get_page_table_origin(&sto[sx]); - if (!pto) - return NULL; - - px = calc_px(dma_addr); - return &pto[px]; -} - -void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags) -{ - if (flags & ZPCI_PTE_INVALID) { - invalidate_pt_entry(entry); - } else { - set_pt_pfaa(entry, page_addr); - validate_pt_entry(entry); - } - - if (flags & ZPCI_TABLE_PROTECTED) - entry_set_protected(entry); - else - entry_clr_protected(entry); -} - -static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, - dma_addr_t dma_addr, size_t size, int flags) -{ - unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - phys_addr_t page_addr = (pa & PAGE_MASK); - unsigned long irq_flags; - unsigned long *entry; - int i, rc = 0; - - if (!nr_pages) - return -EINVAL; - - spin_lock_irqsave(&zdev->dma_table_lock, irq_flags); - if (!zdev->dma_table) { - rc = -EINVAL; - goto out_unlock; - } - - for (i = 0; i < nr_pages; i++) { - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); - if (!entry) { - rc = -ENOMEM; - goto undo_cpu_trans; - } - dma_update_cpu_trans(entry, page_addr, flags); - page_addr += PAGE_SIZE; - dma_addr += PAGE_SIZE; - } - -undo_cpu_trans: - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { - flags = ZPCI_PTE_INVALID; - while (i-- > 0) { - page_addr -= PAGE_SIZE; - dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); - if (!entry) - break; - dma_update_cpu_trans(entry, page_addr, flags); - } - } -out_unlock: - spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags); - return rc; -} - -static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, - size_t size, int flags) -{ - unsigned long irqflags; - int ret; - - /* - * With zdev->tlb_refresh == 0, rpcit is not required to establish new - * translations when previously invalid translation-table entries are - * validated. With lazy unmap, rpcit is skipped for previously valid - * entries, but a global rpcit is then required before any address can - * be re-used, i.e. after each iommu bitmap wrap-around. - */ - if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) { - if (!zdev->tlb_refresh) - return 0; - } else { - if (!s390_iommu_strict) - return 0; - } - - ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, - PAGE_ALIGN(size)); - if (ret == -ENOMEM && !s390_iommu_strict) { - /* enable the hypervisor to free some resources */ - if (zpci_refresh_global(zdev)) - goto out; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags); - bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, - zdev->lazy_bitmap, zdev->iommu_pages); - bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags); - ret = 0; - } -out: - return ret; -} - -static int dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, - dma_addr_t dma_addr, size_t size, int flags) -{ - int rc; - - rc = __dma_update_trans(zdev, pa, dma_addr, size, flags); - if (rc) - return rc; - - rc = __dma_purge_tlb(zdev, dma_addr, size, flags); - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) - __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID); - - return rc; -} - -void dma_free_seg_table(unsigned long entry) -{ - unsigned long *sto = get_rt_sto(entry); - int sx; - - for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++) - if (reg_entry_isvalid(sto[sx])) - dma_free_page_table(get_st_pto(sto[sx])); - - dma_free_cpu_table(sto); -} - -void dma_cleanup_tables(unsigned long *table) -{ - int rtx; - - if (!table) - return; - - for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) - if (reg_entry_isvalid(table[rtx])) - dma_free_seg_table(table[rtx]); - - dma_free_cpu_table(table); -} - -static unsigned long __dma_alloc_iommu(struct device *dev, - unsigned long start, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - - return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages, - start, size, zdev->start_dma >> PAGE_SHIFT, - dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT), - 0); -} - -static dma_addr_t dma_alloc_address(struct device *dev, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long offset, flags; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); - offset = __dma_alloc_iommu(dev, zdev->next_bit, size); - if (offset == -1) { - if (!s390_iommu_strict) { - /* global flush before DMA addresses are reused */ - if (zpci_refresh_global(zdev)) - goto out_error; - - bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, - zdev->lazy_bitmap, zdev->iommu_pages); - bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); - } - /* wrap-around */ - offset = __dma_alloc_iommu(dev, 0, size); - if (offset == -1) - goto out_error; - } - zdev->next_bit = offset + size; - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - - return zdev->start_dma + offset * PAGE_SIZE; - -out_error: - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - return DMA_MAPPING_ERROR; -} - -static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long flags, offset; - - offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); - if (!zdev->iommu_bitmap) - goto out; - - if (s390_iommu_strict) - bitmap_clear(zdev->iommu_bitmap, offset, size); - else - bitmap_set(zdev->lazy_bitmap, offset, size); - -out: - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); -} - -static inline void zpci_err_dma(unsigned long rc, unsigned long addr) -{ - struct { - unsigned long rc; - unsigned long addr; - } __packed data = {rc, addr}; - - zpci_err_hex(&data, sizeof(data)); -} - -static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long pa = page_to_phys(page) + offset; - int flags = ZPCI_PTE_VALID; - unsigned long nr_pages; - dma_addr_t dma_addr; - int ret; - - /* This rounds up number of pages based on size and offset */ - nr_pages = iommu_num_pages(pa, size, PAGE_SIZE); - dma_addr = dma_alloc_address(dev, nr_pages); - if (dma_addr == DMA_MAPPING_ERROR) { - ret = -ENOSPC; - goto out_err; - } - - /* Use rounded up size */ - size = nr_pages * PAGE_SIZE; - - if (direction == DMA_NONE || direction == DMA_TO_DEVICE) - flags |= ZPCI_TABLE_PROTECTED; - - ret = dma_update_trans(zdev, pa, dma_addr, size, flags); - if (ret) - goto out_free; - - atomic64_add(nr_pages, &zdev->mapped_pages); - return dma_addr + (offset & ~PAGE_MASK); - -out_free: - dma_free_address(dev, dma_addr, nr_pages); -out_err: - zpci_err("map error:\n"); - zpci_err_dma(ret, pa); - return DMA_MAPPING_ERROR; -} - -static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - int npages, ret; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - dma_addr = dma_addr & PAGE_MASK; - ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE, - ZPCI_PTE_INVALID); - if (ret) { - zpci_err("unmap error:\n"); - zpci_err_dma(ret, dma_addr); - return; - } - - atomic64_add(npages, &zdev->unmapped_pages); - dma_free_address(dev, dma_addr, npages); -} - -static void *s390_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - struct page *page; - phys_addr_t pa; - dma_addr_t map; - - size = PAGE_ALIGN(size); - page = alloc_pages(flag | __GFP_ZERO, get_order(size)); - if (!page) - return NULL; - - pa = page_to_phys(page); - map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); - if (dma_mapping_error(dev, map)) { - __free_pages(page, get_order(size)); - return NULL; - } - - atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages); - if (dma_handle) - *dma_handle = map; - return phys_to_virt(pa); -} - -static void s390_dma_free(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - - size = PAGE_ALIGN(size); - atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages); - s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0); - free_pages((unsigned long)vaddr, get_order(size)); -} - -/* Map a segment into a contiguous dma address area */ -static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - size_t size, dma_addr_t *handle, - enum dma_data_direction dir) -{ - unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - dma_addr_t dma_addr_base, dma_addr; - int flags = ZPCI_PTE_VALID; - struct scatterlist *s; - phys_addr_t pa = 0; - int ret; - - dma_addr_base = dma_alloc_address(dev, nr_pages); - if (dma_addr_base == DMA_MAPPING_ERROR) - return -ENOMEM; - - dma_addr = dma_addr_base; - if (dir == DMA_NONE || dir == DMA_TO_DEVICE) - flags |= ZPCI_TABLE_PROTECTED; - - for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { - pa = page_to_phys(sg_page(s)); - ret = __dma_update_trans(zdev, pa, dma_addr, - s->offset + s->length, flags); - if (ret) - goto unmap; - - dma_addr += s->offset + s->length; - } - ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); - if (ret) - goto unmap; - - *handle = dma_addr_base; - atomic64_add(nr_pages, &zdev->mapped_pages); - - return ret; - -unmap: - dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, - ZPCI_PTE_INVALID); - dma_free_address(dev, dma_addr_base, nr_pages); - zpci_err("map error:\n"); - zpci_err_dma(ret, pa); - return ret; -} - -static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s = sg, *start = sg, *dma = sg; - unsigned int max = dma_get_max_seg_size(dev); - unsigned int size = s->offset + s->length; - unsigned int offset = s->offset; - int count = 0, i, ret; - - for (i = 1; i < nr_elements; i++) { - s = sg_next(s); - - s->dma_length = 0; - - if (s->offset || (size & ~PAGE_MASK) || - size + s->length > max) { - ret = __s390_dma_map_sg(dev, start, size, - &dma->dma_address, dir); - if (ret) - goto unmap; - - dma->dma_address += offset; - dma->dma_length = size - offset; - - size = offset = s->offset; - start = s; - dma = sg_next(dma); - count++; - } - size += s->length; - } - ret = __s390_dma_map_sg(dev, start, size, &dma->dma_address, dir); - if (ret) - goto unmap; - - dma->dma_address += offset; - dma->dma_length = size - offset; - - return count + 1; -unmap: - for_each_sg(sg, s, count, i) - s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s), - dir, attrs); - - return ret; -} - -static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nr_elements, i) { - if (s->dma_length) - s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, - dir, attrs); - s->dma_address = 0; - s->dma_length = 0; - } -} - -int zpci_dma_init_device(struct zpci_dev *zdev) -{ - int rc; - - /* - * At this point, if the device is part of an IOMMU domain, this would - * be a strong hint towards a bug in the IOMMU API (common) code and/or - * simultaneous access via IOMMU and DMA API. So let's issue a warning. - */ - WARN_ON(zdev->s390_domain); - - spin_lock_init(&zdev->iommu_bitmap_lock); - spin_lock_init(&zdev->dma_table_lock); - - zdev->dma_table = dma_alloc_cpu_table(); - if (!zdev->dma_table) { - rc = -ENOMEM; - goto out; - } - - /* - * Restrict the iommu bitmap size to the minimum of the following: - * - s390_iommu_aperture which defaults to high_memory - * - 3-level pagetable address limit minus start_dma offset - * - DMA address range allowed by the hardware (clp query pci fn) - * - * Also set zdev->end_dma to the actual end address of the usable - * range, instead of the theoretical maximum as reported by hardware. - * - * This limits the number of concurrently usable DMA mappings since - * for each DMA mapped memory address we need a DMA address including - * extra DMA addresses for multiple mappings of the same memory address. - */ - zdev->start_dma = PAGE_ALIGN(zdev->start_dma); - zdev->iommu_size = min3(s390_iommu_aperture, - ZPCI_TABLE_SIZE_RT - zdev->start_dma, - zdev->end_dma - zdev->start_dma + 1); - zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; - zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT; - zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8); - if (!zdev->iommu_bitmap) { - rc = -ENOMEM; - goto free_dma_table; - } - if (!s390_iommu_strict) { - zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8); - if (!zdev->lazy_bitmap) { - rc = -ENOMEM; - goto free_bitmap; - } - - } - if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table))) { - rc = -EIO; - goto free_bitmap; - } - - return 0; -free_bitmap: - vfree(zdev->iommu_bitmap); - zdev->iommu_bitmap = NULL; - vfree(zdev->lazy_bitmap); - zdev->lazy_bitmap = NULL; -free_dma_table: - dma_free_cpu_table(zdev->dma_table); - zdev->dma_table = NULL; -out: - return rc; -} - -int zpci_dma_exit_device(struct zpci_dev *zdev) -{ - int cc = 0; - - /* - * At this point, if the device is part of an IOMMU domain, this would - * be a strong hint towards a bug in the IOMMU API (common) code and/or - * simultaneous access via IOMMU and DMA API. So let's issue a warning. - */ - WARN_ON(zdev->s390_domain); - if (zdev_enabled(zdev)) - cc = zpci_unregister_ioat(zdev, 0); - /* - * cc == 3 indicates the function is gone already. This can happen - * if the function was deconfigured/disabled suddenly and we have not - * received a new handle yet. - */ - if (cc && cc != 3) - return -EIO; - - dma_cleanup_tables(zdev->dma_table); - zdev->dma_table = NULL; - vfree(zdev->iommu_bitmap); - zdev->iommu_bitmap = NULL; - vfree(zdev->lazy_bitmap); - zdev->lazy_bitmap = NULL; - zdev->next_bit = 0; - return 0; -} - -static int __init dma_alloc_cpu_table_caches(void) -{ - dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables", - ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN, - 0, NULL); - if (!dma_region_table_cache) - return -ENOMEM; - - dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables", - ZPCI_PT_SIZE, ZPCI_PT_ALIGN, - 0, NULL); - if (!dma_page_table_cache) { - kmem_cache_destroy(dma_region_table_cache); - return -ENOMEM; - } - return 0; -} - -int __init zpci_dma_init(void) -{ - s390_iommu_aperture = (u64)high_memory; - if (!s390_iommu_aperture_factor) - s390_iommu_aperture = ULONG_MAX; - else - s390_iommu_aperture *= s390_iommu_aperture_factor; - - return dma_alloc_cpu_table_caches(); -} - -void zpci_dma_exit(void) -{ - kmem_cache_destroy(dma_page_table_cache); - kmem_cache_destroy(dma_region_table_cache); -} - -const struct dma_map_ops s390_pci_dma_ops = { - .alloc = s390_dma_alloc, - .free = s390_dma_free, - .map_sg = s390_dma_map_sg, - .unmap_sg = s390_dma_unmap_sg, - .map_page = s390_dma_map_pages, - .unmap_page = s390_dma_unmap_pages, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, - .free_pages = dma_common_free_pages, - /* dma_supported is unconditionally true without a callback */ -}; -EXPORT_SYMBOL_GPL(s390_pci_dma_ops); - -static int __init s390_iommu_setup(char *str) -{ - if (!strcmp(str, "strict")) - s390_iommu_strict = 1; - return 1; -} - -__setup("s390_iommu=", s390_iommu_setup); - -static int __init s390_iommu_aperture_setup(char *str) -{ - if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) - s390_iommu_aperture_factor = 1; - return 1; -} - -__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index b9324ca2eb94..2fbee3887d13 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -16,6 +16,7 @@ #include <asm/sclp.h> #include "pci_bus.h" +#include "pci_report.h" /* Content Code Description for PCI Function Error */ struct zpci_ccdf_err { @@ -59,9 +60,16 @@ static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) } } -static bool is_passed_through(struct zpci_dev *zdev) +static bool is_passed_through(struct pci_dev *pdev) { - return zdev->s390_domain; + struct zpci_dev *zdev = to_zpci(pdev); + bool ret; + + mutex_lock(&zdev->kzdev_lock); + ret = !!zdev->kzdev; + mutex_unlock(&zdev->kzdev_lock); + + return ret; } static bool is_driver_supported(struct pci_driver *driver) @@ -162,6 +170,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev, static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) { pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct zpci_dev *zdev = to_zpci(pdev); + char *status_str = "success"; struct pci_driver *driver; /* @@ -176,32 +186,40 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) } pdev->error_state = pci_channel_io_frozen; - if (is_passed_through(to_zpci(pdev))) { + if (is_passed_through(pdev)) { pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", pci_name(pdev)); + status_str = "failed (pass-through)"; goto out_unlock; } driver = to_pci_driver(pdev->dev.driver); if (!is_driver_supported(driver)) { - if (!driver) + if (!driver) { pr_info("%s: Cannot be recovered because no driver is bound to the device\n", pci_name(pdev)); - else + status_str = "failed (no driver)"; + } else { pr_info("%s: The %s driver bound to the device does not support error recovery\n", pci_name(pdev), driver->name); + status_str = "failed (no driver support)"; + } goto out_unlock; } ers_res = zpci_event_notify_error_detected(pdev, driver); - if (ers_result_indicates_abort(ers_res)) + if (ers_result_indicates_abort(ers_res)) { + status_str = "failed (abort on detection)"; goto out_unlock; + } if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) { ers_res = zpci_event_do_error_state_clear(pdev, driver); - if (ers_result_indicates_abort(ers_res)) + if (ers_result_indicates_abort(ers_res)) { + status_str = "failed (abort on MMIO enable)"; goto out_unlock; + } } if (ers_res == PCI_ERS_RESULT_NEED_RESET) @@ -210,6 +228,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) if (ers_res != PCI_ERS_RESULT_RECOVERED) { pr_err("%s: Automatic recovery failed; operator intervention is required\n", pci_name(pdev)); + status_str = "failed (driver can't recover)"; goto out_unlock; } @@ -218,6 +237,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) driver->err_handler->resume(pdev); out_unlock: pci_dev_unlock(pdev); + zpci_report_status(zdev, "recovery", status_str); return ers_res; } @@ -239,7 +259,7 @@ static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) * we will inject the error event and let the guest recover the device * itself. */ - if (is_passed_through(to_zpci(pdev))) + if (is_passed_through(pdev)) goto out; driver = to_pci_driver(pdev->dev.driver); if (driver && driver->err_handler && driver->err_handler->error_detected) @@ -260,6 +280,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) zpci_err_hex(ccdf, sizeof(*ccdf)); if (zdev) { + mutex_lock(&zdev->state_lock); zpci_update_fh(zdev, ccdf->fh); if (zdev->zbus->bus) pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); @@ -272,21 +293,24 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) goto no_pdev; switch (ccdf->pec) { - case 0x003a: /* Service Action or Error Recovery Successful */ + case 0x002a: /* Error event concerns FMB */ + case 0x002b: + case 0x002c: + break; + case 0x0040: /* Service Action or Error Recovery Failed */ + case 0x003b: + zpci_event_io_failure(pdev, pci_channel_io_perm_failure); + break; + default: /* PCI function left in the error state attempt to recover */ ers_res = zpci_event_attempt_error_recovery(pdev); if (ers_res != PCI_ERS_RESULT_RECOVERED) zpci_event_io_failure(pdev, pci_channel_io_perm_failure); break; - default: - /* - * Mark as frozen not permanently failed because the device - * could be subsequently recovered by the platform. - */ - zpci_event_io_failure(pdev, pci_channel_io_frozen); - break; } pci_dev_put(pdev); no_pdev: + if (zdev) + mutex_unlock(&zdev->state_lock); zpci_zdev_put(zdev); } @@ -306,13 +330,27 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) /* Even though the device is already gone we still * need to free zPCI resources as part of the disable. */ - if (zdev->dma_table) - zpci_dma_exit_device(zdev); if (zdev_enabled(zdev)) zpci_disable_device(zdev); zdev->state = ZPCI_FN_STATE_STANDBY; } +static void zpci_event_reappear(struct zpci_dev *zdev) +{ + lockdep_assert_held(&zdev->state_lock); + /* + * The zdev is in the reserved state. This means that it was presumed to + * go away but there are still undropped references. Now, the platform + * announced its availability again. Bring back the lingering zdev + * to standby. This is safe because we hold a temporary reference + * now so that it won't go away. Account for the re-appearance of the + * underlying device by incrementing the reference count. + */ + zdev->state = ZPCI_FN_STATE_STANDBY; + zpci_zdev_get(zdev); + zpci_dbg(1, "rea fid:%x, fh:%x\n", zdev->fid, zdev->fh); +} + static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); @@ -321,29 +359,48 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n", ccdf->fid, ccdf->fh, ccdf->pec); + + if (existing_zdev) + mutex_lock(&zdev->state_lock); + switch (ccdf->pec) { case 0x0301: /* Reserved|Standby -> Configured */ if (!zdev) { zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED); if (IS_ERR(zdev)) break; + if (zpci_add_device(zdev)) { + kfree(zdev); + break; + } } else { + if (zdev->state == ZPCI_FN_STATE_RESERVED) + zpci_event_reappear(zdev); /* the configuration request may be stale */ - if (zdev->state != ZPCI_FN_STATE_STANDBY) + else if (zdev->state != ZPCI_FN_STATE_STANDBY) break; zdev->state = ZPCI_FN_STATE_CONFIGURED; } zpci_scan_configured_device(zdev, ccdf->fh); break; case 0x0302: /* Reserved -> Standby */ - if (!zdev) - zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); - else + if (!zdev) { + zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); + if (IS_ERR(zdev)) + break; + if (zpci_add_device(zdev)) { + kfree(zdev); + break; + } + } else { + if (zdev->state == ZPCI_FN_STATE_RESERVED) + zpci_event_reappear(zdev); zpci_update_fh(zdev, ccdf->fh); + } break; case 0x0303: /* Deconfiguration requested */ if (zdev) { - /* The event may have been queued before we confirgured + /* The event may have been queued before we configured * the device. */ if (zdev->state != ZPCI_FN_STATE_CONFIGURED) @@ -354,7 +411,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; case 0x0304: /* Configured -> Standby|Reserved */ if (zdev) { - /* The event may have been queued before we confirgured + /* The event may have been queued before we configured * the device.: */ if (zdev->state == ZPCI_FN_STATE_CONFIGURED) @@ -368,7 +425,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ zpci_remove_reserved_devices(); - clp_scan_pci_devices(); + zpci_scan_devices(); break; case 0x0308: /* Standby -> Reserved */ if (!zdev) @@ -378,8 +435,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) default: break; } - if (existing_zdev) + if (existing_zdev) { + mutex_unlock(&zdev->state_lock); zpci_zdev_put(zdev); + } } void zpci_event_availability(void *data) diff --git a/arch/s390/pci/pci_fixup.c b/arch/s390/pci/pci_fixup.c new file mode 100644 index 000000000000..35688b645098 --- /dev/null +++ b/arch/s390/pci/pci_fixup.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Exceptions for specific devices, + * + * Copyright IBM Corp. 2025 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + */ +#include <linux/pci.h> + +static void zpci_ism_bar_no_mmap(struct pci_dev *pdev) +{ + /* + * ISM's BAR is special. Drivers written for ISM know + * how to handle this but others need to be aware of their + * special nature e.g. to prevent attempts to mmap() it. + */ + pdev->non_mappable_bars = 1; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_ISM, + zpci_ism_bar_no_mmap); diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 1a822b7799f8..eb978c8012be 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -15,6 +15,7 @@ #include <asm/pci_debug.h> #include <asm/pci_io.h> #include <asm/processor.h> +#include <asm/asm.h> #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ @@ -57,16 +58,16 @@ static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status, /* Modify PCI Function Controls */ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) { - u8 cc; + int cc; asm volatile ( " .insn rxy,0xe300000000d0,%[req],%[fib]\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=d" (cc), [req] "+d" (req), [fib] "+Q" (*fib) - : : "cc"); + CC_IPM(cc) + : CC_OUT(cc, cc), [req] "+d" (req), [fib] "+Q" (*fib) + : + : CC_CLOBBER); *status = req >> 24 & 0xff; - return cc; + return CC_TRANSFORM(cc); } u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) @@ -92,22 +93,22 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) return cc; } +EXPORT_SYMBOL_GPL(zpci_mod_fc); /* Refresh PCI Translations */ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) { union register_pair addr_range = {.even = addr, .odd = range}; - u8 cc; + int cc; asm volatile ( " .insn rre,0xb9d30000,%[fn],%[addr_range]\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=d" (cc), [fn] "+d" (fn) + CC_IPM(cc) + : CC_OUT(cc, cc), [fn] "+d" (fn) : [addr_range] "d" (addr_range.pair) - : "cc"); + : CC_CLOBBER); *status = fn >> 24 & 0xff; - return cc; + return CC_TRANSFORM(cc); } int zpci_refresh_trans(u64 fn, u64 addr, u64 range) @@ -138,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; @@ -149,25 +150,29 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) return 0; } +EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl); /* PCI Load */ static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) { union register_pair req_off = {.even = req, .odd = offset}; - int cc = -ENXIO; + int cc, exception; u64 __data; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rre,0xb9d20000,%[data],%[req_off]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [data] "=d" (__data), - [req_off] "+&d" (req_off.pair) :: "cc"); + : CC_OUT(cc, cc), [data] "=d" (__data), + [req_off] "+d" (req_off.pair), [exc] "+d" (exception) + : + : CC_CLOBBER); *status = req_off.even >> 24 & 0xff; *data = __data; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) @@ -220,20 +225,23 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) { union register_pair ioaddr_len = {.even = ioaddr, .odd = len}; - int cc = -ENXIO; + int cc, exception; u64 __data; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [data] "=d" (__data), - [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc"); + : CC_OUT(cc, cc), [data] "=d" (__data), + [ioaddr_len] "+d" (ioaddr_len.pair), [exc] "+d" (exception) + : + : CC_CLOBBER); *status = ioaddr_len.odd >> 24 & 0xff; *data = __data; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) @@ -256,19 +264,20 @@ EXPORT_SYMBOL_GPL(zpci_load); static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) { union register_pair req_off = {.even = req, .odd = offset}; - int cc = -ENXIO; + int cc, exception; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rre,0xb9d00000,%[data],%[req_off]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [req_off] "+&d" (req_off.pair) + : CC_OUT(cc, cc), [req_off] "+d" (req_off.pair), [exc] "+d" (exception) : [data] "d" (data) - : "cc"); + : CC_CLOBBER); *status = req_off.even >> 24 & 0xff; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } int __zpci_store(u64 data, u64 req, u64 offset) @@ -309,19 +318,20 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) { union register_pair ioaddr_len = {.even = ioaddr, .odd = len}; - int cc = -ENXIO; + int cc, exception; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair) + : CC_OUT(cc, cc), [ioaddr_len] "+d" (ioaddr_len.pair), [exc] "+d" (exception) : [data] "d" (data) - : "cc", "memory"); + : CC_CLOBBER_LIST("memory")); *status = ioaddr_len.odd >> 24 & 0xff; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) @@ -343,19 +353,20 @@ EXPORT_SYMBOL_GPL(zpci_store); /* PCI Store Block */ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) { - int cc = -ENXIO; + int cc, exception; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rsy,0xeb00000000d0,%[req],%[offset],%[data]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [req] "+d" (req) + : CC_OUT(cc, cc), [req] "+d" (req), [exc] "+d" (exception) : [offset] "d" (offset), [data] "Q" (*data) - : "cc"); + : CC_CLOBBER); *status = req >> 24 & 0xff; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } int __zpci_store_block(const u64 *data, u64 req, u64 offset) @@ -396,19 +407,20 @@ static inline int zpci_write_block_fh(volatile void __iomem *dst, static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status) { - int cc = -ENXIO; + int cc, exception; - asm volatile ( + exception = 1; + asm_inline volatile ( " .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n" - "0: ipm %[cc]\n" - " srl %[cc],28\n" + "0: lhi %[exc],0\n" "1:\n" + CC_IPM(cc) EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [len] "+d" (len) + : CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception) : [ioaddr] "d" (ioaddr), [data] "Q" (*data) - : "cc"); + : CC_CLOBBER); *status = len >> 24 & 0xff; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } int zpci_write_block(volatile void __iomem *dst, diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index ead062bf2b41..191e56a623f6 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -60,18 +60,35 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in return 0; } -int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +/** + * zpci_iov_find_parent_pf - Find the parent PF, if any, of the given function + * @zbus: The bus that the PCI function is on, or would be added on + * @zdev: The PCI function + * + * Finds the parent PF, if it exists and is configured, of the given PCI function + * and increments its refcount. Th PF is searched for on the provided bus so the + * caller has to ensure that this is the correct bus to search. This function may + * be used before adding the PCI function to a zbus. + * + * Return: Pointer to the struct pci_dev of the parent PF or NULL if it not + * found. If the function is not a VF or has no RequesterID information, + * NULL is returned as well. + */ +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { - int i, cand_devfn; - struct zpci_dev *zdev; + int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; - int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ - int rc = 0; if (!zbus->multifunction) - return 0; - - /* If the parent PF for the given VF is also configured in the + return NULL; + /* Non-VFs and VFs without RID available don't have a parent */ + if (!zdev->vfn || !zdev->rid_available) + return NULL; + /* Linux vfid starts at 0 vfn at 1 */ + vfid = zdev->vfn - 1; + devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; + /* + * If the parent PF for the given VF is also configured in the * instance, it must be on the same zbus. * We can then identify the parent PF by checking what * devfn the VF would have if it belonged to that PF using the PF's @@ -85,15 +102,26 @@ int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn if (!pdev) continue; cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); - if (cand_devfn == virtfn->devfn) { - rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); - /* balance pci_get_slot() */ - pci_dev_put(pdev); - break; - } + if (cand_devfn == devfn) + return pdev; /* balance pci_get_slot() */ pci_dev_put(pdev); } } + return NULL; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + struct zpci_dev *zdev = to_zpci(virtfn); + struct pci_dev *pdev_pf; + int rc = 0; + + pdev_pf = zpci_iov_find_parent_pf(zbus, zdev); + if (pdev_pf) { + /* Linux' vfids start at 0 while zdev->vfn starts at 1 */ + rc = zpci_iov_link_virtfn(pdev_pf, virtfn, zdev->vfn - 1); + pci_dev_put(pdev_pf); + } return rc; } diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h index b2c828003bad..d2c2793eb0f3 100644 --- a/arch/s390/pci/pci_iov.h +++ b/arch/s390/pci/pci_iov.h @@ -10,6 +10,8 @@ #ifndef __S390_PCI_IOV_H #define __S390_PCI_IOV_H +#include <linux/pci.h> + #ifdef CONFIG_PCI_IOV void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn); @@ -17,6 +19,8 @@ void zpci_iov_map_resources(struct pci_dev *pdev); int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev); + #else /* CONFIG_PCI_IOV */ static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} @@ -26,5 +30,10 @@ static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *v { return 0; } + +static inline struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + return NULL; +} #endif /* CONFIG_PCI_IOV */ #endif /* __S390_PCI_IOV_h */ diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 500cd2dbdf53..84482a921332 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -11,16 +11,10 @@ #include <asm/isc.h> #include <asm/airq.h> +#include <asm/tpi.h> static enum {FLOATING, DIRECTED} irq_delivery; -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 -#define SIC_IRQ_MODE_DIRECT 4 -#define SIC_IRQ_MODE_D_ALL 16 -#define SIC_IRQ_MODE_D_SINGLE 17 -#define SIC_IRQ_MODE_SET_CPU 18 - /* * summary bit vector * FLOATING - summary bit per function @@ -49,6 +43,7 @@ static int zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -60,6 +55,8 @@ static int zpci_clear_airq(struct zpci_dev *zdev) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev) fib.fmt = 1; fib.fmt1.noi = zdev->msi_nr_irqs; fib.fmt1.dibvo = zdev->msi_first_bit; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) u8 cc, status; fib.fmt = 1; + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -133,7 +132,7 @@ static int zpci_clear_irq(struct zpci_dev *zdev) static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { - struct msi_desc *entry = irq_get_msi_desc(data->irq); + struct msi_desc *entry = irq_data_get_msi_desc(data); struct msi_msg msg = entry->msg; int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); @@ -153,6 +152,7 @@ static struct irq_chip zpci_irq_chip = { static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + union zpci_sic_iib iib = {{0}}; unsigned long bit; int irqs_on = 0; @@ -163,8 +163,8 @@ static void zpci_handle_cpu_local_irq(bool rescan) if (!rescan || irqs_on++) /* End of second scan with interrupts on. */ break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib)) break; bit = 0; continue; @@ -192,6 +192,7 @@ static void zpci_handle_remote_irq(void *data) static void zpci_handle_fallback_irq(void) { struct cpu_irq_data *cpu_data; + union zpci_sic_iib iib = {{0}}; unsigned long cpu; int irqs_on = 0; @@ -201,8 +202,8 @@ static void zpci_handle_fallback_irq(void) if (irqs_on++) /* End of second scan with interrupts on. */ break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; cpu = 0; continue; @@ -216,8 +217,11 @@ static void zpci_handle_fallback_irq(void) } } -static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_directed_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + bool floating = !tpi_info->directed_irq; + if (floating) { inc_irq_stat(IRQIO_PCF); zpci_handle_fallback_irq(); @@ -227,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) } } -static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_floating_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + union zpci_sic_iib iib = {{0}}; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -241,8 +247,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) if (irqs_on++) /* End of second scan with interrupts on. */ break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; si = 0; continue; @@ -262,61 +268,87 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) } } -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, + unsigned long *bit) { - struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs, cpu; - unsigned long bit; - struct msi_desc *msi; - struct msi_msg msg; - int cpu_addr; - int rc, irq; - - zdev->aisb = -1UL; - zdev->msi_first_bit = -1U; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); - if (irq_delivery == DIRECTED) { /* Allocate cpu vector bits */ - bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); - if (bit == -1UL) + *bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); + if (*bit == -1UL) return -EIO; } else { /* Allocate adapter summary indicator bit */ - bit = airq_iv_alloc_bit(zpci_sbv); - if (bit == -1UL) + *bit = airq_iv_alloc_bit(zpci_sbv); + if (*bit == -1UL) return -EIO; - zdev->aisb = bit; + zdev->aisb = *bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); if (!zdev->aibv) return -ENOMEM; /* Wire up shortcut pointer */ - zpci_ibv[bit] = zdev->aibv; + zpci_ibv[*bit] = zdev->aibv; /* Each function has its own interrupt vector */ - bit = 0; + *bit = 0; + } + return 0; +} + +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu; + struct zpci_dev *zdev = to_zpci(pdev); + struct msi_desc *msi; + struct msi_msg msg; + unsigned long bit; + int cpu_addr; + int rc, irq; + + zdev->aisb = -1UL; + zdev->msi_first_bit = -1U; + + msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + if (msi_vecs < nvec) { + pr_info("%s requested %d irqs, allocate system limit of %d", + pci_name(pdev), nvec, zdev->max_msi); } - /* Request MSI interrupts */ + rc = __alloc_airq(zdev, msi_vecs, &bit); + if (rc < 0) + return rc; + + /* + * Request MSI interrupts: + * When using MSI, nvec_used interrupt sources and their irq + * descriptors are controlled through one msi descriptor. + * Thus the outer loop over msi descriptors shall run only once, + * while two inner loops iterate over the interrupt vectors. + * When using MSI-X, each interrupt vector/irq descriptor + * is bound to exactly one msi descriptor (nvec_used is one). + * So the inner loops are executed once, while the outer iterates + * over the MSI-X descriptors. + */ hwirq = bit; msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { - rc = -EIO; if (hwirq - bit >= msi_vecs) break; - irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, - (irq_delivery == DIRECTED) ? - msi->affinity : NULL); + irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used); + irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE, + (irq_delivery == DIRECTED) ? + msi->affinity : NULL); if (irq < 0) return -ENOMEM; - rc = irq_set_msi_desc(irq, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq, &zpci_irq_chip, - handle_percpu_irq); + + for (i = 0; i < irqs_per_msi; i++) { + rc = irq_set_msi_desc_off(irq, i, msi); + if (rc) + return rc; + irq_set_chip_and_handler(irq + i, &zpci_irq_chip, + handle_percpu_irq); + } + msg.data = hwirq - bit; if (irq_delivery == DIRECTED) { if (msi->affinity) @@ -329,31 +361,35 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) msg.address_lo |= (cpu_addr << 8); for_each_possible_cpu(cpu) { - airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); + for (i = 0; i < irqs_per_msi; i++) + airq_iv_set_data(zpci_ibv[cpu], + hwirq + i, irq + i); } } else { msg.address_lo = zdev->msi_addr & 0xffffffff; - airq_iv_set_data(zdev->aibv, hwirq, irq); + for (i = 0; i < irqs_per_msi; i++) + airq_iv_set_data(zdev->aibv, hwirq + i, irq + i); } msg.address_hi = zdev->msi_addr >> 32; pci_write_msi_msg(irq, &msg); - hwirq++; + hwirq += irqs_per_msi; } zdev->msi_first_bit = bit; - zdev->msi_nr_irqs = msi_vecs; + zdev->msi_nr_irqs = hwirq - bit; rc = zpci_set_irq(zdev); if (rc) return rc; - return (msi_vecs == nvec) ? 0 : msi_vecs; + return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs; } void arch_teardown_msi_irqs(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); struct msi_desc *msi; + unsigned int i; int rc; /* Disable interrupts */ @@ -363,8 +399,10 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) /* Release MSI interrupts */ msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { - irq_set_msi_desc(msi->irq, NULL); - irq_free_desc(msi->irq); + for (i = 0; i < msi->nvec_used; i++) { + irq_set_msi_desc(msi->irq + i, NULL); + irq_free_desc(msi->irq + i); + } msi->msg.address_lo = 0; msi->msg.address_hi = 0; msi->msg.data = 0; @@ -402,11 +440,12 @@ static struct airq_struct zpci_airq = { static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; + union zpci_sic_iib ziib = {{0}}; - iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; + iib.cdiib.dibv_addr = virt_to_phys(zpci_ibv[smp_processor_id()]->vector); - __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); - zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib); } static int __init zpci_directed_irq_init(void) @@ -414,14 +453,14 @@ static int __init zpci_directed_irq_init(void) union zpci_sic_iib iib = {{0}}; unsigned int cpu; - zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL); if (!zpci_sbv) return -ENOMEM; iib.diib.isc = PCI_ISC; iib.diib.nr_cpus = num_possible_cpus(); iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector); - __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), GFP_KERNEL); @@ -436,7 +475,7 @@ static int __init zpci_directed_irq_init(void) zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, AIRQ_IV_DATA | AIRQ_IV_CACHELINE | - (!cpu ? AIRQ_IV_ALLOC : 0)); + (!cpu ? AIRQ_IV_ALLOC : 0), NULL); if (!zpci_ibv[cpu]) return -ENOMEM; } @@ -453,7 +492,7 @@ static int __init zpci_floating_irq_init(void) if (!zpci_ibv) return -ENOMEM; - zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); if (!zpci_sbv) goto out_free; @@ -466,6 +505,7 @@ out_free: int __init zpci_irq_init(void) { + union zpci_sic_iib iib = {{0}}; int rc; irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; @@ -497,7 +537,7 @@ int __init zpci_irq_init(void) * Enable floating IRQs (with suppression after one IRQ). When using * directed IRQs this enables the fallback path. */ - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib); return 0; out_airq: diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c new file mode 100644 index 000000000000..ff34baf50a3e --- /dev/null +++ b/arch/s390/pci/pci_kvm_hook.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO ZPCI devices support + * + * Copyright (C) IBM Corp. 2022. All rights reserved. + * Author(s): Pierre Morel <pmorel@linux.ibm.com> + */ +#include <linux/kvm_host.h> + +struct zpci_kvm_hook zpci_kvm_hook; +EXPORT_SYMBOL_GPL(zpci_kvm_hook); diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 080c88620723..51e7a28af899 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -14,6 +14,7 @@ #include <asm/asm-extable.h> #include <asm/pci_io.h> #include <asm/pci_debug.h> +#include <asm/asm.h> static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset) { @@ -30,20 +31,24 @@ static inline int __pcistb_mio_inuser( void __iomem *ioaddr, const void __user *src, u64 len, u8 *status) { - int cc = -ENXIO; - - asm volatile ( - " sacf 256\n" - "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n" - "1: ipm %[cc]\n" - " srl %[cc],28\n" - "2: sacf 768\n" + int cc, exception; + bool sacf_flag; + + exception = 1; + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile ( + " sacf 256\n" + "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n" + "1: lhi %[exc],0\n" + "2: sacf 768\n" + CC_IPM(cc) EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : [cc] "+d" (cc), [len] "+d" (len) + : CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception) : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src)) - : "cc", "memory"); + : CC_CLOBBER_LIST("memory")); + disable_sacf_uaccess(sacf_flag); *status = len >> 24 & 0xff; - return cc; + return exception ? -ENXIO : CC_TRANSFORM(cc); } static inline int __pcistg_mio_inuser( @@ -51,7 +56,8 @@ static inline int __pcistg_mio_inuser( u64 ulen, u8 *status) { union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; - int cc = -ENXIO; + int cc, exception; + bool sacf_flag; u64 val = 0; u64 cnt = ulen; u8 tmp; @@ -61,25 +67,29 @@ static inline int __pcistg_mio_inuser( * a register, then store it to PCI at @ioaddr while in secondary * address space. pcistg then uses the user mappings. */ - asm volatile ( - " sacf 256\n" - "0: llgc %[tmp],0(%[src])\n" - " sllg %[val],%[val],8\n" - " aghi %[src],1\n" - " ogr %[val],%[tmp]\n" - " brctg %[cnt],0b\n" - "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n" - "2: ipm %[cc]\n" - " srl %[cc],28\n" - "3: sacf 768\n" - EX_TABLE(0b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b) + exception = 1; + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile ( + " sacf 256\n" + "0: llgc %[tmp],0(%[src])\n" + "4: sllg %[val],%[val],8\n" + " aghi %[src],1\n" + " ogr %[val],%[tmp]\n" + " brctg %[cnt],0b\n" + "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n" + "2: lhi %[exc],0\n" + "3: sacf 768\n" + CC_IPM(cc) + EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b) + : [src] "+a" (src), [cnt] "+d" (cnt), + [val] "+d" (val), [tmp] "=d" (tmp), [exc] "+d" (exception), + CC_OUT(cc, cc), [ioaddr_len] "+&d" (ioaddr_len.pair) : - [src] "+a" (src), [cnt] "+d" (cnt), - [val] "+d" (val), [tmp] "=d" (tmp), - [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair) - :: "cc", "memory"); + : CC_CLOBBER_LIST("memory")); + disable_sacf_uaccess(sacf_flag); *status = ioaddr_len.odd >> 24 & 0xff; + cc = exception ? -ENXIO : CC_TRANSFORM(cc); /* did we read everything from user memory? */ if (!cc && cnt != 0) cc = -EFAULT; @@ -97,9 +107,9 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, return -EINVAL; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) dst, - (u64 __force) src, n, - ZPCI_MAX_WRITE_SIZE); + size = zpci_get_max_io_size((u64 __force) dst, + (u64 __force) src, n, + ZPCI_MAX_WRITE_SIZE); if (size > 8) /* main path */ rc = __pcistb_mio_inuser(dst, src, size, &status); else @@ -118,12 +128,11 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, const void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -169,11 +178,17 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); - if (ret) - goto out_unlock_mmap; + args.address = mmio_addr; + args.vma = vma; + ret = follow_pfnmap_start(&args); + if (ret) { + fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL); + ret = follow_pfnmap_start(&args); + if (ret) + goto out_unlock_mmap; + } - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) @@ -181,7 +196,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, ret = zpci_memcpy_toio(io_addr, buf, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); out_free: @@ -195,9 +210,10 @@ static inline int __pcilg_mio_inuser( u64 ulen, u8 *status) { union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; + bool sacf_flag; u64 cnt = ulen; int shift = ulen * 8; - int cc = -ENXIO; + int cc, exception; u64 val, tmp; /* @@ -205,27 +221,34 @@ static inline int __pcilg_mio_inuser( * user space) into a register using pcilg then store these bytes at * user address @dst */ - asm volatile ( - " sacf 256\n" - "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n" - "1: ipm %[cc]\n" - " srl %[cc],28\n" - " ltr %[cc],%[cc]\n" - " jne 4f\n" - "2: ahi %[shift],-8\n" - " srlg %[tmp],%[val],0(%[shift])\n" - "3: stc %[tmp],0(%[dst])\n" - " aghi %[dst],1\n" - " brctg %[cnt],2b\n" - "4: sacf 768\n" - EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) + exception = 1; + sacf_flag = enable_sacf_uaccess(); + asm_inline volatile ( + " sacf 256\n" + "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n" + "1: lhi %[exc],0\n" + " jne 4f\n" + "2: ahi %[shift],-8\n" + " srlg %[tmp],%[val],0(%[shift])\n" + "3: stc %[tmp],0(%[dst])\n" + "5: aghi %[dst],1\n" + " brctg %[cnt],2b\n" + /* + * Use xr to clear exc and set condition code to zero + * to ensure flag output is correct for this branch. + */ + " xr %[exc],%[exc]\n" + "4: sacf 768\n" + CC_IPM(cc) + EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b) + : [ioaddr_len] "+&d" (ioaddr_len.pair), [exc] "+d" (exception), + CC_OUT(cc, cc), [val] "=d" (val), + [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp), + [shift] "+a" (shift) : - [ioaddr_len] "+&d" (ioaddr_len.pair), - [cc] "+d" (cc), [val] "=d" (val), - [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp), - [shift] "+d" (shift) - :: "cc", "memory"); - + : CC_CLOBBER_LIST("memory")); + disable_sacf_uaccess(sacf_flag); + cc = exception ? -ENXIO : CC_TRANSFORM(cc); /* did we write everything to the user space buffer? */ if (!cc && cnt != 0) cc = -EFAULT; @@ -242,9 +265,9 @@ static inline int __memcpy_fromio_inuser(void __user *dst, u8 status; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) src, - (u64 __force) dst, n, - ZPCI_MAX_READ_SIZE); + size = zpci_get_max_io_size((u64 __force) src, + (u64 __force) dst, n, + ZPCI_MAX_READ_SIZE); rc = __pcilg_mio_inuser(dst, src, size, &status); if (rc) break; @@ -260,12 +283,11 @@ static inline int __memcpy_fromio_inuser(void __user *dst, SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -305,14 +327,20 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out_unlock_mmap; ret = -EACCES; - if (!(vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_READ)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); - if (ret) - goto out_unlock_mmap; + args.vma = vma; + args.address = mmio_addr; + ret = follow_pfnmap_start(&args); + if (ret) { + fixup_user_fault(current->mm, mmio_addr, 0, NULL); + ret = follow_pfnmap_start(&args); + if (ret) + goto out_unlock_mmap; + } - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { @@ -322,7 +350,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, ret = zpci_memcpy_fromio(buf, io_addr, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); diff --git a/arch/s390/pci/pci_report.c b/arch/s390/pci/pci_report.c new file mode 100644 index 000000000000..1b494e5ecc4d --- /dev/null +++ b/arch/s390/pci/pci_report.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/sprintf.h> +#include <linux/pci.h> + +#include <asm/sclp.h> +#include <asm/debug.h> +#include <asm/pci_debug.h> + +#include "pci_report.h" + +#define ZPCI_ERR_LOG_ID_KERNEL_REPORT 0x4714 + +struct zpci_report_error_data { + u64 timestamp; + u64 err_log_id; + char log_data[]; +} __packed; + +#define ZPCI_REPORT_SIZE (PAGE_SIZE - sizeof(struct err_notify_sccb)) +#define ZPCI_REPORT_DATA_SIZE (ZPCI_REPORT_SIZE - sizeof(struct zpci_report_error_data)) + +struct zpci_report_error { + struct zpci_report_error_header header; + struct zpci_report_error_data data; +} __packed; + +static const char *zpci_state_str(pci_channel_state_t state) +{ + switch (state) { + case pci_channel_io_normal: + return "normal"; + case pci_channel_io_frozen: + return "frozen"; + case pci_channel_io_perm_failure: + return "permanent-failure"; + default: + return "invalid"; + }; +} + +static int debug_log_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf, + size_t out_buf_size) +{ + unsigned long sec, usec; + unsigned int level; + char *except_str; + int rc = 0; + + level = entry->level; + sec = entry->clock; + usec = do_div(sec, USEC_PER_SEC); + + if (entry->exception) + except_str = "*"; + else + except_str = "-"; + rc += scnprintf(out_buf, out_buf_size, "%011ld:%06lu %1u %1s %04u ", + sec, usec, level, except_str, + entry->cpu); + return rc; +} + +static int debug_prolog_header(debug_info_t *id, struct debug_view *view, + char *out_buf, size_t out_buf_size) +{ + return scnprintf(out_buf, out_buf_size, "sec:usec level except cpu msg\n"); +} + +static struct debug_view debug_log_view = { + "pci_msg_log", + &debug_prolog_header, + &debug_log_header_fn, + &debug_sprintf_format_fn, + NULL, + NULL +}; + +/** + * zpci_report_status - Report the status of operations on a PCI device + * @zdev: The PCI device for which to report status + * @operation: A string representing the operation reported + * @status: A string representing the status of the operation + * + * This function creates a human readable report about an operation such as + * PCI device recovery and forwards this to the platform using the SCLP Write + * Event Data mechanism. Besides the operation and status strings the report + * also contains additional information about the device deemed useful for + * debug such as the currently bound device driver, if any, and error state. + * Additionally a string representation of pci_debug_msg_id, or as much as fits, + * is also included. + * + * Return: 0 on success an error code < 0 otherwise. + */ +int zpci_report_status(struct zpci_dev *zdev, const char *operation, const char *status) +{ + struct zpci_report_error *report; + struct pci_driver *driver = NULL; + struct pci_dev *pdev = NULL; + char *buf, *end; + int ret; + + if (!zdev || !zdev->zbus) + return -ENODEV; + + /* Protected virtualization hosts get nothing from us */ + if (prot_virt_guest) + return -ENODATA; + + report = (void *)get_zeroed_page(GFP_KERNEL); + if (!report) + return -ENOMEM; + if (zdev->zbus->bus) + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + if (pdev) + driver = to_pci_driver(pdev->dev.driver); + + buf = report->data.log_data; + end = report->data.log_data + ZPCI_REPORT_DATA_SIZE; + buf += scnprintf(buf, end - buf, "report: %s\n", operation); + buf += scnprintf(buf, end - buf, "status: %s\n", status); + buf += scnprintf(buf, end - buf, "state: %s\n", + (pdev) ? zpci_state_str(pdev->error_state) : "n/a"); + buf += scnprintf(buf, end - buf, "driver: %s\n", (driver) ? driver->name : "n/a"); + ret = debug_dump(pci_debug_msg_id, &debug_log_view, buf, end - buf, true); + if (ret < 0) + pr_err("Reading PCI debug messages failed with code %d\n", ret); + else + buf += ret; + + report->header.version = 1; + report->header.action = SCLP_ERRNOTIFY_AQ_INFO_LOG; + report->header.length = buf - (char *)&report->data; + report->data.timestamp = ktime_get_clocktai_seconds(); + report->data.err_log_id = ZPCI_ERR_LOG_ID_KERNEL_REPORT; + + ret = sclp_pci_report(&report->header, zdev->fh, zdev->fid); + if (ret) + pr_err("Reporting PCI status failed with code %d\n", ret); + else + pr_info("Reported PCI device status\n"); + + free_page((unsigned long)report); + + return ret; +} diff --git a/arch/s390/pci/pci_report.h b/arch/s390/pci/pci_report.h new file mode 100644 index 000000000000..e08003d51a97 --- /dev/null +++ b/arch/s390/pci/pci_report.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2024 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + * + */ +#ifndef __S390_PCI_REPORT_H +#define __S390_PCI_REPORT_H + +struct zpci_dev; + +int zpci_report_status(struct zpci_dev *zdev, const char *operation, const char *status); + +#endif /* __S390_PCI_REPORT_H */ diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index cae280e5c047..0ecad08e1b1e 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -23,7 +23,7 @@ static ssize_t name##_show(struct device *dev, \ { \ struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); \ \ - return sprintf(buf, fmt, zdev->member); \ + return sysfs_emit(buf, fmt, zdev->member); \ } \ static DEVICE_ATTR_RO(name) @@ -34,6 +34,7 @@ zpci_attr(pfgid, "0x%02x\n", pfgid); zpci_attr(vfn, "0x%04x\n", vfn); zpci_attr(pft, "0x%02x\n", pft); zpci_attr(port, "%d\n", port); +zpci_attr(fidparm, "0x%02x\n", fidparm); zpci_attr(uid, "0x%x\n", uid); zpci_attr(segment0, "0x%02x\n", pfip[0]); zpci_attr(segment1, "0x%02x\n", pfip[1]); @@ -45,10 +46,34 @@ static ssize_t mio_enabled_show(struct device *dev, { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - return sprintf(buf, zpci_use_mio(zdev) ? "1\n" : "0\n"); + return sysfs_emit(buf, zpci_use_mio(zdev) ? "1\n" : "0\n"); } static DEVICE_ATTR_RO(mio_enabled); +static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev) +{ + int ret; + + pci_stop_and_remove_bus_device(pdev); + if (zdev_enabled(zdev)) { + ret = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error + * state the FH may indicate an enabled device but + * disable says the device is already disabled don't + * treat it as an error here. + */ + if (ret == -EINVAL) + ret = 0; + if (ret) + return ret; + } + + ret = zpci_reenable_device(zdev); + + return ret; +} + static ssize_t recover_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -69,6 +94,12 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, */ kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); WARN_ON_ONCE(!kn); + + /* Device needs to be configured and state must not change */ + mutex_lock(&zdev->state_lock); + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + goto out; + /* device_remove_file() serializes concurrent calls ignoring all but * the first */ @@ -81,39 +112,13 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, */ pci_lock_rescan_remove(); if (pci_dev_is_added(pdev)) { - pci_stop_and_remove_bus_device(pdev); - if (zdev->dma_table) { - ret = zpci_dma_exit_device(zdev); - if (ret) - goto out; - } - - if (zdev_enabled(zdev)) { - ret = zpci_disable_device(zdev); - /* - * Due to a z/VM vs LPAR inconsistency in the error - * state the FH may indicate an enabled device but - * disable says the device is already disabled don't - * treat it as an error here. - */ - if (ret == -EINVAL) - ret = 0; - if (ret) - goto out; - } - - ret = zpci_enable_device(zdev); - if (ret) - goto out; - ret = zpci_dma_init_device(zdev); - if (ret) { - zpci_disable_device(zdev); - goto out; - } - pci_rescan_bus(zdev->zbus->bus); + ret = _do_recover(pdev, zdev); } -out: + pci_rescan_bus(zdev->zbus->bus); pci_unlock_rescan_remove(); + +out: + mutex_unlock(&zdev->state_lock); if (kn) sysfs_unbreak_active_protection(kn); return ret ? ret : count; @@ -121,7 +126,7 @@ out: static DEVICE_ATTR_WO(recover); static ssize_t util_string_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); @@ -131,10 +136,10 @@ static ssize_t util_string_read(struct file *filp, struct kobject *kobj, return memory_read_from_buffer(buf, count, &off, zdev->util_str, sizeof(zdev->util_str)); } -static BIN_ATTR_RO(util_string, CLP_UTIL_STR_LEN); +static const BIN_ATTR_RO(util_string, CLP_UTIL_STR_LEN); static ssize_t report_error_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct zpci_report_error_header *report = (void *) buf; @@ -150,7 +155,7 @@ static ssize_t report_error_write(struct file *filp, struct kobject *kobj, return ret ? ret : count; } -static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE); +static const BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE); static ssize_t uid_is_unique_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -159,7 +164,6 @@ static ssize_t uid_is_unique_show(struct device *dev, } static DEVICE_ATTR_RO(uid_is_unique); -#ifndef CONFIG_DMI /* analogous to smbios index */ static ssize_t index_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -185,13 +189,12 @@ static struct attribute *zpci_ident_attrs[] = { NULL, }; -static struct attribute_group zpci_ident_attr_group = { +const struct attribute_group zpci_ident_attr_group = { .attrs = zpci_ident_attrs, .is_visible = zpci_index_is_visible, }; -#endif -static struct bin_attribute *zpci_bin_attrs[] = { +static const struct bin_attribute *const zpci_bin_attrs[] = { &bin_attr_util_string, &bin_attr_report_error, NULL, @@ -204,6 +207,7 @@ static struct attribute *zpci_dev_attrs[] = { &dev_attr_pfgid.attr, &dev_attr_pft.attr, &dev_attr_port.attr, + &dev_attr_fidparm.attr, &dev_attr_vfn.attr, &dev_attr_uid.attr, &dev_attr_recover.attr, @@ -212,9 +216,9 @@ static struct attribute *zpci_dev_attrs[] = { NULL, }; -static struct attribute_group zpci_attr_group = { +const struct attribute_group zpci_attr_group = { .attrs = zpci_dev_attrs, - .bin_attrs = zpci_bin_attrs, + .bin_attrs_new = zpci_bin_attrs, }; static struct attribute *pfip_attrs[] = { @@ -224,16 +228,8 @@ static struct attribute *pfip_attrs[] = { &dev_attr_segment3.attr, NULL, }; -static struct attribute_group pfip_attr_group = { + +const struct attribute_group pfip_attr_group = { .name = "pfip", .attrs = pfip_attrs, }; - -const struct attribute_group *zpci_attr_groups[] = { - &zpci_attr_group, - &pfip_attr_group, -#ifndef CONFIG_DMI - &zpci_ident_attr_group, -#endif - NULL, -}; diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index d237bc6841cb..bd39b36e7bd6 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD := y - purgatory-y := head.o purgatory.o string.o sha256.o mem.o targets += $(purgatory-y) purgatory.lds purgatory purgatory.chk purgatory.ro @@ -10,25 +8,22 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS +CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) -KCOV_INSTRUMENT := n -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n - -KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes +KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS += -D__DISABLE_EXPORTS # Since we link purgatory with -r unresolved symbols are not checked, so we # also link a purgatory.chk binary without -r to check for unresolved symbols. diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S index 6f835124ee82..db3ab2402621 100644 --- a/arch/s390/purgatory/head.S +++ b/arch/s390/purgatory/head.S @@ -76,9 +76,9 @@ diag %r0,%r1,0x308 .endm -.text -.align PAGE_SIZE -ENTRY(purgatory_start) + .text + .balign PAGE_SIZE +SYM_CODE_START(purgatory_start) /* The purgatory might be called after a diag308 so better set * architecture and addressing mode. */ @@ -100,7 +100,7 @@ ENTRY(purgatory_start) * checksum verification only (%r2 = 0 -> verification only). * * Check now and preserve over C function call by storing in - * %r10 whith + * %r10 with * 1 -> checksum verification only * 0 -> load new kernel */ @@ -156,7 +156,7 @@ ENTRY(purgatory_start) agr %r10,%r9 /* Buffer location (in crash memory) and size. As the purgatory is - * behind the point of no return it can re-use the stack as buffer. + * behind the point of no return it can reuse the stack as buffer. */ larl %r11,purgatory_end larl %r12,stack @@ -245,45 +245,21 @@ ENTRY(purgatory_start) /* start crash kernel */ START_NEXT_KERNEL .base_dst 1 - - -load_psw_mask: - .long 0x00080000,0x80000000 - - .align 8 -disabled_wait_psw: - .quad 0x0002000180000000 - .quad 0x0000000000000000 + .do_checksum_verification - -gprregs: - .rept 10 - .quad 0 - .endr - -/* Macro to define a global variable with name and size (in bytes) to be - * shared with C code. - * - * Add the .size and .type attribute to satisfy checks on the Elf_Sym during - * purgatory load. - */ -.macro GLOBAL_VARIABLE name,size -\name: - .global \name - .size \name,\size - .type \name,object - .skip \size,0 -.endm - -GLOBAL_VARIABLE purgatory_sha256_digest,32 -GLOBAL_VARIABLE purgatory_sha_regions,16*__KEXEC_SHA_REGION_SIZE -GLOBAL_VARIABLE kernel_entry,8 -GLOBAL_VARIABLE kernel_type,8 -GLOBAL_VARIABLE crash_start,8 -GLOBAL_VARIABLE crash_size,8 - - .align PAGE_SIZE -stack: +SYM_CODE_END(purgatory_start) + +SYM_DATA_LOCAL(load_psw_mask, .long 0x00080000,0x80000000) + .balign 8 +SYM_DATA_LOCAL(disabled_wait_psw, .quad 0x0002000180000000,.do_checksum_verification) +SYM_DATA_LOCAL(gprregs, .fill 10,8,0) +SYM_DATA(purgatory_sha256_digest, .skip 32) +SYM_DATA(purgatory_sha_regions, .skip 16*__KEXEC_SHA_REGION_SIZE) +SYM_DATA(kernel_entry, .skip 8) +SYM_DATA(kernel_type, .skip 8) +SYM_DATA(crash_start, .skip 8) +SYM_DATA(crash_size, .skip 8) + .balign PAGE_SIZE +SYM_DATA_START_LOCAL(stack) /* The buffer to move this code must be as big as the code. */ .skip stack-purgatory_start - .align PAGE_SIZE -purgatory_end: + .balign PAGE_SIZE +SYM_DATA_END_LABEL(stack, SYM_L_LOCAL, purgatory_end) diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S index 8293753100ae..25f512b1de12 100644 --- a/arch/s390/purgatory/kexec-purgatory.S +++ b/arch/s390/purgatory/kexec-purgatory.S @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> .section .rodata, "a" - .align 8 -kexec_purgatory: - .globl kexec_purgatory + .balign 8 +SYM_DATA_START(kexec_purgatory) .incbin "arch/s390/purgatory/purgatory.ro" -.Lkexec_purgatroy_end: +SYM_DATA_END_LABEL(kexec_purgatory, SYM_L_LOCAL, kexec_purgatory_end) - .align 8 -kexec_purgatory_size: - .globl kexec_purgatory_size - .quad .Lkexec_purgatroy_end - kexec_purgatory + .balign 8 +SYM_DATA(kexec_purgatory_size, .quad kexec_purgatory_end-kexec_purgatory) diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore index ea62f37b79ef..e6af51d9d183 100644 --- a/arch/s390/tools/.gitignore +++ b/arch/s390/tools/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only gen_facilities gen_opcode_table +relocs diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index f9dd47ff9ac4..f2862364fb42 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -25,3 +25,8 @@ $(kapi)/facility-defs.h: $(obj)/gen_facilities FORCE $(kapi)/dis-defs.h: $(obj)/gen_opcode_table FORCE $(call filechk,dis-defs.h) + +hostprogs += relocs +PHONY += relocs +relocs: $(obj)/relocs + @: diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 530dd941d140..d5c68ade71ab 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -46,6 +46,7 @@ static struct facility_def facility_defs[] = { #endif #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES 53, /* load-and-zero-rightmost-byte, etc. */ + 129, /* vector */ #endif #ifdef CONFIG_HAVE_MARCH_Z14_FEATURES 58, /* miscellaneous-instruction-extension 2 */ @@ -53,6 +54,9 @@ static struct facility_def facility_defs[] = { #ifdef CONFIG_HAVE_MARCH_Z15_FEATURES 61, /* miscellaneous-instruction-extension 3 */ #endif +#ifdef CONFIG_HAVE_MARCH_Z17_FEATURES + 84, /* miscellaneous-instruction-extension 4 */ +#endif -1 /* END */ } }, @@ -108,9 +112,12 @@ static struct facility_def facility_defs[] = { 15, /* AP Facilities Test */ 156, /* etoken facility */ 165, /* nnpa facility */ + 170, /* ineffective-nonconstrained-transaction facility */ 193, /* bear enhancement facility */ 194, /* rdp enhancement facility */ 196, /* processor activity instrumentation facility */ + 197, /* processor activity instrumentation extension 1 */ + 201, /* concurrent-functions facility */ -1 /* END */ } }, diff --git a/arch/s390/tools/gen_opcode_table.c b/arch/s390/tools/gen_opcode_table.c index a1bc02b29c81..7d76c417f83f 100644 --- a/arch/s390/tools/gen_opcode_table.c +++ b/arch/s390/tools/gen_opcode_table.c @@ -201,6 +201,17 @@ static int cmp_long_insn(const void *a, const void *b) return strcmp(((struct insn *)a)->name, ((struct insn *)b)->name); } +static void print_insn_name(const char *name) +{ + size_t i, len; + + len = strlen(name); + printf("{"); + for (i = 0; i < len; i++) + printf(" \'%c\',", name[i]); + printf(" }"); +} + static void print_long_insn(struct gen_opcode *desc) { struct insn *insn; @@ -223,7 +234,9 @@ static void print_long_insn(struct gen_opcode *desc) insn = &desc->insn[i]; if (insn->name_len < 6) continue; - printf("\t[LONG_INSN_%s] = \"%s\", \\\n", insn->upper, insn->name); + printf("\t[LONG_INSN_%s] = ", insn->upper); + print_insn_name(insn->name); + printf(", \\\n"); } printf("}\n\n"); } @@ -236,11 +249,13 @@ static void print_opcode(struct insn *insn, int nr) if (insn->type->byte != 0) opcode += 2; printf("\t[%4d] = { .opfrag = 0x%s, .format = INSTR_%s, ", nr, opcode, insn->format); - if (insn->name_len < 6) - printf(".name = \"%s\" ", insn->name); - else - printf(".offset = LONG_INSN_%s ", insn->upper); - printf("}, \\\n"); + if (insn->name_len < 6) { + printf(".name = "); + print_insn_name(insn->name); + } else { + printf(".offset = LONG_INSN_%s", insn->upper); + } + printf(" }, \\\n"); } static void add_to_group(struct gen_opcode *desc, struct insn *insn, int offset) diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 5f008e794898..def2659f6602 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -527,9 +527,9 @@ b938 sortl RRE_RR b939 dfltcc RRF_R0RR2 b93a kdsa RRE_RR b93b nnpa RRE_00 -b93c ppno RRE_RR -b93e kimd RRE_RR -b93f klmd RRE_RR +b93c prno RRE_RR +b93e kimd RRF_U0RR +b93f klmd RRF_U0RR b941 cfdtr RRF_UURF b942 clgdtr RRF_UURF b943 clfdtr RRF_UURF @@ -549,6 +549,10 @@ b964 nngrk RRF_R0RR2 b965 ocgrk RRF_R0RR2 b966 nogrk RRF_R0RR2 b967 nxgrk RRF_R0RR2 +b968 clzg RRE_RR +b969 ctzg RRE_RR +b96c bextg RRF_R0RR2 +b96d bdepg RRF_R0RR2 b972 crt RRF_U0RR b973 clrt RRF_U0RR b974 nnrk RRF_R0RR2 @@ -796,6 +800,16 @@ e35b sy RXY_RRRD e35c mfy RXY_RRRD e35e aly RXY_RRRD e35f sly RXY_RRRD +e360 lxab RXY_RRRD +e361 llxab RXY_RRRD +e362 lxah RXY_RRRD +e363 llxah RXY_RRRD +e364 lxaf RXY_RRRD +e365 llxaf RXY_RRRD +e366 lxag RXY_RRRD +e367 llxag RXY_RRRD +e368 lxaq RXY_RRRD +e369 llxaq RXY_RRRD e370 sthy RXY_RRRD e371 lay RXY_RRRD e372 stcy RXY_RRRD @@ -880,6 +894,8 @@ e63c vupkz VSI_URDV e63d vstrl VSI_URDV e63f vstrlr VRS_RRDV e649 vlip VRI_V0UU2 +e64a vcvdq VRI_VV0UU +e64e vcvbq VRR_VV0U2 e650 vcvb VRR_RV0UU e651 vclzdp VRR_VV0U2 e652 vcvbg VRR_RV0UU @@ -893,7 +909,7 @@ e65b vpsop VRI_VVUUU2 e65c vupkzl VRR_VV0U2 e65d vcfn VRR_VV0UU2 e65e vclfnl VRR_VV0UU2 -e65f vtp VRR_0V +e65f vtp VRR_0V0U e670 vpkzr VRI_VVV0UU2 e671 vap VRI_VVV0UU2 e672 vsrpr VRI_VVV0UU2 @@ -908,6 +924,7 @@ e67b vrp VRI_VVV0UU2 e67c vscshp VRR_VVV e67d vcsph VRR_VVV0U0 e67e vsdp VRI_VVV0UU2 +e67f vtz VRR_0VVU e700 vleb VRX_VRRDU e701 vleh VRX_VRRDU e702 vleg VRX_VRRDU @@ -948,6 +965,7 @@ e74d vrep VRI_VVUU e750 vpopct VRR_VV0U e752 vctz VRR_VV0U e753 vclz VRR_VV0U +e754 vgem VRR_VV0U e756 vlr VRX_VV e75c vistr VRR_VV0U0U e75f vseg VRR_VV0U @@ -985,6 +1003,8 @@ e784 vpdi VRR_VVV0U e785 vbperm VRR_VVV e786 vsld VRI_VVV0U e787 vsrd VRI_VVV0U +e788 veval VRI_VVV0UV +e789 vblend VRR_VVVU0V e78a vstrc VRR_VVVUU0V e78b vstrs VRR_VVVUU0V e78c vperm VRR_VVV0V @@ -1010,6 +1030,10 @@ e7ac vmale VRR_VVVU0V e7ad vmalo VRR_VVVU0V e7ae vmae VRR_VVVU0V e7af vmao VRR_VVVU0V +e7b0 vdl VRR_VVV0UU +e7b1 vrl VRR_VVV0UU +e7b2 vd VRR_VVV0UU +e7b3 vr VRR_VVV0UU e7b4 vgfm VRR_VVV0U e7b8 vmsl VRR_VVVUU0V e7b9 vaccc VRR_VVVU0V @@ -1017,12 +1041,12 @@ e7bb vac VRR_VVVU0V e7bc vgfma VRR_VVVU0V e7bd vsbcbi VRR_VVVU0V e7bf vsbi VRR_VVVU0V -e7c0 vclgd VRR_VV0UUU -e7c1 vcdlg VRR_VV0UUU -e7c2 vcgd VRR_VV0UUU -e7c3 vcdg VRR_VV0UUU -e7c4 vlde VRR_VV0UU2 -e7c5 vled VRR_VV0UUU +e7c0 vclfp VRR_VV0UUU +e7c1 vcfpl VRR_VV0UUU +e7c2 vcsfp VRR_VV0UUU +e7c3 vcfps VRR_VV0UUU +e7c4 vfll VRR_VV0UU2 +e7c5 vflr VRR_VV0UUU e7c7 vfi VRR_VV0UUU e7ca wfk VRR_VV0UU2 e7cb wfc VRR_VV0UU2 @@ -1094,9 +1118,9 @@ eb54 niy SIY_URD eb55 cliy SIY_URD eb56 oiy SIY_URD eb57 xiy SIY_URD -eb60 lric RSY_RDRU -eb61 stric RSY_RDRU -eb62 mric RSY_RDRU +eb60 lric RSY_RURD2 +eb61 stric RSY_RURD2 +eb62 mric RSY_RURD2 eb6a asi SIY_IRD eb6e alsi SIY_IRD eb71 lpswey SIY_RD @@ -1104,7 +1128,7 @@ eb7a agsi SIY_IRD eb7e algsi SIY_IRD eb80 icmh RSY_RURD eb81 icmy RSY_RURD -eb8a sqbs RSY_RDRU +eb8a sqbs RSY_RURD2 eb8e mvclu RSY_RRRD eb8f clclu RSY_RRRD eb90 stmy RSY_RRRD diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c new file mode 100644 index 000000000000..30a732c808f3 --- /dev/null +++ b/arch/s390/tools/relocs.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <elf.h> +#include <byteswap.h> +#define USE_BSD +#include <endian.h> + +#define ELF_BITS 64 + +#define ELF_MACHINE EM_S390 +#define ELF_MACHINE_NAME "IBM S/390" +#define SHT_REL_TYPE SHT_RELA +#define Elf_Rel Elf64_Rela + +#define ELF_CLASS ELFCLASS64 +#define ELF_ENDIAN ELFDATA2MSB +#define ELF_R_SYM(val) ELF64_R_SYM(val) +#define ELF_R_TYPE(val) ELF64_R_TYPE(val) +#define ELF_ST_TYPE(o) ELF64_ST_TYPE(o) +#define ELF_ST_BIND(o) ELF64_ST_BIND(o) +#define ELF_ST_VISIBILITY(o) ELF64_ST_VISIBILITY(o) + +#define ElfW(type) _ElfW(ELF_BITS, type) +#define _ElfW(bits, type) __ElfW(bits, type) +#define __ElfW(bits, type) Elf##bits##_##type + +#define Elf_Addr ElfW(Addr) +#define Elf_Ehdr ElfW(Ehdr) +#define Elf_Phdr ElfW(Phdr) +#define Elf_Shdr ElfW(Shdr) +#define Elf_Sym ElfW(Sym) + +static Elf_Ehdr ehdr; +static unsigned long shnum; +static unsigned int shstrndx; + +struct relocs { + uint32_t *offset; + unsigned long count; + unsigned long size; +}; + +static struct relocs relocs64; +#define FMT PRIu64 + +struct section { + Elf_Shdr shdr; + struct section *link; + Elf_Rel *reltab; +}; + +static struct section *secs; + +#if BYTE_ORDER == LITTLE_ENDIAN +#define le16_to_cpu(val) (val) +#define le32_to_cpu(val) (val) +#define le64_to_cpu(val) (val) +#define be16_to_cpu(val) bswap_16(val) +#define be32_to_cpu(val) bswap_32(val) +#define be64_to_cpu(val) bswap_64(val) +#endif + +#if BYTE_ORDER == BIG_ENDIAN +#define le16_to_cpu(val) bswap_16(val) +#define le32_to_cpu(val) bswap_32(val) +#define le64_to_cpu(val) bswap_64(val) +#define be16_to_cpu(val) (val) +#define be32_to_cpu(val) (val) +#define be64_to_cpu(val) (val) +#endif + +static uint16_t elf16_to_cpu(uint16_t val) +{ + if (ehdr.e_ident[EI_DATA] == ELFDATA2LSB) + return le16_to_cpu(val); + else + return be16_to_cpu(val); +} + +static uint32_t elf32_to_cpu(uint32_t val) +{ + if (ehdr.e_ident[EI_DATA] == ELFDATA2LSB) + return le32_to_cpu(val); + else + return be32_to_cpu(val); +} + +#define elf_half_to_cpu(x) elf16_to_cpu(x) +#define elf_word_to_cpu(x) elf32_to_cpu(x) + +static uint64_t elf64_to_cpu(uint64_t val) +{ + return be64_to_cpu(val); +} + +#define elf_addr_to_cpu(x) elf64_to_cpu(x) +#define elf_off_to_cpu(x) elf64_to_cpu(x) +#define elf_xword_to_cpu(x) elf64_to_cpu(x) + +static void die(char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + +static void read_ehdr(FILE *fp) +{ + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) + die("Cannot read ELF header: %s\n", strerror(errno)); + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) + die("No ELF magic\n"); + if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) + die("Not a %d bit executable\n", ELF_BITS); + if (ehdr.e_ident[EI_DATA] != ELF_ENDIAN) + die("ELF endian mismatch\n"); + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) + die("Unknown ELF version\n"); + + /* Convert the fields to native endian */ + ehdr.e_type = elf_half_to_cpu(ehdr.e_type); + ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine); + ehdr.e_version = elf_word_to_cpu(ehdr.e_version); + ehdr.e_entry = elf_addr_to_cpu(ehdr.e_entry); + ehdr.e_phoff = elf_off_to_cpu(ehdr.e_phoff); + ehdr.e_shoff = elf_off_to_cpu(ehdr.e_shoff); + ehdr.e_flags = elf_word_to_cpu(ehdr.e_flags); + ehdr.e_ehsize = elf_half_to_cpu(ehdr.e_ehsize); + ehdr.e_phentsize = elf_half_to_cpu(ehdr.e_phentsize); + ehdr.e_phnum = elf_half_to_cpu(ehdr.e_phnum); + ehdr.e_shentsize = elf_half_to_cpu(ehdr.e_shentsize); + ehdr.e_shnum = elf_half_to_cpu(ehdr.e_shnum); + ehdr.e_shstrndx = elf_half_to_cpu(ehdr.e_shstrndx); + + shnum = ehdr.e_shnum; + shstrndx = ehdr.e_shstrndx; + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) + die("Unsupported ELF header type\n"); + if (ehdr.e_machine != ELF_MACHINE) + die("Not for %s\n", ELF_MACHINE_NAME); + if (ehdr.e_version != EV_CURRENT) + die("Unknown ELF version\n"); + if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) + die("Bad Elf header size\n"); + if (ehdr.e_phentsize != sizeof(Elf_Phdr)) + die("Bad program header entry\n"); + if (ehdr.e_shentsize != sizeof(Elf_Shdr)) + die("Bad section header entry\n"); + + if (shnum == SHN_UNDEF || shstrndx == SHN_XINDEX) { + Elf_Shdr shdr; + + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno)); + + if (fread(&shdr, sizeof(shdr), 1, fp) != 1) + die("Cannot read initial ELF section header: %s\n", strerror(errno)); + + if (shnum == SHN_UNDEF) + shnum = elf_xword_to_cpu(shdr.sh_size); + + if (shstrndx == SHN_XINDEX) + shstrndx = elf_word_to_cpu(shdr.sh_link); + } + + if (shstrndx >= shnum) + die("String table index out of bounds\n"); +} + +static void read_shdrs(FILE *fp) +{ + Elf_Shdr shdr; + int i; + + secs = calloc(shnum, sizeof(struct section)); + if (!secs) + die("Unable to allocate %ld section headers\n", shnum); + + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno)); + + for (i = 0; i < shnum; i++) { + struct section *sec = &secs[i]; + + if (fread(&shdr, sizeof(shdr), 1, fp) != 1) { + die("Cannot read ELF section headers %d/%ld: %s\n", + i, shnum, strerror(errno)); + } + + sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name); + sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type); + sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags); + sec->shdr.sh_addr = elf_addr_to_cpu(shdr.sh_addr); + sec->shdr.sh_offset = elf_off_to_cpu(shdr.sh_offset); + sec->shdr.sh_size = elf_xword_to_cpu(shdr.sh_size); + sec->shdr.sh_link = elf_word_to_cpu(shdr.sh_link); + sec->shdr.sh_info = elf_word_to_cpu(shdr.sh_info); + sec->shdr.sh_addralign = elf_xword_to_cpu(shdr.sh_addralign); + sec->shdr.sh_entsize = elf_xword_to_cpu(shdr.sh_entsize); + + if (sec->shdr.sh_link < shnum) + sec->link = &secs[sec->shdr.sh_link]; + } + +} + +static void read_relocs(FILE *fp) +{ + int i, j; + + for (i = 0; i < shnum; i++) { + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL_TYPE) + continue; + + sec->reltab = malloc(sec->shdr.sh_size); + if (!sec->reltab) + die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size); + + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno)); + + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size) + die("Cannot read symbol table: %s\n", strerror(errno)); + + for (j = 0; j < sec->shdr.sh_size / sizeof(Elf_Rel); j++) { + Elf_Rel *rel = &sec->reltab[j]; + + rel->r_offset = elf_addr_to_cpu(rel->r_offset); + rel->r_info = elf_xword_to_cpu(rel->r_info); +#if (SHT_REL_TYPE == SHT_RELA) + rel->r_addend = elf_xword_to_cpu(rel->r_addend); +#endif + } + } +} + +static void add_reloc(struct relocs *r, uint32_t offset) +{ + if (r->count == r->size) { + unsigned long newsize = r->size + 50000; + void *mem = realloc(r->offset, newsize * sizeof(r->offset[0])); + + if (!mem) + die("realloc of %ld entries for relocs failed\n", newsize); + + r->offset = mem; + r->size = newsize; + } + r->offset[r->count++] = offset; +} + +static int do_reloc(struct section *sec, Elf_Rel *rel) +{ + unsigned int r_type = ELF64_R_TYPE(rel->r_info); + ElfW(Addr) offset = rel->r_offset; + + switch (r_type) { + case R_390_NONE: + case R_390_PC32: + case R_390_PC64: + case R_390_PC16DBL: + case R_390_PC32DBL: + case R_390_PLT32DBL: + case R_390_GOTENT: + case R_390_GOTPCDBL: + case R_390_GOTOFF64: + break; + case R_390_64: + add_reloc(&relocs64, offset); + break; + default: + die("Unsupported relocation type: %d\n", r_type); + break; + } + + return 0; +} + +static void walk_relocs(void) +{ + int i; + + /* Walk through the relocations */ + for (i = 0; i < shnum; i++) { + struct section *sec_applies; + int j; + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL_TYPE) + continue; + + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) + continue; + + for (j = 0; j < sec->shdr.sh_size / sizeof(Elf_Rel); j++) { + Elf_Rel *rel = &sec->reltab[j]; + + do_reloc(sec, rel); + } + } +} + +static int cmp_relocs(const void *va, const void *vb) +{ + const uint32_t *a, *b; + + a = va; b = vb; + return (*a == *b) ? 0 : (*a > *b) ? 1 : -1; +} + +static void sort_relocs(struct relocs *r) +{ + qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); +} + +static int print_reloc(uint32_t v) +{ + return fprintf(stdout, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1; +} + +static void emit_relocs(void) +{ + int i; + + walk_relocs(); + sort_relocs(&relocs64); + + printf(".section \".vmlinux.relocs_64\",\"a\"\n"); + for (i = 0; i < relocs64.count; i++) + print_reloc(relocs64.offset[i]); +} + +static void process(FILE *fp) +{ + read_ehdr(fp); + read_shdrs(fp); + read_relocs(fp); + emit_relocs(); +} + +static void usage(void) +{ + die("relocs vmlinux\n"); +} + +int main(int argc, char **argv) +{ + unsigned char e_ident[EI_NIDENT]; + const char *fname; + FILE *fp; + + fname = NULL; + + if (argc != 2) + usage(); + + fname = argv[1]; + + fp = fopen(fname, "r"); + if (!fp) + die("Cannot open %s: %s\n", fname, strerror(errno)); + + if (fread(&e_ident, 1, EI_NIDENT, fp) != EI_NIDENT) + die("Cannot read %s: %s", fname, strerror(errno)); + + rewind(fp); + + process(fp); + + fclose(fp); + return 0; +} |