diff options
Diffstat (limited to 'arch/arm64/kernel')
119 files changed, 20600 insertions, 9172 deletions
diff --git a/arch/arm64/kernel/.gitignore b/arch/arm64/kernel/.gitignore index c5f676c3c224..bbb90f92d051 100644 --- a/arch/arm64/kernel/.gitignore +++ b/arch/arm64/kernel/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vmlinux.lds diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index f2b4e816b6de..2f361a883d8c 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -1,67 +1,97 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux kernel. # -CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) -AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_armv8_deprecated.o := -I$(src) -CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_insn.o = -pg -CFLAGS_REMOVE_return_address.o = -pg +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) -CFLAGS_setup.o = -DUTS_MACHINE='"$(UTS_MACHINE)"' +# Remove stack protector to avoid triggering unneeded stack canary +# checks due to randomize_kstack_offset. +CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong +CFLAGS_syscall.o += -fno-stack-protector + +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + +# It's not safe to invoke KCOV when portions of the kernel environment aren't +# available or are out-of-sync with HW state. Since `noinstr` doesn't always +# inhibit KCOV instrumentation, disable it for the entire compilation unit. +KCOV_INSTRUMENT_entry-common.o := n +KCOV_INSTRUMENT_idle.o := n # Object file lists. -arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ - entry-fpsimd.o process.o ptrace.o setup.o signal.o \ - sys.o stacktrace.o time.o traps.o io.o vdso.o \ - hyp-stub.o psci.o cpu_ops.o insn.o \ +obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ + entry-common.o entry-fpsimd.o process.o ptrace.o \ + setup.o signal.o sys.o stacktrace.o time.o traps.o \ + io.o vdso.o hyp-stub.o psci.o cpu_ops.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ - smp.o smp_spin_table.o topology.o smccc-call.o + smp.o smp_spin_table.o topology.o smccc-call.o \ + syscall.o proton-pack.o idreg-override.o idle.o \ + patching.o -extra-$(CONFIG_EFI) := efi-entry.o +targets += efi-entry.o OBJCOPYFLAGS := --prefix-symbols=__efistub_ $(obj)/%.stub.o: $(obj)/%.o FORCE $(call if_changed,objcopy) -arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ - sys_compat.o entry32.o -arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o -arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o -arm64-obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o -arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o -arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o -arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o -arm64-obj-$(CONFIG_CPU_PM) += sleep.o suspend.o -arm64-obj-$(CONFIG_CPU_IDLE) += cpuidle.o -arm64-obj-$(CONFIG_JUMP_LABEL) += jump_label.o -arm64-obj-$(CONFIG_KGDB) += kgdb.o -arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o -arm64-obj-$(CONFIG_PCI) += pci.o -arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o -arm64-obj-$(CONFIG_ACPI) += acpi.o -arm64-obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o -arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o -arm64-obj-$(CONFIG_PARAVIRT) += paravirt.o -arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o -arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o -arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ +obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ + sys_compat.o +obj-$(CONFIG_COMPAT) += sigreturn32.o +obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS) += compat_alignment.o +obj-$(CONFIG_KUSER_HELPERS) += kuser32.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o +obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_CPU_PM) += sleep.o suspend.o +obj-$(CONFIG_CPU_IDLE) += cpuidle.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o \ + efi-rt-wrapper.o +obj-$(CONFIG_PCI) += pci.o +obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o +obj-$(CONFIG_ACPI) += acpi.o +obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o +obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o +obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ +obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o +obj-$(CONFIG_ELF_CORE) += elfcore.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ cpu-reset.o -arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o +obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o +obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o -arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o +obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MTE) += mte.o +obj-y += vdso-wrap.o +obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o + +# Force dependency (vdso*-wrap.S includes vdso.so through incbin) +$(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so +$(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so -obj-y += $(arm64-obj-y) vdso/ probes/ -obj-m += $(arm64-obj-m) -head-y := head.o -extra-y += $(head-y) vmlinux.lds +obj-y += probes/ +obj-y += head.o +extra-y += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" endif -# will be included by each individual module but not by the core kernel itself -extra-$(CONFIG_DYNAMIC_FTRACE) += ftrace-mod.o +# for cleaning +subdir- += vdso vdso32 diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index b3162715ed78..a5a256e3f9fe 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 Specific Low-Level ACPI Boot Support * @@ -7,35 +8,31 @@ * Author: Hanjun Guo <hanjun.guo@linaro.org> * Author: Tomasz Nowicki <tomasz.nowicki@linaro.org> * Author: Naresh Bhat <naresh.bhat@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi.h> -#include <linux/bootmem.h> #include <linux/cpumask.h> +#include <linux/efi.h> #include <linux/efi-bgrt.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/irq_work.h> #include <linux/memblock.h> #include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <linux/smp.h> #include <linux/serial_core.h> +#include <linux/pgtable.h> +#include <acpi/ghes.h> #include <asm/cputype.h> #include <asm/cpu_ops.h> +#include <asm/daifflags.h> #include <asm/smp_plat.h> -#ifdef CONFIG_ACPI_APEI -# include <linux/efi.h> -# include <asm/pgtable.h> -#endif - int acpi_noirq = 1; /* skip ACPI IRQ initialization */ int acpi_disabled = 1; EXPORT_SYMBOL(acpi_disabled); @@ -66,29 +63,22 @@ static int __init parse_acpi(char *arg) } early_param("acpi", parse_acpi); -static int __init dt_scan_depth1_nodes(unsigned long node, - const char *uname, int depth, - void *data) +static bool __init dt_is_stub(void) { - /* - * Ignore anything not directly under the root node; we'll - * catch its parent instead. - */ - if (depth != 1) - return 0; + int node; - if (strcmp(uname, "chosen") == 0) - return 0; + fdt_for_each_subnode(node, initial_boot_params, 0) { + const char *name = fdt_get_name(initial_boot_params, node, NULL); + if (strcmp(name, "chosen") == 0) + continue; + if (strcmp(name, "hypervisor") == 0 && + of_flat_dt_is_compatible(node, "xen,xen")) + continue; - if (strcmp(uname, "hypervisor") == 0 && - of_flat_dt_is_compatible(node, "xen,xen")) - return 0; + return false; + } - /* - * This node at depth 1 is neither a chosen node nor a xen node, - * which we do not expect. - */ - return 1; + return true; } /* @@ -117,7 +107,7 @@ bool __init acpi_psci_present(void) } /* Whether HVC must be used instead of SMC as the PSCI conduit */ -bool __init acpi_psci_use_hvc(void) +bool acpi_psci_use_hvc(void) { return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC; } @@ -157,10 +147,14 @@ static int __init acpi_fadt_sanity_check(void) */ if (table->revision < 5 || (table->revision == 5 && fadt->minor_revision < 1)) { - pr_err("Unsupported FADT revision %d.%d, should be 5.1+\n", + pr_err(FW_BUG "Unsupported FADT revision %d.%d, should be 5.1+\n", table->revision, fadt->minor_revision); - ret = -EINVAL; - goto out; + + if (!fadt->arm_boot_flags) { + ret = -EINVAL; + goto out; + } + pr_err("FADT has ARM boot flags set, assuming 5.1\n"); } if (!(fadt->flags & ACPI_FADT_HW_REDUCED)) { @@ -205,8 +199,7 @@ void __init acpi_boot_table_init(void) * and ACPI has not been [force] enabled (acpi=on|force) */ if (param_acpi_off || - (!param_acpi_on && !param_acpi_force && - of_scan_flat_dt(dt_scan_depth1_nodes, NULL))) + (!param_acpi_on && !param_acpi_force && !dt_is_stub())) goto done; /* @@ -230,24 +223,35 @@ void __init acpi_boot_table_init(void) done: if (acpi_disabled) { - if (earlycon_init_is_deferred) + if (earlycon_acpi_spcr_enable) early_init_dt_scan_chosen_stdout(); } else { - parse_spcr(earlycon_init_is_deferred); + acpi_parse_spcr(earlycon_acpi_spcr_enable, true); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); } } -#ifdef CONFIG_ACPI_APEI -pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) +static pgprot_t __acpi_get_writethrough_mem_attribute(void) +{ + /* + * Although UEFI specifies the use of Normal Write-through for + * EFI_MEMORY_WT, it is seldom used in practice and not implemented + * by most (all?) CPUs. Rather than allocate a MAIR just for this + * purpose, emit a warning and use Normal Non-cacheable instead. + */ + pr_warn_once("No MAIR allocation for EFI_MEMORY_WT; treating as Normal Non-cacheable\n"); + return __pgprot(PROT_NORMAL_NC); +} + +pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) { /* * According to "Table 8 Map: EFI memory types to AArch64 memory * types" of UEFI 2.5 section 2.3.6.1, each EFI memory type is * mapped to a corresponding MAIR attribute encoding. * The EFI memory attribute advises all possible capabilities - * of a memory region. We use the most efficient capability. + * of a memory region. */ u64 attr; @@ -255,10 +259,155 @@ pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) attr = efi_mem_attributes(addr); if (attr & EFI_MEMORY_WB) return PAGE_KERNEL; - if (attr & EFI_MEMORY_WT) - return __pgprot(PROT_NORMAL_WT); if (attr & EFI_MEMORY_WC) return __pgprot(PROT_NORMAL_NC); + if (attr & EFI_MEMORY_WT) + return __acpi_get_writethrough_mem_attribute(); return __pgprot(PROT_DEVICE_nGnRnE); } -#endif + +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + efi_memory_desc_t *md, *region = NULL; + pgprot_t prot; + + if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP))) + return NULL; + + for_each_efi_memory_desc(md) { + u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (phys < md->phys_addr || phys >= end) + continue; + + if (phys + size > end) { + pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n"); + return NULL; + } + region = md; + break; + } + + /* + * It is fine for AML to remap regions that are not represented in the + * EFI memory map at all, as it only describes normal memory, and MMIO + * regions that require a virtual mapping to make them accessible to + * the EFI runtime services. + */ + prot = __pgprot(PROT_DEVICE_nGnRnE); + if (region) { + switch (region->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + if (memblock_is_map_memory(phys) || + !memblock_is_region_memory(phys, size)) { + pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); + return NULL; + } + /* + * Mapping kernel memory is permitted if the region in + * question is covered by a single memblock with the + * NOMAP attribute set: this enables the use of ACPI + * table overrides passed via initramfs, which are + * reserved in memory using arch_reserve_mem_area() + * below. As this particular use case only requires + * read access, fall through to the R/O mapping case. + */ + fallthrough; + + case EFI_RUNTIME_SERVICES_CODE: + /* + * This would be unusual, but not problematic per se, + * as long as we take care not to create a writable + * mapping for executable code. + */ + prot = PAGE_KERNEL_RO; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + /* + * ACPI reclaim memory is used to pass firmware tables + * and other data that is intended for consumption by + * the OS only, which may decide it wants to reclaim + * that memory and use it for something else. We never + * do that, but we usually add it to the linear map + * anyway, in which case we should use the existing + * mapping. + */ + if (memblock_is_map_memory(phys)) + return (void __iomem *)__phys_to_virt(phys); + fallthrough; + + default: + if (region->attribute & EFI_MEMORY_WB) + prot = PAGE_KERNEL; + else if (region->attribute & EFI_MEMORY_WC) + prot = __pgprot(PROT_NORMAL_NC); + else if (region->attribute & EFI_MEMORY_WT) + prot = __acpi_get_writethrough_mem_attribute(); + } + } + return ioremap_prot(phys, size, pgprot_val(prot)); +} + +/* + * Claim Synchronous External Aborts as a firmware first notification. + * + * Used by KVM and the arch do_sea handler. + * @regs may be NULL when called from process context. + */ +int apei_claim_sea(struct pt_regs *regs) +{ + int err = -ENOENT; + bool return_to_irqs_enabled; + unsigned long current_flags; + + if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES)) + return err; + + current_flags = local_daif_save_flags(); + + /* current_flags isn't useful here as daif doesn't tell us about pNMI */ + return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); + + if (regs) + return_to_irqs_enabled = interrupts_enabled(regs); + + /* + * SEA can interrupt SError, mask it and describe this as an NMI so + * that APEI defers the handling. + */ + local_daif_restore(DAIF_ERRCTX); + nmi_enter(); + err = ghes_notify_sea(); + nmi_exit(); + + /* + * APEI NMI-like notifications are deferred to irq_work. Unless + * we interrupted irqs-masked code, we can do that now. + */ + if (!err) { + if (return_to_irqs_enabled) { + local_daif_restore(DAIF_PROCCTX_NOIRQ); + __irq_enter(); + irq_work_run(); + __irq_exit(); + } else { + pr_warn_ratelimited("APEI work queued but not completed"); + err = -EINPROGRESS; + } + } + + local_daif_restore(current_flags); + + return err; +} + +void arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + memblock_mark_nomap(addr, size); +} diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index f01fab637dab..e51535a5f939 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ACPI 5.1 based NUMA setup for ARM64 * Lots of code was borrowed from arch/x86/mm/srat.c @@ -17,7 +18,6 @@ #include <linux/acpi.h> #include <linux/bitmap.h> -#include <linux/bootmem.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> @@ -25,36 +25,73 @@ #include <linux/module.h> #include <linux/topology.h> -#include <acpi/processor.h> #include <asm/numa.h> -static int cpus_in_srat; +static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; -struct __node_cpu_hwid { - u32 node_id; /* logical node containing this CPU */ - u64 cpu_hwid; /* MPIDR for this CPU */ -}; +int __init acpi_numa_get_nid(unsigned int cpu) +{ + return acpi_early_node_map[cpu]; +} + +static inline int get_cpu_for_acpi_id(u32 uid) +{ + int cpu; -static struct __node_cpu_hwid early_node_cpu_hwid[NR_CPUS] = { -[0 ... NR_CPUS - 1] = {NUMA_NO_NODE, PHYS_CPUID_INVALID} }; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + if (uid == get_acpi_id_for_cpu(cpu)) + return cpu; -int acpi_numa_get_nid(unsigned int cpu, u64 hwid) + return -EINVAL; +} + +static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header, + const unsigned long end) { - int i; + struct acpi_srat_gicc_affinity *pa; + int cpu, pxm, node; - for (i = 0; i < cpus_in_srat; i++) { - if (hwid == early_node_cpu_hwid[i].cpu_hwid) - return early_node_cpu_hwid[i].node_id; - } + if (srat_disabled()) + return -EINVAL; + + pa = (struct acpi_srat_gicc_affinity *)header; + if (!pa) + return -EINVAL; + + if (!(pa->flags & ACPI_SRAT_GICC_ENABLED)) + return 0; - return NUMA_NO_NODE; + pxm = pa->proximity_domain; + node = pxm_to_node(pxm); + + /* + * If we can't map the UID to a logical cpu this + * means that the UID is not part of possible cpus + * so we do not need a NUMA mapping for it, skip + * the SRAT entry and keep parsing. + */ + cpu = get_cpu_for_acpi_id(pa->acpi_processor_uid); + if (cpu < 0) + return 0; + + acpi_early_node_map[cpu] = node; + pr_info("SRAT: PXM %d -> MPIDR 0x%llx -> Node %d\n", pxm, + cpu_logical_map(cpu), node); + + return 0; +} + +void __init acpi_map_cpus_to_nodes(void) +{ + acpi_table_parse_entries(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_GICC_AFFINITY, + acpi_parse_gicc_pxm, 0); } /* Callback for Proximity Domain -> ACPI processor UID mapping */ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { int pxm, node; - phys_cpuid_t mpidr; if (srat_disabled()) return; @@ -69,46 +106,15 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) if (!(pa->flags & ACPI_SRAT_GICC_ENABLED)) return; - if (cpus_in_srat >= NR_CPUS) { - pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not be able to use all cpus\n", - NR_CPUS); - return; - } - pxm = pa->proximity_domain; node = acpi_map_pxm_to_node(pxm); - if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + if (node == NUMA_NO_NODE) { pr_err("SRAT: Too many proximity domains %d\n", pxm); bad_srat(); return; } - mpidr = acpi_map_madt_entry(pa->acpi_processor_uid); - if (mpidr == PHYS_CPUID_INVALID) { - pr_err("SRAT: PXM %d with ACPI ID %d has no valid MPIDR in MADT\n", - pxm, pa->acpi_processor_uid); - bad_srat(); - return; - } - - early_node_cpu_hwid[cpus_in_srat].node_id = node; - early_node_cpu_hwid[cpus_in_srat].cpu_hwid = mpidr; node_set(node, numa_nodes_parsed); - cpus_in_srat++; - pr_info("SRAT: PXM %d -> MPIDR 0x%Lx -> Node %d\n", - pxm, mpidr, node); } -int __init arm64_acpi_numa_init(void) -{ - int ret; - - ret = acpi_numa_init(); - if (ret) { - pr_info("Failed to initialise from firmware\n"); - return ret; - } - - return srat_disabled() ? -EINVAL : 0; -} diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c index 98a20e58758b..b1990e38aed0 100644 --- a/arch/arm64/kernel/acpi_parking_protocol.c +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 ACPI Parking Protocol implementation * * Authors: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> * Mark Salter <msalter@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> #include <linux/mm.h> @@ -110,7 +99,8 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu) * that read this address need to convert this address to the * Boot-Loader's endianness before jumping. */ - writeq_relaxed(__pa_symbol(secondary_entry), &mailbox->entry_point); + writeq_relaxed(__pa_symbol(secondary_entry), + &mailbox->entry_point); writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); arch_send_wakeup_ipi_mask(cpumask_of(cpu)); diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 6dd0a3a3e5c9..91263d09ea65 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -1,66 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * alternative runtime patching * inspired by the x86 version * * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "alternatives: " fmt #include <linux/init.h> #include <linux/cpu.h> +#include <linux/elf.h> #include <asm/cacheflush.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> +#include <asm/module.h> #include <asm/sections.h> +#include <asm/vdso.h> #include <linux/stop_machine.h> -#define __ALT_PTR(a,f) ((void *)&(a)->f + (a)->f) +#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) +#define ALT_CAP(a) ((a)->cpufeature & ~ARM64_CB_BIT) +#define ALT_HAS_CB(a) ((a)->cpufeature & ARM64_CB_BIT) + +/* Volatile, as we may be patching the guts of READ_ONCE() */ +static volatile int all_alternatives_applied; + +static DECLARE_BITMAP(applied_alternatives, ARM64_NCAPS); + struct alt_region { struct alt_instr *begin; struct alt_instr *end; }; +bool alternative_is_applied(u16 cpufeature) +{ + if (WARN_ON(cpufeature >= ARM64_NCAPS)) + return false; + + return test_bit(cpufeature, applied_alternatives); +} + /* * Check if the target PC is within an alternative block. */ -static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) +static __always_inline bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) { - unsigned long replptr; - - if (kernel_text_address(pc)) - return 1; - - replptr = (unsigned long)ALT_REPL_PTR(alt); - if (pc >= replptr && pc <= (replptr + alt->alt_len)) - return 0; - - /* - * Branching into *another* alternate sequence is doomed, and - * we're not even trying to fix it up. - */ - BUG(); + unsigned long replptr = (unsigned long)ALT_REPL_PTR(alt); + return !(pc >= replptr && pc <= (replptr + alt->alt_len)); } #define align_down(x, a) ((unsigned long)(x) & ~(((unsigned long)(a)) - 1)) -static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) +static __always_inline u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) { u32 insn; @@ -105,60 +101,152 @@ static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnp return insn; } -static void __apply_alternatives(void *alt_region, bool use_linear_alias) +static noinstr void patch_alternative(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + __le32 *replptr; + int i; + + replptr = ALT_REPL_PTR(alt); + for (i = 0; i < nr_inst; i++) { + u32 insn; + + insn = get_alt_insn(alt, origptr + i, replptr + i); + updptr[i] = cpu_to_le32(insn); + } +} + +/* + * We provide our own, private D-cache cleaning function so that we don't + * accidentally call into the cache.S code, which is patched by us at + * runtime. + */ +static void clean_dcache_range_nopatch(u64 start, u64 end) +{ + u64 cur, d_size, ctr_el0; + + ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + d_size = 4 << cpuid_feature_extract_unsigned_field(ctr_el0, + CTR_EL0_DminLine_SHIFT); + cur = start & ~(d_size - 1); + do { + /* + * We must clean+invalidate to the PoC in order to avoid + * Cortex-A53 errata 826319, 827319, 824069 and 819472 + * (this corresponds to ARM64_WORKAROUND_CLEAN_CACHE) + */ + asm volatile("dc civac, %0" : : "r" (cur) : "memory"); + } while (cur += d_size, cur < end); +} + +static void __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *feature_mask) { struct alt_instr *alt; - struct alt_region *region = alt_region; - __le32 *origptr, *replptr, *updptr; + __le32 *origptr, *updptr; + alternative_cb_t alt_cb; for (alt = region->begin; alt < region->end; alt++) { - u32 insn; - int i, nr_inst; + int nr_inst; + int cap = ALT_CAP(alt); - if (!cpus_have_cap(alt->cpufeature)) + if (!test_bit(cap, feature_mask)) continue; - BUG_ON(alt->alt_len != alt->orig_len); + if (!cpus_have_cap(cap)) + continue; - pr_info_once("patching kernel code\n"); + if (ALT_HAS_CB(alt)) + BUG_ON(alt->alt_len != 0); + else + BUG_ON(alt->alt_len != alt->orig_len); origptr = ALT_ORIG_PTR(alt); - replptr = ALT_REPL_PTR(alt); - updptr = use_linear_alias ? lm_alias(origptr) : origptr; - nr_inst = alt->alt_len / sizeof(insn); + updptr = is_module ? origptr : lm_alias(origptr); + nr_inst = alt->orig_len / AARCH64_INSN_SIZE; + + if (ALT_HAS_CB(alt)) + alt_cb = ALT_REPL_PTR(alt); + else + alt_cb = patch_alternative; - for (i = 0; i < nr_inst; i++) { - insn = get_alt_insn(alt, origptr + i, replptr + i); - updptr[i] = cpu_to_le32(insn); + alt_cb(alt, origptr, updptr, nr_inst); + + if (!is_module) { + clean_dcache_range_nopatch((u64)origptr, + (u64)(origptr + nr_inst)); } + } - flush_icache_range((uintptr_t)origptr, - (uintptr_t)(origptr + nr_inst)); + /* + * The core module code takes care of cache maintenance in + * flush_module_icache(). + */ + if (!is_module) { + dsb(ish); + icache_inval_all_pou(); + isb(); + + /* Ignore ARM64_CB bit from feature mask */ + bitmap_or(applied_alternatives, applied_alternatives, + feature_mask, ARM64_NCAPS); + bitmap_and(applied_alternatives, applied_alternatives, + cpu_hwcaps, ARM64_NCAPS); } } +void apply_alternatives_vdso(void) +{ + struct alt_region region; + const struct elf64_hdr *hdr; + const struct elf64_shdr *shdr; + const struct elf64_shdr *alt; + DECLARE_BITMAP(all_capabilities, ARM64_NCAPS); + + bitmap_fill(all_capabilities, ARM64_NCAPS); + + hdr = (struct elf64_hdr *)vdso_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + + region = (struct alt_region){ + .begin = (void *)hdr + alt->sh_offset, + .end = (void *)hdr + alt->sh_offset + alt->sh_size, + }; + + __apply_alternatives(®ion, false, &all_capabilities[0]); +} + +static const struct alt_region kernel_alternatives = { + .begin = (struct alt_instr *)__alt_instructions, + .end = (struct alt_instr *)__alt_instructions_end, +}; + /* * We might be patching the stop_machine state machine, so implement a * really simple polling protocol here. */ static int __apply_alternatives_multi_stop(void *unused) { - static int patched = 0; - struct alt_region region = { - .begin = (struct alt_instr *)__alt_instructions, - .end = (struct alt_instr *)__alt_instructions_end, - }; - /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(patched)) + while (!all_alternatives_applied) cpu_relax(); isb(); } else { - BUG_ON(patched); - __apply_alternatives(®ion, true); + DECLARE_BITMAP(remaining_capabilities, ARM64_NCAPS); + + bitmap_complement(remaining_capabilities, boot_capabilities, + ARM64_NCAPS); + + BUG_ON(all_alternatives_applied); + __apply_alternatives(&kernel_alternatives, false, + remaining_capabilities); /* Barriers provided by the cache flushing */ - WRITE_ONCE(patched, 1); + all_alternatives_applied = 1; } return 0; @@ -166,16 +254,48 @@ static int __apply_alternatives_multi_stop(void *unused) void __init apply_alternatives_all(void) { + pr_info("applying system-wide alternatives\n"); + + apply_alternatives_vdso(); /* better not try code patching on a live SMP system */ stop_machine(__apply_alternatives_multi_stop, NULL, cpu_online_mask); } -void apply_alternatives(void *start, size_t length) +/* + * This is called very early in the boot process (directly after we run + * a feature detect on the boot CPU). No need to worry about other CPUs + * here. + */ +void __init apply_boot_alternatives(void) +{ + /* If called on non-boot cpu things could go wrong */ + WARN_ON(smp_processor_id() != 0); + + pr_info("applying boot alternatives\n"); + + __apply_alternatives(&kernel_alternatives, false, + &boot_capabilities[0]); +} + +#ifdef CONFIG_MODULES +void apply_alternatives_module(void *start, size_t length) { struct alt_region region = { .begin = start, .end = start + length, }; + DECLARE_BITMAP(all_capabilities, ARM64_NCAPS); - __apply_alternatives(®ion, false); + bitmap_fill(all_capabilities, ARM64_NCAPS); + + __apply_alternatives(®ion, true, &all_capabilities[0]); +} +#endif + +noinstr void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + for (int i = 0; i < nr_inst; i++) + updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); } +EXPORT_SYMBOL(alt_cb_patch_nops); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c deleted file mode 100644 index 67368c7329c0..000000000000 --- a/arch/arm64/kernel/arm64ksyms.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Based on arch/arm/kernel/armksyms.c - * - * Copyright (C) 2000 Russell King - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/cryptohash.h> -#include <linux/delay.h> -#include <linux/in6.h> -#include <linux/syscalls.h> -#include <linux/uaccess.h> -#include <linux/io.h> -#include <linux/arm-smccc.h> -#include <linux/kprobes.h> - -#include <asm/checksum.h> - -EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(clear_page); - - /* user mem (segment) */ -EXPORT_SYMBOL(__arch_copy_from_user); -EXPORT_SYMBOL(__arch_copy_to_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(raw_copy_in_user); - - /* physical memory */ -EXPORT_SYMBOL(memstart_addr); - - /* string / mem functions */ -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(strcmp); -EXPORT_SYMBOL(strncmp); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strnlen); -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(__memset); -EXPORT_SYMBOL(__memcpy); -EXPORT_SYMBOL(__memmove); -EXPORT_SYMBOL(memchr); -EXPORT_SYMBOL(memcmp); - - /* atomic bitops */ -EXPORT_SYMBOL(set_bit); -EXPORT_SYMBOL(test_and_set_bit); -EXPORT_SYMBOL(clear_bit); -EXPORT_SYMBOL(test_and_clear_bit); -EXPORT_SYMBOL(change_bit); -EXPORT_SYMBOL(test_and_change_bit); - -#ifdef CONFIG_FUNCTION_TRACER -EXPORT_SYMBOL(_mcount); -NOKPROBE_SYMBOL(_mcount); -#endif - - /* arm-smccc */ -EXPORT_SYMBOL(__arm_smccc_smc); -EXPORT_SYMBOL(__arm_smccc_hvc); diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index d06fbe4cd38d..fb0e7c7b2e20 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/cpu.h> @@ -13,6 +10,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/sysctl.h> +#include <linux/uaccess.h> #include <asm/cpufeature.h> #include <asm/insn.h> @@ -20,8 +18,6 @@ #include <asm/system_misc.h> #include <asm/traps.h> #include <asm/kprobes.h> -#include <linux/uaccess.h> -#include <asm/cpufeature.h> #define CREATE_TRACE_POINTS #include "trace-events-emulation.h" @@ -63,6 +59,7 @@ struct insn_emulation { static LIST_HEAD(insn_emulation); static int nr_insn_emulated __initdata; static DEFINE_RAW_SPINLOCK(insn_emulation_lock); +static DEFINE_MUTEX(insn_emulation_mutex); static void register_emulation_hooks(struct insn_emulation_ops *ops) { @@ -178,6 +175,9 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) struct insn_emulation *insn; insn = kzalloc(sizeof(*insn), GFP_KERNEL); + if (!insn) + return; + insn->ops = ops; insn->min = INSN_UNDEF; @@ -204,14 +204,14 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; - struct insn_emulation *insn = (struct insn_emulation *) table->data; + struct insn_emulation *insn = container_of(table->data, struct insn_emulation, current_mode); enum insn_emulation_mode prev_mode = insn->current_mode; - table->data = &insn->current_mode; + mutex_lock(&insn_emulation_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write || prev_mode == insn->current_mode) @@ -224,27 +224,21 @@ static int emulation_proc_handler(struct ctl_table *table, int write, update_insn_emulation_mode(insn, INSN_UNDEF); } ret: - table->data = insn; + mutex_unlock(&insn_emulation_mutex); return ret; } -static struct ctl_table ctl_abi[] = { - { - .procname = "abi", - .mode = 0555, - }, - { } -}; - -static void __init register_insn_emulation_sysctl(struct ctl_table *table) +static void __init register_insn_emulation_sysctl(void) { unsigned long flags; int i = 0; struct insn_emulation *insn; struct ctl_table *insns_sysctl, *sysctl; - insns_sysctl = kzalloc(sizeof(*sysctl) * (nr_insn_emulated + 1), - GFP_KERNEL); + insns_sysctl = kcalloc(nr_insn_emulated + 1, sizeof(*sysctl), + GFP_KERNEL); + if (!insns_sysctl) + return; raw_spin_lock_irqsave(&insn_emulation_lock, flags); list_for_each_entry(insn, &insn_emulation, node) { @@ -254,7 +248,7 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) sysctl->maxlen = sizeof(int); sysctl->procname = insn->ops->name; - sysctl->data = insn; + sysctl->data = &insn->current_mode; sysctl->extra1 = &insn->min; sysctl->extra2 = &insn->max; sysctl->proc_handler = emulation_proc_handler; @@ -262,8 +256,7 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) } raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - table->child = insns_sysctl; - register_sysctl_table(table); + register_sysctl("abi", insns_sysctl); } /* @@ -285,9 +278,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ do { \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ __asm__ __volatile__( \ - " mov %w3, %w7\n" \ + " mov %w3, %w6\n" \ "0: ldxr"B" %w2, [%4]\n" \ "1: stxr"B" %w0, %w1, [%4]\n" \ " cbz %w0, 2f\n" \ @@ -298,19 +291,13 @@ do { \ "2:\n" \ " mov %w1, %w2\n" \ "3:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "4: mov %w0, %w6\n" \ - " b 3b\n" \ - " .popsection" \ - _ASM_EXTABLE(0b, 4b) \ - _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE_UACCESS_ERR(0b, 3b, %w0) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ : "r" ((unsigned long)addr), "i" (-EAGAIN), \ - "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ @@ -379,6 +366,7 @@ static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) static int swp_handler(struct pt_regs *regs, u32 instr) { u32 destreg, data, type, address = 0; + const void __user *user_ptr; int rn, rt2, res = 0; perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->pc); @@ -410,7 +398,8 @@ static int swp_handler(struct pt_regs *regs, u32 instr) aarch32_insn_extract_reg_num(instr, A32_RT2_OFFSET), data); /* Check access in reasonable access range for both SWP and SWPB */ - if (!access_ok(VERIFY_WRITE, (address & ~3), 4)) { + user_ptr = (const void __user *)(unsigned long)(address & ~3); + if (!access_ok(user_ptr, 4)) { pr_debug("SWP{B} emulation: access to 0x%08x not allowed!\n", address); goto fault; @@ -431,12 +420,12 @@ ret: pr_warn_ratelimited("\"%s\" (%ld) uses obsolete SWP{B} instruction at 0x%llx\n", current->comm, (unsigned long)current->pid, regs->pc); - regs->pc += 4; + arm64_skip_faulting_instruction(regs, 4); return 0; fault: pr_debug("SWP{B} emulation: access caused memory abort!\n"); - arm64_notify_segfault(regs, address); + arm64_notify_segfault(address); return 0; } @@ -449,8 +438,8 @@ static struct undef_hook swp_hooks[] = { { .instr_mask = 0x0fb00ff0, .instr_val = 0x01000090, - .pstate_mask = COMPAT_PSR_MODE_MASK, - .pstate_val = COMPAT_PSR_MODE_USR, + .pstate_mask = PSR_AA32_MODE_MASK, + .pstate_val = PSR_AA32_MODE_USR, .fn = swp_handler }, { } @@ -512,16 +501,16 @@ ret: pr_warn_ratelimited("\"%s\" (%ld) uses deprecated CP15 Barrier instruction at 0x%llx\n", current->comm, (unsigned long)current->pid, regs->pc); - regs->pc += 4; + arm64_skip_faulting_instruction(regs, 4); return 0; } static int cp15_barrier_set_hw_mode(bool enable) { if (enable) - config_sctlr_el1(0, SCTLR_EL1_CP15BEN); + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_CP15BEN); else - config_sctlr_el1(SCTLR_EL1_CP15BEN, 0); + sysreg_clear_set(sctlr_el1, SCTLR_EL1_CP15BEN, 0); return 0; } @@ -529,15 +518,15 @@ static struct undef_hook cp15_barrier_hooks[] = { { .instr_mask = 0x0fff0fdf, .instr_val = 0x0e070f9a, - .pstate_mask = COMPAT_PSR_MODE_MASK, - .pstate_val = COMPAT_PSR_MODE_USR, + .pstate_mask = PSR_AA32_MODE_MASK, + .pstate_val = PSR_AA32_MODE_USR, .fn = cp15barrier_handler, }, { .instr_mask = 0x0fff0fff, .instr_val = 0x0e070f95, - .pstate_mask = COMPAT_PSR_MODE_MASK, - .pstate_val = COMPAT_PSR_MODE_USR, + .pstate_mask = PSR_AA32_MODE_MASK, + .pstate_val = PSR_AA32_MODE_USR, .fn = cp15barrier_handler, }, { } @@ -556,9 +545,9 @@ static int setend_set_hw_mode(bool enable) return -EINVAL; if (enable) - config_sctlr_el1(SCTLR_EL1_SED, 0); + sysreg_clear_set(sctlr_el1, SCTLR_EL1_SED, 0); else - config_sctlr_el1(0, SCTLR_EL1_SED); + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_SED); return 0; } @@ -570,10 +559,10 @@ static int compat_setend_handler(struct pt_regs *regs, u32 big_endian) if (big_endian) { insn = "setend be"; - regs->pstate |= COMPAT_PSR_E_BIT; + regs->pstate |= PSR_AA32_E_BIT; } else { insn = "setend le"; - regs->pstate &= ~COMPAT_PSR_E_BIT; + regs->pstate &= ~PSR_AA32_E_BIT; } trace_instruction_emulation(insn, regs->pc); @@ -586,14 +575,14 @@ static int compat_setend_handler(struct pt_regs *regs, u32 big_endian) static int a32_setend_handler(struct pt_regs *regs, u32 instr) { int rc = compat_setend_handler(regs, (instr >> 9) & 1); - regs->pc += 4; + arm64_skip_faulting_instruction(regs, 4); return rc; } static int t16_setend_handler(struct pt_regs *regs, u32 instr) { int rc = compat_setend_handler(regs, (instr >> 3) & 1); - regs->pc += 2; + arm64_skip_faulting_instruction(regs, 2); return rc; } @@ -601,16 +590,16 @@ static struct undef_hook setend_hooks[] = { { .instr_mask = 0xfffffdff, .instr_val = 0xf1010000, - .pstate_mask = COMPAT_PSR_MODE_MASK, - .pstate_val = COMPAT_PSR_MODE_USR, + .pstate_mask = PSR_AA32_MODE_MASK, + .pstate_val = PSR_AA32_MODE_USR, .fn = a32_setend_handler, }, { /* Thumb mode */ - .instr_mask = 0x0000fff7, + .instr_mask = 0xfffffff7, .instr_val = 0x0000b650, - .pstate_mask = (COMPAT_PSR_T_BIT | COMPAT_PSR_MODE_MASK), - .pstate_val = (COMPAT_PSR_T_BIT | COMPAT_PSR_MODE_USR), + .pstate_mask = (PSR_AA32_T_BIT | PSR_AA32_MODE_MASK), + .pstate_val = (PSR_AA32_T_BIT | PSR_AA32_MODE_USR), .fn = t16_setend_handler, }, {} @@ -624,7 +613,8 @@ static struct insn_emulation_ops setend_ops = { }; /* - * Invoked as late_initcall, since not needed before init spawned. + * Invoked as core_initcall, which guarantees that the instruction + * emulation is ready for userspace. */ static int __init armv8_deprecated_init(void) { @@ -635,7 +625,7 @@ static int __init armv8_deprecated_init(void) register_insn_emulation(&cp15_barrier_ops); if (IS_ENABLED(CONFIG_SETEND_EMULATION)) { - if(system_supports_mixed_endian_el0()) + if (system_supports_mixed_endian_el0()) register_insn_emulation(&setend_ops); else pr_info("setend instruction emulation is not supported on this system\n"); @@ -644,7 +634,7 @@ static int __init armv8_deprecated_init(void) cpuhp_setup_state_nocalls(CPUHP_AP_ARM64_ISNDEP_STARTING, "arm64/isndep:starting", run_all_insn_set_hw_mode, NULL); - register_insn_emulation_sysctl(ctl_abi); + register_insn_emulation_sysctl(); return 0; } diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 71bf088f1e4b..1197e7679882 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -1,34 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/asm-offsets.c * * Copyright (C) 1995-2003 Russell King * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/arm_sdei.h> #include <linux/sched.h> +#include <linux/kexec.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/kvm_host.h> +#include <linux/preempt.h> #include <linux/suspend.h> #include <asm/cpufeature.h> +#include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> +#include <asm/signal32.h> #include <asm/smp_plat.h> #include <asm/suspend.h> -#include <asm/vdso_datapage.h> #include <linux/kbuild.h> #include <linux/arm-smccc.h> @@ -36,24 +29,37 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); + DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); - DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif +#ifdef CONFIG_SHADOW_CALL_STACK + DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base)); + DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp)); +#endif DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); +#ifdef CONFIG_STACKPROTECTOR + DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary)); +#endif BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); + DEFINE(THREAD_SCTLR_USER, offsetof(struct task_struct, thread.sctlr_user)); +#ifdef CONFIG_ARM64_PTR_AUTH + DEFINE(THREAD_KEYS_USER, offsetof(struct task_struct, thread.keys_user)); +#endif +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL + DEFINE(THREAD_KEYS_KERNEL, offsetof(struct task_struct, thread.keys_kernel)); +#endif +#ifdef CONFIG_ARM64_MTE + DEFINE(THREAD_MTE_CTRL, offsetof(struct task_struct, thread.mte_ctrl)); +#endif BLANK(); DEFINE(S_X0, offsetof(struct pt_regs, regs[0])); - DEFINE(S_X1, offsetof(struct pt_regs, regs[1])); DEFINE(S_X2, offsetof(struct pt_regs, regs[2])); - DEFINE(S_X3, offsetof(struct pt_regs, regs[3])); DEFINE(S_X4, offsetof(struct pt_regs, regs[4])); - DEFINE(S_X5, offsetof(struct pt_regs, regs[5])); DEFINE(S_X6, offsetof(struct pt_regs, regs[6])); - DEFINE(S_X7, offsetof(struct pt_regs, regs[7])); DEFINE(S_X8, offsetof(struct pt_regs, regs[8])); DEFINE(S_X10, offsetof(struct pt_regs, regs[10])); DEFINE(S_X12, offsetof(struct pt_regs, regs[12])); @@ -65,19 +71,22 @@ int main(void) DEFINE(S_X24, offsetof(struct pt_regs, regs[24])); DEFINE(S_X26, offsetof(struct pt_regs, regs[26])); DEFINE(S_X28, offsetof(struct pt_regs, regs[28])); + DEFINE(S_FP, offsetof(struct pt_regs, regs[29])); DEFINE(S_LR, offsetof(struct pt_regs, regs[30])); DEFINE(S_SP, offsetof(struct pt_regs, sp)); -#ifdef CONFIG_COMPAT - DEFINE(S_COMPAT_SP, offsetof(struct pt_regs, compat_sp)); -#endif DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); - DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); - DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); + DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); + DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); + BLANK(); +#ifdef CONFIG_COMPAT + DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); + DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); BLANK(); +#endif DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); @@ -87,57 +96,42 @@ int main(void) BLANK(); DEFINE(PAGE_SZ, PAGE_SIZE); BLANK(); - DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); BLANK(); - DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); - DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); - DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); - DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); - DEFINE(CLOCK_COARSE_RES, LOW_RES_NSEC); - DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); - BLANK(); - DEFINE(VDSO_CS_CYCLE_LAST, offsetof(struct vdso_data, cs_cycle_last)); - DEFINE(VDSO_RAW_TIME_SEC, offsetof(struct vdso_data, raw_time_sec)); - DEFINE(VDSO_RAW_TIME_NSEC, offsetof(struct vdso_data, raw_time_nsec)); - DEFINE(VDSO_XTIME_CLK_SEC, offsetof(struct vdso_data, xtime_clock_sec)); - DEFINE(VDSO_XTIME_CLK_NSEC, offsetof(struct vdso_data, xtime_clock_nsec)); - DEFINE(VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec)); - DEFINE(VDSO_XTIME_CRS_NSEC, offsetof(struct vdso_data, xtime_coarse_nsec)); - DEFINE(VDSO_WTM_CLK_SEC, offsetof(struct vdso_data, wtm_clock_sec)); - DEFINE(VDSO_WTM_CLK_NSEC, offsetof(struct vdso_data, wtm_clock_nsec)); - DEFINE(VDSO_TB_SEQ_COUNT, offsetof(struct vdso_data, tb_seq_count)); - DEFINE(VDSO_CS_MONO_MULT, offsetof(struct vdso_data, cs_mono_mult)); - DEFINE(VDSO_CS_RAW_MULT, offsetof(struct vdso_data, cs_raw_mult)); - DEFINE(VDSO_CS_SHIFT, offsetof(struct vdso_data, cs_shift)); - DEFINE(VDSO_TZ_MINWEST, offsetof(struct vdso_data, tz_minuteswest)); - DEFINE(VDSO_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); - DEFINE(VDSO_USE_SYSCALL, offsetof(struct vdso_data, use_syscall)); - BLANK(); - DEFINE(TVAL_TV_SEC, offsetof(struct timeval, tv_sec)); - DEFINE(TVAL_TV_USEC, offsetof(struct timeval, tv_usec)); - DEFINE(TSPEC_TV_SEC, offsetof(struct timespec, tv_sec)); - DEFINE(TSPEC_TV_NSEC, offsetof(struct timespec, tv_nsec)); - BLANK(); - DEFINE(TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); - DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); - BLANK(); - DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); + DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); + DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT); + DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending)); + BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); -#ifdef CONFIG_KVM_ARM_HOST + DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); + DEFINE(FTR_OVR_MASK_OFFSET, offsetof(struct arm64_ftr_override, mask)); + BLANK(); +#ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); - DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); - DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); - DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs)); - DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2])); - DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context)); + DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); + DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); + DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs)); + DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1])); + DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1])); + DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1])); + DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1])); + DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1])); + DEFINE(CPU_APDBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDBKEYLO_EL1])); + DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); + DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); + DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); + DEFINE(NVHE_INIT_MAIR_EL2, offsetof(struct kvm_nvhe_init_params, mair_el2)); + DEFINE(NVHE_INIT_TCR_EL2, offsetof(struct kvm_nvhe_init_params, tcr_el2)); + DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); + DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); + DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); + DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); + DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); + DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); #endif #ifdef CONFIG_CPU_PM - DEFINE(CPU_SUSPEND_SZ, sizeof(struct cpu_suspend_ctx)); DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); DEFINE(MPIDR_HASH_MASK, offsetof(struct mpidr_hash, mask)); DEFINE(MPIDR_HASH_SHIFTS, offsetof(struct mpidr_hash, shift_aff)); @@ -148,11 +142,44 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); - + DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS, offsetof(struct arm_smccc_1_2_regs, a0)); + DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS, offsetof(struct arm_smccc_1_2_regs, a2)); + DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS, offsetof(struct arm_smccc_1_2_regs, a4)); + DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS, offsetof(struct arm_smccc_1_2_regs, a6)); + DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS, offsetof(struct arm_smccc_1_2_regs, a8)); + DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS, offsetof(struct arm_smccc_1_2_regs, a10)); + DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS, offsetof(struct arm_smccc_1_2_regs, a12)); + DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS, offsetof(struct arm_smccc_1_2_regs, a14)); + DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS, offsetof(struct arm_smccc_1_2_regs, a16)); BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val)); + BLANK(); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif +#ifdef CONFIG_ARM_SDE_INTERFACE + DEFINE(SDEI_EVENT_INTREGS, offsetof(struct sdei_registered_event, interrupted_regs)); + DEFINE(SDEI_EVENT_PRIORITY, offsetof(struct sdei_registered_event, priority)); +#endif +#ifdef CONFIG_ARM64_PTR_AUTH + DEFINE(PTRAUTH_USER_KEY_APIA, offsetof(struct ptrauth_keys_user, apia)); +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL + DEFINE(PTRAUTH_KERNEL_KEY_APIA, offsetof(struct ptrauth_keys_kernel, apia)); +#endif + BLANK(); +#endif +#ifdef CONFIG_KEXEC_CORE + DEFINE(KIMAGE_ARCH_DTB_MEM, offsetof(struct kimage, arch.dtb_mem)); + DEFINE(KIMAGE_ARCH_EL2_VECTORS, offsetof(struct kimage, arch.el2_vectors)); + DEFINE(KIMAGE_ARCH_ZERO_PAGE, offsetof(struct kimage, arch.zero_page)); + DEFINE(KIMAGE_ARCH_PHYS_OFFSET, offsetof(struct kimage, arch.phys_offset)); + DEFINE(KIMAGE_ARCH_TTBR1, offsetof(struct kimage, arch.ttbr1)); + DEFINE(KIMAGE_HEAD, offsetof(struct kimage, head)); + DEFINE(KIMAGE_START, offsetof(struct kimage, start)); + BLANK(); +#endif return 0; } diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 380f2e2fbed5..97c42be71338 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -1,22 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 cacheinfo support * * Copyright (C) 2015 ARM Ltd. * All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed "as is" WITHOUT ANY WARRANTY of any - * kind, whether express or implied; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/acpi.h> #include <linux/cacheinfo.h> #include <linux/of.h> @@ -27,6 +17,15 @@ #define CLIDR_CTYPE(clidr, level) \ (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) +int cache_line_size(void) +{ + if (coherency_max_size != 0) + return coherency_max_size; + + return cache_line_size_of_cpu(); +} +EXPORT_SYMBOL_GPL(cache_line_size); + static inline enum cache_type get_cache_type(int level) { u64 clidr; @@ -44,9 +43,10 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { - unsigned int ctype, level, leaves, of_level; + unsigned int ctype, level, leaves; + int fw_level; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); for (level = 1, leaves = 0; level <= MAX_CACHE_LEVEL; level++) { @@ -59,15 +59,22 @@ static int __init_cache_level(unsigned int cpu) leaves += (ctype == CACHE_TYPE_SEPARATE) ? 2 : 1; } - of_level = of_find_last_cache_level(cpu); - if (level < of_level) { + if (acpi_disabled) + fw_level = of_find_last_cache_level(cpu); + else + fw_level = acpi_find_last_cache_level(cpu); + + if (fw_level < 0) + return fw_level; + + if (level < fw_level) { /* * some external caches not specified in CLIDR_EL1 * the information may be available in the device tree * only unified external caches are considered here */ - leaves += (of_level - level); - level = of_level; + leaves += (fw_level - level); + level = fw_level; } this_cpu_ci->num_levels = level; @@ -75,7 +82,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int level, idx; enum cache_type type; @@ -94,6 +101,3 @@ static int __populate_cache_leaves(unsigned int cpu) } return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c new file mode 100644 index 000000000000..5edec2f49ec9 --- /dev/null +++ b/arch/arm64/kernel/compat_alignment.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-only +// based on arch/arm/mm/alignment.c + +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/perf_event.h> +#include <linux/uaccess.h> + +#include <asm/exception.h> +#include <asm/ptrace.h> +#include <asm/traps.h> + +/* + * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 + * + * Speed optimisations and better fault handling by Russell King. + */ +#define CODING_BITS(i) (i & 0x0e000000) + +#define LDST_P_BIT(i) (i & (1 << 24)) /* Preindex */ +#define LDST_U_BIT(i) (i & (1 << 23)) /* Add offset */ +#define LDST_W_BIT(i) (i & (1 << 21)) /* Writeback */ +#define LDST_L_BIT(i) (i & (1 << 20)) /* Load */ + +#define LDST_P_EQ_U(i) ((((i) ^ ((i) >> 1)) & (1 << 23)) == 0) + +#define LDSTHD_I_BIT(i) (i & (1 << 22)) /* double/half-word immed */ + +#define RN_BITS(i) ((i >> 16) & 15) /* Rn */ +#define RD_BITS(i) ((i >> 12) & 15) /* Rd */ +#define RM_BITS(i) (i & 15) /* Rm */ + +#define REGMASK_BITS(i) (i & 0xffff) + +#define BAD_INSTR 0xdeadc0de + +/* Thumb-2 32 bit format per ARMv7 DDI0406A A6.3, either f800h,e800h,f800h */ +#define IS_T32(hi16) \ + (((hi16) & 0xe000) == 0xe000 && ((hi16) & 0x1800)) + +union offset_union { + unsigned long un; + signed long sn; +}; + +#define TYPE_ERROR 0 +#define TYPE_FAULT 1 +#define TYPE_LDST 2 +#define TYPE_DONE 3 + +static void +do_alignment_finish_ldst(unsigned long addr, u32 instr, struct pt_regs *regs, + union offset_union offset) +{ + if (!LDST_U_BIT(instr)) + offset.un = -offset.un; + + if (!LDST_P_BIT(instr)) + addr += offset.un; + + if (!LDST_P_BIT(instr) || LDST_W_BIT(instr)) + regs->regs[RN_BITS(instr)] = addr; +} + +static int +do_alignment_ldrdstrd(unsigned long addr, u32 instr, struct pt_regs *regs) +{ + unsigned int rd = RD_BITS(instr); + unsigned int rd2; + int load; + + if ((instr & 0xfe000000) == 0xe8000000) { + /* ARMv7 Thumb-2 32-bit LDRD/STRD */ + rd2 = (instr >> 8) & 0xf; + load = !!(LDST_L_BIT(instr)); + } else if (((rd & 1) == 1) || (rd == 14)) { + return TYPE_ERROR; + } else { + load = ((instr & 0xf0) == 0xd0); + rd2 = rd + 1; + } + + if (load) { + unsigned int val, val2; + + if (get_user(val, (u32 __user *)addr) || + get_user(val2, (u32 __user *)(addr + 4))) + return TYPE_FAULT; + regs->regs[rd] = val; + regs->regs[rd2] = val2; + } else { + if (put_user(regs->regs[rd], (u32 __user *)addr) || + put_user(regs->regs[rd2], (u32 __user *)(addr + 4))) + return TYPE_FAULT; + } + return TYPE_LDST; +} + +/* + * LDM/STM alignment handler. + * + * There are 4 variants of this instruction: + * + * B = rn pointer before instruction, A = rn pointer after instruction + * ------ increasing address -----> + * | | r0 | r1 | ... | rx | | + * PU = 01 B A + * PU = 11 B A + * PU = 00 A B + * PU = 10 A B + */ +static int +do_alignment_ldmstm(unsigned long addr, u32 instr, struct pt_regs *regs) +{ + unsigned int rd, rn, nr_regs, regbits; + unsigned long eaddr, newaddr; + unsigned int val; + + /* count the number of registers in the mask to be transferred */ + nr_regs = hweight16(REGMASK_BITS(instr)) * 4; + + rn = RN_BITS(instr); + newaddr = eaddr = regs->regs[rn]; + + if (!LDST_U_BIT(instr)) + nr_regs = -nr_regs; + newaddr += nr_regs; + if (!LDST_U_BIT(instr)) + eaddr = newaddr; + + if (LDST_P_EQ_U(instr)) /* U = P */ + eaddr += 4; + + for (regbits = REGMASK_BITS(instr), rd = 0; regbits; + regbits >>= 1, rd += 1) + if (regbits & 1) { + if (LDST_L_BIT(instr)) { + if (get_user(val, (u32 __user *)eaddr)) + return TYPE_FAULT; + if (rd < 15) + regs->regs[rd] = val; + else + regs->pc = val; + } else { + /* + * The PC register has a bias of +8 in ARM mode + * and +4 in Thumb mode. This means that a read + * of the value of PC should account for this. + * Since Thumb does not permit STM instructions + * to refer to PC, just add 8 here. + */ + val = (rd < 15) ? regs->regs[rd] : regs->pc + 8; + if (put_user(val, (u32 __user *)eaddr)) + return TYPE_FAULT; + } + eaddr += 4; + } + + if (LDST_W_BIT(instr)) + regs->regs[rn] = newaddr; + + return TYPE_DONE; +} + +/* + * Convert Thumb multi-word load/store instruction forms to equivalent ARM + * instructions so we can reuse ARM userland alignment fault fixups for Thumb. + * + * This implementation was initially based on the algorithm found in + * gdb/sim/arm/thumbemu.c. It is basically just a code reduction of same + * to convert only Thumb ld/st instruction forms to equivalent ARM forms. + * + * NOTES: + * 1. Comments below refer to ARM ARM DDI0100E Thumb Instruction sections. + * 2. If for some reason we're passed an non-ld/st Thumb instruction to + * decode, we return 0xdeadc0de. This should never happen under normal + * circumstances but if it does, we've got other problems to deal with + * elsewhere and we obviously can't fix those problems here. + */ + +static unsigned long thumb2arm(u16 tinstr) +{ + u32 L = (tinstr & (1<<11)) >> 11; + + switch ((tinstr & 0xf800) >> 11) { + /* 6.6.1 Format 1: */ + case 0xc000 >> 11: /* 7.1.51 STMIA */ + case 0xc800 >> 11: /* 7.1.25 LDMIA */ + { + u32 Rn = (tinstr & (7<<8)) >> 8; + u32 W = ((L<<Rn) & (tinstr&255)) ? 0 : 1<<21; + + return 0xe8800000 | W | (L<<20) | (Rn<<16) | + (tinstr&255); + } + + /* 6.6.1 Format 2: */ + case 0xb000 >> 11: /* 7.1.48 PUSH */ + case 0xb800 >> 11: /* 7.1.47 POP */ + if ((tinstr & (3 << 9)) == 0x0400) { + static const u32 subset[4] = { + 0xe92d0000, /* STMDB sp!,{registers} */ + 0xe92d4000, /* STMDB sp!,{registers,lr} */ + 0xe8bd0000, /* LDMIA sp!,{registers} */ + 0xe8bd8000 /* LDMIA sp!,{registers,pc} */ + }; + return subset[(L<<1) | ((tinstr & (1<<8)) >> 8)] | + (tinstr & 255); /* register_list */ + } + fallthrough; /* for illegal instruction case */ + + default: + return BAD_INSTR; + } +} + +/* + * Convert Thumb-2 32 bit LDM, STM, LDRD, STRD to equivalent instruction + * handlable by ARM alignment handler, also find the corresponding handler, + * so that we can reuse ARM userland alignment fault fixups for Thumb. + * + * @pinstr: original Thumb-2 instruction; returns new handlable instruction + * @regs: register context. + * @poffset: return offset from faulted addr for later writeback + * + * NOTES: + * 1. Comments below refer to ARMv7 DDI0406A Thumb Instruction sections. + * 2. Register name Rt from ARMv7 is same as Rd from ARMv6 (Rd is Rt) + */ +static void * +do_alignment_t32_to_handler(u32 *pinstr, struct pt_regs *regs, + union offset_union *poffset) +{ + u32 instr = *pinstr; + u16 tinst1 = (instr >> 16) & 0xffff; + u16 tinst2 = instr & 0xffff; + + switch (tinst1 & 0xffe0) { + /* A6.3.5 Load/Store multiple */ + case 0xe880: /* STM/STMIA/STMEA,LDM/LDMIA, PUSH/POP T2 */ + case 0xe8a0: /* ...above writeback version */ + case 0xe900: /* STMDB/STMFD, LDMDB/LDMEA */ + case 0xe920: /* ...above writeback version */ + /* no need offset decision since handler calculates it */ + return do_alignment_ldmstm; + + case 0xf840: /* POP/PUSH T3 (single register) */ + if (RN_BITS(instr) == 13 && (tinst2 & 0x09ff) == 0x0904) { + u32 L = !!(LDST_L_BIT(instr)); + const u32 subset[2] = { + 0xe92d0000, /* STMDB sp!,{registers} */ + 0xe8bd0000, /* LDMIA sp!,{registers} */ + }; + *pinstr = subset[L] | (1<<RD_BITS(instr)); + return do_alignment_ldmstm; + } + /* Else fall through for illegal instruction case */ + break; + + /* A6.3.6 Load/store double, STRD/LDRD(immed, lit, reg) */ + case 0xe860: + case 0xe960: + case 0xe8e0: + case 0xe9e0: + poffset->un = (tinst2 & 0xff) << 2; + fallthrough; + + case 0xe940: + case 0xe9c0: + return do_alignment_ldrdstrd; + + /* + * No need to handle load/store instructions up to word size + * since ARMv6 and later CPUs can perform unaligned accesses. + */ + default: + break; + } + return NULL; +} + +static int alignment_get_arm(struct pt_regs *regs, __le32 __user *ip, u32 *inst) +{ + __le32 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le32_to_cpu(instr); + return 0; +} + +static int alignment_get_thumb(struct pt_regs *regs, __le16 __user *ip, u16 *inst) +{ + __le16 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le16_to_cpu(instr); + return 0; +} + +int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) +{ + union offset_union offset; + unsigned long instrptr; + int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs); + unsigned int type; + u32 instr = 0; + u16 tinstr = 0; + int isize = 4; + int thumb2_32b = 0; + int fault; + + instrptr = instruction_pointer(regs); + + if (compat_thumb_mode(regs)) { + __le16 __user *ptr = (__le16 __user *)(instrptr & ~1); + + fault = alignment_get_thumb(regs, ptr, &tinstr); + if (!fault) { + if (IS_T32(tinstr)) { + /* Thumb-2 32-bit */ + u16 tinst2; + fault = alignment_get_thumb(regs, ptr + 1, &tinst2); + instr = ((u32)tinstr << 16) | tinst2; + thumb2_32b = 1; + } else { + isize = 2; + instr = thumb2arm(tinstr); + } + } + } else { + fault = alignment_get_arm(regs, (__le32 __user *)instrptr, &instr); + } + + if (fault) + return 1; + + switch (CODING_BITS(instr)) { + case 0x00000000: /* 3.13.4 load/store instruction extensions */ + if (LDSTHD_I_BIT(instr)) + offset.un = (instr & 0xf00) >> 4 | (instr & 15); + else + offset.un = regs->regs[RM_BITS(instr)]; + + if ((instr & 0x001000f0) == 0x000000d0 || /* LDRD */ + (instr & 0x001000f0) == 0x000000f0) /* STRD */ + handler = do_alignment_ldrdstrd; + else + return 1; + break; + + case 0x08000000: /* ldm or stm, or thumb-2 32bit instruction */ + if (thumb2_32b) { + offset.un = 0; + handler = do_alignment_t32_to_handler(&instr, regs, &offset); + } else { + offset.un = 0; + handler = do_alignment_ldmstm; + } + break; + + default: + return 1; + } + + type = handler(addr, instr, regs); + + if (type == TYPE_ERROR || type == TYPE_FAULT) + return 1; + + if (type == TYPE_LDST) + do_alignment_finish_ldst(addr, instr, regs, offset); + + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->pc); + arm64_skip_faulting_instruction(regs, isize); + + return 0; +} diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 65f42d257414..6b752fe89745 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -1,42 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * CPU reset routines * * Copyright (C) 2001 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> #include <asm/sysreg.h> #include <asm/virt.h> .text -.pushsection .idmap.text, "ax" +.pushsection .idmap.text, "awx" /* - * __cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) - Helper for - * cpu_soft_restart. + * cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) * - * @el2_switch: Flag to indicate a swich to EL2 is needed. + * @el2_switch: Flag to indicate a switch to EL2 is needed. * @entry: Location to jump to for soft reset. - * arg0: First argument passed to @entry. - * arg1: Second argument passed to @entry. - * arg2: Third argument passed to @entry. + * arg0: First argument passed to @entry. (relocation list) + * arg1: Second argument passed to @entry.(physical kernel entry) + * arg2: Third argument passed to @entry. (physical dtb address) * * Put the CPU into the same state as it would be if it had been reset, and * branch to what would be the reset vector. It must be executed with the * flat identity mapping. */ -ENTRY(__cpu_soft_restart) - /* Clear sctlr_el1 flags. */ - mrs x12, sctlr_el1 - ldr x13, =SCTLR_ELx_FLAGS - bic x12, x12, x13 +SYM_TYPED_FUNC_START(cpu_soft_restart) + mov_q x12, INIT_SCTLR_EL1_MMU_OFF + pre_disable_mmu_workaround + /* + * either disable EL1&0 translation regime or disable EL2&0 translation + * regime if HCR_EL2.E2H == 1 + */ msr sctlr_el1, x12 isb @@ -44,11 +43,11 @@ ENTRY(__cpu_soft_restart) mov x0, #HVC_SOFT_RESTART hvc #0 // no return -1: mov x18, x1 // entry +1: mov x8, x1 // entry mov x0, x2 // arg0 mov x1, x3 // arg1 mov x2, x4 // arg2 - br x18 -ENDPROC(__cpu_soft_restart) + br x8 +SYM_FUNC_END(cpu_soft_restart) .popsection diff --git a/arch/arm64/kernel/cpu-reset.h b/arch/arm64/kernel/cpu-reset.h deleted file mode 100644 index 6c2b1b4f57c9..000000000000 --- a/arch/arm64/kernel/cpu-reset.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * CPU reset routines - * - * Copyright (C) 2015 Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _ARM64_CPU_RESET_H -#define _ARM64_CPU_RESET_H - -#include <asm/virt.h> - -void __cpu_soft_restart(unsigned long el2_switch, unsigned long entry, - unsigned long arg0, unsigned long arg1, unsigned long arg2); - -static inline void __noreturn cpu_soft_restart(unsigned long el2_switch, - unsigned long entry, unsigned long arg0, unsigned long arg1, - unsigned long arg2) -{ - typeof(__cpu_soft_restart) *restart; - - el2_switch = el2_switch && !is_kernel_in_hyp_mode() && - is_hyp_mode_available(); - restart = (void *)__pa_symbol(__cpu_soft_restart); - - cpu_install_idmap(); - restart(el2_switch, entry, arg0, arg1, arg2); - unreachable(); -} - -#endif diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 0e27f86ee709..89ac00084f38 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -1,84 +1,444 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Contains CPU specific errata definitions * * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/arm-smccc.h> #include <linux/types.h> +#include <linux/cpu.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/cpufeature.h> +#include <asm/kvm_asm.h> +#include <asm/smp_plat.h> static bool __maybe_unused is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) { + const struct arm64_midr_revidr *fix; + u32 midr = read_cpuid_id(), revidr; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + if (!is_midr_in_range(midr, &entry->midr_range)) + return false; + + midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; + revidr = read_cpuid(REVIDR_EL1); + for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++) + if (midr == fix->midr_rv && (revidr & fix->revidr_mask)) + return false; + + return true; +} + +static bool __maybe_unused +is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list); +} + +static bool __maybe_unused +is_kryo_midr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u32 model; + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model, - entry->midr_range_min, - entry->midr_range_max); + + model = read_cpuid_id(); + model &= MIDR_IMPLEMENTOR_MASK | (0xf00 << MIDR_PARTNUM_SHIFT) | + MIDR_ARCHITECTURE_MASK; + + return model == entry->midr_range.model; } static bool -has_mismatched_cache_line_size(const struct arm64_cpu_capabilities *entry, - int scope) +has_mismatched_cache_type(const struct arm64_cpu_capabilities *entry, + int scope) { + u64 mask = arm64_ftr_reg_ctrel0.strict_mask; + u64 sys = arm64_ftr_reg_ctrel0.sys_val & mask; + u64 ctr_raw, ctr_real; + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return (read_cpuid_cachetype() & arm64_ftr_reg_ctrel0.strict_mask) != - (arm64_ftr_reg_ctrel0.sys_val & arm64_ftr_reg_ctrel0.strict_mask); + + /* + * We want to make sure that all the CPUs in the system expose + * a consistent CTR_EL0 to make sure that applications behaves + * correctly with migration. + * + * If a CPU has CTR_EL0.IDC but does not advertise it via CTR_EL0 : + * + * 1) It is safe if the system doesn't support IDC, as CPU anyway + * reports IDC = 0, consistent with the rest. + * + * 2) If the system has IDC, it is still safe as we trap CTR_EL0 + * access on this CPU via the ARM64_HAS_CACHE_IDC capability. + * + * So, we need to make sure either the raw CTR_EL0 or the effective + * CTR_EL0 matches the system's copy to allow a secondary CPU to boot. + */ + ctr_raw = read_cpuid_cachetype() & mask; + ctr_real = read_cpuid_effective_cachetype() & mask; + + return (ctr_real != sys) && (ctr_raw != sys); } -static int cpu_enable_trap_ctr_access(void *__unused) +static void +cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) { - /* Clear SCTLR_EL1.UCT */ - config_sctlr_el1(SCTLR_EL1_UCT, 0); - return 0; + u64 mask = arm64_ftr_reg_ctrel0.strict_mask; + bool enable_uct_trap = false; + + /* Trap CTR_EL0 access on this CPU, only if it has a mismatch */ + if ((read_cpuid_cachetype() & mask) != + (arm64_ftr_reg_ctrel0.sys_val & mask)) + enable_uct_trap = true; + + /* ... or if the system is affected by an erratum */ + if (cap->capability == ARM64_WORKAROUND_1542419) + enable_uct_trap = true; + + if (enable_uct_trap) + sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } -#define MIDR_RANGE(model, min, max) \ - .def_scope = SCOPE_LOCAL_CPU, \ - .matches = is_affected_midr_range, \ - .midr_model = model, \ - .midr_range_min = min, \ - .midr_range_max = max +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static bool +has_cortex_a76_erratum_1463225(const struct arm64_cpu_capabilities *entry, + int scope) +{ + return is_affected_midr_range_list(entry, scope) && is_kernel_in_hyp_mode(); +} +#endif -#define MIDR_ALL_VERSIONS(model) \ - .def_scope = SCOPE_LOCAL_CPU, \ - .matches = is_affected_midr_range, \ - .midr_model = model, \ - .midr_range_min = 0, \ - .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK) +static void __maybe_unused +cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0); +} -const struct arm64_cpu_capabilities arm64_errata[] = { +static DEFINE_RAW_SPINLOCK(reg_user_mask_modification); +static void __maybe_unused +cpu_clear_bf16_from_user_emulation(const struct arm64_cpu_capabilities *__unused) +{ + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); + if (!regp) + return; + + raw_spin_lock(®_user_mask_modification); + if (regp->user_mask & ID_AA64ISAR1_EL1_BF16_MASK) + regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; + raw_spin_unlock(®_user_mask_modification); +} + +#define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max) \ + .matches = is_affected_midr_range, \ + .midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max) + +#define CAP_MIDR_ALL_VERSIONS(model) \ + .matches = is_affected_midr_range, \ + .midr_range = MIDR_ALL_VERSIONS(model) + +#define MIDR_FIXED(rev, revidr_mask) \ + .fixed_revs = (struct arm64_midr_revidr[]){{ (rev), (revidr_mask) }, {}} + +#define ERRATA_MIDR_RANGE(model, v_min, r_min, v_max, r_max) \ + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ + CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max) + +#define CAP_MIDR_RANGE_LIST(list) \ + .matches = is_affected_midr_range_list, \ + .midr_range_list = list + +/* Errata affecting a range of revisions of given model variant */ +#define ERRATA_MIDR_REV_RANGE(m, var, r_min, r_max) \ + ERRATA_MIDR_RANGE(m, var, r_min, var, r_max) + +/* Errata affecting a single variant/revision of a model */ +#define ERRATA_MIDR_REV(model, var, rev) \ + ERRATA_MIDR_RANGE(model, var, rev, var, rev) + +/* Errata affecting all variants/revisions of a given a model */ +#define ERRATA_MIDR_ALL_VERSIONS(model) \ + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ + CAP_MIDR_ALL_VERSIONS(model) + +/* Errata affecting a list of midr ranges, with same work around */ +#define ERRATA_MIDR_RANGE_LIST(midr_list) \ + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ + CAP_MIDR_RANGE_LIST(midr_list) + +static const __maybe_unused struct midr_range tx2_family_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), + {}, +}; + +static bool __maybe_unused +needs_tx2_tvm_workaround(const struct arm64_cpu_capabilities *entry, + int scope) +{ + int i; + + if (!is_affected_midr_range_list(entry, scope) || + !is_hyp_mode_available()) + return false; + + for_each_possible_cpu(i) { + if (MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0) != 0) + return true; + } + + return false; +} + +static bool __maybe_unused +has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, + int scope) +{ + u32 midr = read_cpuid_id(); + bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); + const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range(midr, &range) && has_dic; +} + +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI +static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 + { + ERRATA_MIDR_REV(MIDR_QCOM_FALKOR_V1, 0, 0) + }, + { + .midr_range.model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1286807 + { + ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), + }, + { + /* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */ + ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2441007 + { + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2441009 + { + /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + }, +#endif + {}, +}; +#endif + +#ifdef CONFIG_CAVIUM_ERRATUM_23154 +static const struct midr_range cavium_erratum_23154_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_THUNDERX), + MIDR_ALL_VERSIONS(MIDR_THUNDERX_81XX), + MIDR_ALL_VERSIONS(MIDR_THUNDERX_83XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_98XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_96XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXN), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXMM), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXO), + {}, +}; +#endif + +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +const struct midr_range cavium_erratum_27456_cpus[] = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, +}; +#endif + +#ifdef CONFIG_CAVIUM_ERRATUM_30115 +static const struct midr_range cavium_erratum_30115_cpus[] = { + /* Cavium ThunderX, T88 pass 1.x - 2.2 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 2), + /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ + MIDR_REV_RANGE(MIDR_THUNDERX_81XX, 0, 0, 2), + /* Cavium ThunderX, T83 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_83XX, 0, 0), + {}, +}; +#endif + +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 +static const struct arm64_cpu_capabilities qcom_erratum_1003_list[] = { + { + ERRATA_MIDR_REV(MIDR_QCOM_FALKOR_V1, 0, 0), + }, + { + .midr_range.model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, + {}, +}; +#endif + +#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE +static const struct midr_range workaround_clean_cache[] = { #if defined(CONFIG_ARM64_ERRATUM_826319) || \ defined(CONFIG_ARM64_ERRATUM_827319) || \ defined(CONFIG_ARM64_ERRATUM_824069) + /* Cortex-A53 r0p[012]: ARM errata 826319, 827319, 824069 */ + MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_819472 + /* Cortex-A53 r0p[01] : ARM errata 819472 */ + MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 1), +#endif + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_1418040 +/* + * - 1188873 affects r0p0 to r2p0 + * - 1418040 affects r0p0 to r3p1 + */ +static const struct midr_range erratum_1418040_list[] = { + /* Cortex-A76 r0p0 to r3p1 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1), + /* Neoverse-N1 r0p0 to r3p1 */ + MIDR_RANGE(MIDR_NEOVERSE_N1, 0, 0, 3, 1), + /* Kryo4xx Gold (rcpe to rfpf) => (r0p0 to r3p1) */ + MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xf), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_845719 +static const struct midr_range erratum_845719_list[] = { + /* Cortex-A53 r0p[01234] */ + MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), + /* Brahma-B53 r0p[0] */ + MIDR_REV(MIDR_BRAHMA_B53, 0, 0), + /* Kryo2XX Silver rAp4 */ + MIDR_REV(MIDR_QCOM_KRYO_2XX_SILVER, 0xa, 0x4), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_843419 +static const struct arm64_cpu_capabilities erratum_843419_list[] = { { - /* Cortex-A53 r0p[012] */ - .desc = "ARM errata 826319, 827319, 824069", - .capability = ARM64_WORKAROUND_CLEAN_CACHE, - MIDR_RANGE(MIDR_CORTEX_A53, 0x00, 0x02), - .enable = cpu_enable_cache_maint_trap, + /* Cortex-A53 r0p[01234] */ + .matches = is_affected_midr_range, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), + MIDR_FIXED(0x4, BIT(8)), }, + { + /* Brahma-B53 r0p[0] */ + .matches = is_affected_midr_range, + ERRATA_MIDR_REV(MIDR_BRAHMA_B53, 0, 0), + }, + {}, +}; +#endif + +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT +static const struct midr_range erratum_speculative_at_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_1165522 + /* Cortex A76 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), +#endif +#ifdef CONFIG_ARM64_ERRATUM_1319367 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), +#endif +#ifdef CONFIG_ARM64_ERRATUM_1530923 + /* Cortex A55 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 2, 0), + /* Kryo4xx Silver (rdpe => r1p0) */ + MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static const struct midr_range erratum_1463225[] = { + /* Cortex-A76 r0p0 - r3p1 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1), + /* Kryo4xx Gold (rcpe to rfpf) => (r0p0 to r3p1) */ + MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xf), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE +static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2139208 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2119858 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE +static const struct midr_range tsb_flush_fail_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2067961 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2054223 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE +static struct midr_range trbe_write_out_of_range_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2253138 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), #endif -#ifdef CONFIG_ARM64_ERRATUM_819472 +#ifdef CONFIG_ARM64_ERRATUM_2224489 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + +#ifdef CONFIG_ARM64_ERRATUM_1742098 +static struct midr_range broken_aarch32_aes[] = { + MIDR_RANGE(MIDR_CORTEX_A57, 0, 1, 0xf, 0xf), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + +const struct arm64_cpu_capabilities arm64_errata[] = { +#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { - /* Cortex-A53 r0p[01] */ - .desc = "ARM errata 819472", + .desc = "ARM errata 826319, 827319, 824069, or 819472", .capability = ARM64_WORKAROUND_CLEAN_CACHE, - MIDR_RANGE(MIDR_CORTEX_A53, 0x00, 0x01), - .enable = cpu_enable_cache_maint_trap, + ERRATA_MIDR_RANGE_LIST(workaround_clean_cache), + .cpu_enable = cpu_enable_cache_maint_trap, }, #endif #ifdef CONFIG_ARM64_ERRATUM_832075 @@ -86,9 +446,9 @@ const struct arm64_cpu_capabilities arm64_errata[] = { /* Cortex-A57 r0p0 - r1p2 */ .desc = "ARM erratum 832075", .capability = ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE, - MIDR_RANGE(MIDR_CORTEX_A57, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, 2)), + ERRATA_MIDR_RANGE(MIDR_CORTEX_A57, + 0, 0, + 1, 2), }, #endif #ifdef CONFIG_ARM64_ERRATUM_834220 @@ -96,87 +456,72 @@ const struct arm64_cpu_capabilities arm64_errata[] = { /* Cortex-A57 r0p0 - r1p2 */ .desc = "ARM erratum 834220", .capability = ARM64_WORKAROUND_834220, - MIDR_RANGE(MIDR_CORTEX_A57, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, 2)), + ERRATA_MIDR_RANGE(MIDR_CORTEX_A57, + 0, 0, + 1, 2), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_843419 + { + .desc = "ARM erratum 843419", + .capability = ARM64_WORKAROUND_843419, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = cpucap_multi_entry_cap_matches, + .match_list = erratum_843419_list, }, #endif #ifdef CONFIG_ARM64_ERRATUM_845719 { - /* Cortex-A53 r0p[01234] */ .desc = "ARM erratum 845719", .capability = ARM64_WORKAROUND_845719, - MIDR_RANGE(MIDR_CORTEX_A53, 0x00, 0x04), + ERRATA_MIDR_RANGE_LIST(erratum_845719_list), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_23154 { - /* Cavium ThunderX, pass 1.x */ - .desc = "Cavium erratum 23154", + .desc = "Cavium errata 23154 and 38545", .capability = ARM64_WORKAROUND_CAVIUM_23154, - MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + ERRATA_MIDR_RANGE_LIST(cavium_erratum_23154_cpus), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 { - /* Cavium ThunderX, T88 pass 1.x - 2.1 */ - .desc = "Cavium erratum 27456", - .capability = ARM64_WORKAROUND_CAVIUM_27456, - MIDR_RANGE(MIDR_THUNDERX, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, 1)), - }, - { - /* Cavium ThunderX, T81 pass 1.0 */ .desc = "Cavium erratum 27456", .capability = ARM64_WORKAROUND_CAVIUM_27456, - MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00), + ERRATA_MIDR_RANGE_LIST(cavium_erratum_27456_cpus), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_30115 { - /* Cavium ThunderX, T88 pass 1.x - 2.2 */ - .desc = "Cavium erratum 30115", - .capability = ARM64_WORKAROUND_CAVIUM_30115, - MIDR_RANGE(MIDR_THUNDERX, 0x00, - (1 << MIDR_VARIANT_SHIFT) | 2), - }, - { - /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ .desc = "Cavium erratum 30115", .capability = ARM64_WORKAROUND_CAVIUM_30115, - MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x02), - }, - { - /* Cavium ThunderX, T83 pass 1.0 */ - .desc = "Cavium erratum 30115", - .capability = ARM64_WORKAROUND_CAVIUM_30115, - MIDR_RANGE(MIDR_THUNDERX_83XX, 0x00, 0x00), + ERRATA_MIDR_RANGE_LIST(cavium_erratum_30115_cpus), }, #endif { - .desc = "Mismatched cache line size", - .capability = ARM64_MISMATCHED_CACHE_LINE_SIZE, - .matches = has_mismatched_cache_line_size, - .def_scope = SCOPE_LOCAL_CPU, - .enable = cpu_enable_trap_ctr_access, + .desc = "Mismatched cache type (CTR_EL0)", + .capability = ARM64_MISMATCHED_CACHE_TYPE, + .matches = has_mismatched_cache_type, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .cpu_enable = cpu_enable_trap_ctr_access, }, #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 { - .desc = "Qualcomm Technologies Falkor erratum 1003", + .desc = "Qualcomm Technologies Falkor/Kryo erratum 1003", .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, - MIDR_RANGE(MIDR_QCOM_FALKOR_V1, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(0, 0)), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = cpucap_multi_entry_cap_matches, + .match_list = qcom_erratum_1003_list, }, #endif -#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI { - .desc = "Qualcomm Technologies Falkor erratum 1009", + .desc = "Qualcomm erratum 1009, or ARM erratum 1286807, 2441009", .capability = ARM64_WORKAROUND_REPEAT_TLBI, - MIDR_RANGE(MIDR_QCOM_FALKOR_V1, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(0, 0)), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = cpucap_multi_entry_cap_matches, + .match_list = arm64_repeat_tlbi_list, }, #endif #ifdef CONFIG_ARM64_ERRATUM_858921 @@ -184,39 +529,200 @@ const struct arm64_cpu_capabilities arm64_errata[] = { /* Cortex-A73 all versions */ .desc = "ARM erratum 858921", .capability = ARM64_WORKAROUND_858921, - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, #endif { - } -}; + .desc = "Spectre-v2", + .capability = ARM64_SPECTRE_V2, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v2, + .cpu_enable = spectre_v2_enable_mitigation, + }, +#ifdef CONFIG_RANDOMIZE_BASE + { + /* Must come after the Spectre-v2 entry */ + .desc = "Spectre-v3a", + .capability = ARM64_SPECTRE_V3A, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v3a, + .cpu_enable = spectre_v3a_enable_mitigation, + }, +#endif + { + .desc = "Spectre-v4", + .capability = ARM64_SPECTRE_V4, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v4, + .cpu_enable = spectre_v4_enable_mitigation, + }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, + }, +#ifdef CONFIG_ARM64_ERRATUM_1418040 + { + .desc = "ARM erratum 1418040", + .capability = ARM64_WORKAROUND_1418040, + ERRATA_MIDR_RANGE_LIST(erratum_1418040_list), + /* + * We need to allow affected CPUs to come in late, but + * also need the non-affected CPUs to be able to come + * in at any point in time. Wonderful. + */ + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT + { + .desc = "ARM errata 1165522, 1319367, or 1530923", + .capability = ARM64_WORKAROUND_SPECULATIVE_AT, + ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_list), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1463225 + { + .desc = "ARM erratum 1463225", + .capability = ARM64_WORKAROUND_1463225, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_cortex_a76_erratum_1463225, + .midr_range_list = erratum_1463225, + }, +#endif +#ifdef CONFIG_CAVIUM_TX2_ERRATUM_219 + { + .desc = "Cavium ThunderX2 erratum 219 (KVM guest sysreg trapping)", + .capability = ARM64_WORKAROUND_CAVIUM_TX2_219_TVM, + ERRATA_MIDR_RANGE_LIST(tx2_family_cpus), + .matches = needs_tx2_tvm_workaround, + }, + { + .desc = "Cavium ThunderX2 erratum 219 (PRFM removal)", + .capability = ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM, + ERRATA_MIDR_RANGE_LIST(tx2_family_cpus), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1542419 + { + /* we depend on the firmware portion for correctness */ + .desc = "ARM erratum 1542419 (kernel portion)", + .capability = ARM64_WORKAROUND_1542419, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_neoverse_n1_erratum_1542419, + .cpu_enable = cpu_enable_trap_ctr_access, + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1508412 + { + /* we depend on the firmware portion for correctness */ + .desc = "ARM erratum 1508412 (kernel portion)", + .capability = ARM64_WORKAROUND_1508412, + ERRATA_MIDR_RANGE(MIDR_CORTEX_A77, + 0, 0, + 1, 0), + }, +#endif +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM + { + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE + { + /* + * The erratum work around is handled within the TRBE + * driver and can be applied per-cpu. So, we can allow + * a late CPU to come online with this erratum. + */ + .desc = "ARM erratum 2119858 or 2139208", + .capability = ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_overwrite_fill_mode_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE + { + .desc = "ARM erratum 2067961 or 2054223", + .capability = ARM64_WORKAROUND_TSB_FLUSH_FAILURE, + ERRATA_MIDR_RANGE_LIST(tsb_flush_fail_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + { + .desc = "ARM erratum 2253138 or 2224489", + .capability = ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2077057 + { + .desc = "ARM erratum 2077057", + .capability = ARM64_WORKAROUND_2077057, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2064142 + { + .desc = "ARM erratum 2064142", + .capability = ARM64_WORKAROUND_2064142, -/* - * The CPU Errata work arounds are detected and applied at boot time - * and the related information is freed soon after. If the new CPU requires - * an errata not detected at boot, fail this CPU. - */ -void verify_local_cpu_errata_workarounds(void) -{ - const struct arm64_cpu_capabilities *caps = arm64_errata; - - for (; caps->matches; caps++) - if (!cpus_have_cap(caps->capability) && - caps->matches(caps, SCOPE_LOCAL_CPU)) { - pr_crit("CPU%d: Requires work around for %s, not detected" - " at boot time\n", - smp_processor_id(), - caps->desc ? : "an erratum"); - cpu_die_early(); - } -} + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2457168 + { + .desc = "ARM erratum 2457168", + .capability = ARM64_WORKAROUND_2457168, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, -void update_cpu_errata_workarounds(void) -{ - update_cpu_capabilities(arm64_errata, "enabling workaround for"); -} + /* Cortex-A510 r0p0-r1p1 */ + CAP_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2038923 + { + .desc = "ARM erratum 2038923", + .capability = ARM64_WORKAROUND_2038923, -void __init enable_errata_workarounds(void) -{ - enable_cpu_capabilities(arm64_errata); -} + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1902691 + { + .desc = "ARM erratum 1902691", + .capability = ARM64_WORKAROUND_1902691, + + /* Cortex-A510 r0p0 - r0p1 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 1) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1742098 + { + .desc = "ARM erratum 1742098", + .capability = ARM64_WORKAROUND_1742098, + CAP_MIDR_RANGE_LIST(broken_aarch32_aes), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2658417 + { + .desc = "ARM erratum 2658417", + .capability = ARM64_WORKAROUND_2658417, + /* Cortex-A510 r0p0 - r1p1 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), + .cpu_enable = cpu_clear_bf16_from_user_emulation, + }, +#endif + { + } +}; diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index d16978213c5b..e133011f64b5 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * CPU kernel entry/exit control * * Copyright (C) 2013 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> @@ -26,18 +15,20 @@ #include <asm/smp_plat.h> extern const struct cpu_operations smp_spin_table_ops; +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL extern const struct cpu_operations acpi_parking_protocol_ops; +#endif extern const struct cpu_operations cpu_psci_ops; -const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; +static const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; -static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = { +static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = { &smp_spin_table_ops, &cpu_psci_ops, NULL, }; -static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = { +static const struct cpu_operations *const acpi_supported_cpu_ops[] __initconst = { #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL &acpi_parking_protocol_ops, #endif @@ -47,7 +38,7 @@ static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = { static const struct cpu_operations * __init cpu_get_ops(const char *name) { - const struct cpu_operations **ops; + const struct cpu_operations *const *ops; ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops; @@ -85,6 +76,7 @@ static const char *__init cpu_read_enable_method(int cpu) pr_err("%pOF: missing enable-method property\n", dn); } + of_node_put(dn); } else { enable_method = acpi_get_enable_method(cpu); if (!enable_method) { @@ -104,7 +96,7 @@ static const char *__init cpu_read_enable_method(int cpu) /* * Read a cpu's enable method and record it in cpu_ops. */ -int __init cpu_read_ops(int cpu) +int __init init_cpu_ops(int cpu) { const char *enable_method = cpu_read_enable_method(cpu); @@ -119,3 +111,8 @@ int __init cpu_read_ops(int cpu) return 0; } + +const struct cpu_operations *get_cpu_ops(int cpu) +{ + return cpu_ops[cpu]; +} diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 21e2c95d24e7..b3f37e2209ad 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1,48 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Contains CPU feature definitions * * Copyright (C) 2015 ARM Ltd. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * A note for the weary kernel hacker: the code here is confusing and hard to + * follow! That's partly because it's solving a nasty problem, but also because + * there's a little bit of over-abstraction that tends to obscure what's going + * on behind a maze of helper functions and macros. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * The basic problem is that hardware folks have started gluing together CPUs + * with distinct architectural features; in some cases even creating SoCs where + * user-visible instructions are available only on a subset of the available + * cores. We try to address this by snapshotting the feature registers of the + * boot CPU and comparing these with the feature registers of each secondary + * CPU when bringing them up. If there is a mismatch, then we update the + * snapshot state to indicate the lowest-common denominator of the feature, + * known as the "safe" value. This snapshot state can be queried to view the + * "sanitised" value of a feature register. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. + * The sanitised register values are used to decide which capabilities we + * have in the system. These may be in the form of traditional "hwcaps" + * advertised to userspace or internal "cpucaps" which are used to configure + * things like alternative patching and static keys. While a feature mismatch + * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch + * may prevent a CPU from being onlined at all. + * + * Some implementation details worth remembering: + * + * - Mismatched features are *always* sanitised to a "safe" value, which + * usually indicates that the feature is not supported. + * + * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK" + * warning when onlining an offending CPU and the kernel will be tainted + * with TAINT_CPU_OUT_OF_SPEC. + * + * - Features marked as FTR_VISIBLE have their sanitised value visible to + * userspace. FTR_VISIBLE features in registers that are only visible + * to EL0 by trapping *must* have a corresponding HWCAP so that late + * onlining of CPUs cannot lead to features disappearing at runtime. + * + * - A "feature" is typically a 4-bit register field. A "capability" is the + * high-level description derived from the sanitised field value. + * + * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID + * scheme for fields in ID registers") to understand when feature fields + * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly). + * + * - KVM exposes its own view of the feature registers to guest operating + * systems regardless of FTR_VISIBLE. This is typically driven from the + * sanitised register values to allow virtual CPUs to be migrated between + * arbitrary physical CPUs, but some features not present on the host are + * also advertised and emulated. Look at sys_reg_descs[] for the gory + * details. + * + * - If the arm64_ftr_bits[] for a register has a missing field, then this + * field is treated as STRICT RES0, including for read_sanitised_ftr_reg(). + * This is stronger than FTR_HIDDEN and can be used to hide features from + * KVM guests. */ #define pr_fmt(fmt) "CPU features: " fmt #include <linux/bsearch.h> #include <linux/cpumask.h> +#include <linux/crash_dump.h> #include <linux/sort.h> #include <linux/stop_machine.h> +#include <linux/sysfs.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/kasan.h> +#include <linux/percpu.h> + #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> +#include <asm/fpsimd.h> +#include <asm/hwcap.h> +#include <asm/insn.h> +#include <asm/kvm_host.h> #include <asm/mmu_context.h> +#include <asm/mte.h> #include <asm/processor.h> +#include <asm/smp.h> #include <asm/sysreg.h> #include <asm/traps.h> +#include <asm/vectors.h> #include <asm/virt.h> -unsigned long elf_hwcap __read_mostly; -EXPORT_SYMBOL_GPL(elf_hwcap); +/* Kernel representation of AT_HWCAP and AT_HWCAP2 */ +static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP_DEFAULT \ (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\ COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\ - COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\ - COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\ - COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV|\ + COMPAT_HWCAP_TLS|COMPAT_HWCAP_IDIV|\ COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; @@ -50,28 +106,38 @@ unsigned int compat_elf_hwcap2 __read_mostly; DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcaps); +static struct arm64_cpu_capabilities const __ro_after_init *cpu_hwcaps_ptrs[ARM64_NCAPS]; -static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p) -{ - /* file-wide pr_fmt adds "CPU features: " prefix */ - pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); - return 0; -} +DECLARE_BITMAP(boot_capabilities, ARM64_NCAPS); -static struct notifier_block cpu_hwcaps_notifier = { - .notifier_call = dump_cpu_hwcaps -}; +bool arm64_use_ng_mappings = false; +EXPORT_SYMBOL(arm64_use_ng_mappings); + +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; + +/* + * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs + * support it? + */ +static bool __read_mostly allow_mismatched_32bit_el0; + +/* + * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have + * seen at least one CPU capable of 32-bit EL0. + */ +DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); + +/* + * Mask of CPUs supporting 32-bit EL0. + * Only valid if arm64_mismatched_32bit_el0 is enabled. + */ +static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly; -static int __init register_cpu_hwcaps_dumper(void) +void dump_cpu_features(void) { - atomic_notifier_chain_register(&panic_notifier_list, - &cpu_hwcaps_notifier); - return 0; + /* file-wide pr_fmt adds "CPU features: " prefix */ + pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); } -__initcall(register_cpu_hwcaps_dumper); - -DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); -EXPORT_SYMBOL(cpu_hwcap_keys); #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ @@ -97,174 +163,434 @@ EXPORT_SYMBOL(cpu_hwcap_keys); .width = 0, \ } -/* meta feature for alternatives */ -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); +static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); +static bool __system_matches_cap(unsigned int n); /* * NOTE: Any changes to the visibility of features should be kept in * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_ATOMICS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_DPB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), - /* Linux doesn't care about the EL3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64PFR0_EL3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SEL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RAS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_GIC_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, ID_AA64PFR0_EL1_AdvSIMD_NI), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_RAS_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_SHIFT, 4, ID_AA64PFR1_EL1_MTE_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, ID_AA64PFR1_EL1_SSBS_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_BT_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN16_SHIFT, 4, ID_AA64MMFR0_TGRAN16_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_BIGENDEL0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_EXS_SHIFT, 4, 0), + /* + * Page size not being supported at Stage-2 is not fatal. You + * just give up KVM if PAGE_SIZE isn't supported there. Go fix + * your favourite nesting hypervisor. + * + * There is a small corner case where the hypervisor explicitly + * advertises a given granule size at Stage-2 (value 2) on some + * vCPUs, and uses the fallback to Stage-1 (value 0) for other + * vCPUs. Although this is not forbidden by the architecture, it + * indicates that the hypervisor is being silly (or buggy). + * + * We make no effort to cope with this and pretend that if these + * fields are inconsistent across vCPUs, then it isn't worth + * trying to bring KVM up. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT, 4, 1), + /* + * We already refuse to boot CPUs that don't support our configured + * page size, so we can only detect mismatches for a page size other + * than the one we're currently using. Unfortunately, SoCs like this + * exist in the wild so, even though we don't like it, we'll have to go + * along with it and treat them as non-strict. + */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN4_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN4_NI), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN64_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN64_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN16_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN16_NI), + + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT, 4, 0), /* Linux shouldn't care about secure memory */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_SNSMEM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_BIGENDEL_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_ASID_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_SNSMEM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGEND_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT, 4, 0), /* * Differing PARange is fine as long as all peripherals and memory are mapped * within the minimum PARange of all CPUs */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_PARANGE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_PARANGE_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_LOR_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_HPD_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_VHE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_VMIDBITS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_HADBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TWED_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1_SpecSEI_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_PAN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_LO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VMIDBits_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_LVA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_IESB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_LSM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_BBM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_TTL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_FWB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_AT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_ST_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_NV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_VARange_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_UAO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CnP_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_ctr[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1), /* * Linux can handle differing I-cache policies. Userspace JITs will * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, 14, 2, ICACHE_POLICY_VIPT), /* L1Ip */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* IminLine */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT), /* L1Ip */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0), ARM64_FTR_END, }; +static struct arm64_ftr_override __ro_after_init no_override = { }; + struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { .name = "SYS_CTR_EL0", - .ftr_bits = ftr_ctr + .ftr_bits = ftr_ctr, + .override = &no_override, }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 28, 4, 0xf), /* InnerShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 24, 4, 0), /* FCSE */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0), /* AuxReg */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 16, 4, 0), /* TCM */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 12, 4, 0), /* ShareLvl */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 8, 4, 0xf), /* OuterShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 4, 4, 0), /* PMSA */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 0, 4, 0), /* VMSA */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_INNERSHR_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_FCSE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_AUXREG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_TCM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_SHARELVL_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_OUTERSHR_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_PMSA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_VMSA_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 36, 28, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_BRPS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_WRPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_BRPs_SHIFT, 4, 0), /* * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPROUND_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSHVEC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSQRT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPDIVIDE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPTRAP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPDP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_FPSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_SIMD_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDFMAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDINT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_SIMDLS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPDNAN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_FPFTZ_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mvfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 4, 4, 0), /* FPMisc */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 0, 4, 0), /* SIMDMisc */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_FPMISC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_SIMDMISC_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_dczid[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 4, 1, 1), /* DZP */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* BS */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_gmid[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0), ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_isar0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DIVIDE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DEBUG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_COPROC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_CMPBRANCH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITFIELD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITCOUNT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_SWAP_SHIFT, 4, 0), + ARM64_FTR_END, +}; static const struct arm64_ftr_bits ftr_id_isar5[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_ISAR5_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_ISAR5_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_ISAR5_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_ISAR5_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_ISAR5_AES_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_ISAR5_SEVL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SEVL_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_mmfr4[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 4, 4, 0), /* ac2 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_AC2_SHIFT, 4, 0), + + /* + * SpecSEI = 1 indicates that the PE might generate an SError on an + * external abort on speculative read. It is safe to assume that an + * SError might be generated than it will not be. Hence it has been + * classified as FTR_HIGHER_SAFE. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_SPECSEI_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SWP_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_PSR_M_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_BARRIER_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SMC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WRITEBACK_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WITHSHIFTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_UNPRIV_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_mmfr5[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_ETS_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar6[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_JSCVT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 12, 4, 0), /* State3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 8, 4, 0), /* State2 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 4, 4, 0), /* State1 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 0, 4, 0), /* State0 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE0_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRT_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SEC_FRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GENTIMER_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRTUALIZATION_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_MPROGMOD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SECURITY_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_PROGMOD_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), + /* [31:28] TraceFilt */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_PERFMON_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MPROFDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPTRC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPTRC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPSDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPDBG_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_dfr1[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_MTPMU_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_zcr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, + ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_WIDTH, 0), /* LEN */ + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_smcr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, + SMCR_ELx_LEN_SHIFT, SMCR_ELx_LEN_WIDTH, 0), /* LEN */ ARM64_FTR_END, }; @@ -272,7 +598,7 @@ static const struct arm64_ftr_bits ftr_id_dfr0[] = { * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: - * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] + * id_isar[1-3], id_mmfr[1-3] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), @@ -296,13 +622,28 @@ static const struct arm64_ftr_bits ftr_raz[] = { ARM64_FTR_END, }; -#define ARM64_FTR_REG(id, table) { \ - .sys_id = id, \ - .reg = &(struct arm64_ftr_reg){ \ - .name = #id, \ - .ftr_bits = &((table)[0]), \ +#define __ARM64_FTR_REG_OVERRIDE(id_str, id, table, ovr) { \ + .sys_id = id, \ + .reg = &(struct arm64_ftr_reg){ \ + .name = id_str, \ + .override = (ovr), \ + .ftr_bits = &((table)[0]), \ }} +#define ARM64_FTR_REG_OVERRIDE(id, table, ovr) \ + __ARM64_FTR_REG_OVERRIDE(#id, id, table, ovr) + +#define ARM64_FTR_REG(id, table) \ + __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) + +struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64pfr0_override; +struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64zfr0_override; +struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; +struct arm64_ftr_override __ro_after_init id_aa64isar1_override; +struct arm64_ftr_override __ro_after_init id_aa64isar2_override; + static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; @@ -310,7 +651,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 1 */ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0), - ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1), ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0), ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0), ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits), @@ -318,22 +659,32 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits), /* Op1 = 0, CRn = 0, CRm = 2 */ - ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0), ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4), ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5), ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4), + ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), /* Op1 = 0, CRn = 0, CRm = 3 */ - ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0), + ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), + ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), + ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), + ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ - ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), - ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_raz), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0, + &id_aa64pfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, + &id_aa64pfr1_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, + &id_aa64zfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, + &id_aa64smfr0_override), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -341,13 +692,24 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), - ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, + &id_aa64isar1_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, + &id_aa64isar2_override), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), - ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, + &id_aa64mmfr1_override), ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), + /* Op1 = 0, CRn = 1, CRm = 2 */ + ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), + ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr), + + /* Op1 = 1, CRn = 0, CRm = 0 */ + ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), + /* Op1 = 3, CRn = 0, CRm = 0 */ { SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 }, ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid), @@ -362,16 +724,16 @@ static int search_cmp_ftr_reg(const void *id, const void *regp) } /* - * get_arm64_ftr_reg - Lookup a feature register entry using its - * sys_reg() encoding. With the array arm64_ftr_regs sorted in the - * ascending order of sys_id , we use binary search to find a matching + * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using + * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the + * ascending order of sys_id, we use binary search to find a matching * entry. * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure. It is upto the caller to decide * the impact of a failure. */ -static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) { const struct __ftr_reg_entry *ret; @@ -385,6 +747,27 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) return NULL; } +/* + * get_arm64_ftr_reg - Looks up a feature register entry using + * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn(). + * + * returns - Upon success, matching ftr_reg entry for id. + * - NULL on failure but with an WARN_ON(). + */ +struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +{ + struct arm64_ftr_reg *reg; + + reg = get_arm64_ftr_reg_nowarn(sys_id); + + /* + * Requesting a non-existent register search is an error. Warn + * and let the caller handle it. + */ + WARN_ON(!reg); + return reg; +} + static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val) { @@ -405,10 +788,14 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, ret = ftrp->safe_val; break; case FTR_LOWER_SAFE: - ret = new < cur ? new : cur; + ret = min(new, cur); break; + case FTR_HIGHER_OR_ZERO_SAFE: + if (!cur || !new) + break; + fallthrough; case FTR_HIGHER_SAFE: - ret = new > cur ? new : cur; + ret = max(new, cur); break; default: BUG(); @@ -419,11 +806,52 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, static void __init sort_ftr_regs(void) { - int i; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) { + const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg; + const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits; + unsigned int j = 0; + + /* + * Features here must be sorted in descending order with respect + * to their shift values and should not overlap with each other. + */ + for (; ftr_bits->width != 0; ftr_bits++, j++) { + unsigned int width = ftr_reg->ftr_bits[j].width; + unsigned int shift = ftr_reg->ftr_bits[j].shift; + unsigned int prev_shift; + + WARN((shift + width) > 64, + "%s has invalid feature at shift %d\n", + ftr_reg->name, shift); + + /* + * Skip the first feature. There is nothing to + * compare against for now. + */ + if (j == 0) + continue; + + prev_shift = ftr_reg->ftr_bits[j - 1].shift; + WARN((shift + width) > prev_shift, + "%s has feature overlap at shift %d\n", + ftr_reg->name, shift); + } - /* Check that the array is sorted so that we can do the binary search */ - for (i = 1; i < ARRAY_SIZE(arm64_ftr_regs); i++) - BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + /* + * Skip the first register. There is nothing to + * compare against for now. + */ + if (i == 0) + continue; + /* + * Registers here must be sorted in ascending order with respect + * to sys_id for subsequent binary search in get_arm64_ftr_reg() + * to work correctly. + */ + BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); + } } /* @@ -432,7 +860,7 @@ static void __init sort_ftr_regs(void) * Any bits that are not covered by an arm64_ftr_bits entry are considered * RES0 for the system-wide value, and must strictly match. */ -static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) +static void init_cpu_ftr_reg(u32 sys_reg, u64 new) { u64 val = 0; u64 strict_mask = ~0x0ULL; @@ -442,11 +870,45 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg); - BUG_ON(!reg); + if (!reg) + return; - for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { + for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); s64 ftr_new = arm64_ftr_value(ftrp, new); + s64 ftr_ovr = arm64_ftr_value(ftrp, reg->override->val); + + if ((ftr_mask & reg->override->mask) == ftr_mask) { + s64 tmp = arm64_ftr_safe_value(ftrp, ftr_ovr, ftr_new); + char *str = NULL; + + if (ftr_ovr != tmp) { + /* Unsafe, remove the override */ + reg->override->mask &= ~ftr_mask; + reg->override->val &= ~ftr_mask; + tmp = ftr_ovr; + str = "ignoring override"; + } else if (ftr_new != tmp) { + /* Override was valid */ + ftr_new = tmp; + str = "forced"; + } else if (ftr_ovr == tmp) { + /* Override was the safe value */ + str = "already set"; + } + + if (str) + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, tmp); + } else if ((ftr_mask & reg->override->val) == ftr_mask) { + reg->override->val &= ~ftr_mask; + pr_warn("%s[%d:%d]: impossible override, ignored\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift); + } val = arm64_ftr_set_value(ftrp, val, ftr_new); @@ -468,6 +930,57 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) reg->user_mask = user_mask; } +extern const struct arm64_cpu_capabilities arm64_errata[]; +static const struct arm64_cpu_capabilities arm64_features[]; + +static void __init +init_cpu_hwcaps_indirect_list_from_array(const struct arm64_cpu_capabilities *caps) +{ + for (; caps->matches; caps++) { + if (WARN(caps->capability >= ARM64_NCAPS, + "Invalid capability %d\n", caps->capability)) + continue; + if (WARN(cpu_hwcaps_ptrs[caps->capability], + "Duplicate entry for capability %d\n", + caps->capability)) + continue; + cpu_hwcaps_ptrs[caps->capability] = caps; + } +} + +static void __init init_cpu_hwcaps_indirect_list(void) +{ + init_cpu_hwcaps_indirect_list_from_array(arm64_features); + init_cpu_hwcaps_indirect_list_from_array(arm64_errata); +} + +static void __init setup_boot_cpu_capabilities(void); + +static void init_32bit_cpu_features(struct cpuinfo_32bit *info) +{ + init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); + init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1); + init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); + init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); + init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); + init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); + init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); + init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); + init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6); + init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); + init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); + init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); + init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); + init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4); + init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5); + init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); + init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); + init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2); + init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); + init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); + init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); +} + void __init init_cpu_features(struct cpuinfo_arm64 *info) { /* Before we start using the tables, make sure it is sorted */ @@ -480,31 +993,52 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1); init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); + init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); + init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); - if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); - init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); - init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); - init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); - init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); - init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); - init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); - init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); - init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); - init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); - init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); - init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); - init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); - init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); - init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); - init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) + init_32bit_cpu_features(&info->aarch32); + + if (IS_ENABLED(CONFIG_ARM64_SVE) && + id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_zcr = read_zcr_features(); + init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr); + vec_init_vq_map(ARM64_VEC_SVE); } + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { + info->reg_smcr = read_smcr_features(); + /* + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. + */ + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr); + vec_init_vq_map(ARM64_VEC_SME); + } + + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) + init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); + + /* + * Initialize the indirect array of CPU hwcaps capabilities pointers + * before we handle the boot CPU below. + */ + init_cpu_hwcaps_indirect_list(); + + /* + * Detect and enable early CPU capabilities based on the boot CPU, + * after we have initialised the CPU feature infrastructure. + */ + setup_boot_cpu_capabilities(); } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) @@ -528,7 +1062,9 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); - BUG_ON(!regp); + if (!regp) + return 0; + update_cpu_ftr_reg(regp, val); if ((boot & regp->strict_mask) == (val & regp->strict_mask)) return 0; @@ -537,6 +1073,112 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) return 1; } +static void relax_cpu_ftr_reg(u32 sys_id, int field) +{ + const struct arm64_ftr_bits *ftrp; + struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); + + if (!regp) + return; + + for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) { + if (ftrp->shift == field) { + regp->strict_mask &= ~arm64_ftr_mask(ftrp); + break; + } + } + + /* Bogus field? */ + WARN_ON(!ftrp->width); +} + +static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info, + struct cpuinfo_arm64 *boot) +{ + static bool boot_cpu_32bit_regs_overridden = false; + + if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden) + return; + + if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0)) + return; + + boot->aarch32 = info->aarch32; + init_32bit_cpu_features(&boot->aarch32); + boot_cpu_32bit_regs_overridden = true; +} + +static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info, + struct cpuinfo_32bit *boot) +{ + int taint = 0; + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* + * If we don't have AArch32 at EL1, then relax the strictness of + * EL1-dependent register fields to avoid spurious sanity check fails. + */ + if (!id_aa64pfr0_32bit_el1(pfr0)) { + relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_SMC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRT_FRAC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SEC_FRAC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRTUALIZATION_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SECURITY_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_PROGMOD_SHIFT); + } + + taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, + info->reg_id_dfr0, boot->reg_id_dfr0); + taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu, + info->reg_id_dfr1, boot->reg_id_dfr1); + taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, + info->reg_id_isar0, boot->reg_id_isar0); + taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, + info->reg_id_isar1, boot->reg_id_isar1); + taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, + info->reg_id_isar2, boot->reg_id_isar2); + taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, + info->reg_id_isar3, boot->reg_id_isar3); + taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, + info->reg_id_isar4, boot->reg_id_isar4); + taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, + info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, + info->reg_id_isar6, boot->reg_id_isar6); + + /* + * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and + * ACTLR formats could differ across CPUs and therefore would have to + * be trapped for virtualization anyway. + */ + taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, + info->reg_id_mmfr0, boot->reg_id_mmfr0); + taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, + info->reg_id_mmfr1, boot->reg_id_mmfr1); + taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, + info->reg_id_mmfr2, boot->reg_id_mmfr2); + taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, + info->reg_id_mmfr3, boot->reg_id_mmfr3); + taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu, + info->reg_id_mmfr4, boot->reg_id_mmfr4); + taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu, + info->reg_id_mmfr5, boot->reg_id_mmfr5); + taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, + info->reg_id_pfr0, boot->reg_id_pfr0); + taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, + info->reg_id_pfr1, boot->reg_id_pfr1); + taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu, + info->reg_id_pfr2, boot->reg_id_pfr2); + taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, + info->reg_mvfr0, boot->reg_mvfr0); + taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, + info->reg_mvfr1, boot->reg_mvfr1); + taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, + info->reg_mvfr2, boot->reg_mvfr2); + + return taint; +} + /* * Update system wide CPU feature registers with the values from a * non-boot CPU. Also performs SANITY checks to make sure that there @@ -586,6 +1228,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar0, boot->reg_id_aa64isar0); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, + info->reg_id_aa64isar2, boot->reg_id_aa64isar2); /* * Differing PARange support is fine as long as all peripherals and @@ -599,60 +1243,68 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); - /* - * EL3 is not our concern. - * ID_AA64PFR1 is currently RES0. - */ taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1); - /* - * If we have AArch32, we care about 32-bit features for compat. - * If the system doesn't support AArch32, don't update them. - */ - if (id_aa64pfr0_32bit_el0(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - - taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, - info->reg_id_dfr0, boot->reg_id_dfr0); - taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, - info->reg_id_isar0, boot->reg_id_isar0); - taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, - info->reg_id_isar1, boot->reg_id_isar1); - taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, - info->reg_id_isar2, boot->reg_id_isar2); - taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, - info->reg_id_isar3, boot->reg_id_isar3); - taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, - info->reg_id_isar4, boot->reg_id_isar4); - taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, - info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, + info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); + taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, + info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); + + if (IS_ENABLED(CONFIG_ARM64_SVE) && + id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_zcr = read_zcr_features(); + taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, + info->reg_zcr, boot->reg_zcr); + + /* Probe vector lengths */ + if (!system_capabilities_finalized()) + vec_update_vq_map(ARM64_VEC_SVE); + } + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { + info->reg_smcr = read_smcr_features(); /* - * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and - * ACTLR formats could differ across CPUs and therefore would have to - * be trapped for virtualization anyway. + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. */ - taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, - info->reg_id_mmfr0, boot->reg_id_mmfr0); - taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, - info->reg_id_mmfr1, boot->reg_id_mmfr1); - taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, - info->reg_id_mmfr2, boot->reg_id_mmfr2); - taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, - info->reg_id_mmfr3, boot->reg_id_mmfr3); - taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, - info->reg_id_pfr0, boot->reg_id_pfr0); - taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, - info->reg_id_pfr1, boot->reg_id_pfr1); - taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, - info->reg_mvfr0, boot->reg_mvfr0); - taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, - info->reg_mvfr1, boot->reg_mvfr1); - taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, - info->reg_mvfr2, boot->reg_mvfr2); + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + taint |= check_update_ftr_reg(SYS_SMCR_EL1, cpu, + info->reg_smcr, boot->reg_smcr); + + /* Probe vector lengths */ + if (!system_capabilities_finalized()) + vec_update_vq_map(ARM64_VEC_SME); + } + + /* + * The kernel uses the LDGM/STGM instructions and the number of tags + * they read/write depends on the GMID_EL1.BS field. Check that the + * value is the same on all CPUs. + */ + if (IS_ENABLED(CONFIG_ARM64_MTE) && + id_aa64pfr1_mte(info->reg_id_aa64pfr1)) { + taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu, + info->reg_gmid, boot->reg_gmid); + } + + /* + * If we don't have AArch32 at all then skip the checks entirely + * as the register values may be UNKNOWN and we're not going to be + * using them for anything. + * + * This relies on a sanitised view of the AArch64 ID registers + * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last. + */ + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + lazy_init_32bit_cpu_features(info, boot); + taint |= update_32bit_cpu_features(cpu, &info->aarch32, + &boot->aarch32); } /* @@ -669,40 +1321,51 @@ u64 read_sanitised_ftr_reg(u32 id) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id); - /* We shouldn't get a request for an unsupported register */ - BUG_ON(!regp); + if (!regp) + return 0; return regp->sys_val; } +EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg); #define read_sysreg_case(r) \ - case r: return read_sysreg_s(r) + case r: val = read_sysreg_s(r); break; /* * __read_sysreg_by_encoding() - Used by a STARTING cpu before cpuinfo is populated. * Read the system register on the current CPU */ -static u64 __read_sysreg_by_encoding(u32 sys_id) +u64 __read_sysreg_by_encoding(u32 sys_id) { + struct arm64_ftr_reg *regp; + u64 val; + switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); + read_sysreg_case(SYS_ID_PFR2_EL1); read_sysreg_case(SYS_ID_DFR0_EL1); + read_sysreg_case(SYS_ID_DFR1_EL1); read_sysreg_case(SYS_ID_MMFR0_EL1); read_sysreg_case(SYS_ID_MMFR1_EL1); read_sysreg_case(SYS_ID_MMFR2_EL1); read_sysreg_case(SYS_ID_MMFR3_EL1); + read_sysreg_case(SYS_ID_MMFR4_EL1); + read_sysreg_case(SYS_ID_MMFR5_EL1); read_sysreg_case(SYS_ID_ISAR0_EL1); read_sysreg_case(SYS_ID_ISAR1_EL1); read_sysreg_case(SYS_ID_ISAR2_EL1); read_sysreg_case(SYS_ID_ISAR3_EL1); read_sysreg_case(SYS_ID_ISAR4_EL1); read_sysreg_case(SYS_ID_ISAR5_EL1); + read_sysreg_case(SYS_ID_ISAR6_EL1); read_sysreg_case(SYS_MVFR0_EL1); read_sysreg_case(SYS_MVFR1_EL1); read_sysreg_case(SYS_MVFR2_EL1); read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); + read_sysreg_case(SYS_ID_AA64ZFR0_EL1); + read_sysreg_case(SYS_ID_AA64SMFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); @@ -710,6 +1373,7 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); + read_sysreg_case(SYS_ID_AA64ISAR2_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); @@ -719,32 +1383,118 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) BUG(); return 0; } + + regp = get_arm64_ftr_reg(sys_id); + if (regp) { + val &= ~regp->override->mask; + val |= (regp->override->val & regp->override->mask); + } + + return val; } #include <linux/irqchip/arm-gic-v3.h> static bool +has_always(const struct arm64_cpu_capabilities *entry, int scope) +{ + return true; +} + +static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - int val = cpuid_feature_extract_field(reg, entry->field_pos, entry->sign); + int val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); return val >= entry->min_field_value; } -static bool -has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +static u64 +read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope) { - u64 val; - WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); if (scope == SCOPE_SYSTEM) - val = read_sanitised_ftr_reg(entry->sys_reg); + return read_sanitised_ftr_reg(entry->sys_reg); else - val = __read_sysreg_by_encoding(entry->sys_reg); + return __read_sysreg_by_encoding(entry->sys_reg); +} + +static bool +has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + int mask; + struct arm64_ftr_reg *regp; + u64 val = read_scoped_sysreg(entry, scope); + + regp = get_arm64_ftr_reg(entry->sys_reg); + if (!regp) + return false; + + mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask, + entry->field_pos, + entry->field_width); + if (!mask) + return false; + + return feature_matches(val, entry); +} +static bool +has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 val = read_scoped_sysreg(entry, scope); return feature_matches(val, entry); } +const struct cpumask *system_32bit_el0_cpumask(void) +{ + if (!system_supports_32bit_el0()) + return cpu_none_mask; + + if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) + return cpu_32bit_el0_mask; + + return cpu_possible_mask; +} + +static int __init parse_32bit_el0_param(char *str) +{ + allow_mismatched_32bit_el0 = true; + return 0; +} +early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param); + +static ssize_t aarch32_el0_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct cpumask *mask = system_32bit_el0_cpumask(); + + return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask)); +} +static const DEVICE_ATTR_RO(aarch32_el0); + +static int __init aarch32_el0_sysfs_init(void) +{ + if (!allow_mismatched_32bit_el0) + return 0; + + return device_create_file(cpu_subsys.dev_root, &dev_attr_aarch32_el0); +} +device_initcall(aarch32_el0_sysfs_init); + +static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return allow_mismatched_32bit_el0; + + if (scope == SCOPE_SYSTEM) + pr_info("detected: 32-bit EL0 Support\n"); + + return true; +} + static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope) { bool has_sre; @@ -765,45 +1515,641 @@ static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int _ u32 midr = read_cpuid_id(); /* Cavium ThunderX pass 1.x and 2.x */ - return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, + return midr_is_cpu_model_range(midr, MIDR_THUNDERX, MIDR_CPU_VAR_REV(0, 0), MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK)); } +static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) +{ + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_signed_field(pfr0, + ID_AA64PFR0_EL1_FP_SHIFT) < 0; +} + +static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, + int scope) +{ + u64 ctr; + + if (scope == SCOPE_SYSTEM) + ctr = arm64_ftr_reg_ctrel0.sys_val; + else + ctr = read_cpuid_effective_cachetype(); + + return ctr & BIT(CTR_EL0_IDC_SHIFT); +} + +static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused) +{ + /* + * If the CPU exposes raw CTR_EL0.IDC = 0, while effectively + * CTR_EL0.IDC = 1 (from CLIDR values), we need to trap accesses + * to the CTR_EL0 on this CPU and emulate it with the real/safe + * value. + */ + if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT))) + sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); +} + +static bool has_cache_dic(const struct arm64_cpu_capabilities *entry, + int scope) +{ + u64 ctr; + + if (scope == SCOPE_SYSTEM) + ctr = arm64_ftr_reg_ctrel0.sys_val; + else + ctr = read_cpuid_cachetype(); + + return ctr & BIT(CTR_EL0_DIC_SHIFT); +} + +static bool __maybe_unused +has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) +{ + /* + * Kdump isn't guaranteed to power-off all secondary CPUs, CNP + * may share TLB entries with a CPU stuck in the crashed + * kernel. + */ + if (is_kdump_kernel()) + return false; + + if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) + return false; + + return has_cpuid_feature(entry, scope); +} + +/* + * This check is triggered during the early boot before the cpufeature + * is initialised. Checking the status on the local CPU allows the boot + * CPU to detect the need for non-global mappings and thus avoiding a + * pagetable re-write after all the CPUs are booted. This check will be + * anyway run on individual CPUs, allowing us to get the consistent + * state once the SMP CPUs are up and thus make the switch to non-global + * mappings if required. + */ +bool kaslr_requires_kpti(void) +{ + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return false; + + /* + * E0PD does a similar job to KPTI so can be used instead + * where available. + */ + if (IS_ENABLED(CONFIG_ARM64_E0PD)) { + u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); + if (cpuid_feature_extract_unsigned_field(mmfr2, + ID_AA64MMFR2_EL1_E0PD_SHIFT)) + return false; + } + + /* + * Systems affected by Cavium erratum 24756 are incompatible + * with KPTI. + */ + if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { + extern const struct midr_range cavium_erratum_27456_cpus[]; + + if (is_midr_in_range_list(read_cpuid_id(), + cavium_erratum_27456_cpus)) + return false; + } + + return kaslr_offset() > 0; +} + +static bool __meltdown_safe = true; +static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, + int scope) +{ + /* List of CPUs that are not vulnerable and don't need KPTI */ + static const struct midr_range kpti_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ } + }; + char const *str = "kpti command line option"; + bool meltdown_safe; + + meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + + /* Defer to CPU feature registers */ + if (has_cpuid_feature(entry, scope)) + meltdown_safe = true; + + if (!meltdown_safe) + __meltdown_safe = false; + + /* + * For reasons that aren't entirely clear, enabling KPTI on Cavium + * ThunderX leads to apparent I-cache corruption of kernel text, which + * ends as well as you might imagine. Don't even try. We cannot rely + * on the cpus_have_*cap() helpers here to detect the CPU erratum + * because cpucap detection order may change. However, since we know + * affected CPUs are always in a homogeneous configuration, it is + * safe to rely on this_cpu_has_cap() here. + */ + if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + str = "ARM64_WORKAROUND_CAVIUM_27456"; + __kpti_forced = -1; + } + + /* Useful for KASLR robustness */ + if (kaslr_requires_kpti()) { + if (!__kpti_forced) { + str = "KASLR"; + __kpti_forced = 1; + } + } + + if (cpu_mitigations_off() && !__kpti_forced) { + str = "mitigations=off"; + __kpti_forced = -1; + } + + if (!IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + pr_info_once("kernel page table isolation disabled by kernel configuration\n"); + return false; + } + + /* Forced? */ + if (__kpti_forced) { + pr_info_once("kernel page table isolation forced %s by %s\n", + __kpti_forced > 0 ? "ON" : "OFF", str); + return __kpti_forced > 0; + } + + return !meltdown_safe; +} + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) + +extern +void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, + phys_addr_t size, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(int), int flags); + +static phys_addr_t kpti_ng_temp_alloc; + +static phys_addr_t kpti_ng_pgd_alloc(int shift) +{ + kpti_ng_temp_alloc -= PAGE_SIZE; + return kpti_ng_temp_alloc; +} + +static void +kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +{ + typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); + extern kpti_remap_fn idmap_kpti_install_ng_mappings; + kpti_remap_fn *remap_fn; + + int cpu = smp_processor_id(); + int levels = CONFIG_PGTABLE_LEVELS; + int order = order_base_2(levels); + u64 kpti_ng_temp_pgd_pa = 0; + pgd_t *kpti_ng_temp_pgd; + u64 alloc = 0; + + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); + + __this_cpu_write(this_cpu_vector, v); + } + + /* + * We don't need to rewrite the page-tables if either we've done + * it already or we have KASLR enabled and therefore have not + * created any global mappings at all. + */ + if (arm64_use_ng_mappings) + return; + + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + + if (!cpu) { + alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); + kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); + kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); + + // + // Create a minimal page table hierarchy that permits us to map + // the swapper page tables temporarily as we traverse them. + // + // The physical pages are laid out as follows: + // + // +--------+-/-------+-/------ +-\\--------+ + // : PTE[] : | PMD[] : | PUD[] : || PGD[] : + // +--------+-\-------+-\------ +-//--------+ + // ^ + // The first page is mapped into this hierarchy at a PMD_SHIFT + // aligned virtual address, so that we can manipulate the PTE + // level entries while the mapping is active. The first entry + // covers the PTE[] page itself, the remaining entries are free + // to be used as a ad-hoc fixmap. + // + create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc), + KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, + kpti_ng_pgd_alloc, 0); + } + + cpu_install_idmap(); + remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); + cpu_uninstall_idmap(); + + if (!cpu) { + free_pages(alloc, order); + arm64_use_ng_mappings = true; + } +} +#else +static void +kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +{ +} +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + +static int __init parse_kpti(char *str) +{ + bool enabled; + int ret = strtobool(str, &enabled); + + if (ret) + return ret; + + __kpti_forced = enabled ? 1 : -1; + return 0; +} +early_param("kpti", parse_kpti); + +#ifdef CONFIG_ARM64_HW_AFDBM +static inline void __cpu_enable_hw_dbm(void) +{ + u64 tcr = read_sysreg(tcr_el1) | TCR_HD; + + write_sysreg(tcr, tcr_el1); + isb(); + local_flush_tlb_all(); +} + +static bool cpu_has_broken_dbm(void) +{ + /* List of CPUs which have broken DBM support. */ + static const struct midr_range cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_1024718 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + /* Kryo4xx Silver (rdpe => r1p0) */ + MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2051678 + MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), +#endif + {}, + }; + + return is_midr_in_range_list(read_cpuid_id(), cpus); +} + +static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) +{ + return has_cpuid_feature(cap, SCOPE_LOCAL_CPU) && + !cpu_has_broken_dbm(); +} + +static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap) +{ + if (cpu_can_use_dbm(cap)) + __cpu_enable_hw_dbm(); +} + +static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, + int __unused) +{ + static bool detected = false; + /* + * DBM is a non-conflicting feature. i.e, the kernel can safely + * run a mix of CPUs with and without the feature. So, we + * unconditionally enable the capability to allow any late CPU + * to use the feature. We only enable the control bits on the + * CPU, if it actually supports. + * + * We have to make sure we print the "feature" detection only + * when at least one CPU actually uses it. So check if this CPU + * can actually use it and print the message exactly once. + * + * This is safe as all CPUs (including secondary CPUs - due to the + * LOCAL_CPU scope - and the hotplugged CPUs - via verification) + * goes through the "matches" check exactly once. Also if a CPU + * matches the criteria, it is guaranteed that the CPU will turn + * the DBM on, as the capability is unconditionally enabled. + */ + if (!detected && cpu_can_use_dbm(cap)) { + detected = true; + pr_info("detected: Hardware dirty bit management\n"); + } + + return true; +} + +#endif + +#ifdef CONFIG_ARM64_AMU_EXTN + +/* + * The "amu_cpus" cpumask only signals that the CPU implementation for the + * flagged CPUs supports the Activity Monitors Unit (AMU) but does not provide + * information regarding all the events that it supports. When a CPU bit is + * set in the cpumask, the user of this feature can only rely on the presence + * of the 4 fixed counters for that CPU. But this does not guarantee that the + * counters are enabled or access to these counters is enabled by code + * executed at higher exception levels (firmware). + */ +static struct cpumask amu_cpus __read_mostly; + +bool cpu_has_amu_feat(int cpu) +{ + return cpumask_test_cpu(cpu, &amu_cpus); +} + +int get_cpu_with_amu_feat(void) +{ + return cpumask_any(&amu_cpus); +} + +static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) +{ + if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) { + pr_info("detected CPU%d: Activity Monitors Unit (AMU)\n", + smp_processor_id()); + cpumask_set_cpu(smp_processor_id(), &amu_cpus); + + /* 0 reference values signal broken/disabled counters */ + if (!this_cpu_has_cap(ARM64_WORKAROUND_2457168)) + update_freq_counters_refs(); + } +} + +static bool has_amu(const struct arm64_cpu_capabilities *cap, + int __unused) +{ + /* + * The AMU extension is a non-conflicting feature: the kernel can + * safely run a mix of CPUs with and without support for the + * activity monitors extension. Therefore, unconditionally enable + * the capability to allow any late CPU to use the feature. + * + * With this feature unconditionally enabled, the cpu_enable + * function will be called for all CPUs that match the criteria, + * including secondary and hotplugged, marking this feature as + * present on that respective CPU. The enable function will also + * print a detection message. + */ + + return true; +} +#else +int get_cpu_with_amu_feat(void) +{ + return nr_cpu_ids; +} +#endif + static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused) { return is_kernel_in_hyp_mode(); } -static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, - int __unused) +static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused) { - phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); + /* + * Copy register values that aren't redirected by hardware. + * + * Before code patching, we only set tpidr_el1, all CPUs need to copy + * this value to tpidr_el2 before we patch the code. Once we've done + * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to + * do anything here. + */ + if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN)) + write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); +} +#ifdef CONFIG_ARM64_PAN +static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) +{ /* - * Activate the lower HYP offset only if: - * - the idmap doesn't clash with it, - * - the kernel is not running at EL2. + * We modify PSTATE. This won't work from irq context as the PSTATE + * is discarded once we return from the exception. */ - return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); + WARN_ON_ONCE(in_interrupt()); + + sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); + set_pstate_pan(1); } +#endif /* CONFIG_ARM64_PAN */ -static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) +#ifdef CONFIG_ARM64_RAS_EXTN +static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) { - u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + /* Firmware may have left a deferred SError in this register. */ + write_sysreg_s(0, SYS_DISR_EL1); +} +#endif /* CONFIG_ARM64_RAS_EXTN */ - return cpuid_feature_extract_signed_field(pfr0, - ID_AA64PFR0_FP_SHIFT) < 0; +#ifdef CONFIG_ARM64_PTR_AUTH +static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope) +{ + int boot_val, sec_val; + + /* We don't expect to be called with SCOPE_SYSTEM */ + WARN_ON(scope == SCOPE_SYSTEM); + /* + * The ptr-auth feature levels are not intercompatible with lower + * levels. Hence we must match ptr-auth feature level of the secondary + * CPUs with that of the boot CPU. The level of boot cpu is fetched + * from the sanitised register whereas direct register read is done for + * the secondary CPUs. + * The sanitised feature state is guaranteed to match that of the + * boot CPU as a mismatched secondary CPU is parked before it gets + * a chance to update the state, with the capability. + */ + boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg), + entry->field_pos, entry->sign); + if (scope & SCOPE_BOOT_CPU) + return boot_val >= entry->min_field_value; + /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */ + sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg), + entry->field_pos, entry->sign); + return (sec_val >= entry->min_field_value) && (sec_val == boot_val); +} + +static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry, + int scope) +{ + bool api = has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); + bool apa = has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5], scope); + bool apa3 = has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3], scope); + + return apa || apa3 || api; +} + +static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + bool gpi = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF); + bool gpa = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5); + bool gpa3 = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3); + + return gpa || gpa3 || gpi; +} +#endif /* CONFIG_ARM64_PTR_AUTH */ + +#ifdef CONFIG_ARM64_E0PD +static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) +{ + if (this_cpu_has_cap(ARM64_HAS_E0PD)) + sysreg_clear_set(tcr_el1, 0, TCR_E0PD1); +} +#endif /* CONFIG_ARM64_E0PD */ + +#ifdef CONFIG_ARM64_PSEUDO_NMI +static bool enable_pseudo_nmi; + +static int __init early_enable_pseudo_nmi(char *p) +{ + return strtobool(p, &enable_pseudo_nmi); +} +early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); + +static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, + int scope) +{ + return enable_pseudo_nmi && has_useable_gicv3_cpuif(entry, scope); +} +#endif + +#ifdef CONFIG_ARM64_BTI +static void bti_enable(const struct arm64_cpu_capabilities *__unused) +{ + /* + * Use of X16/X17 for tail-calls and trampolines that jump to + * function entry points using BR is a requirement for + * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. + * So, be strict and forbid other BRs using other registers to + * jump onto a PACIxSP instruction: + */ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); + isb(); +} +#endif /* CONFIG_ARM64_BTI */ + +#ifdef CONFIG_ARM64_MTE +static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); + + mte_cpu_setup(); + + /* + * Clear the tags in the zero page. This needs to be done via the + * linear map which has the Tagged attribute. + */ + if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags)) + mte_clear_page_tags(lm_alias(empty_zero_page)); + + kasan_init_hw_tags_cpu(); +} +#endif /* CONFIG_ARM64_MTE */ + +static void elf_hwcap_fixup(void) +{ +#ifdef CONFIG_ARM64_ERRATUM_1742098 + if (cpus_have_const_cap(ARM64_WORKAROUND_1742098)) + compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES; +#endif /* ARM64_ERRATUM_1742098 */ +} + +#ifdef CONFIG_KVM +static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return kvm_get_mode() == KVM_MODE_PROTECTED; +} +#endif /* CONFIG_KVM */ + +static void cpu_trap_el0_impdef(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP); +} + +/* Internal helper functions to match cpu capability type */ +static bool +cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU); +} + +static bool +cpucap_late_cpu_permitted(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU); +} + +static bool +cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } static const struct arm64_cpu_capabilities arm64_features[] = { { + .capability = ARM64_ALWAYS_BOOT, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_always, + }, + { + .capability = ARM64_ALWAYS_SYSTEM, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_always, + }, + { .desc = "GIC system register CPU interface", .capability = ARM64_HAS_SYSREG_GIC_CPUIF, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_GIC_SHIFT, + .field_pos = ID_AA64PFR0_EL1_GIC_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = 1, + }, + { + .desc = "Enhanced Counter Virtualization", + .capability = ARM64_HAS_ECV, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR0_EL1, + .field_pos = ID_AA64MMFR0_EL1_ECV_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, }, @@ -811,81 +2157,104 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "Privileged Access Never", .capability = ARM64_HAS_PAN, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64MMFR1_EL1, - .field_pos = ID_AA64MMFR1_PAN_SHIFT, + .field_pos = ID_AA64MMFR1_EL1_PAN_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 1, - .enable = cpu_enable_pan, + .cpu_enable = cpu_enable_pan, }, #endif /* CONFIG_ARM64_PAN */ -#if defined(CONFIG_AS_LSE) && defined(CONFIG_ARM64_LSE_ATOMICS) +#ifdef CONFIG_ARM64_EPAN + { + .desc = "Enhanced Privileged Access Never", + .capability = ARM64_HAS_EPAN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR1_EL1, + .field_pos = ID_AA64MMFR1_EL1_PAN_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = 3, + }, +#endif /* CONFIG_ARM64_EPAN */ +#ifdef CONFIG_ARM64_LSE_ATOMICS { .desc = "LSE atomic instructions", .capability = ARM64_HAS_LSE_ATOMICS, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_ATOMICS_SHIFT, + .field_pos = ID_AA64ISAR0_EL1_ATOMIC_SHIFT, + .field_width = 4, .sign = FTR_UNSIGNED, .min_field_value = 2, }, -#endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */ +#endif /* CONFIG_ARM64_LSE_ATOMICS */ { .desc = "Software prefetching using PRFM", .capability = ARM64_HAS_NO_HW_PREFETCH, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .matches = has_no_hw_prefetch, }, -#ifdef CONFIG_ARM64_UAO - { - .desc = "User Access Override", - .capability = ARM64_HAS_UAO, - .def_scope = SCOPE_SYSTEM, - .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .field_pos = ID_AA64MMFR2_UAO_SHIFT, - .min_field_value = 1, - /* - * We rely on stop_machine() calling uao_thread_switch() to set - * UAO immediately after patching. - */ - }, -#endif /* CONFIG_ARM64_UAO */ -#ifdef CONFIG_ARM64_PAN - { - .capability = ARM64_ALT_PAN_NOT_UAO, - .def_scope = SCOPE_SYSTEM, - .matches = cpufeature_pan_not_uao, - }, -#endif /* CONFIG_ARM64_PAN */ { .desc = "Virtualization Host Extensions", .capability = ARM64_HAS_VIRT_HOST_EXTN, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = runs_at_el2, + .cpu_enable = cpu_copy_el2regs, }, { - .desc = "32-bit EL0 Support", - .capability = ARM64_HAS_32BIT_EL0, - .def_scope = SCOPE_SYSTEM, + .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_32bit_el0, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL1_EL0_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_ELx_32BIT_64BIT, + }, +#ifdef CONFIG_KVM + { + .desc = "32-bit EL1 Support", + .capability = ARM64_HAS_32BIT_EL1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + .field_pos = ID_AA64PFR0_EL1_EL1_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_ELx_32BIT_64BIT, }, { - .desc = "Reduced HYP mapping offset", - .capability = ARM64_HYP_OFFSET_LOW, - .def_scope = SCOPE_SYSTEM, - .matches = hyp_offset_low, + .desc = "Protected KVM", + .capability = ARM64_KVM_PROTECTED_MODE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = is_kvm_protected_mode, + }, +#endif + { + .desc = "Kernel page table isolation (KPTI)", + .capability = ARM64_UNMAP_KERNEL_AT_EL0, + .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, + /* + * The ID feature fields below are used to indicate that + * the CPU doesn't need KPTI. See unmap_kernel_at_el0 for + * more details. + */ + .sys_reg = SYS_ID_AA64PFR0_EL1, + .field_pos = ID_AA64PFR0_EL1_CSV3_SHIFT, + .field_width = 4, + .min_field_value = 1, + .matches = unmap_kernel_at_el0, + .cpu_enable = kpti_install_ng_mappings, }, { /* FP/SIMD is not implemented */ .capability = ARM64_HAS_NO_FPSIMD, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, .min_field_value = 0, .matches = has_no_fpsimd, }, @@ -893,64 +2262,600 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "Data cache clean to Point of Persistence", .capability = ARM64_HAS_DCPOP, - .def_scope = SCOPE_SYSTEM, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .field_pos = ID_AA64ISAR1_EL1_DPB_SHIFT, + .field_width = 4, + .min_field_value = 1, + }, + { + .desc = "Data cache clean to Point of Deep Persistence", + .capability = ARM64_HAS_DCPODP, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_EL1_DPB_SHIFT, + .field_width = 4, + .min_field_value = 2, + }, +#endif +#ifdef CONFIG_ARM64_SVE + { + .desc = "Scalable Vector Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SVE, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL1_SVE_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_SVE_IMP, + .matches = has_cpuid_feature, + .cpu_enable = sve_kernel_enable, + }, +#endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_RAS_EXTN + { + .desc = "RAS Extension Support", + .capability = ARM64_HAS_RAS_EXTN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL1_RAS_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_RAS_IMP, + .cpu_enable = cpu_clear_disr, + }, +#endif /* CONFIG_ARM64_RAS_EXTN */ +#ifdef CONFIG_ARM64_AMU_EXTN + { + /* + * The feature is enabled by default if CONFIG_ARM64_AMU_EXTN=y. + * Therefore, don't provide .desc as we don't want the detection + * message to be shown until at least one CPU is detected to + * support the feature. + */ + .capability = ARM64_HAS_AMU_EXTN, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .matches = has_amu, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL1_AMU_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR0_EL1_AMU_IMP, + .cpu_enable = cpu_amu_enable, + }, +#endif /* CONFIG_ARM64_AMU_EXTN */ + { + .desc = "Data cache clean to the PoU not required for I/D coherence", + .capability = ARM64_HAS_CACHE_IDC, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cache_idc, + .cpu_enable = cpu_emulate_effective_ctr, + }, + { + .desc = "Instruction cache invalidation not required for I/D coherence", + .capability = ARM64_HAS_CACHE_DIC, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cache_dic, + }, + { + .desc = "Stage-2 Force Write-Back", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_STAGE2_FWB, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_EL1_FWB_SHIFT, + .field_width = 4, + .min_field_value = 1, + .matches = has_cpuid_feature, + }, + { + .desc = "ARMv8.4 Translation Table Level", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_ARMv8_4_TTL, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_EL1_TTL_SHIFT, + .field_width = 4, + .min_field_value = 1, + .matches = has_cpuid_feature, + }, + { + .desc = "TLB range maintenance instructions", + .capability = ARM64_HAS_TLB_RANGE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR0_EL1, + .field_pos = ID_AA64ISAR0_EL1_TLB_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = ID_AA64ISAR0_EL1_TLB_RANGE, + }, +#ifdef CONFIG_ARM64_HW_AFDBM + { + /* + * Since we turn this on always, we don't want the user to + * think that the feature is available when it may not be. + * So hide the description. + * + * .desc = "Hardware pagetable Dirty Bit Management", + * + */ + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .capability = ARM64_HW_DBM, + .sys_reg = SYS_ID_AA64MMFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR1_EL1_HAFDBS_SHIFT, + .field_width = 4, + .min_field_value = 2, + .matches = has_hw_dbm, + .cpu_enable = cpu_enable_hw_dbm, + }, +#endif + { + .desc = "CRC32 instructions", + .capability = ARM64_HAS_CRC32, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR0_EL1, + .field_pos = ID_AA64ISAR0_EL1_CRC32_SHIFT, + .field_width = 4, + .min_field_value = 1, + }, + { + .desc = "Speculative Store Bypassing Safe (SSBS)", + .capability = ARM64_SSBS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_EL1_SSBS_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = ID_AA64PFR1_EL1_SSBS_IMP, + }, +#ifdef CONFIG_ARM64_CNP + { + .desc = "Common not Private translations", + .capability = ARM64_HAS_CNP, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_useable_cnp, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_EL1_CnP_SHIFT, + .field_width = 4, + .min_field_value = 1, + .cpu_enable = cpu_enable_cnp, + }, +#endif + { + .desc = "Speculation barrier (SB)", + .capability = ARM64_HAS_SB, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .field_pos = ID_AA64ISAR1_EL1_SB_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, .min_field_value = 1, }, +#ifdef CONFIG_ARM64_PTR_AUTH + { + .desc = "Address authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_EL1_APA_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_APA_PAuth, + .matches = has_address_auth_cpucap, + }, + { + .desc = "Address authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_EL1_APA3_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR2_EL1_APA3_PAuth, + .matches = has_address_auth_cpucap, + }, + { + .desc = "Address authentication (IMP DEF algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_IMP_DEF, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_EL1_API_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_API_PAuth, + .matches = has_address_auth_cpucap, + }, + { + .capability = ARM64_HAS_ADDRESS_AUTH, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_metacap, + }, + { + .desc = "Generic authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_EL1_GPA_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_GPA_IMP, + .matches = has_cpuid_feature, + }, + { + .desc = "Generic authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_EL1_GPA3_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR2_EL1_GPA3_IMP, + .matches = has_cpuid_feature, + }, + { + .desc = "Generic authentication (IMP DEF algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_IMP_DEF, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_EL1_GPI_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64ISAR1_EL1_GPI_IMP, + .matches = has_cpuid_feature, + }, + { + .capability = ARM64_HAS_GENERIC_AUTH, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_generic_auth, + }, +#endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_PSEUDO_NMI + { + /* + * Depends on having GICv3 + */ + .desc = "IRQ priority masking", + .capability = ARM64_HAS_IRQ_PRIO_MASKING, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = can_use_gic_priorities, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .field_pos = ID_AA64PFR0_EL1_GIC_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = 1, + }, +#endif +#ifdef CONFIG_ARM64_E0PD + { + .desc = "E0PD", + .capability = ARM64_HAS_E0PD, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_width = 4, + .field_pos = ID_AA64MMFR2_EL1_E0PD_SHIFT, + .matches = has_cpuid_feature, + .min_field_value = 1, + .cpu_enable = cpu_enable_e0pd, + }, +#endif + { + .desc = "Random Number Generator", + .capability = ARM64_HAS_RNG, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR0_EL1, + .field_pos = ID_AA64ISAR0_EL1_RNDR_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = 1, + }, +#ifdef CONFIG_ARM64_BTI + { + .desc = "Branch Target Identification", + .capability = ARM64_BTI, +#ifdef CONFIG_ARM64_BTI_KERNEL + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, +#else + .type = ARM64_CPUCAP_SYSTEM_FEATURE, #endif + .matches = has_cpuid_feature, + .cpu_enable = bti_enable, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_EL1_BT_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_BT_IMP, + .sign = FTR_UNSIGNED, + }, +#endif +#ifdef CONFIG_ARM64_MTE + { + .desc = "Memory Tagging Extension", + .capability = ARM64_MTE, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_EL1_MTE_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_MTE_MTE2, + .sign = FTR_UNSIGNED, + .cpu_enable = cpu_enable_mte, + }, + { + .desc = "Asymmetric MTE Tag Check Fault", + .capability = ARM64_MTE_ASYMM, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_EL1_MTE_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_MTE_MTE3, + .sign = FTR_UNSIGNED, + }, +#endif /* CONFIG_ARM64_MTE */ + { + .desc = "RCpc load-acquire (LDAPR)", + .capability = ARM64_HAS_LDAPR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_EL1_LRCPC_SHIFT, + .field_width = 4, + .matches = has_cpuid_feature, + .min_field_value = 1, + }, +#ifdef CONFIG_ARM64_SME + { + .desc = "Scalable Matrix Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR1_EL1_SME_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64PFR1_EL1_SME_IMP, + .matches = has_cpuid_feature, + .cpu_enable = sme_kernel_enable, + }, + /* FA64 should be sorted after the base SME capability */ + { + .desc = "FA64", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME_FA64, + .sys_reg = SYS_ID_AA64SMFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64SMFR0_EL1_FA64_SHIFT, + .field_width = 1, + .min_field_value = ID_AA64SMFR0_EL1_FA64_IMP, + .matches = has_cpuid_feature, + .cpu_enable = fa64_kernel_enable, + }, +#endif /* CONFIG_ARM64_SME */ + { + .desc = "WFx with timeout", + .capability = ARM64_HAS_WFXT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_EL1_WFxT_SHIFT, + .field_width = 4, + .matches = has_cpuid_feature, + .min_field_value = ID_AA64ISAR2_EL1_WFxT_IMP, + }, + { + .desc = "Trap EL0 IMPLEMENTATION DEFINED functionality", + .capability = ARM64_HAS_TIDCP1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64MMFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR1_EL1_TIDCP1_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64MMFR1_EL1_TIDCP1_IMP, + .matches = has_cpuid_feature, + .cpu_enable = cpu_trap_el0_impdef, + }, {}, }; -#define HWCAP_CAP(reg, field, s, min_value, type, cap) \ - { \ - .desc = #cap, \ - .def_scope = SCOPE_SYSTEM, \ - .matches = has_cpuid_feature, \ - .sys_reg = reg, \ - .field_pos = field, \ - .sign = s, \ - .min_field_value = min_value, \ - .hwcap_type = type, \ - .hwcap = cap, \ +#define HWCAP_CPUID_MATCH(reg, field, width, s, min_value) \ + .matches = has_user_cpuid_feature, \ + .sys_reg = reg, \ + .field_pos = field, \ + .field_width = width, \ + .sign = s, \ + .min_field_value = min_value, + +#define __HWCAP_CAP(name, cap_type, cap) \ + .desc = name, \ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, \ + .hwcap_type = cap_type, \ + .hwcap = cap, \ + +#define HWCAP_CAP(reg, field, width, s, min_value, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + HWCAP_CPUID_MATCH(reg, field, width, s, min_value) \ + } + +#define HWCAP_MULTI_CAP(list, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + .matches = cpucap_multi_entry_cap_matches, \ + .match_list = list, \ } +#define HWCAP_CAP_MATCH(match, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + .matches = match, \ + } + +#ifdef CONFIG_ARM64_PTR_AUTH +static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_APA_SHIFT, + 4, FTR_UNSIGNED, + ID_AA64ISAR1_EL1_APA_PAuth) + }, + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_APA3_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_APA3_PAuth) + }, + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_API_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_API_PAuth) + }, + {}, +}; + +static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_GPA_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_GPA_IMP) + }, + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_GPA3_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_GPA3_IMP) + }, + { + HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_GPI_SHIFT, + 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_GPI_IMP) + }, + {}, +}; +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_DP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_TS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_TS_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_FP_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_FP_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_DIT_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_SB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_BF16), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_EBF16), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_EL1_AT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), +#ifdef CONFIG_ARM64_SVE + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_SVE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR0_EL1_SVE_IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SVEver_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BitPerm_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BF16_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BF16_EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SHA3_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SM4_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_I8MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_F32MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_F64MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), +#endif + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_SSBS_SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), +#ifdef CONFIG_ARM64_BTI + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_BT_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_BT_IMP, CAP_HWCAP, KERNEL_HWCAP_BTI), +#endif +#ifdef CONFIG_ARM64_PTR_AUTH + HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), + HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), +#endif +#ifdef CONFIG_ARM64_MTE + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_MTE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_MTE_MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_MTE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_MTE_MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), +#endif /* CONFIG_ARM64_MTE */ + HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV), + HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP), + HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES), + HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_WFxT_IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), +#ifdef CONFIG_ARM64_SME + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_EL1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_EL1_SME_IMP, CAP_HWCAP, KERNEL_HWCAP_SME), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_FA64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_EL1_I16I64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F64F64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_EL1_I8I32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F16F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_B16F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F32F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), +#endif /* CONFIG_ARM64_SME */ {}, }; +#ifdef CONFIG_COMPAT +static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope) +{ + /* + * Check that all of MVFR1_EL1.{SIMDSP, SIMDInt, SIMDLS} are available, + * in line with that of arm32 as in vfp_init(). We make sure that the + * check is future proof, by making sure value is non-zero. + */ + u32 mvfr1; + + WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); + if (scope == SCOPE_SYSTEM) + mvfr1 = read_sanitised_ftr_reg(SYS_MVFR1_EL1); + else + mvfr1 = read_sysreg_s(SYS_MVFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_SIMDSP_SHIFT) && + cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_SIMDINT_SHIFT) && + cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_SIMDLS_SHIFT); +} +#endif + static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = { #ifdef CONFIG_COMPAT - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), + HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON), + HWCAP_CAP(SYS_MVFR1_EL1, MVFR1_SIMDFMAC_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4), + /* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */ + HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, 4, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP), + HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, 4, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, 4, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), + HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, 4, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), #endif {}, }; -static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) +static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: - elf_hwcap |= cap->hwcap; + cpu_set_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -973,7 +2878,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) switch (cap->hwcap_type) { case CAP_HWCAP: - rc = (elf_hwcap & cap->hwcap) != 0; + rc = cpu_have_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -991,68 +2896,164 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) return rc; } -static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) +static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { /* We support emulation of accesses to CPU ID feature registers */ - elf_hwcap |= HWCAP_CPUID; + cpu_set_named_feature(CPUID); for (; hwcaps->matches; hwcaps++) - if (hwcaps->matches(hwcaps, hwcaps->def_scope)) + if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps))) cap_set_elf_hwcap(hwcaps); } -void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, - const char *info) +static void update_cpu_capabilities(u16 scope_mask) { - for (; caps->matches; caps++) { - if (!caps->matches(caps, caps->def_scope)) + int i; + const struct arm64_cpu_capabilities *caps; + + scope_mask &= ARM64_CPUCAP_SCOPE_MASK; + for (i = 0; i < ARM64_NCAPS; i++) { + caps = cpu_hwcaps_ptrs[i]; + if (!caps || !(caps->type & scope_mask) || + cpus_have_cap(caps->capability) || + !caps->matches(caps, cpucap_default_scope(caps))) continue; - if (!cpus_have_cap(caps->capability) && caps->desc) - pr_info("%s %s\n", info, caps->desc); + if (caps->desc) + pr_info("detected: %s\n", caps->desc); cpus_set_cap(caps->capability); + + if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU)) + set_bit(caps->capability, boot_capabilities); } } /* + * Enable all the available capabilities on this CPU. The capabilities + * with BOOT_CPU scope are handled separately and hence skipped here. + */ +static int cpu_enable_non_boot_scope_capabilities(void *__unused) +{ + int i; + u16 non_boot_scope = SCOPE_ALL & ~SCOPE_BOOT_CPU; + + for_each_available_cap(i) { + const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[i]; + + if (WARN_ON(!cap)) + continue; + + if (!(cap->type & non_boot_scope)) + continue; + + if (cap->cpu_enable) + cap->cpu_enable(cap); + } + return 0; +} + +/* * Run through the enabled capabilities and enable() it on all active * CPUs */ -void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) +static void __init enable_cpu_capabilities(u16 scope_mask) { - for (; caps->matches; caps++) { - unsigned int num = caps->capability; + int i; + const struct arm64_cpu_capabilities *caps; + bool boot_scope; + scope_mask &= ARM64_CPUCAP_SCOPE_MASK; + boot_scope = !!(scope_mask & SCOPE_BOOT_CPU); + + for (i = 0; i < ARM64_NCAPS; i++) { + unsigned int num; + + caps = cpu_hwcaps_ptrs[i]; + if (!caps || !(caps->type & scope_mask)) + continue; + num = caps->capability; if (!cpus_have_cap(num)) continue; - /* Ensure cpus_have_const_cap(num) works */ - static_branch_enable(&cpu_hwcap_keys[num]); - - if (caps->enable) { + if (boot_scope && caps->cpu_enable) /* - * Use stop_machine() as it schedules the work allowing - * us to modify PSTATE, instead of on_each_cpu() which - * uses an IPI, giving us a PSTATE that disappears when - * we return. + * Capabilities with SCOPE_BOOT_CPU scope are finalised + * before any secondary CPU boots. Thus, each secondary + * will enable the capability as appropriate via + * check_local_cpu_capabilities(). The only exception is + * the boot CPU, for which the capability must be + * enabled here. This approach avoids costly + * stop_machine() calls for this case. */ - stop_machine(caps->enable, NULL, cpu_online_mask); - } + caps->cpu_enable(caps); } + + /* + * For all non-boot scope capabilities, use stop_machine() + * as it schedules the work allowing us to modify PSTATE, + * instead of on_each_cpu() which uses an IPI, giving us a + * PSTATE that disappears when we return. + */ + if (!boot_scope) + stop_machine(cpu_enable_non_boot_scope_capabilities, + NULL, cpu_online_mask); } /* - * Flag to indicate if we have computed the system wide - * capabilities based on the boot time active CPUs. This - * will be used to determine if a new booting CPU should - * go through the verification process to make sure that it - * supports the system capabilities, without using a hotplug - * notifier. + * Run through the list of capabilities to check for conflicts. + * If the system has already detected a capability, take necessary + * action on this CPU. */ -static bool sys_caps_initialised; - -static inline void set_sys_caps_initialised(void) +static void verify_local_cpu_caps(u16 scope_mask) { - sys_caps_initialised = true; + int i; + bool cpu_has_cap, system_has_cap; + const struct arm64_cpu_capabilities *caps; + + scope_mask &= ARM64_CPUCAP_SCOPE_MASK; + + for (i = 0; i < ARM64_NCAPS; i++) { + caps = cpu_hwcaps_ptrs[i]; + if (!caps || !(caps->type & scope_mask)) + continue; + + cpu_has_cap = caps->matches(caps, SCOPE_LOCAL_CPU); + system_has_cap = cpus_have_cap(caps->capability); + + if (system_has_cap) { + /* + * Check if the new CPU misses an advertised feature, + * which is not safe to miss. + */ + if (!cpu_has_cap && !cpucap_late_cpu_optional(caps)) + break; + /* + * We have to issue cpu_enable() irrespective of + * whether the CPU has it or not, as it is enabeld + * system wide. It is upto the call back to take + * appropriate action on this CPU. + */ + if (caps->cpu_enable) + caps->cpu_enable(caps); + } else { + /* + * Check if the CPU has this capability if it isn't + * safe to have when the system doesn't. + */ + if (cpu_has_cap && !cpucap_late_cpu_permitted(caps)) + break; + } + } + + if (i < ARM64_NCAPS) { + pr_crit("CPU%d: Detected conflict for capability %d (%s), System: %d, CPU: %d\n", + smp_processor_id(), caps->capability, + caps->desc, system_has_cap, cpu_has_cap); + + if (cpucap_panic_on_conflict(caps)) + cpu_panic_kernel(); + else + cpu_die_early(); + } } /* @@ -1061,12 +3062,13 @@ static inline void set_sys_caps_initialised(void) */ static void check_early_cpu_features(void) { - verify_cpu_run_el(); verify_cpu_asid_bits(); + + verify_local_cpu_caps(SCOPE_BOOT_CPU); } static void -verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) +__verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) @@ -1077,23 +3079,76 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) } } -static void -verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) +static void verify_local_elf_hwcaps(void) { - for (; caps->matches; caps++) { - if (!cpus_have_cap(caps->capability)) - continue; - /* - * If the new CPU misses an advertised feature, we cannot proceed - * further, park the cpu. - */ - if (!caps->matches(caps, SCOPE_LOCAL_CPU)) { - pr_crit("CPU%d: missing feature: %s\n", - smp_processor_id(), caps->desc); - cpu_die_early(); - } - if (caps->enable) - caps->enable(NULL); + __verify_local_elf_hwcaps(arm64_elf_hwcaps); + + if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1))) + __verify_local_elf_hwcaps(compat_elf_hwcaps); +} + +static void verify_sve_features(void) +{ + u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); + u64 zcr = read_zcr_features(); + + unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK; + unsigned int len = zcr & ZCR_ELx_LEN_MASK; + + if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SVE)) { + pr_crit("CPU%d: SVE: vector length support mismatch\n", + smp_processor_id()); + cpu_die_early(); + } + + /* Add checks on other ZCR bits here if necessary */ +} + +static void verify_sme_features(void) +{ + u64 safe_smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); + u64 smcr = read_smcr_features(); + + unsigned int safe_len = safe_smcr & SMCR_ELx_LEN_MASK; + unsigned int len = smcr & SMCR_ELx_LEN_MASK; + + if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SME)) { + pr_crit("CPU%d: SME: vector length support mismatch\n", + smp_processor_id()); + cpu_die_early(); + } + + /* Add checks on other SMCR bits here if necessary */ +} + +static void verify_hyp_capabilities(void) +{ + u64 safe_mmfr1, mmfr0, mmfr1; + int parange, ipa_max; + unsigned int safe_vmid_bits, vmid_bits; + + if (!IS_ENABLED(CONFIG_KVM)) + return; + + safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + + /* Verify VMID bits */ + safe_vmid_bits = get_vmid_bits(safe_mmfr1); + vmid_bits = get_vmid_bits(mmfr1); + if (vmid_bits < safe_vmid_bits) { + pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + /* Verify IPA range */ + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + if (ipa_max < get_kvm_ipa_limit()) { + pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); + cpu_die_early(); } } @@ -1107,11 +3162,22 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) */ static void verify_local_cpu_capabilities(void) { - verify_local_cpu_errata_workarounds(); - verify_local_cpu_features(arm64_features); - verify_local_elf_hwcaps(arm64_elf_hwcaps); - if (system_supports_32bit_el0()) - verify_local_elf_hwcaps(compat_elf_hwcaps); + /* + * The capabilities with SCOPE_BOOT_CPU are checked from + * check_early_cpu_features(), as they need to be verified + * on all secondary CPUs. + */ + verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU); + verify_local_elf_hwcaps(); + + if (system_supports_sve()) + verify_sve_features(); + + if (system_supports_sme()) + verify_sme_features(); + + if (is_hyp_mode_available()) + verify_hyp_capabilities(); } void check_local_cpu_capabilities(void) @@ -1124,96 +3190,180 @@ void check_local_cpu_capabilities(void) /* * If we haven't finalised the system capabilities, this CPU gets - * a chance to update the errata work arounds. + * a chance to update the errata work arounds and local features. * Otherwise, this CPU should verify that it has all the system * advertised capabilities. */ - if (!sys_caps_initialised) - update_cpu_errata_workarounds(); + if (!system_capabilities_finalized()) + update_cpu_capabilities(SCOPE_LOCAL_CPU); else verify_local_cpu_capabilities(); } -static void __init setup_feature_capabilities(void) +static void __init setup_boot_cpu_capabilities(void) { - update_cpu_capabilities(arm64_features, "detected feature:"); - enable_cpu_capabilities(arm64_features); + /* Detect capabilities with either SCOPE_BOOT_CPU or SCOPE_LOCAL_CPU */ + update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); + /* Enable the SCOPE_BOOT_CPU capabilities alone right away */ + enable_cpu_capabilities(SCOPE_BOOT_CPU); } -DEFINE_STATIC_KEY_FALSE(arm64_const_caps_ready); -EXPORT_SYMBOL(arm64_const_caps_ready); - -static void __init mark_const_caps_ready(void) +bool this_cpu_has_cap(unsigned int n) { - static_branch_enable(&arm64_const_caps_ready); + if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { + const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[n]; + + if (cap) + return cap->matches(cap, SCOPE_LOCAL_CPU); + } + + return false; } +EXPORT_SYMBOL_GPL(this_cpu_has_cap); /* - * Check if the current CPU has a given feature capability. - * Should be called from non-preemptible context. + * This helper function is used in a narrow window when, + * - The system wide safe registers are set with all the SMP CPUs and, + * - The SYSTEM_FEATURE cpu_hwcaps may not have been set. + * In all other cases cpus_have_{const_}cap() should be used. */ -static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, - unsigned int cap) +static bool __maybe_unused __system_matches_cap(unsigned int n) { - const struct arm64_cpu_capabilities *caps; + if (n < ARM64_NCAPS) { + const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[n]; - if (WARN_ON(preemptible())) - return false; + if (cap) + return cap->matches(cap, SCOPE_SYSTEM); + } + return false; +} - for (caps = cap_array; caps->desc; caps++) - if (caps->capability == cap && caps->matches) - return caps->matches(caps, SCOPE_LOCAL_CPU); +void cpu_set_feature(unsigned int num) +{ + set_bit(num, elf_hwcap); +} - return false; +bool cpu_have_feature(unsigned int num) +{ + return test_bit(num, elf_hwcap); } +EXPORT_SYMBOL_GPL(cpu_have_feature); -extern const struct arm64_cpu_capabilities arm64_errata[]; +unsigned long cpu_get_elf_hwcap(void) +{ + /* + * We currently only populate the first 32 bits of AT_HWCAP. Please + * note that for userspace compatibility we guarantee that bits 62 + * and 63 will always be returned as 0. + */ + return elf_hwcap[0]; +} -bool this_cpu_has_cap(unsigned int cap) +unsigned long cpu_get_elf_hwcap2(void) { - return (__this_cpu_has_cap(arm64_features, cap) || - __this_cpu_has_cap(arm64_errata, cap)); + return elf_hwcap[1]; +} + +static void __init setup_system_capabilities(void) +{ + /* + * We have finalised the system-wide safe feature + * registers, finalise the capabilities that depend + * on it. Also enable all the available capabilities, + * that are not enabled already. + */ + update_cpu_capabilities(SCOPE_SYSTEM); + enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); } void __init setup_cpu_features(void) { u32 cwg; - int cls; - /* Set the CPU feature capabilies */ - setup_feature_capabilities(); - enable_errata_workarounds(); - mark_const_caps_ready(); + setup_system_capabilities(); setup_elf_hwcaps(arm64_elf_hwcaps); - if (system_supports_32bit_el0()) + if (system_supports_32bit_el0()) { setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + } + + if (system_uses_ttbr0_pan()) + pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); - /* Advertise that we have computed the system capabilities */ - set_sys_caps_initialised(); + sve_setup(); + sme_setup(); + minsigstksz_setup(); /* * Check for sane CTR_EL0.CWG value. */ cwg = cache_type_cwg(); - cls = cache_line_size(); if (!cwg) - pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n", - cls); - if (L1_CACHE_BYTES < cls) - pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", - L1_CACHE_BYTES, cls); + pr_warn("No Cache Writeback Granule information, assuming %d\n", + ARCH_DMA_MINALIGN); } -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) +static int enable_mismatched_32bit_el0(unsigned int cpu) +{ + /* + * The first 32-bit-capable CPU we detected and so can no longer + * be offlined by userspace. -1 indicates we haven't yet onlined + * a 32-bit-capable CPU. + */ + static int lucky_winner = -1; + + struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); + bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0); + + if (cpu_32bit) { + cpumask_set_cpu(cpu, cpu_32bit_el0_mask); + static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0); + } + + if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit) + return 0; + + if (lucky_winner >= 0) + return 0; + + /* + * We've detected a mismatch. We need to keep one of our CPUs with + * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting + * every CPU in the system for a 32-bit task. + */ + lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask, + cpu_active_mask); + get_cpu_device(lucky_winner)->offline_disabled = true; + setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n", + cpu, lucky_winner); + return 0; +} + +static int __init init_32bit_el0_mask(void) +{ + if (!allow_mismatched_32bit_el0) + return 0; + + if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL)) + return -ENOMEM; + + return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/mismatched_32bit_el0:online", + enable_mismatched_32bit_el0, NULL); +} +subsys_initcall_sync(init_32bit_el0_mask); + +static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { - return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); } /* * We emulate only the following system register space. - * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7] + * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7] * See Table C5-6 System instruction encodings for System register accesses, * ARMv8 ARM(ARM DDI 0487A.f) for more details. */ @@ -1223,7 +3373,7 @@ static inline bool __attribute_const__ is_emulated(u32 id) sys_reg_CRn(id) == 0x0 && sys_reg_Op1(id) == 0x0 && (sys_reg_CRm(id) == 0 || - ((sys_reg_CRm(id) >= 4) && (sys_reg_CRm(id) <= 7)))); + ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7)))); } /* @@ -1260,7 +3410,7 @@ static int emulate_sys_reg(u32 id, u64 *valp) if (sys_reg_CRm(id) == 0) return emulate_id_reg(id, valp); - regp = get_arm64_ftr_reg(id); + regp = get_arm64_ftr_reg_nowarn(id); if (regp) *valp = arm64_ftr_reg_user_value(regp); else @@ -1272,31 +3422,36 @@ static int emulate_sys_reg(u32 id, u64 *valp) return 0; } -static int emulate_mrs(struct pt_regs *regs, u32 insn) +int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt) { int rc; - u32 sys_reg, dst; u64 val; + rc = emulate_sys_reg(sys_reg, &val); + if (!rc) { + pt_regs_write_reg(regs, rt, val); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } + return rc; +} + +static int emulate_mrs(struct pt_regs *regs, u32 insn) +{ + u32 sys_reg, rt; + /* * sys_reg values are defined as used in mrs/msr instruction. * shift the imm value to get the encoding. */ sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5; - rc = emulate_sys_reg(sys_reg, &val); - if (!rc) { - dst = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); - pt_regs_write_reg(regs, dst, val); - regs->pc += 4; - } - - return rc; + rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + return do_emulate_mrs(regs, sys_reg, rt); } static struct undef_hook mrs_hook = { - .instr_mask = 0xfff00000, - .instr_val = 0xd5300000, - .pstate_mask = COMPAT_PSR_MODE_MASK, + .instr_mask = 0xffff0000, + .instr_val = 0xd5380000, + .pstate_mask = PSR_AA32_MODE_MASK, .pstate_val = PSR_MODE_EL0t, .fn = emulate_mrs, }; @@ -1308,3 +3463,29 @@ static int __init enable_mrs_emulation(void) } core_initcall(enable_mrs_emulation); + +enum mitigation_state arm64_get_meltdown_state(void) +{ + if (__meltdown_safe) + return SPECTRE_UNAFFECTED; + + if (arm64_kernel_unmapped_at_el0()) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; +} + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, + char *buf) +{ + switch (arm64_get_meltdown_state()) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: PTI\n"); + + default: + return sprintf(buf, "Vulnerable\n"); + } +} diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index fd691087dc9a..4150e308e99c 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 CPU idle arch support * * Copyright (C) 2014 ARM Ltd. * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/acpi.h> @@ -14,46 +11,66 @@ #include <linux/cpu_pm.h> #include <linux/of.h> #include <linux/of_device.h> +#include <linux/psci.h> + +#ifdef CONFIG_ACPI -#include <asm/cpuidle.h> -#include <asm/cpu_ops.h> +#include <acpi/processor.h> -int arm_cpuidle_init(unsigned int cpu) +#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags)) + +static int psci_acpi_cpu_init_idle(unsigned int cpu) { - int ret = -EOPNOTSUPP; + int i, count; + struct acpi_lpi_state *lpi; + struct acpi_processor *pr = per_cpu(processors, cpu); - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_suspend && - cpu_ops[cpu]->cpu_init_idle) - ret = cpu_ops[cpu]->cpu_init_idle(cpu); + if (unlikely(!pr || !pr->flags.has_lpi)) + return -EINVAL; - return ret; -} + /* + * If the PSCI cpu_suspend function hook has not been initialized + * idle states must not be enabled, so bail out + */ + if (!psci_ops.cpu_suspend) + return -EOPNOTSUPP; -/** - * arm_cpuidle_suspend() - function to enter a low-power idle state - * @arg: argument to pass to CPU suspend operations - * - * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU - * operations back-end error code otherwise. - */ -int arm_cpuidle_suspend(int index) -{ - int cpu = smp_processor_id(); + count = pr->power.count - 1; + if (count <= 0) + return -ENODEV; - return cpu_ops[cpu]->cpu_suspend(index); -} + for (i = 0; i < count; i++) { + u32 state; -#ifdef CONFIG_ACPI + lpi = &pr->power.lpi_states[i + 1]; + /* + * Only bits[31:0] represent a PSCI power_state while + * bits[63:32] must be 0x0 as per ARM ACPI FFH Specification + */ + state = lpi->address; + if (!psci_power_state_is_valid(state)) { + pr_warn("Invalid PSCI power state %#x\n", state); + return -EINVAL; + } + } -#include <acpi/processor.h> + return 0; +} int acpi_processor_ffh_lpi_probe(unsigned int cpu) { - return arm_cpuidle_init(cpu); + return psci_acpi_cpu_init_idle(cpu); } int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { - return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index); + u32 state = lpi->address; + + if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(psci_cpu_suspend_enter, + lpi->index, state); + else + return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, + lpi->index, state); } #endif diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 311885962830..28d4f442b0bc 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -1,24 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Record and handle CPU attributes. * * Copyright (C) 2014 ARM Ltd. - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <asm/arch_timer.h> #include <asm/cache.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/cpufeature.h> +#include <asm/fpsimd.h> #include <linux/bitops.h> #include <linux/bug.h> @@ -42,70 +33,125 @@ DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data); static struct cpuinfo_arm64 boot_cpu_data; -static char *icache_policy_str[] = { - [0 ... ICACHE_POLICY_PIPT] = "RESERVED/UNKNOWN", - [ICACHE_POLICY_VIPT] = "VIPT", - [ICACHE_POLICY_PIPT] = "PIPT", - [ICACHE_POLICY_VPIPT] = "VPIPT", -}; +static inline const char *icache_policy_str(int l1ip) +{ + switch (l1ip) { + case CTR_EL0_L1Ip_VPIPT: + return "VPIPT"; + case CTR_EL0_L1Ip_VIPT: + return "VIPT"; + case CTR_EL0_L1Ip_PIPT: + return "PIPT"; + default: + return "RESERVED/UNKNOWN"; + } +} unsigned long __icache_flags; static const char *const hwcap_str[] = { - "fp", - "asimd", - "evtstrm", - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - "atomics", - "fphp", - "asimdhp", - "cpuid", - "asimdrdm", - "jscvt", - "fcma", - "lrcpc", - "dcpop", - NULL + [KERNEL_HWCAP_FP] = "fp", + [KERNEL_HWCAP_ASIMD] = "asimd", + [KERNEL_HWCAP_EVTSTRM] = "evtstrm", + [KERNEL_HWCAP_AES] = "aes", + [KERNEL_HWCAP_PMULL] = "pmull", + [KERNEL_HWCAP_SHA1] = "sha1", + [KERNEL_HWCAP_SHA2] = "sha2", + [KERNEL_HWCAP_CRC32] = "crc32", + [KERNEL_HWCAP_ATOMICS] = "atomics", + [KERNEL_HWCAP_FPHP] = "fphp", + [KERNEL_HWCAP_ASIMDHP] = "asimdhp", + [KERNEL_HWCAP_CPUID] = "cpuid", + [KERNEL_HWCAP_ASIMDRDM] = "asimdrdm", + [KERNEL_HWCAP_JSCVT] = "jscvt", + [KERNEL_HWCAP_FCMA] = "fcma", + [KERNEL_HWCAP_LRCPC] = "lrcpc", + [KERNEL_HWCAP_DCPOP] = "dcpop", + [KERNEL_HWCAP_SHA3] = "sha3", + [KERNEL_HWCAP_SM3] = "sm3", + [KERNEL_HWCAP_SM4] = "sm4", + [KERNEL_HWCAP_ASIMDDP] = "asimddp", + [KERNEL_HWCAP_SHA512] = "sha512", + [KERNEL_HWCAP_SVE] = "sve", + [KERNEL_HWCAP_ASIMDFHM] = "asimdfhm", + [KERNEL_HWCAP_DIT] = "dit", + [KERNEL_HWCAP_USCAT] = "uscat", + [KERNEL_HWCAP_ILRCPC] = "ilrcpc", + [KERNEL_HWCAP_FLAGM] = "flagm", + [KERNEL_HWCAP_SSBS] = "ssbs", + [KERNEL_HWCAP_SB] = "sb", + [KERNEL_HWCAP_PACA] = "paca", + [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_DCPODP] = "dcpodp", + [KERNEL_HWCAP_SVE2] = "sve2", + [KERNEL_HWCAP_SVEAES] = "sveaes", + [KERNEL_HWCAP_SVEPMULL] = "svepmull", + [KERNEL_HWCAP_SVEBITPERM] = "svebitperm", + [KERNEL_HWCAP_SVESHA3] = "svesha3", + [KERNEL_HWCAP_SVESM4] = "svesm4", + [KERNEL_HWCAP_FLAGM2] = "flagm2", + [KERNEL_HWCAP_FRINT] = "frint", + [KERNEL_HWCAP_SVEI8MM] = "svei8mm", + [KERNEL_HWCAP_SVEF32MM] = "svef32mm", + [KERNEL_HWCAP_SVEF64MM] = "svef64mm", + [KERNEL_HWCAP_SVEBF16] = "svebf16", + [KERNEL_HWCAP_I8MM] = "i8mm", + [KERNEL_HWCAP_BF16] = "bf16", + [KERNEL_HWCAP_DGH] = "dgh", + [KERNEL_HWCAP_RNG] = "rng", + [KERNEL_HWCAP_BTI] = "bti", + [KERNEL_HWCAP_MTE] = "mte", + [KERNEL_HWCAP_ECV] = "ecv", + [KERNEL_HWCAP_AFP] = "afp", + [KERNEL_HWCAP_RPRES] = "rpres", + [KERNEL_HWCAP_MTE3] = "mte3", + [KERNEL_HWCAP_SME] = "sme", + [KERNEL_HWCAP_SME_I16I64] = "smei16i64", + [KERNEL_HWCAP_SME_F64F64] = "smef64f64", + [KERNEL_HWCAP_SME_I8I32] = "smei8i32", + [KERNEL_HWCAP_SME_F16F32] = "smef16f32", + [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", + [KERNEL_HWCAP_SME_F32F32] = "smef32f32", + [KERNEL_HWCAP_SME_FA64] = "smefa64", + [KERNEL_HWCAP_WFXT] = "wfxt", + [KERNEL_HWCAP_EBF16] = "ebf16", + [KERNEL_HWCAP_SVE_EBF16] = "sveebf16", }; #ifdef CONFIG_COMPAT +#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x) static const char *const compat_hwcap_str[] = { - "swp", - "half", - "thumb", - "26bit", - "fastmult", - "fpa", - "vfp", - "edsp", - "java", - "iwmmxt", - "crunch", - "thumbee", - "neon", - "vfpv3", - "vfpv3d16", - "tls", - "vfpv4", - "idiva", - "idivt", - "vfpd32", - "lpae", - "evtstrm", - NULL + [COMPAT_KERNEL_HWCAP(SWP)] = "swp", + [COMPAT_KERNEL_HWCAP(HALF)] = "half", + [COMPAT_KERNEL_HWCAP(THUMB)] = "thumb", + [COMPAT_KERNEL_HWCAP(26BIT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(FAST_MULT)] = "fastmult", + [COMPAT_KERNEL_HWCAP(FPA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(VFP)] = "vfp", + [COMPAT_KERNEL_HWCAP(EDSP)] = "edsp", + [COMPAT_KERNEL_HWCAP(JAVA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(IWMMXT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(CRUNCH)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(THUMBEE)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(NEON)] = "neon", + [COMPAT_KERNEL_HWCAP(VFPv3)] = "vfpv3", + [COMPAT_KERNEL_HWCAP(VFPV3D16)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(TLS)] = "tls", + [COMPAT_KERNEL_HWCAP(VFPv4)] = "vfpv4", + [COMPAT_KERNEL_HWCAP(IDIVA)] = "idiva", + [COMPAT_KERNEL_HWCAP(IDIVT)] = "idivt", + [COMPAT_KERNEL_HWCAP(VFPD32)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(LPAE)] = "lpae", + [COMPAT_KERNEL_HWCAP(EVTSTRM)] = "evtstrm", }; +#define COMPAT_KERNEL_HWCAP2(x) const_ilog2(COMPAT_HWCAP2_ ## x) static const char *const compat_hwcap2_str[] = { - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - NULL + [COMPAT_KERNEL_HWCAP2(AES)] = "aes", + [COMPAT_KERNEL_HWCAP2(PMULL)] = "pmull", + [COMPAT_KERNEL_HWCAP2(SHA1)] = "sha1", + [COMPAT_KERNEL_HWCAP2(SHA2)] = "sha2", + [COMPAT_KERNEL_HWCAP2(CRC32)] = "crc32", }; #endif /* CONFIG_COMPAT */ @@ -141,17 +187,26 @@ static int c_show(struct seq_file *m, void *v) seq_puts(m, "Features\t:"); if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; compat_hwcap_str[j]; j++) - if (compat_elf_hwcap & (1 << j)) + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + seq_printf(m, " %s", compat_hwcap_str[j]); + } + } - for (j = 0; compat_hwcap2_str[j]; j++) + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) if (compat_elf_hwcap2 & (1 << j)) seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ } else { - for (j = 0; hwcap_str[j]; j++) - if (elf_hwcap & (1 << j)) + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) seq_printf(m, " %s", hwcap_str[j]); } seq_puts(m, "\n"); @@ -213,7 +268,7 @@ static struct kobj_type cpuregs_kobj_type = { struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj); \ \ if (info->reg_midr) \ - return sprintf(buf, "0x%016x\n", info->reg_##_field); \ + return sprintf(buf, "0x%016llx\n", info->reg_##_field); \ else \ return 0; \ } \ @@ -221,6 +276,7 @@ static struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, @@ -233,6 +289,16 @@ static const struct attribute_group cpuregs_attr_group = { .name = "identification" }; +static struct attribute *sme_cpuregs_id_attrs[] = { + &cpuregs_attr_smidr_el1.attr, + NULL +}; + +static const struct attribute_group sme_cpuregs_attr_group = { + .attrs = sme_cpuregs_id_attrs, + .name = "identification" +}; + static int cpuid_cpu_online(unsigned int cpu) { int rc; @@ -250,6 +316,8 @@ static int cpuid_cpu_online(unsigned int cpu) rc = sysfs_create_group(&info->kobj, &cpuregs_attr_group); if (rc) kobject_del(&info->kobj); + if (system_supports_sme()) + rc = sysfs_merge_group(&info->kobj, &sme_cpuregs_attr_group); out: return rc; } @@ -288,31 +356,67 @@ static int __init cpuinfo_regs_init(void) } return 0; } +device_initcall(cpuinfo_regs_init); + static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) { unsigned int cpu = smp_processor_id(); u32 l1ip = CTR_L1IP(info->reg_ctr); switch (l1ip) { - case ICACHE_POLICY_PIPT: + case CTR_EL0_L1Ip_PIPT: break; - case ICACHE_POLICY_VPIPT: + case CTR_EL0_L1Ip_VPIPT: set_bit(ICACHEF_VPIPT, &__icache_flags); break; + case CTR_EL0_L1Ip_VIPT: default: - /* Fallthrough */ - case ICACHE_POLICY_VIPT: /* Assume aliasing */ set_bit(ICACHEF_ALIASING, &__icache_flags); + break; } - pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu); + pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str(l1ip), cpu); +} + +static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info) +{ + info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); + info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1); + info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); + info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1); + info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); + info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1); + info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1); + info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1); + + info->reg_mvfr0 = read_cpuid(MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(MVFR2_EL1); } static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) { info->reg_cntfrq = arch_timer_get_cntfrq(); - info->reg_ctr = read_cpuid_cachetype(); + /* + * Use the effective value of the CTR_EL0 than the raw value + * exposed by the CPU. CTR_EL0.IDC field value must be interpreted + * with the CLIDR_EL1 fields to avoid triggering false warnings + * when there is a mismatch across the CPUs. Keep track of the + * effective value of the CTR_EL0 in our internal records for + * accurate sanity check and feature enablement. + */ + info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); @@ -321,32 +425,20 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); + info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); + info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); + info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); - /* Update the 32bit ID registers only if AArch32 is implemented */ - if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); - - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); - } + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) + info->reg_gmid = read_cpuid(GMID_EL1); + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) + __cpuinfo_store_cpu_32bit(&info->aarch32); cpuinfo_detect_icache_policy(info); } @@ -366,5 +458,3 @@ void __init cpuinfo_store_boot_cpu(void) boot_cpu_data = *info; init_cpu_features(&boot_cpu_data); } - -device_initcall(cpuinfo_regs_init); diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c new file mode 100644 index 000000000000..2b65aae332ce --- /dev/null +++ b/arch/arm64/kernel/crash_core.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Linaro. + * Copyright (C) Huawei Futurewei Technologies. + */ + +#include <linux/crash_core.h> +#include <asm/cpufeature.h> +#include <asm/memory.h> +#include <asm/pgtable-hwdef.h> + +static inline u64 get_tcr_el1_t1sz(void); + +static inline u64 get_tcr_el1_t1sz(void) +{ + return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET; +} + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_NUMBER(VA_BITS); + /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ + vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); + vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); + vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); + vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); + vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); + vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n", + kimage_voffset); + vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", + PHYS_OFFSET); + vmcoreinfo_append_str("NUMBER(TCR_EL1_T1SZ)=0x%llx\n", + get_tcr_el1_t1sz()); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", + system_supports_address_auth() ? + ptrauth_kernel_pac_mask() : 0); +} diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index f46d57c31443..670e4ce81822 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -1,36 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Routines for doing kexec-based kdump * * Copyright (C) 2017 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/crash_dump.h> #include <linux/errno.h> #include <linux/io.h> -#include <linux/memblock.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/memory.h> -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -41,14 +24,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); @@ -58,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, /** * elfcorehdr_read - read from ELF core header * @buf: buffer where the data is placed - * @csize: number of bytes to read + * @count: number of bytes to read * @ppos: address in the memory * * This function reads @count bytes from elf core header which exists @@ -67,5 +43,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index c7ef99904934..3da09778267e 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARMv8 single-step debug support and mdscr context switching. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> */ @@ -30,14 +19,16 @@ #include <asm/cpufeature.h> #include <asm/cputype.h> +#include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/system_misc.h> +#include <asm/traps.h> /* Determine debug architecture. */ u8 debug_monitors_arch(void) { return cpuid_feature_extract_unsigned_field(read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1), - ID_AA64DFR0_DEBUGVER_SHIFT); + ID_AA64DFR0_EL1_DebugVer_SHIFT); } /* @@ -46,9 +37,9 @@ u8 debug_monitors_arch(void) static void mdscr_write(u32 mdscr) { unsigned long flags; - local_dbg_save(flags); + flags = local_daif_save(); write_sysreg(mdscr, mdscr_el1); - local_dbg_restore(flags); + local_daif_restore(flags); } NOKPROBE_SYMBOL(mdscr_write); @@ -133,12 +124,13 @@ NOKPROBE_SYMBOL(disable_debug_monitors); */ static int clear_os_lock(unsigned int cpu) { + write_sysreg(0, osdlr_el1); write_sysreg(0, oslar_el1); isb(); return 0; } -static int debug_monitors_init(void) +static int __init debug_monitors_init(void) { return cpuhp_setup_state(CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, "arm64/debug_monitors:starting", @@ -149,58 +141,85 @@ postcore_initcall(debug_monitors_init); /* * Single step API and exception handling. */ -static void set_regs_spsr_ss(struct pt_regs *regs) +static void set_user_regs_spsr_ss(struct user_pt_regs *regs) { regs->pstate |= DBG_SPSR_SS; } -NOKPROBE_SYMBOL(set_regs_spsr_ss); +NOKPROBE_SYMBOL(set_user_regs_spsr_ss); -static void clear_regs_spsr_ss(struct pt_regs *regs) +static void clear_user_regs_spsr_ss(struct user_pt_regs *regs) { regs->pstate &= ~DBG_SPSR_SS; } -NOKPROBE_SYMBOL(clear_regs_spsr_ss); +NOKPROBE_SYMBOL(clear_user_regs_spsr_ss); -/* EL1 Single Step Handler hooks */ -static LIST_HEAD(step_hook); -static DEFINE_SPINLOCK(step_hook_lock); +#define set_regs_spsr_ss(r) set_user_regs_spsr_ss(&(r)->user_regs) +#define clear_regs_spsr_ss(r) clear_user_regs_spsr_ss(&(r)->user_regs) -void register_step_hook(struct step_hook *hook) +static DEFINE_SPINLOCK(debug_hook_lock); +static LIST_HEAD(user_step_hook); +static LIST_HEAD(kernel_step_hook); + +static void register_debug_hook(struct list_head *node, struct list_head *list) { - spin_lock(&step_hook_lock); - list_add_rcu(&hook->node, &step_hook); - spin_unlock(&step_hook_lock); + spin_lock(&debug_hook_lock); + list_add_rcu(node, list); + spin_unlock(&debug_hook_lock); + } -void unregister_step_hook(struct step_hook *hook) +static void unregister_debug_hook(struct list_head *node) { - spin_lock(&step_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&step_hook_lock); + spin_lock(&debug_hook_lock); + list_del_rcu(node); + spin_unlock(&debug_hook_lock); synchronize_rcu(); } +void register_user_step_hook(struct step_hook *hook) +{ + register_debug_hook(&hook->node, &user_step_hook); +} + +void unregister_user_step_hook(struct step_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + +void register_kernel_step_hook(struct step_hook *hook) +{ + register_debug_hook(&hook->node, &kernel_step_hook); +} + +void unregister_kernel_step_hook(struct step_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + /* * Call registered single step handlers * There is no Syndrome info to check for determining the handler. * So we call all the registered handlers, until the right handler is * found which returns zero. */ -static int call_step_hook(struct pt_regs *regs, unsigned int esr) +static int call_step_hook(struct pt_regs *regs, unsigned long esr) { struct step_hook *hook; + struct list_head *list; int retval = DBG_HOOK_ERROR; - rcu_read_lock(); + list = user_mode(regs) ? &user_step_hook : &kernel_step_hook; - list_for_each_entry_rcu(hook, &step_hook, node) { + /* + * Since single-step exception disables interrupt, this function is + * entirely not preemptible, and we can use rcu list safely here. + */ + list_for_each_entry_rcu(hook, list, node) { retval = hook->fn(regs, esr); if (retval == DBG_HOOK_HANDLED) break; } - rcu_read_unlock(); - return retval; } NOKPROBE_SYMBOL(call_step_hook); @@ -208,12 +227,6 @@ NOKPROBE_SYMBOL(call_step_hook); static void send_user_sigtrap(int si_code) { struct pt_regs *regs = current_pt_regs(); - siginfo_t info = { - .si_signo = SIGTRAP, - .si_errno = 0, - .si_code = si_code, - .si_addr = (void __user *)instruction_pointer(regs), - }; if (WARN_ON(!user_mode(regs))) return; @@ -221,10 +234,11 @@ static void send_user_sigtrap(int si_code) if (interrupts_enabled(regs)) local_irq_enable(); - force_sig_info(SIGTRAP, &info, current); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } -static int single_step_handler(unsigned long addr, unsigned int esr, +static int single_step_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { bool handler_found = false; @@ -236,10 +250,6 @@ static int single_step_handler(unsigned long addr, unsigned int esr, if (!reinstall_suspended_bps(regs)) return 0; -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; -#endif if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) handler_found = true; @@ -266,61 +276,61 @@ static int single_step_handler(unsigned long addr, unsigned int esr, } NOKPROBE_SYMBOL(single_step_handler); -/* - * Breakpoint handler is re-entrant as another breakpoint can - * hit within breakpoint handler, especically in kprobes. - * Use reader/writer locks instead of plain spinlock. - */ -static LIST_HEAD(break_hook); -static DEFINE_SPINLOCK(break_hook_lock); +static LIST_HEAD(user_break_hook); +static LIST_HEAD(kernel_break_hook); -void register_break_hook(struct break_hook *hook) +void register_user_break_hook(struct break_hook *hook) { - spin_lock(&break_hook_lock); - list_add_rcu(&hook->node, &break_hook); - spin_unlock(&break_hook_lock); + register_debug_hook(&hook->node, &user_break_hook); } -void unregister_break_hook(struct break_hook *hook) +void unregister_user_break_hook(struct break_hook *hook) { - spin_lock(&break_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&break_hook_lock); - synchronize_rcu(); + unregister_debug_hook(&hook->node); } -static int call_break_hook(struct pt_regs *regs, unsigned int esr) +void register_kernel_break_hook(struct break_hook *hook) +{ + register_debug_hook(&hook->node, &kernel_break_hook); +} + +void unregister_kernel_break_hook(struct break_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + +static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; - int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL; + struct list_head *list; + int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(hook, &break_hook, node) - if ((esr & hook->esr_mask) == hook->esr_val) + list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; + + /* + * Since brk exception disables interrupt, this function is + * entirely not preemptible, and we can use rcu list safely here. + */ + list_for_each_entry_rcu(hook, list, node) { + unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + + if ((comment & ~hook->mask) == hook->imm) fn = hook->fn; - rcu_read_unlock(); + } return fn ? fn(regs, esr) : DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); -static int brk_handler(unsigned long addr, unsigned int esr, +static int brk_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { - bool handler_found = false; - -#ifdef CONFIG_KPROBES - if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - } -#endif - if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; + if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) + return 0; - if (!handler_found && user_mode(regs)) { + if (user_mode(regs)) { send_user_sigtrap(TRAP_BRKPT); - } else if (!handler_found) { + } else { pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } @@ -368,15 +378,13 @@ int aarch32_break_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(aarch32_break_handler); -static int __init debug_traps_init(void) +void __init debug_traps_init(void) { hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, TRAP_TRACE, "single-step handler"); hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, - TRAP_BRKPT, "ptrace BRK handler"); - return 0; + TRAP_BRKPT, "BRK handler"); } -arch_initcall(debug_traps_init); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) @@ -385,17 +393,26 @@ void user_rewind_single_step(struct task_struct *task) * If single step is active for this thread, then set SPSR.SS * to 1 to avoid returning to the active-pending state. */ - if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP)) + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) set_regs_spsr_ss(task_pt_regs(task)); } NOKPROBE_SYMBOL(user_rewind_single_step); void user_fastforward_single_step(struct task_struct *task) { - if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP)) + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) clear_regs_spsr_ss(task_pt_regs(task)); } +void user_regs_reset_single_step(struct user_pt_regs *regs, + struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) + set_user_regs_spsr_ss(regs); + else + clear_user_regs_spsr_ss(regs); +} + /* Kernel API */ void kernel_enable_single_step(struct pt_regs *regs) { diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 4e6ad355bd05..61a87fa1c305 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -1,94 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * EFI entry point. * * Copyright (C) 2013, 2014 Red Hat, Inc. * Author: Mark Salter <msalter@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/linkage.h> #include <linux/init.h> #include <asm/assembler.h> -#define EFI_LOAD_ERROR 0x8000000000000001 - __INIT +SYM_CODE_START(efi_enter_kernel) /* - * We arrive here from the EFI boot manager with: - * - * * CPU in little-endian mode - * * MMU on with identity-mapped RAM - * * Icache and Dcache on - * - * We will most likely be running from some place other than where - * we want to be. The kernel image wants to be placed at TEXT_OFFSET - * from start of RAM. - */ -ENTRY(entry) - /* - * Create a stack frame to save FP/LR with extra space - * for image_addr variable passed to efi_entry(). + * efi_pe_entry() will have copied the kernel image if necessary and we + * end up here with device tree address in x1 and the kernel entry + * point stored in x0. Save those values in registers which are + * callee preserved. */ - stp x29, x30, [sp, #-32]! - mov x29, sp + ldr w2, =primary_entry_offset + add x19, x0, x2 // relocated Image entrypoint + mov x20, x1 // DTB address /* - * Call efi_entry to do the real work. - * x0 and x1 are already set up by firmware. Current runtime - * address of image is calculated and passed via *image_addr. - * - * unsigned long efi_entry(void *handle, - * efi_system_table_t *sys_table, - * unsigned long *image_addr) ; - */ - adr_l x8, _text - add x2, sp, 16 - str x8, [x2] - bl efi_entry - cmn x0, #1 - b.eq efi_load_fail - - /* - * efi_entry() will have copied the kernel image if necessary and we - * return here with device tree address in x0 and the kernel entry - * point stored at *image_addr. Save those values in registers which - * are callee preserved. - */ - mov x20, x0 // DTB address - ldr x0, [sp, #16] // relocated _text address - ldr w21, =stext_offset - add x21, x0, x21 - - /* - * Calculate size of the kernel Image (same for original and copy). - */ - adr_l x1, _text - adr_l x2, _edata - sub x1, x2, x1 - - /* - * Flush the copied Image to the PoC, and ensure it is not shadowed by + * Clean the copied Image to the PoC, and ensure it is not shadowed by * stale icache entries from before relocation. */ - bl __flush_dcache_area + ldr w1, =kernel_size + add x1, x0, x1 + bl dcache_clean_poc ic ialluis /* - * Ensure that the rest of this function (in the original Image) is - * visible when the caches are disabled. The I-cache can't have stale - * entries for the VA range of the current image, so no maintenance is - * necessary. + * Clean the remainder of this routine to the PoC + * so that we can safely disable the MMU and caches. */ - adr x0, entry - adr x1, entry_end - sub x1, x1, x0 - bl __flush_dcache_area - + adr x0, 0f + adr x1, 3f + bl dcache_clean_poc +0: /* Turn off Dcache and MMU */ mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 @@ -96,6 +47,7 @@ ENTRY(entry) mrs x0, sctlr_el2 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el2, x0 isb b 2f @@ -103,6 +55,7 @@ ENTRY(entry) mrs x0, sctlr_el1 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el1, x0 isb 2: @@ -111,12 +64,6 @@ ENTRY(entry) mov x1, xzr mov x2, xzr mov x3, xzr - br x21 - -efi_load_fail: - mov x0, #EFI_LOAD_ERROR - ldp x29, x30, [sp], #32 - ret - -entry_end: -ENDPROC(entry) + br x19 +3: +SYM_CODE_END(efi_enter_kernel) diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 613fc3000677..28d8a5dca5f1 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -1,54 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2013 - 2017 Linaro, Ltd. * Copyright (C) 2013, 2014 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/pe.h> #include <linux/sizes.h> + .macro efi_signature_nop +#ifdef CONFIG_EFI +.L_head: + /* + * This ccmp instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + ccmp x18, #0, #0xd, pl +#else + /* + * Bootloaders may inspect the opcode at the start of the kernel + * image to decide if the kernel is capable of booting via UEFI. + * So put an ordinary NOP here, not the "MZ.." pseudo-nop above. + */ + nop +#endif + .endm + .macro __EFI_PE_HEADER +#ifdef CONFIG_EFI + .set .Lpe_header_offset, . - .L_head .long PE_MAGIC -coff_header: .short IMAGE_FILE_MACHINE_ARM64 // Machine - .short section_count // NumberOfSections + .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp .long 0 // PointerToSymbolTable .long 0 // NumberOfSymbols - .short section_table - optional_header // SizeOfOptionalHeader + .short .Lsection_table - .Loptional_header // SizeOfOptionalHeader .short IMAGE_FILE_DEBUG_STRIPPED | \ IMAGE_FILE_EXECUTABLE_IMAGE | \ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics -optional_header: +.Loptional_header: .short PE_OPT_MAGIC_PE32PLUS // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long __initdata_begin - efi_header_end // SizeOfCode + .long __initdata_begin - .Lefi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_entry - _head // AddressOfEntryPoint - .long efi_header_end - _head // BaseOfCode + .long __efistub_efi_pe_entry - .L_head // AddressOfEntryPoint + .long .Lefi_header_end - .L_head // BaseOfCode -extra_header_fields: .quad 0 // ImageBase - .long SZ_4K // SectionAlignment + .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment .short 0 // MajorOperatingSystemVersion .short 0 // MinorOperatingSystemVersion - .short 0 // MajorImageVersion - .short 0 // MinorImageVersion + .short LINUX_EFISTUB_MAJOR_VERSION // MajorImageVersion + .short LINUX_EFISTUB_MINOR_VERSION // MinorImageVersion .short 0 // MajorSubsystemVersion .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - _head // SizeOfImage + .long _end - .L_head // SizeOfImage // Everything before the kernel image is considered part of the header - .long efi_header_end - _head // SizeOfHeaders + .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem .short 0 // DllCharacteristics @@ -57,7 +72,7 @@ extra_header_fields: .quad 0 // SizeOfHeapReserve .quad 0 // SizeOfHeapCommit .long 0 // LoaderFlags - .long (section_table - .) / 8 // NumberOfRvaAndSizes + .long (.Lsection_table - .) / 8 // NumberOfRvaAndSizes .quad 0 // ExportTable .quad 0 // ImportTable @@ -67,17 +82,17 @@ extra_header_fields: .quad 0 // BaseRelocationTable #ifdef CONFIG_DEBUG_EFI - .long efi_debug_table - _head // DebugTable - .long efi_debug_table_size + .long .Lefi_debug_table - .L_head // DebugTable + .long .Lefi_debug_table_size #endif // Section table -section_table: +.Lsection_table: .ascii ".text\0\0\0" - .long __initdata_begin - efi_header_end // VirtualSize - .long efi_header_end - _head // VirtualAddress - .long __initdata_begin - efi_header_end // SizeOfRawData - .long efi_header_end - _head // PointerToRawData + .long __initdata_begin - .Lefi_header_end // VirtualSize + .long .Lefi_header_end - .L_head // VirtualAddress + .long __initdata_begin - .Lefi_header_end // SizeOfRawData + .long .Lefi_header_end - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -89,9 +104,9 @@ section_table: .ascii ".data\0\0\0" .long __pecoff_data_size // VirtualSize - .long __initdata_begin - _head // VirtualAddress + .long __initdata_begin - .L_head // VirtualAddress .long __pecoff_data_rawsize // SizeOfRawData - .long __initdata_begin - _head // PointerToRawData + .long __initdata_begin - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -101,7 +116,7 @@ section_table: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_WRITE // Characteristics - .set section_count, (. - section_table) / 40 + .set .Lsection_count, (. - .Lsection_table) / 40 #ifdef CONFIG_DEBUG_EFI /* @@ -117,21 +132,21 @@ section_table: __INITRODATA .align 2 -efi_debug_table: +.Lefi_debug_table: // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY .long 0 // Characteristics .long 0 // TimeDateStamp .short 0 // MajorVersion .short 0 // MinorVersion .long IMAGE_DEBUG_TYPE_CODEVIEW // Type - .long efi_debug_entry_size // SizeOfData + .long .Lefi_debug_entry_size // SizeOfData .long 0 // RVA - .long efi_debug_entry - _head // FileOffset + .long .Lefi_debug_entry - .L_head // FileOffset - .set efi_debug_table_size, . - efi_debug_table + .set .Lefi_debug_table_size, . - .Lefi_debug_table .previous -efi_debug_entry: +.Lefi_debug_entry: // EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY .ascii "NB10" // Signature .long 0 // Unknown @@ -140,16 +155,12 @@ efi_debug_entry: .asciz VMLINUX_PATH - .set efi_debug_entry_size, . - efi_debug_entry + .set .Lefi_debug_entry_size, . - .Lefi_debug_entry #endif - /* - * EFI will load .text onwards at the 4k section alignment - * described in the PE/COFF header. To ensure that instruction - * sequences using an adrp and a :lo12: immediate will function - * correctly at this alignment, we must ensure that .text is - * placed at a 4k boundary in the Image to begin with. - */ - .align 12 -efi_header_end: + .balign SEGMENT_ALIGN +.Lefi_header_end: +#else + .set .Lpe_header_offset, 0x0 +#endif .endm diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S new file mode 100644 index 000000000000..67babd5f04c2 --- /dev/null +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> + +SYM_FUNC_START(__efi_rt_asm_wrapper) + stp x29, x30, [sp, #-112]! + mov x29, sp + + /* + * Register x18 is designated as the 'platform' register by the AAPCS, + * which means firmware running at the same exception level as the OS + * (such as UEFI) should never touch it. + */ + stp x1, x18, [sp, #16] + + /* + * Preserve all callee saved registers and record the stack pointer + * value in a per-CPU variable so we can recover from synchronous + * exceptions occurring while running the firmware routines. + */ + stp x19, x20, [sp, #32] + stp x21, x22, [sp, #48] + stp x23, x24, [sp, #64] + stp x25, x26, [sp, #80] + stp x27, x28, [sp, #96] + + adr_this_cpu x8, __efi_rt_asm_recover_sp, x9 + str x29, [x8] + + /* + * We are lucky enough that no EFI runtime services take more than + * 5 arguments, so all are passed in registers rather than via the + * stack. + */ + mov x8, x0 + mov x0, x2 + mov x1, x3 + mov x2, x4 + mov x3, x5 + mov x4, x6 + blr x8 + + ldp x1, x2, [sp, #16] + cmp x2, x18 + ldp x29, x30, [sp], #112 + b.ne 0f + ret +0: + /* + * With CONFIG_SHADOW_CALL_STACK, the kernel uses x18 to store a + * shadow stack pointer, which we need to restore before returning to + * potentially instrumented code. This is safe because the wrapper is + * called with preemption disabled and a separate shadow stack is used + * for interrupts. + */ + mov x18, x2 + b efi_handle_corrupted_x18 // tail call +SYM_FUNC_END(__efi_rt_asm_wrapper) + +SYM_FUNC_START(__efi_rt_asm_recover) + ldr_this_cpu x8, __efi_rt_asm_recover_sp, x9 + mov sp, x8 + + ldp x0, x18, [sp, #16] + ldp x19, x20, [sp, #32] + ldp x21, x22, [sp, #48] + ldp x23, x24, [sp, #64] + ldp x25, x26, [sp, #80] + ldp x27, x28, [sp, #96] + ldp x29, x30, [sp], #112 + + b efi_handle_runtime_exception +SYM_FUNC_END(__efi_rt_asm_recover) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 82cd07592519..ee53f2a0aa03 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -1,21 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Extensible Firmware Interface * * Based on Extensible Firmware Interface Specification version 2.4 * * Copyright (C) 2013, 2014 Linaro Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/efi.h> #include <linux/init.h> +#include <linux/percpu.h> #include <asm/efi.h> +static bool region_is_misaligned(const efi_memory_desc_t *md) +{ + if (PAGE_SIZE == EFI_PAGE_SIZE) + return false; + return !PAGE_ALIGNED(md->phys_addr) || + !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT); +} + /* * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be * executable, everything else can be mapped with the XN bits @@ -29,14 +34,22 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) if (type == EFI_MEMORY_MAPPED_IO) return PROT_DEVICE_nGnRE; - if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), - "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) + if (region_is_misaligned(md)) { + static bool __initdata code_is_misaligned; + /* - * If the region is not aligned to the page size of the OS, we - * can not use strict permissions, since that would also affect - * the mapping attributes of the adjacent regions. + * Regions that are not aligned to the OS page size cannot be + * mapped with strict permissions, as those might interfere + * with the permissions that are needed by the adjacent + * region's mapping. However, if we haven't encountered any + * misaligned runtime code regions so far, we can safely use + * non-executable permissions for non-code regions. */ - return pgprot_val(PAGE_KERNEL_EXEC); + code_is_misaligned |= (type == EFI_RUNTIME_SERVICES_CODE); + + return code_is_misaligned ? pgprot_val(PAGE_KERNEL_EXEC) + : pgprot_val(PAGE_KERNEL); + } /* R-- */ if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == @@ -48,7 +61,9 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) return pgprot_val(PAGE_KERNEL_ROX); /* RW- */ - if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE) + if (((attr & (EFI_MEMORY_RP | EFI_MEMORY_WP | EFI_MEMORY_XP)) == + EFI_MEMORY_XP) || + type != EFI_RUNTIME_SERVICES_CODE) return pgprot_val(PAGE_KERNEL); /* RWX */ @@ -56,7 +71,8 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) } /* we will fill this structure from the stub, so don't put it in .bss */ -struct screen_info screen_info __section(.data); +struct screen_info screen_info __section(".data"); +EXPORT_SYMBOL(screen_info); int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { @@ -64,19 +80,16 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); - if (!PAGE_ALIGNED(md->phys_addr) || - !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) { - /* - * If the end address of this region is not aligned to page - * size, the mapping is rounded up, and may end up sharing a - * page frame with the next UEFI memory region. If we create - * a block entry now, we may need to split it again when mapping - * the next region, and support for that is going to be removed - * from the MMU routines. So avoid block mappings altogether in - * that case. - */ + /* + * If this region is not aligned to the page size used by the OS, the + * mapping will be rounded outwards, and may end up sharing a page + * frame with an adjacent runtime memory region. Given that the page + * table descriptor covering the shared page will be rewritten when the + * adjacent region gets mapped, we must avoid block mappings here so we + * don't have to worry about splitting them when that happens. + */ + if (region_is_misaligned(md)) page_mappings_only = true; - } create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, @@ -84,11 +97,10 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) return 0; } -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; - pte_t pte = *ptep; + pte_t pte = READ_ONCE(*ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -104,6 +116,9 @@ int __init efi_set_mapping_permissions(struct mm_struct *mm, BUG_ON(md->type != EFI_RUNTIME_SERVICES_CODE && md->type != EFI_RUNTIME_SERVICES_DATA); + if (region_is_misaligned(md)) + return 0; + /* * Calling apply_to_page_range() is only safe on regions that are * guaranteed to be mapped down to pages. Since we are only called @@ -124,3 +139,34 @@ bool efi_poweroff_required(void) { return efi_enabled(EFI_RUNTIME_SERVICES); } + +asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f) +{ + pr_err_ratelimited(FW_BUG "register x18 corrupted by EFI %s\n", f); + return s; +} + +asmlinkage DEFINE_PER_CPU(u64, __efi_rt_asm_recover_sp); + +asmlinkage efi_status_t __efi_rt_asm_recover(void); + +asmlinkage efi_status_t efi_handle_runtime_exception(const char *f) +{ + pr_err(FW_BUG "Synchronous exception occurred in EFI runtime service %s()\n", f); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return EFI_ABORTED; +} + +bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg) +{ + /* Check whether the exception occurred while running the firmware */ + if (current_work() != &efi_rts_work.work || regs->pc >= TASK_SIZE_64) + return false; + + pr_err(FW_BUG "Unable to handle %s in EFI runtime service\n", msg); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + dump_stack(); + + regs->pc = (u64)__efi_rt_asm_recover; + return true; +} diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c new file mode 100644 index 000000000000..27ef7ad3ffd2 --- /dev/null +++ b/arch/arm64/kernel/elfcore.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/coredump.h> +#include <linux/elfcore.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +#include <asm/cpufeature.h> +#include <asm/mte.h> + +#define for_each_mte_vma(vmi, vma) \ + if (system_supports_mte()) \ + for_each_vma(vmi, vma) \ + if (vma->vm_flags & VM_MTE) + +static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + return vma_pages(vma) * MTE_PAGE_TAG_STORAGE; +} + +/* Derived from dump_user_range(); start/end must be page-aligned */ +static int mte_dump_tag_range(struct coredump_params *cprm, + unsigned long start, unsigned long end) +{ + int ret = 1; + unsigned long addr; + void *tags = NULL; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page = get_dump_page(addr); + + /* + * get_dump_page() returns NULL when encountering an empty + * page table entry that would otherwise have been filled with + * the zero page. Skip the equivalent tag dump which would + * have been all zeros. + */ + if (!page) { + dump_skip(cprm, MTE_PAGE_TAG_STORAGE); + continue; + } + + /* + * Pages mapped in user space as !pte_access_permitted() (e.g. + * PROT_EXEC only) may not have the PG_mte_tagged flag set. + */ + if (!test_bit(PG_mte_tagged, &page->flags)) { + put_page(page); + dump_skip(cprm, MTE_PAGE_TAG_STORAGE); + continue; + } + + if (!tags) { + tags = mte_allocate_tag_storage(); + if (!tags) { + put_page(page); + ret = 0; + break; + } + } + + mte_save_page_tags(page_address(page), tags); + put_page(page); + if (!dump_emit(cprm, tags, MTE_PAGE_TAG_STORAGE)) { + mte_free_tag_storage(tags); + ret = 0; + break; + } + } + + if (tags) + mte_free_tag_storage(tags); + + return ret; +} + +Elf_Half elf_core_extra_phdrs(void) +{ + struct vm_area_struct *vma; + int vma_count = 0; + VMA_ITERATOR(vmi, current->mm, 0); + + for_each_mte_vma(vmi, vma) + vma_count++; + + return vma_count; +} + +int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); + + for_each_mte_vma(vmi, vma) { + struct elf_phdr phdr; + + phdr.p_type = PT_AARCH64_MEMTAG_MTE; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = mte_vma_tag_dump_size(vma); + phdr.p_memsz = vma->vm_end - vma->vm_start; + offset += phdr.p_filesz; + phdr.p_flags = 0; + phdr.p_align = 0; + + if (!dump_emit(cprm, &phdr, sizeof(phdr))) + return 0; + } + + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + struct vm_area_struct *vma; + size_t data_size = 0; + VMA_ITERATOR(vmi, current->mm, 0); + + for_each_mte_vma(vmi, vma) + data_size += mte_vma_tag_dump_size(vma); + + return data_size; +} + +int elf_core_write_extra_data(struct coredump_params *cprm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); + + for_each_mte_vma(vmi, vma) { + if (vma->vm_flags & VM_DONTDUMP) + continue; + + if (!mte_dump_tag_range(cprm, vma->vm_start, vma->vm_end)) + return 0; + } + + return 1; +} diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c new file mode 100644 index 000000000000..27369fa1c032 --- /dev/null +++ b/arch/arm64/kernel/entry-common.c @@ -0,0 +1,887 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Exception handling code + * + * Copyright (C) 2019 ARM Ltd. + */ + +#include <linux/context_tracking.h> +#include <linux/kasan.h> +#include <linux/linkage.h> +#include <linux/lockdep.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/thread_info.h> + +#include <asm/cpufeature.h> +#include <asm/daifflags.h> +#include <asm/esr.h> +#include <asm/exception.h> +#include <asm/irq_regs.h> +#include <asm/kprobes.h> +#include <asm/mmu.h> +#include <asm/processor.h> +#include <asm/sdei.h> +#include <asm/stacktrace.h> +#include <asm/sysreg.h> +#include <asm/system_misc.h> + +/* + * Handle IRQ/context state management when entering from kernel mode. + * Before this function is called it is not safe to call regular kernel code, + * intrumentable code, or any code which may trigger an exception. + * + * This is intended to match the logic in irqentry_enter(), handling the kernel + * mode transitions only. + */ +static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +{ + regs->exit_rcu = false; + + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + lockdep_hardirqs_off(CALLER_ADDR0); + ct_irq_enter(); + trace_hardirqs_off_finish(); + + regs->exit_rcu = true; + return; + } + + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter_check_tick(); + trace_hardirqs_off_finish(); +} + +static void noinstr enter_from_kernel_mode(struct pt_regs *regs) +{ + __enter_from_kernel_mode(regs); + mte_check_tfsr_entry(); + mte_disable_tco_entry(current); +} + +/* + * Handle IRQ/context state management when exiting to kernel mode. + * After this function returns it is not safe to call regular kernel code, + * intrumentable code, or any code which may trigger an exception. + * + * This is intended to match the logic in irqentry_exit(), handling the kernel + * mode transitions only, and with preemption handled elsewhere. + */ +static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (interrupts_enabled(regs)) { + if (regs->exit_rcu) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + ct_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + trace_hardirqs_on(); + } else { + if (regs->exit_rcu) + ct_irq_exit(); + } +} + +static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +{ + mte_check_tfsr_exit(); + __exit_to_kernel_mode(regs); +} + +/* + * Handle IRQ/context state management when entering from user mode. + * Before this function is called it is not safe to call regular kernel code, + * intrumentable code, or any code which may trigger an exception. + */ +static __always_inline void __enter_from_user_mode(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); + trace_hardirqs_off_finish(); + mte_disable_tco_entry(current); +} + +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + __enter_from_user_mode(); +} + +/* + * Handle IRQ/context state management when exiting to user mode. + * After this function returns it is not safe to call regular kernel code, + * intrumentable code, or any code which may trigger an exception. + */ +static __always_inline void __exit_to_user_mode(void) +{ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + user_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +static __always_inline void prepare_exit_to_user_mode(struct pt_regs *regs) +{ + unsigned long flags; + + local_daif_mask(); + + flags = read_thread_flags(); + if (unlikely(flags & _TIF_WORK_MASK)) + do_notify_resume(regs, flags); +} + +static __always_inline void exit_to_user_mode(struct pt_regs *regs) +{ + prepare_exit_to_user_mode(regs); + mte_check_tfsr_exit(); + __exit_to_user_mode(); +} + +asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) +{ + exit_to_user_mode(regs); +} + +/* + * Handle IRQ/context state management when entering an NMI from user/kernel + * mode. Before this function is called it is not safe to call regular kernel + * code, intrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_enter_nmi(struct pt_regs *regs) +{ + regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + ct_nmi_enter(); + + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); +} + +/* + * Handle IRQ/context state management when exiting an NMI from user/kernel + * mode. After this function returns it is not safe to call regular kernel + * code, intrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_exit_nmi(struct pt_regs *regs) +{ + bool restore = regs->lockdep_hardirqs; + + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + } + + ct_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} + +/* + * Handle IRQ/context state management when entering a debug exception from + * kernel mode. Before this function is called it is not safe to call regular + * kernel code, intrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +{ + regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + + lockdep_hardirqs_off(CALLER_ADDR0); + ct_nmi_enter(); + + trace_hardirqs_off_finish(); +} + +/* + * Handle IRQ/context state management when exiting a debug exception from + * kernel mode. After this function returns it is not safe to call regular + * kernel code, intrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +{ + bool restore = regs->lockdep_hardirqs; + + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + } + + ct_nmi_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +#define need_irq_preemption() \ + (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) +#else +#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) +#endif + +static void __sched arm64_preempt_schedule_irq(void) +{ + if (!need_irq_preemption()) + return; + + /* + * Note: thread_info::preempt_count includes both thread_info::count + * and thread_info::need_resched, and is not equivalent to + * preempt_count(). + */ + if (READ_ONCE(current_thread_info()->preempt_count) != 0) + return; + + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (system_capabilities_finalized()) + preempt_schedule_irq(); +} + +static void do_interrupt_handler(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + if (on_thread_stack()) + call_on_irq_stack(regs, handler); + else + handler(regs); + + set_irq_regs(old_regs); +} + +extern void (*handle_arch_irq)(struct pt_regs *); +extern void (*handle_arch_fiq)(struct pt_regs *); + +static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, + unsigned long esr) +{ + arm64_enter_nmi(regs); + + console_verbose(); + + pr_crit("Unhandled %s exception on CPU%d, ESR 0x%016lx -- %s\n", + vector, smp_processor_id(), esr, + esr_get_class_string(esr)); + + __show_regs(regs); + panic("Unhandled exception"); +} + +#define UNHANDLED(el, regsize, vector) \ +asmlinkage void noinstr el##_##regsize##_##vector##_handler(struct pt_regs *regs) \ +{ \ + const char *desc = #regsize "-bit " #el " " #vector; \ + __panic_unhandled(regs, desc, read_sysreg(esr_el1)); \ +} + +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); + +static void cortex_a76_erratum_1463225_svc_handler(void) +{ + u32 reg, val; + + if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) + return; + + if (!unlikely(this_cpu_has_cap(ARM64_WORKAROUND_1463225))) + return; + + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); + reg = read_sysreg(mdscr_el1); + val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; + write_sysreg(val, mdscr_el1); + asm volatile("msr daifclr, #8"); + isb(); + + /* We will have taken a single-step exception by this point */ + + write_sysreg(reg, mdscr_el1); + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0); +} + +static __always_inline bool +cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa)) + return false; + + /* + * We've taken a dummy step exception from the kernel to ensure + * that interrupts are re-enabled on the syscall path. Return back + * to cortex_a76_erratum_1463225_svc_handler() with debug exceptions + * masked so that we can safely restore the mdscr and get on with + * handling the syscall. + */ + regs->pstate |= PSR_D_BIT; + return true; +} +#else /* CONFIG_ARM64_ERRATUM_1463225 */ +static void cortex_a76_erratum_1463225_svc_handler(void) { } +static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_ARM64_ERRATUM_1463225 */ + +UNHANDLED(el1t, 64, sync) +UNHANDLED(el1t, 64, irq) +UNHANDLED(el1t, 64, fiq) +UNHANDLED(el1t, 64, error) + +static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_mem_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_sp_pc_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_undefinstr(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_bti(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + arm64_enter_el1_dbg(regs); + if (!cortex_a76_erratum_1463225_debug_handler(regs)) + do_debug_exception(far, esr, regs); + arm64_exit_el1_dbg(regs); +} + +static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_fpac(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_DABT_CUR: + case ESR_ELx_EC_IABT_CUR: + el1_abort(regs, esr); + break; + /* + * We don't handle ESR_ELx_EC_SP_ALIGN, since we will have hit a + * recursive exception when trying to push the initial pt_regs. + */ + case ESR_ELx_EC_PC_ALIGN: + el1_pc(regs, esr); + break; + case ESR_ELx_EC_SYS64: + case ESR_ELx_EC_UNKNOWN: + el1_undef(regs, esr); + break; + case ESR_ELx_EC_BTI: + el1_bti(regs, esr); + break; + case ESR_ELx_EC_BREAKPT_CUR: + case ESR_ELx_EC_SOFTSTP_CUR: + case ESR_ELx_EC_WATCHPT_CUR: + case ESR_ELx_EC_BRK64: + el1_dbg(regs, esr); + break; + case ESR_ELx_EC_FPAC: + el1_fpac(regs, esr); + break; + default: + __panic_unhandled(regs, "64-bit el1h sync", esr); + } +} + +static __always_inline void __el1_pnmi(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + arm64_enter_nmi(regs); + do_interrupt_handler(regs, handler); + arm64_exit_nmi(regs); +} + +static __always_inline void __el1_irq(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + enter_from_kernel_mode(regs); + + irq_enter_rcu(); + do_interrupt_handler(regs, handler); + irq_exit_rcu(); + + arm64_preempt_schedule_irq(); + + exit_to_kernel_mode(regs); +} +static void noinstr el1_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + __el1_pnmi(regs, handler); + else + __el1_irq(regs, handler); +} + +asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs) +{ + el1_interrupt(regs, handle_arch_irq); +} + +asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) +{ + el1_interrupt(regs, handle_arch_fiq); +} + +asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + local_daif_restore(DAIF_ERRCTX); + arm64_enter_nmi(regs); + do_serror(regs, esr); + arm64_exit_nmi(regs); +} + +static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_mem_abort(far, esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + /* + * We've taken an instruction abort from userspace and not yet + * re-enabled IRQs. If the address is a kernel address, apply + * BP hardening prior to enabling IRQs and pre-emption. + */ + if (!is_ttbr0_addr(far)) + arm64_apply_bp_hardening(); + + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_mem_abort(far, esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_fpsimd_acc(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sve_acc(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sme_acc(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_fpsimd_exc(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sysinstr(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + if (!is_ttbr0_addr(instruction_pointer(regs))) + arm64_apply_bp_hardening(); + + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sp_pc_abort(far, esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sp_pc_abort(regs->sp, esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_undefinstr(regs, esr); + exit_to_user_mode(regs); +} + +static void noinstr el0_bti(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_bti(regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + bad_el0_sync(regs, 0, esr); + exit_to_user_mode(regs); +} + +static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) +{ + /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */ + unsigned long far = read_sysreg(far_el1); + + enter_from_user_mode(regs); + do_debug_exception(far, esr, regs); + local_daif_restore(DAIF_PROCCTX); + exit_to_user_mode(regs); +} + +static void noinstr el0_svc(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + cortex_a76_erratum_1463225_svc_handler(); + do_el0_svc(regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_fpac(regs, esr); + exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_SVC64: + el0_svc(regs); + break; + case ESR_ELx_EC_DABT_LOW: + el0_da(regs, esr); + break; + case ESR_ELx_EC_IABT_LOW: + el0_ia(regs, esr); + break; + case ESR_ELx_EC_FP_ASIMD: + el0_fpsimd_acc(regs, esr); + break; + case ESR_ELx_EC_SVE: + el0_sve_acc(regs, esr); + break; + case ESR_ELx_EC_SME: + el0_sme_acc(regs, esr); + break; + case ESR_ELx_EC_FP_EXC64: + el0_fpsimd_exc(regs, esr); + break; + case ESR_ELx_EC_SYS64: + case ESR_ELx_EC_WFx: + el0_sys(regs, esr); + break; + case ESR_ELx_EC_SP_ALIGN: + el0_sp(regs, esr); + break; + case ESR_ELx_EC_PC_ALIGN: + el0_pc(regs, esr); + break; + case ESR_ELx_EC_UNKNOWN: + el0_undef(regs, esr); + break; + case ESR_ELx_EC_BTI: + el0_bti(regs); + break; + case ESR_ELx_EC_BREAKPT_LOW: + case ESR_ELx_EC_SOFTSTP_LOW: + case ESR_ELx_EC_WATCHPT_LOW: + case ESR_ELx_EC_BRK64: + el0_dbg(regs, esr); + break; + case ESR_ELx_EC_FPAC: + el0_fpac(regs, esr); + break; + default: + el0_inv(regs, esr); + } +} + +static void noinstr el0_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + enter_from_user_mode(regs); + + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (regs->pc & BIT(55)) + arm64_apply_bp_hardening(); + + irq_enter_rcu(); + do_interrupt_handler(regs, handler); + irq_exit_rcu(); + + exit_to_user_mode(regs); +} + +static void noinstr __el0_irq_handler_common(struct pt_regs *regs) +{ + el0_interrupt(regs, handle_arch_irq); +} + +asmlinkage void noinstr el0t_64_irq_handler(struct pt_regs *regs) +{ + __el0_irq_handler_common(regs); +} + +static void noinstr __el0_fiq_handler_common(struct pt_regs *regs) +{ + el0_interrupt(regs, handle_arch_fiq); +} + +asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) +{ + __el0_fiq_handler_common(regs); +} + +static void noinstr __el0_error_handler_common(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + enter_from_user_mode(regs); + local_daif_restore(DAIF_ERRCTX); + arm64_enter_nmi(regs); + do_serror(regs, esr); + arm64_exit_nmi(regs); + local_daif_restore(DAIF_PROCCTX); + exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) +{ + __el0_error_handler_common(regs); +} + +#ifdef CONFIG_COMPAT +static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_cp15instr(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_svc_compat(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + cortex_a76_erratum_1463225_svc_handler(); + do_el0_svc_compat(regs); + exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_SVC32: + el0_svc_compat(regs); + break; + case ESR_ELx_EC_DABT_LOW: + el0_da(regs, esr); + break; + case ESR_ELx_EC_IABT_LOW: + el0_ia(regs, esr); + break; + case ESR_ELx_EC_FP_ASIMD: + el0_fpsimd_acc(regs, esr); + break; + case ESR_ELx_EC_FP_EXC32: + el0_fpsimd_exc(regs, esr); + break; + case ESR_ELx_EC_PC_ALIGN: + el0_pc(regs, esr); + break; + case ESR_ELx_EC_UNKNOWN: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_CP14_64: + el0_undef(regs, esr); + break; + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + el0_cp15(regs, esr); + break; + case ESR_ELx_EC_BREAKPT_LOW: + case ESR_ELx_EC_SOFTSTP_LOW: + case ESR_ELx_EC_WATCHPT_LOW: + case ESR_ELx_EC_BKPT32: + el0_dbg(regs, esr); + break; + default: + el0_inv(regs, esr); + } +} + +asmlinkage void noinstr el0t_32_irq_handler(struct pt_regs *regs) +{ + __el0_irq_handler_common(regs); +} + +asmlinkage void noinstr el0t_32_fiq_handler(struct pt_regs *regs) +{ + __el0_fiq_handler_common(regs); +} + +asmlinkage void noinstr el0t_32_error_handler(struct pt_regs *regs) +{ + __el0_error_handler_common(regs); +} +#else /* CONFIG_COMPAT */ +UNHANDLED(el0t, 32, sync) +UNHANDLED(el0t, 32, irq) +UNHANDLED(el0t, 32, fiq) +UNHANDLED(el0t, 32, error) +#endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_VMAP_STACK +asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + arm64_enter_nmi(regs); + panic_bad_stack(regs, esr, far); +} +#endif /* CONFIG_VMAP_STACK */ + +#ifdef CONFIG_ARM_SDE_INTERFACE +asmlinkage noinstr unsigned long +__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) +{ + unsigned long ret; + + /* + * We didn't take an exception to get here, so the HW hasn't + * set/cleared bits in PSTATE that we may rely on. + * + * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to + * whether PSTATE bits are inherited unchanged or generated from + * scratch, and the TF-A implementation always clears PAN and always + * clears UAO. There are no other known implementations. + * + * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how + * PSTATE is modified upon architectural exceptions, and so PAN is + * either inherited or set per SCTLR_ELx.SPAN, and UAO is always + * cleared. + * + * We must explicitly reset PAN to the expected state, including + * clearing it when the host isn't using it, in case a VM had it set. + */ + if (system_uses_hw_pan()) + set_pstate_pan(1); + else if (cpu_has_pan()) + set_pstate_pan(0); + + arm64_enter_nmi(regs); + ret = do_sdei_event(regs, arg); + arm64_exit_nmi(regs); + + return ret; +} +#endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6a27cd6dbfa6..229436f33df5 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -1,20 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * FP/SIMD state saving and restoring * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> @@ -27,17 +16,109 @@ * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_save_state) +SYM_FUNC_START(fpsimd_save_state) fpsimd_save x0, 8 ret -ENDPROC(fpsimd_save_state) +SYM_FUNC_END(fpsimd_save_state) /* * Load the FP registers. * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_load_state) +SYM_FUNC_START(fpsimd_load_state) fpsimd_restore x0, 8 ret -ENDPROC(fpsimd_load_state) +SYM_FUNC_END(fpsimd_load_state) + +#ifdef CONFIG_ARM64_SVE + +/* + * Save the SVE state + * + * x0 - pointer to buffer for state + * x1 - pointer to storage for FPSR + * x2 - Save FFR if non-zero + */ +SYM_FUNC_START(sve_save_state) + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(sve_save_state) + +/* + * Load the SVE state + * + * x0 - pointer to buffer for state + * x1 - pointer to storage for FPSR + * x2 - Restore FFR if non-zero + */ +SYM_FUNC_START(sve_load_state) + sve_load 0, x1, x2, 4 + ret +SYM_FUNC_END(sve_load_state) + +SYM_FUNC_START(sve_get_vl) + _sve_rdvl 0, 1 + ret +SYM_FUNC_END(sve_get_vl) + +SYM_FUNC_START(sve_set_vq) + sve_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sve_set_vq) + +/* + * Zero all SVE registers but the first 128-bits of each vector + * + * VQ must already be configured by caller, any further updates of VQ + * will need to ensure that the register state remains valid. + * + * x0 = include FFR? + * x1 = VQ - 1 + */ +SYM_FUNC_START(sve_flush_live) + cbz x1, 1f // A VQ-1 of 0 is 128 bits so no extra Z state + sve_flush_z +1: sve_flush_p + tbz x0, #0, 2f + sve_flush_ffr +2: ret +SYM_FUNC_END(sve_flush_live) + +#endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +SYM_FUNC_START(sme_get_vl) + _sme_rdsvl 0, 1 + ret +SYM_FUNC_END(sme_get_vl) + +SYM_FUNC_START(sme_set_vq) + sme_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sme_set_vq) + +/* + * Save the SME state + * + * x0 - pointer to buffer for state + */ +SYM_FUNC_START(za_save_state) + _sme_rdsvl 1, 1 // x1 = VL/8 + sme_save_za 0, x1, 12 + ret +SYM_FUNC_END(za_save_state) + +/* + * Load the SME state + * + * x0 - pointer to buffer for state + */ +SYM_FUNC_START(za_load_state) + _sme_rdsvl 1, 1 // x1 = VL/8 + sme_load_za 0, x1, 12 + ret +SYM_FUNC_END(za_load_state) + +#endif /* CONFIG_ARM64_SME */ diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index e1be42e11ff5..795344ab4ec4 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -1,19 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * arch/arm64/kernel/entry-ftrace.S * * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/ftrace.h> #include <asm/insn.h> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +/* + * Due to -fpatchable-function-entry=2, the compiler has placed two NOPs before + * the regular function prologue. For an enabled callsite, ftrace_init_nop() and + * ftrace_make_call() have patched those NOPs to: + * + * MOV X9, LR + * BL <entry> + * + * ... where <entry> is either ftrace_caller or ftrace_regs_caller. + * + * Each instrumented function follows the AAPCS, so here x0-x8 and x18-x30 are + * live (x18 holds the Shadow Call Stack pointer), and x9-x17 are safe to + * clobber. + * + * We save the callsite's context into a pt_regs before invoking any ftrace + * callbacks. So that we can get a sensible backtrace, we create a stack record + * for the callsite and the ftrace entry assembly. This is not sufficient for + * reliable stacktrace: until we create the callsite stack record, its caller + * is missing from the LR and existing chain of frame records. + */ + .macro ftrace_regs_entry, allregs=0 + /* Make room for pt_regs, plus a callee frame */ + sub sp, sp, #(PT_REGS_SIZE + 16) + + /* Save function arguments (and x9 for simplicity) */ + stp x0, x1, [sp, #S_X0] + stp x2, x3, [sp, #S_X2] + stp x4, x5, [sp, #S_X4] + stp x6, x7, [sp, #S_X6] + stp x8, x9, [sp, #S_X8] + + /* Optionally save the callee-saved registers, always save the FP */ + .if \allregs == 1 + stp x10, x11, [sp, #S_X10] + stp x12, x13, [sp, #S_X12] + stp x14, x15, [sp, #S_X14] + stp x16, x17, [sp, #S_X16] + stp x18, x19, [sp, #S_X18] + stp x20, x21, [sp, #S_X20] + stp x22, x23, [sp, #S_X22] + stp x24, x25, [sp, #S_X24] + stp x26, x27, [sp, #S_X26] + stp x28, x29, [sp, #S_X28] + .else + str x29, [sp, #S_FP] + .endif + + /* Save the callsite's SP and LR */ + add x10, sp, #(PT_REGS_SIZE + 16) + stp x9, x10, [sp, #S_LR] + + /* Save the PC after the ftrace callsite */ + str x30, [sp, #S_PC] + + /* Create a frame record for the callsite above pt_regs */ + stp x29, x9, [sp, #PT_REGS_SIZE] + add x29, sp, #PT_REGS_SIZE + + /* Create our frame record within pt_regs. */ + stp x29, x30, [sp, #S_STACKFRAME] + add x29, sp, #S_STACKFRAME + .endm + +SYM_CODE_START(ftrace_regs_caller) + bti c + ftrace_regs_entry 1 + b ftrace_common +SYM_CODE_END(ftrace_regs_caller) + +SYM_CODE_START(ftrace_caller) + bti c + ftrace_regs_entry 0 + b ftrace_common +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_common) + sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn) + mov x1, x9 // parent_ip (callsite's LR) + ldr_l x2, function_trace_op // op + mov x3, sp // regs + +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + bl ftrace_stub + +/* + * At the callsite x0-x8 and x19-x30 were live. Any C code will have preserved + * x19-x29 per the AAPCS, and we created frame records upon entry, so we need + * to restore x0-x8, x29, and x30. + */ + /* Restore function arguments */ + ldp x0, x1, [sp] + ldp x2, x3, [sp, #S_X2] + ldp x4, x5, [sp, #S_X4] + ldp x6, x7, [sp, #S_X6] + ldr x8, [sp, #S_X8] + + /* Restore the callsite's FP, LR, PC */ + ldr x29, [sp, #S_FP] + ldr x30, [sp, #S_LR] + ldr x9, [sp, #S_PC] + + /* Restore the callsite's SP */ + add sp, sp, #PT_REGS_SIZE + 16 + + ret x9 +SYM_CODE_END(ftrace_common) + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + /* * Gcc with -pg will put the following code in the beginning of each function: * mov x0, x30 @@ -79,7 +188,6 @@ .macro mcount_get_lr reg ldr \reg, [x29] ldr \reg, [\reg, #8] - mcount_adjust_addr \reg, \reg .endm .macro mcount_get_lr_addr reg @@ -96,7 +204,7 @@ * - tracer function to probe instrumented function's entry, * - ftrace_graph_caller to set up an exit hook */ -ENTRY(_mcount) +SYM_FUNC_START(_mcount) mcount_enter ldr_l x2, ftrace_trace_function @@ -108,13 +216,8 @@ ENTRY(_mcount) mcount_get_lr x1 // function's lr (= parent's pc) blr x2 // (*ftrace_trace_function)(pc, lr); -#ifndef CONFIG_FUNCTION_GRAPH_TRACER -skip_ftrace_call: // return; - mcount_exit // } -#else - mcount_exit // return; - // } -skip_ftrace_call: +skip_ftrace_call: // } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER ldr_l x2, ftrace_graph_return cmp x0, x2 // if ((ftrace_graph_return b.ne ftrace_graph_caller // != ftrace_stub) @@ -123,10 +226,11 @@ skip_ftrace_call: adr_l x0, ftrace_graph_entry_stub // != ftrace_graph_entry_stub)) cmp x0, x2 b.ne ftrace_graph_caller // ftrace_graph_caller(); - - mcount_exit #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -ENDPROC(_mcount) + mcount_exit +SYM_FUNC_END(_mcount) +EXPORT_SYMBOL(_mcount) +NOKPROBE(_mcount) #else /* CONFIG_DYNAMIC_FTRACE */ /* @@ -135,9 +239,11 @@ ENDPROC(_mcount) * and later on, NOP to branch to ftrace_caller() when enabled or branch to * NOP when disabled per-function base. */ -ENTRY(_mcount) +SYM_FUNC_START(_mcount) ret -ENDPROC(_mcount) +SYM_FUNC_END(_mcount) +EXPORT_SYMBOL(_mcount) +NOKPROBE(_mcount) /* * void ftrace_caller(unsigned long return_address) @@ -148,51 +254,27 @@ ENDPROC(_mcount) * - tracer function to probe instrumented function's entry, * - ftrace_graph_caller to set up an exit hook */ -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) mcount_enter mcount_get_pc0 x0 // function's pc mcount_get_lr x1 // function's lr - .global ftrace_call -ftrace_call: // tracer(pc, lr); +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) // tracer(pc, lr); nop // This will be replaced with "bl xxx" // where xxx can be any kind of tracer. #ifdef CONFIG_FUNCTION_GRAPH_TRACER - .global ftrace_graph_call -ftrace_graph_call: // ftrace_graph_caller(); +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) // ftrace_graph_caller(); nop // If enabled, this will be replaced // "b ftrace_graph_caller" #endif mcount_exit -ENDPROC(ftrace_caller) +SYM_FUNC_END(ftrace_caller) #endif /* CONFIG_DYNAMIC_FTRACE */ -ENTRY(ftrace_stub) - ret -ENDPROC(ftrace_stub) - #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* save return value regs*/ - .macro save_return_regs - sub sp, sp, #64 - stp x0, x1, [sp] - stp x2, x3, [sp, #16] - stp x4, x5, [sp, #32] - stp x6, x7, [sp, #48] - .endm - - /* restore return value regs*/ - .macro restore_return_regs - ldp x0, x1, [sp] - ldp x2, x3, [sp, #16] - ldp x4, x5, [sp, #32] - ldp x6, x7, [sp, #48] - add sp, sp, #64 - .endm - /* * void ftrace_graph_caller(void) * @@ -202,28 +284,51 @@ ENDPROC(ftrace_stub) * the call stack in order to intercept instrumented function's return path * and run return_to_handler() later on its exit. */ -ENTRY(ftrace_graph_caller) - mcount_get_lr_addr x0 // pointer to function's saved lr - mcount_get_pc x1 // function's pc +SYM_FUNC_START(ftrace_graph_caller) + mcount_get_pc x0 // function's pc + mcount_get_lr_addr x1 // pointer to function's saved lr mcount_get_parent_fp x2 // parent's fp - bl prepare_ftrace_return // prepare_ftrace_return(&lr, pc, fp) + bl prepare_ftrace_return // prepare_ftrace_return(pc, &lr, fp) mcount_exit -ENDPROC(ftrace_graph_caller) +SYM_FUNC_END(ftrace_graph_caller) +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +SYM_TYPED_FUNC_START(ftrace_stub) + ret +SYM_FUNC_END(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * void return_to_handler(void) * * Run ftrace_return_to_handler() before going back to parent. - * @fp is checked against the value passed by ftrace_graph_caller() - * only when HAVE_FUNCTION_GRAPH_FP_TEST is enabled. + * @fp is checked against the value passed by ftrace_graph_caller(). */ -ENTRY(return_to_handler) - save_return_regs +SYM_CODE_START(return_to_handler) + /* save return value regs */ + sub sp, sp, #64 + stp x0, x1, [sp] + stp x2, x3, [sp, #16] + stp x4, x5, [sp, #32] + stp x6, x7, [sp, #48] + mov x0, x29 // parent's fp bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); mov x30, x0 // restore the original return address - restore_return_regs + + /* restore return value regs */ + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + add sp, sp, #64 + ret -END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e1c59d4008a8..e28137d64b76 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1,88 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Low-level exception handling code * * Copyright (C) 2012 ARM Ltd. * Authors: Catalin Marinas <catalin.marinas@arm.com> * Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/arm-smccc.h> #include <linux/init.h> #include <linux/linkage.h> #include <asm/alternative.h> #include <asm/assembler.h> #include <asm/asm-offsets.h> +#include <asm/asm_pointer_auth.h> +#include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/errno.h> #include <asm/esr.h> #include <asm/irq.h> #include <asm/memory.h> +#include <asm/mmu.h> +#include <asm/processor.h> #include <asm/ptrace.h> +#include <asm/scs.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> -/* - * Context tracking subsystem. Used to instrument transitions - * between user and kernel mode. - */ - .macro ct_user_exit, syscall = 0 -#ifdef CONFIG_CONTEXT_TRACKING - bl context_tracking_user_exit - .if \syscall == 1 + .macro clear_gp_regs + .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 + mov x\n, xzr + .endr + .endm + + .macro kernel_ventry, el:req, ht:req, regsize:req, label:req + .align 7 +.Lventry_start\@: + .if \el == 0 /* - * Save/restore needed during syscalls. Restore syscall arguments from - * the values already saved on stack during kernel_entry. + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. */ - ldp x0, x1, [sp] - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] + b .Lskip_tramp_vectors_cleanup\@ + .if \regsize == 64 + mrs x30, tpidrro_el0 + msr tpidrro_el0, xzr + .else + mov x30, xzr + .endif +.Lskip_tramp_vectors_cleanup\@: .endif -#endif - .endm - - .macro ct_user_enter -#ifdef CONFIG_CONTEXT_TRACKING - bl context_tracking_user_enter -#endif - .endm - -/* - * Bad Abort numbers - *----------------- - */ -#define BAD_SYNC 0 -#define BAD_IRQ 1 -#define BAD_FIQ 2 -#define BAD_ERROR 3 - .macro kernel_ventry label - .align 7 - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. - * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + * Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT) + * should always be zero. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - b \label + b el\el\ht\()_\regsize\()_\label 0: /* @@ -91,7 +74,7 @@ * userspace, and can clobber EL0 registers to free up GPRs. */ - /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + /* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */ msr tpidr_el0, x0 /* Recover the original x0 value and stash it in tpidrro_el0 */ @@ -114,7 +97,103 @@ sub sp, sp, x0 mrs x0, tpidrro_el0 #endif - b \label + b el\el\ht\()_\regsize\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? + .endm + + .macro tramp_alias, dst, sym, tmp + mov_q \dst, TRAMP_VALIAS + adr_l \tmp, \sym + add \dst, \dst, \tmp + adr_l \tmp, .entry.tramp.text + sub \dst, \dst, \tmp + .endm + + /* + * This macro corrupts x0-x3. It is the caller's duty to save/restore + * them if required. + */ + .macro apply_ssbd, state, tmp1, tmp2 +alternative_cb ARM64_ALWAYS_SYSTEM, spectre_v4_patch_fw_mitigation_enable + b .L__asm_ssbd_skip\@ // Patched to NOP +alternative_cb_end + ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1 + cbz \tmp2, .L__asm_ssbd_skip\@ + ldr \tmp2, [tsk, #TSK_TI_FLAGS] + tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@ + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 + mov w1, #\state +alternative_cb ARM64_ALWAYS_SYSTEM, smccc_patch_fw_mitigation_conduit + nop // Patched to SMC/HVC #0 +alternative_cb_end +.L__asm_ssbd_skip\@: + .endm + + /* Check for MTE asynchronous tag check faults */ + .macro check_mte_async_tcf, tmp, ti_flags, thread_sctlr +#ifdef CONFIG_ARM64_MTE + .arch_extension lse +alternative_if_not ARM64_MTE + b 1f +alternative_else_nop_endif + /* + * Asynchronous tag check faults are only possible in ASYNC (2) or + * ASYM (3) modes. In each of these modes bit 1 of SCTLR_EL1.TCF0 is + * set, so skip the check if it is unset. + */ + tbz \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f + mrs_s \tmp, SYS_TFSRE0_EL1 + tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f + /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ + mov \tmp, #_TIF_MTE_ASYNC_FAULT + add \ti_flags, tsk, #TSK_TI_FLAGS + stset \tmp, [\ti_flags] +1: +#endif + .endm + + /* Clear the MTE asynchronous tag check faults */ + .macro clear_mte_async_tcf thread_sctlr +#ifdef CONFIG_ARM64_MTE +alternative_if ARM64_MTE + /* See comment in check_mte_async_tcf above. */ + tbz \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f + dsb ish + msr_s SYS_TFSRE0_EL1, xzr +1: +alternative_else_nop_endif +#endif + .endm + + .macro mte_set_gcr, mte_ctrl, tmp +#ifdef CONFIG_ARM64_MTE + ubfx \tmp, \mte_ctrl, #MTE_CTRL_GCR_USER_EXCL_SHIFT, #16 + orr \tmp, \tmp, #SYS_GCR_EL1_RRND + msr_s SYS_GCR_EL1, \tmp +#endif + .endm + + .macro mte_set_kernel_gcr, tmp, tmp2 +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb ARM64_ALWAYS_SYSTEM, kasan_hw_tags_enable + b 1f +alternative_cb_end + mov \tmp, KERNEL_GCR_EL1 + msr_s SYS_GCR_EL1, \tmp +1: +#endif + .endm + + .macro mte_set_user_gcr, tsk, tmp, tmp2 +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb ARM64_ALWAYS_SYSTEM, kasan_hw_tags_enable + b 1f +alternative_cb_end + ldr \tmp, [\tsk, #THREAD_MTE_CTRL] + + mte_set_gcr \tmp, \tmp2 +1: +#endif .endm .macro kernel_entry, el, regsize = 64 @@ -138,30 +217,74 @@ stp x28, x29, [sp, #16 * 14] .if \el == 0 + clear_gp_regs mrs x21, sp_el0 - ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, - ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug - disable_step_tsk x19, x20 // exceptions when scheduling. + ldr_this_cpu tsk, __entry_task, x20 + msr sp_el0, tsk + + /* + * Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions + * when scheduling. + */ + ldr x19, [tsk, #TSK_TI_FLAGS] + disable_step_tsk x19, x20 + + /* Check for asynchronous tag check faults in user space */ + ldr x0, [tsk, THREAD_SCTLR_USER] + check_mte_async_tcf x22, x23, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * Enable IA for in-kernel PAC if the task had it disabled. Although + * this could be implemented with an unconditional MRS which would avoid + * a load, this was measured to be slower on Cortex-A75 and Cortex-A76. + * + * Install the kernel IA key only if IA was enabled in the task. If IA + * was disabled on kernel exit then we would have left the kernel IA + * installed so there is no need to install it again. + */ + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_kernel_nosync tsk, x20, x22, x23 + b 2f +1: + mrs x0, sctlr_el1 + orr x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: +alternative_else_nop_endif +#endif + + apply_ssbd 1, x22, x23 + + mte_set_kernel_gcr x22, x23 + + /* + * Any non-self-synchronizing system register updates required for + * kernel entry should be placed before this point. + */ +alternative_if ARM64_MTE + isb + b 1f +alternative_else_nop_endif +alternative_if ARM64_HAS_ADDRESS_AUTH + isb +alternative_else_nop_endif +1: - mov x29, xzr // fp pointed to user-space + scs_load tsk .else - add x21, sp, #S_FRAME_SIZE - get_thread_info tsk - /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ - ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] - str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #TASK_SIZE_64 - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ + add x21, sp, #PT_REGS_SIZE + get_current_task tsk .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] /* - * In order to be able to dump the contents of struct pt_regs at the - * time the exception was taken (in case we attempt to walk the call - * stack later), chain it together with the stack frames. + * For exceptions from EL0, create a final frame record. + * For exceptions from EL1, create a synthetic frame record so the + * interrupted code shows up in the backtrace. */ .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] @@ -171,28 +294,9 @@ add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Set the TTBR0 PAN bit in SPSR. When the exception is taken from - * EL0, there is no need to check the state of TTBR0_EL1 since - * accesses are always enabled. - * Note that the meaning of this bit differs from the ARMv8.1 PAN - * feature as all TTBR0_EL1 accesses are disabled, not just those to - * user mappings. - */ -alternative_if ARM64_HAS_PAN - b 1f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_entry_el\el alternative_else_nop_endif - - .if \el != 0 - mrs x21, ttbr0_el1 - tst x21, #0xffff << 48 // Check for the reserved ASID - orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR - b.eq 1f // TTBR0 access already disabled - and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR - .endif - - __uaccess_ttbr0_disable x21 -1: #endif stp x22, x23, [sp, #S_PC] @@ -203,16 +307,20 @@ alternative_else_nop_endif str w21, [sp, #S_SYSCALLNO] .endif - /* - * Set sp_el0 to current thread_info. - */ - .if \el == 0 - msr sp_el0, tsk - .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI + /* Save pmr */ +alternative_if ARM64_HAS_IRQ_PRIO_MASKING + mrs_s x20, SYS_ICC_PMR_EL1 + str x20, [sp, #S_PMR_SAVE] + mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET + msr_s SYS_ICC_PMR_EL1, x20 +alternative_else_nop_endif +#endif /* * Registers that may be useful after this macro is invoked: * + * x20 - ICC_PMR_EL1 * x21 - aborted SP * x22 - aborted PC * x23 - aborted PSTATE @@ -221,64 +329,75 @@ alternative_else_nop_endif .macro kernel_exit, el .if \el != 0 - /* Restore the task's original addr_limit. */ - ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - - /* No need to restore UAO, it will be restored from SPSR_EL1 */ + disable_daif .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI + /* Restore pmr */ +alternative_if ARM64_HAS_IRQ_PRIO_MASKING + ldr x20, [sp, #S_PMR_SAVE] + msr_s SYS_ICC_PMR_EL1, x20 + mrs_s x21, SYS_ICC_CTLR_EL1 + tbz x21, #6, .L__skip_pmr_sync\@ // Check for ICC_CTLR_EL1.PMHE + dsb sy // Ensure priority change is seen by redistributor +.L__skip_pmr_sync\@: +alternative_else_nop_endif +#endif + ldp x21, x22, [sp, #S_PC] // load ELR, SPSR - .if \el == 0 - ct_user_enter - .endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR - * PAN bit checking. - */ -alternative_if ARM64_HAS_PAN - b 2f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_exit_el\el alternative_else_nop_endif - - .if \el != 0 - tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set - .endif - - __uaccess_ttbr0_enable x0 - - .if \el == 0 - /* - * Enable errata workarounds only if returning to user. The only - * workaround currently required for TTBR0_EL1 changes are for the - * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache - * corruption). - */ - post_ttbr0_update_workaround - .endif -1: - .if \el != 0 - and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit - .endif -2: #endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + tst x22, #PSR_MODE32_BIT // native task? + b.eq 3f + #ifdef CONFIG_ARM64_ERRATUM_845719 alternative_if ARM64_WORKAROUND_845719 - tbz x22, #4, 1f #ifdef CONFIG_PID_IN_CONTEXTIDR mrs x29, contextidr_el1 msr contextidr_el1, x29 #else msr contextidr_el1, xzr #endif +alternative_else_nop_endif +#endif +3: + scs_save tsk + + /* Ignore asynchronous tag check faults in the uaccess routines */ + ldr x0, [tsk, THREAD_SCTLR_USER] + clear_mte_async_tcf x0 + +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * IA was enabled for in-kernel PAC. Disable it now if needed, or + * alternatively install the user's IA. All other per-task keys and + * SCTLR bits were updated on task switch. + * + * No kernel C function calls after this. + */ + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_user tsk, x0, x1, x2 + b 2f 1: + mrs x0, sctlr_el1 + bic x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: alternative_else_nop_endif #endif + + mte_set_user_gcr tsk, x0, x1 + + apply_ssbd 0, x0, x1 .endif msr elr_el1, x21 // set up the return data @@ -298,64 +417,80 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] + + .if \el == 0 +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp - eret // return to kernel + add sp, sp, #PT_REGS_SIZE // restore sp + eret +alternative_else_nop_endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + bne 4f + msr far_el1, x29 + tramp_alias x30, tramp_exit_native, x29 + br x30 +4: + tramp_alias x30, tramp_exit_compat, x29 + br x30 +#endif + .else + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + + /* Ensure any device/NC reads complete */ + alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 + + eret + .endif + sb .endm - .macro irq_stack_entry - mov x19, sp // preserve the original sp +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +SYM_CODE_START_LOCAL(__swpan_entry_el1) + mrs x21, ttbr0_el1 + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR +SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL) + __uaccess_ttbr0_disable x21 +1: ret +SYM_CODE_END(__swpan_entry_el1) /* - * Compare sp with the base of the task stack. - * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, - * and should switch to the irq stack. + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. */ - ldr x25, [tsk, TSK_STACK] - eor x25, x25, x19 - and x25, x25, #~(THREAD_SIZE - 1) - cbnz x25, 9998f - - ldr_this_cpu x25, irq_stack_ptr, x26 - mov x26, #IRQ_STACK_SIZE - add x26, x25, x26 - - /* switch to the irq stack */ - mov sp, x26 -9998: - .endm +SYM_CODE_START_LOCAL(__swpan_exit_el1) + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + __uaccess_ttbr0_enable x0, x1 +1: and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + ret +SYM_CODE_END(__swpan_exit_el1) +SYM_CODE_START_LOCAL(__swpan_exit_el0) + __uaccess_ttbr0_enable x0, x1 /* - * x19 should be preserved between irq_stack_entry and - * irq_stack_exit. + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). */ - .macro irq_stack_exit - mov sp, x19 - .endm + b post_ttbr_update_workaround +SYM_CODE_END(__swpan_exit_el0) +#endif -/* - * These are the registers used in the syscall handler, and allow us to - * have in theory up to 7 arguments to a function - x0 to x6. - * - * x7 is reserved for the system call number in 32-bit mode. - */ -wsc_nr .req w25 // number of system calls -wscno .req w26 // syscall number -xscno .req x26 // syscall number (zero-extended) -stbl .req x27 // syscall table pointer +/* GPRs used by entry code */ tsk .req x28 // current thread_info -/* - * Interrupt handling. - */ - .macro irq_handler - ldr_l x1, handle_arch_irq - mov x0, sp - irq_stack_entry - blr x1 - irq_stack_exit - .endm - .text /* @@ -364,53 +499,47 @@ tsk .req x28 // current thread_info .pushsection ".entry.text", "ax" .align 11 -ENTRY(vectors) - kernel_ventry el1_sync_invalid // Synchronous EL1t - kernel_ventry el1_irq_invalid // IRQ EL1t - kernel_ventry el1_fiq_invalid // FIQ EL1t - kernel_ventry el1_error_invalid // Error EL1t - - kernel_ventry el1_sync // Synchronous EL1h - kernel_ventry el1_irq // IRQ EL1h - kernel_ventry el1_fiq_invalid // FIQ EL1h - kernel_ventry el1_error_invalid // Error EL1h - - kernel_ventry el0_sync // Synchronous 64-bit EL0 - kernel_ventry el0_irq // IRQ 64-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 - kernel_ventry el0_error_invalid // Error 64-bit EL0 - -#ifdef CONFIG_COMPAT - kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 - kernel_ventry el0_irq_compat // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 -#else - kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 - kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 - kernel_ventry el0_error_invalid // Error 32-bit EL0 -#endif -END(vectors) +SYM_CODE_START(vectors) + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1t + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + kernel_ventry 0, t, 64, sync // Synchronous 64-bit EL0 + kernel_ventry 0, t, 64, irq // IRQ 64-bit EL0 + kernel_ventry 0, t, 64, fiq // FIQ 64-bit EL0 + kernel_ventry 0, t, 64, error // Error 64-bit EL0 + + kernel_ventry 0, t, 32, sync // Synchronous 32-bit EL0 + kernel_ventry 0, t, 32, irq // IRQ 32-bit EL0 + kernel_ventry 0, t, 32, fiq // FIQ 32-bit EL0 + kernel_ventry 0, t, 32, error // Error 32-bit EL0 +SYM_CODE_END(vectors) #ifdef CONFIG_VMAP_STACK +SYM_CODE_START_LOCAL(__bad_stack) /* * We detected an overflow in kernel_ventry, which switched to the * overflow stack. Stash the exception regs, and head to our overflow * handler. */ -__bad_stack: + /* Restore the original x0 value */ mrs x0, tpidrro_el0 /* * Store the original GPRs to the new stack. The orginal SP (minus - * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + * PT_REGS_SIZE) was stashed in tpidr_el0 by kernel_ventry. */ - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE kernel_entry 1 mrs x0, tpidr_el0 - add x0, x0, #S_FRAME_SIZE + add x0, x0, #PT_REGS_SIZE str x0, [sp, #S_SP] /* Stash the regs for handle_bad_stack */ @@ -419,489 +548,271 @@ __bad_stack: /* Time to die */ bl handle_bad_stack ASM_BUG() +SYM_CODE_END(__bad_stack) #endif /* CONFIG_VMAP_STACK */ -/* - * Invalid mode handlers - */ - .macro inv_entry, el, reason, regsize = 64 + + .macro entry_handler el:req, ht:req, regsize:req, label:req +SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label) kernel_entry \el, \regsize mov x0, sp - mov x1, #\reason - mrs x2, esr_el1 - bl bad_mode - ASM_BUG() + bl el\el\ht\()_\regsize\()_\label\()_handler + .if \el == 0 + b ret_to_user + .else + b ret_to_kernel + .endif +SYM_CODE_END(el\el\ht\()_\regsize\()_\label) .endm -el0_sync_invalid: - inv_entry 0, BAD_SYNC -ENDPROC(el0_sync_invalid) - -el0_irq_invalid: - inv_entry 0, BAD_IRQ -ENDPROC(el0_irq_invalid) - -el0_fiq_invalid: - inv_entry 0, BAD_FIQ -ENDPROC(el0_fiq_invalid) - -el0_error_invalid: - inv_entry 0, BAD_ERROR -ENDPROC(el0_error_invalid) - -#ifdef CONFIG_COMPAT -el0_fiq_invalid_compat: - inv_entry 0, BAD_FIQ, 32 -ENDPROC(el0_fiq_invalid_compat) +/* + * Early exception handlers + */ + entry_handler 1, t, 64, sync + entry_handler 1, t, 64, irq + entry_handler 1, t, 64, fiq + entry_handler 1, t, 64, error + + entry_handler 1, h, 64, sync + entry_handler 1, h, 64, irq + entry_handler 1, h, 64, fiq + entry_handler 1, h, 64, error + + entry_handler 0, t, 64, sync + entry_handler 0, t, 64, irq + entry_handler 0, t, 64, fiq + entry_handler 0, t, 64, error + + entry_handler 0, t, 32, sync + entry_handler 0, t, 32, irq + entry_handler 0, t, 32, fiq + entry_handler 0, t, 32, error + +SYM_CODE_START_LOCAL(ret_to_kernel) + kernel_exit 1 +SYM_CODE_END(ret_to_kernel) -el0_error_invalid_compat: - inv_entry 0, BAD_ERROR, 32 -ENDPROC(el0_error_invalid_compat) +SYM_CODE_START_LOCAL(ret_to_user) + ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step + enable_step_tsk x19, x2 +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + bl stackleak_erase_on_task_stack #endif + kernel_exit 0 +SYM_CODE_END(ret_to_user) -el1_sync_invalid: - inv_entry 1, BAD_SYNC -ENDPROC(el1_sync_invalid) - -el1_irq_invalid: - inv_entry 1, BAD_IRQ -ENDPROC(el1_irq_invalid) - -el1_fiq_invalid: - inv_entry 1, BAD_FIQ -ENDPROC(el1_fiq_invalid) + .popsection // .entry.text -el1_error_invalid: - inv_entry 1, BAD_ERROR -ENDPROC(el1_error_invalid) + // Move from tramp_pg_dir to swapper_pg_dir + .macro tramp_map_kernel, tmp + mrs \tmp, ttbr1_el1 + add \tmp, \tmp, #TRAMP_SWAPPER_OFFSET + bic \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 +alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003 + /* ASID already in \tmp[63:48] */ + movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12) + movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12) + /* 2MB boundary containing the vectors, so we nobble the walk cache */ + movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12) + isb + tlbi vae1, \tmp + dsb nsh +alternative_else_nop_endif +#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ + .endm -/* - * EL1 mode handlers. - */ - .align 6 -el1_sync: - kernel_entry 1 - mrs x1, esr_el1 // read the syndrome register - lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class - cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1 - b.eq el1_da - cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1 - b.eq el1_ia - cmp x24, #ESR_ELx_EC_SYS64 // configurable trap - b.eq el1_undef - cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception - b.eq el1_sp_pc - cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception - b.eq el1_sp_pc - cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL1 - b.eq el1_undef - cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1 - b.ge el1_dbg - b el1_inv - -el1_ia: + // Move from swapper_pg_dir to tramp_pg_dir + .macro tramp_unmap_kernel, tmp + mrs \tmp, ttbr1_el1 + sub \tmp, \tmp, #TRAMP_SWAPPER_OFFSET + orr \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp /* - * Fall through to the Data abort case + * We avoid running the post_ttbr_update_workaround here because + * it's only needed by Cavium ThunderX, which requires KPTI to be + * disabled. */ -el1_da: - /* - * Data abort handling - */ - mrs x3, far_el1 - enable_dbg - // re-enable interrupts if they were enabled in the aborted context - tbnz x23, #7, 1f // PSR_I_BIT - enable_irq -1: - clear_address_tag x0, x3 - mov x2, sp // struct pt_regs - bl do_mem_abort + .endm - // disable interrupts before pulling preserved data off the stack - disable_irq - kernel_exit 1 -el1_sp_pc: - /* - * Stack or PC alignment exception handling - */ - mrs x0, far_el1 - enable_dbg - mov x2, sp - bl do_sp_pc_abort - ASM_BUG() -el1_undef: - /* - * Undefined instruction - */ - enable_dbg - mov x0, sp - bl do_undefinstr - ASM_BUG() -el1_dbg: + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RELOCATABLE + ldr \dst, .L__tramp_data_\var + .ifndef .L__tramp_data_\var + .pushsection ".entry.tramp.rodata", "a", %progbits + .align 3 +.L__tramp_data_\var: + .quad \var + .popsection + .endif +#else /* - * Debug exception handling + * As !RELOCATABLE implies !RANDOMIZE_BASE the address is always a + * compile time constant (and hence not secret and not worth hiding). + * + * As statically allocated kernel code and data always live in the top + * 47 bits of the address space we can sign-extend bit 47 and avoid an + * instruction to load the upper 16 bits (which must be 0xFFFF). */ - cmp x24, #ESR_ELx_EC_BRK64 // if BRK64 - cinc x24, x24, eq // set bit '0' - tbz x24, #0, el1_inv // EL1 only - mrs x0, far_el1 - mov x2, sp // struct pt_regs - bl do_debug_exception - kernel_exit 1 -el1_inv: - // TODO: add support for undefined instructions in kernel mode - enable_dbg - mov x0, sp - mov x2, x1 - mov x1, #BAD_SYNC - bl bad_mode - ASM_BUG() -ENDPROC(el1_sync) - - .align 6 -el1_irq: - kernel_entry 1 - enable_dbg -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off + movz \dst, :abs_g2_s:\var + movk \dst, :abs_g1_nc:\var + movk \dst, :abs_g0_nc:\var #endif + .endm - irq_handler +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 -#ifdef CONFIG_PREEMPT - ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count - cbnz w24, 1f // preempt count != 0 - ldr x0, [tsk, #TSK_TI_FLAGS] // get flags - tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? - bl el1_preempt + .macro tramp_ventry, vector_start, regsize, kpti, bhb + .align 7 1: -#endif -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on -#endif - kernel_exit 1 -ENDPROC(el1_irq) - -#ifdef CONFIG_PREEMPT -el1_preempt: - mov x24, lr -1: bl preempt_schedule_irq // irq en/disable is done inside - ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS - tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? - ret x24 -#endif + .if \regsize == 64 + msr tpidrro_el0, x30 // Restored in kernel_ventry + .endif -/* - * EL0 mode handlers. - */ - .align 6 -el0_sync: - kernel_entry 0 - mrs x25, esr_el1 // read the syndrome register - lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class - cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state - b.eq el0_svc - cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0 - b.eq el0_da - cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0 - b.eq el0_ia - cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access - b.eq el0_fpsimd_acc - cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception - b.eq el0_fpsimd_exc - cmp x24, #ESR_ELx_EC_SYS64 // configurable trap - b.eq el0_sys - cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception - b.eq el0_sp_pc - cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception - b.eq el0_sp_pc - cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0 - b.eq el0_undef - cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0 - b.ge el0_dbg - b el0_inv - -#ifdef CONFIG_COMPAT - .align 6 -el0_sync_compat: - kernel_entry 0, 32 - mrs x25, esr_el1 // read the syndrome register - lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class - cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state - b.eq el0_svc_compat - cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0 - b.eq el0_da - cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0 - b.eq el0_ia - cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access - b.eq el0_fpsimd_acc - cmp x24, #ESR_ELx_EC_FP_EXC32 // FP/ASIMD exception - b.eq el0_fpsimd_exc - cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception - b.eq el0_sp_pc - cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0 - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP15_32 // CP15 MRC/MCR trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP15_64 // CP15 MRRC/MCRR trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP14_MR // CP14 MRC/MCR trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP14_LS // CP14 LDC/STC trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP14_64 // CP14 MRRC/MCRR trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0 - b.ge el0_dbg - b el0_inv -el0_svc_compat: + .if \bhb == BHB_MITIGATION_LOOP /* - * AArch32 syscall handling + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. */ - adrp stbl, compat_sys_call_table // load compat syscall table pointer - mov wscno, w7 // syscall number in w7 (r7) - mov wsc_nr, #__NR_compat_syscalls - b el0_svc_naked - - .align 6 -el0_irq_compat: - kernel_entry 0, 32 - b el0_irq_naked -#endif + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP -el0_da: - /* - * Data abort handling - */ - mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq - ct_user_exit - clear_address_tag x0, x26 - mov x1, x25 - mov x2, sp - bl do_mem_abort - b ret_to_user -el0_ia: - /* - * Instruction abort handling - */ - mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq - ct_user_exit - mov x0, x26 - mov x1, x25 - mov x2, sp - bl do_mem_abort - b ret_to_user -el0_fpsimd_acc: - /* - * Floating Point or Advanced SIMD access - */ - enable_dbg - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_fpsimd_acc - b ret_to_user -el0_fpsimd_exc: - /* - * Floating Point or Advanced SIMD exception - */ - enable_dbg - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_fpsimd_exc - b ret_to_user -el0_sp_pc: - /* - * Stack or PC alignment exception handling - */ - mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq - ct_user_exit - mov x0, x26 - mov x1, x25 - mov x2, sp - bl do_sp_pc_abort - b ret_to_user -el0_undef: - /* - * Undefined instruction - */ - // enable interrupts before calling the main handler - enable_dbg_and_irq - ct_user_exit - mov x0, sp - bl do_undefinstr - b ret_to_user -el0_sys: + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + + .if \kpti == 1 /* - * System instructions, for trapped cache maintenance instructions + * Defend against branch aliasing attacks by pushing a dummy + * entry onto the return stack and using a RET instruction to + * enter the full-fat kernel vectors. */ - enable_dbg_and_irq - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_sysinstr - b ret_to_user -el0_dbg: + bl 2f + b . +2: + tramp_map_kernel x30 +alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 + tramp_data_read_var x30, vectors +alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM + prfm plil1strm, [x30, #(1b - \vector_start)] +alternative_else_nop_endif + + msr vbar_el1, x30 + isb + .else + adr_l x30, vectors + .endif // \kpti == 1 + + .if \bhb == BHB_MITIGATION_FW /* - * Debug exception handling + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. */ - tbnz x24, #0, el0_inv // EL0 only - mrs x0, far_el1 - mov x1, x25 - mov x2, sp - bl do_debug_exception - enable_dbg - ct_user_exit - b ret_to_user -el0_inv: - enable_dbg - ct_user_exit - mov x0, sp - mov x1, #BAD_SYNC - mov x2, x25 - bl bad_el0_sync - b ret_to_user -ENDPROC(el0_sync) - - .align 6 -el0_irq: - kernel_entry 0 -el0_irq_naked: - enable_dbg -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW - ct_user_exit - irq_handler + add x30, x30, #(1b - \vector_start + 4) + ret +.org 1b + 128 // Did we overflow the ventry slot? + .endm -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on -#endif - b ret_to_user -ENDPROC(el0_irq) + .macro tramp_exit, regsize = 64 + tramp_data_read_var x30, this_cpu_vector + get_this_cpu_offset x29 + ldr x30, [x30, x29] -/* - * This is the fast syscall return path. We do as little as possible here, - * and this includes saving x0 back into the kernel stack. - */ -ret_fast_syscall: - disable_irq // disable interrupts - str x0, [sp, #S_X0] // returned x0 - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing - and x2, x1, #_TIF_SYSCALL_WORK - cbnz x2, ret_fast_syscall_trace - and x2, x1, #_TIF_WORK_MASK - cbnz x2, work_pending - enable_step_tsk x1, x2 - kernel_exit 0 -ret_fast_syscall_trace: - enable_irq // enable interrupts - b __sys_trace_return_skipped // we already saved x0 + msr vbar_el1, x30 + ldr lr, [sp, #S_LR] + tramp_unmap_kernel x29 + .if \regsize == 64 + mrs x29, far_el1 + .endif + add sp, sp, #PT_REGS_SIZE // restore sp + eret + sb + .endm + .macro generate_tramp_vector, kpti, bhb +.Lvector_start\@: + .space 0x400 + + .rept 4 + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb + .endr + .endm + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 /* - * Ok, we need to do extra processing, enter the slow path. - */ -work_pending: - mov x0, sp // 'regs' - bl do_notify_resume -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on // enabled while in userspace -#endif - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step - b finish_ret_to_user -/* - * "slow" syscall return path. + * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. */ -ret_to_user: - disable_irq // disable interrupts - ldr x1, [tsk, #TSK_TI_FLAGS] - and x2, x1, #_TIF_WORK_MASK - cbnz x2, work_pending -finish_ret_to_user: - enable_step_tsk x1, x2 - kernel_exit 0 -ENDPROC(ret_to_user) + .pushsection ".entry.tramp.text", "ax" + .align 11 +SYM_CODE_START_NOALIGN(tramp_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE +SYM_CODE_END(tramp_vectors) + +SYM_CODE_START(tramp_exit_native) + tramp_exit +SYM_CODE_END(tramp_exit_native) + +SYM_CODE_START(tramp_exit_compat) + tramp_exit 32 +SYM_CODE_END(tramp_exit_compat) + .popsection // .entry.tramp.text +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* - * SVC handler. + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. */ - .align 6 -el0_svc: - adrp stbl, sys_call_table // load syscall table pointer - mov wscno, w8 // syscall number in w8 - mov wsc_nr, #__NR_syscalls -el0_svc_naked: // compat entry point - stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number - enable_dbg_and_irq - ct_user_exit 1 - - ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks - tst x16, #_TIF_SYSCALL_WORK - b.ne __sys_trace - cmp wscno, wsc_nr // check upper syscall limit - b.hs ni_sys - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table - blr x16 // call sys_* routine - b ret_fast_syscall -ni_sys: - mov x0, sp - bl do_ni_syscall - b ret_fast_syscall -ENDPROC(el0_svc) - - /* - * This is the really slow path. We're going to be doing context - * switches, and waiting for our parent to respond. - */ -__sys_trace: - cmp wscno, #NO_SYSCALL // user-issued syscall(-1)? - b.ne 1f - mov x0, #-ENOSYS // set default errno if so - str x0, [sp, #S_X0] -1: mov x0, sp - bl syscall_trace_enter - cmp w0, #NO_SYSCALL // skip the syscall? - b.eq __sys_trace_return_skipped - mov wscno, w0 // syscall number (possibly new) - mov x1, sp // pointer to regs - cmp wscno, wsc_nr // check upper syscall limit - b.hs __ni_sys_trace - ldp x0, x1, [sp] // restore the syscall args - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table - blr x16 // call sys_* routine - -__sys_trace_return: - str x0, [sp, #S_X0] // save returned x0 -__sys_trace_return_skipped: - mov x0, sp - bl syscall_trace_exit - b ret_to_user - -__ni_sys_trace: - mov x0, sp - bl do_ni_syscall - b __sys_trace_return + .macro generate_el1_vector, bhb +.Lvector_start\@: + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1h + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, 0, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, 0, \bhb + .endr + .endm - .popsection // .entry.text +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ + .pushsection ".entry.text", "ax" + .align 11 +SYM_CODE_START(__bp_harden_el1_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ +SYM_CODE_END(__bp_harden_el1_vectors) + .popsection -/* - * Special system call wrappers. - */ -ENTRY(sys_rt_sigreturn_wrapper) - mov x0, sp - b sys_rt_sigreturn -ENDPROC(sys_rt_sigreturn_wrapper) /* * Register switch for AArch64. The callee-saved registers need to be saved @@ -911,7 +822,7 @@ ENDPROC(sys_rt_sigreturn_wrapper) * Previous and next are guaranteed not to be the same. * */ -ENTRY(cpu_switch_to) +SYM_FUNC_START(cpu_switch_to) mov x10, #THREAD_CPU_CONTEXT add x8, x0, x10 mov x9, sp @@ -932,19 +843,235 @@ ENTRY(cpu_switch_to) ldr lr, [x8] mov sp, x9 msr sp_el0, x1 + ptrauth_keys_install_kernel x1, x8, x9, x10 + scs_save x0 + scs_load x1 ret -ENDPROC(cpu_switch_to) +SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) /* * This is how we return from a fork. */ -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) bl schedule_tail cbz x19, 1f // not a kernel thread mov x0, x20 blr x19 -1: get_thread_info tsk +1: get_current_task tsk + mov x0, sp + bl asm_exit_to_user_mode b ret_to_user -ENDPROC(ret_from_fork) +SYM_CODE_END(ret_from_fork) NOKPROBE(ret_from_fork) + +/* + * void call_on_irq_stack(struct pt_regs *regs, + * void (*func)(struct pt_regs *)); + * + * Calls func(regs) using this CPU's irq stack and shadow irq stack. + */ +SYM_FUNC_START(call_on_irq_stack) +#ifdef CONFIG_SHADOW_CALL_STACK + stp scs_sp, xzr, [sp, #-16]! + ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x17 +#endif + /* Create a frame record to save our LR and SP (implicit in FP) */ + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr_this_cpu x16, irq_stack_ptr, x17 + mov x15, #IRQ_STACK_SIZE + add x16, x16, x15 + + /* Move to the new stack and call the function there */ + mov sp, x16 + blr x1 + + /* + * Restore the SP from the FP, and restore the FP and LR from the frame + * record. + */ + mov sp, x29 + ldp x29, x30, [sp], #16 +#ifdef CONFIG_SHADOW_CALL_STACK + ldp scs_sp, xzr, [sp], #16 +#endif + ret +SYM_FUNC_END(call_on_irq_stack) +NOKPROBE(call_on_irq_stack) + +#ifdef CONFIG_ARM_SDE_INTERFACE + +#include <asm/sdei.h> +#include <uapi/linux/arm_sdei.h> + +.macro sdei_handler_exit exit_mode + /* On success, this call never returns... */ + cmp \exit_mode, #SDEI_EXIT_SMC + b.ne 99f + smc #0 + b . +99: hvc #0 + b . +.endm + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * The regular SDEI entry point may have been unmapped along with the rest of + * the kernel. This trampoline restores the kernel mapping to make the x1 memory + * argument accessible. + * + * This clobbers x4, __sdei_handler() will restore this from firmware's + * copy. + */ +.pushsection ".entry.tramp.text", "ax" +SYM_CODE_START(__sdei_asm_entry_trampoline) + mrs x4, ttbr1_el1 + tbz x4, #USER_ASID_BIT, 1f + + tramp_map_kernel tmp=x4 + isb + mov x4, xzr + + /* + * Remember whether to unmap the kernel on exit. + */ +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] + tramp_data_read_var x4, __sdei_asm_handler + br x4 +SYM_CODE_END(__sdei_asm_entry_trampoline) +NOKPROBE(__sdei_asm_entry_trampoline) + +/* + * Make the exit call and restore the original ttbr1_el1 + * + * x0 & x1: setup for the exit API call + * x2: exit_mode + * x4: struct sdei_registered_event argument from registration time. + */ +SYM_CODE_START(__sdei_asm_exit_trampoline) + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] + cbnz x4, 1f + + tramp_unmap_kernel tmp=x4 + +1: sdei_handler_exit exit_mode=x2 +SYM_CODE_END(__sdei_asm_exit_trampoline) +NOKPROBE(__sdei_asm_exit_trampoline) +.popsection // .entry.tramp.text +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + +/* + * Software Delegated Exception entry point. + * + * x0: Event number + * x1: struct sdei_registered_event argument from registration time. + * x2: interrupted PC + * x3: interrupted PSTATE + * x4: maybe clobbered by the trampoline + * + * Firmware has preserved x0->x17 for us, we must save/restore the rest to + * follow SMC-CC. We save (or retrieve) all the registers as the handler may + * want them. + */ +SYM_CODE_START(__sdei_asm_handler) + stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC] + stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2] + stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3] + stp x8, x9, [x1, #SDEI_EVENT_INTREGS + 16 * 4] + stp x10, x11, [x1, #SDEI_EVENT_INTREGS + 16 * 5] + stp x12, x13, [x1, #SDEI_EVENT_INTREGS + 16 * 6] + stp x14, x15, [x1, #SDEI_EVENT_INTREGS + 16 * 7] + stp x16, x17, [x1, #SDEI_EVENT_INTREGS + 16 * 8] + stp x18, x19, [x1, #SDEI_EVENT_INTREGS + 16 * 9] + stp x20, x21, [x1, #SDEI_EVENT_INTREGS + 16 * 10] + stp x22, x23, [x1, #SDEI_EVENT_INTREGS + 16 * 11] + stp x24, x25, [x1, #SDEI_EVENT_INTREGS + 16 * 12] + stp x26, x27, [x1, #SDEI_EVENT_INTREGS + 16 * 13] + stp x28, x29, [x1, #SDEI_EVENT_INTREGS + 16 * 14] + mov x4, sp + stp lr, x4, [x1, #SDEI_EVENT_INTREGS + S_LR] + + mov x19, x1 + +#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK) + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] +#endif + +#ifdef CONFIG_VMAP_STACK + /* + * entry.S may have been using sp as a scratch register, find whether + * this is a normal or critical event and switch to the appropriate + * stack for this CPU. + */ + cbnz w4, 1f + ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 + b 2f +1: ldr_this_cpu dst=x5, sym=sdei_stack_critical_ptr, tmp=x6 +2: mov x6, #SDEI_STACK_SIZE + add x5, x5, x6 + mov sp, x5 +#endif + +#ifdef CONFIG_SHADOW_CALL_STACK + /* Use a separate shadow call stack for normal and critical events */ + cbnz w4, 3f + ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal_ptr, tmp=x6 + b 4f +3: ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical_ptr, tmp=x6 +4: +#endif + + /* + * We may have interrupted userspace, or a guest, or exit-from or + * return-to either of these. We can't trust sp_el0, restore it. + */ + mrs x28, sp_el0 + ldr_this_cpu dst=x0, sym=__entry_task, tmp=x1 + msr sp_el0, x0 + + /* If we interrupted the kernel point to the previous stack/frame. */ + and x0, x3, #0xc + mrs x1, CurrentEL + cmp x0, x1 + csel x29, x29, xzr, eq // fp, or zero + csel x4, x2, xzr, eq // elr, or zero + + stp x29, x4, [sp, #-16]! + mov x29, sp + + add x0, x19, #SDEI_EVENT_INTREGS + mov x1, x19 + bl __sdei_handler + + msr sp_el0, x28 + /* restore regs >x17 that we clobbered */ + mov x4, x19 // keep x4 for __sdei_asm_exit_trampoline + ldp x28, x29, [x4, #SDEI_EVENT_INTREGS + 16 * 14] + ldp x18, x19, [x4, #SDEI_EVENT_INTREGS + 16 * 9] + ldp lr, x1, [x4, #SDEI_EVENT_INTREGS + S_LR] + mov sp, x1 + + mov x1, x0 // address to complete_and_resume + /* x0 = (x0 <= SDEI_EV_FAILED) ? + * EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME + */ + cmp x0, #SDEI_EV_FAILED + mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE + mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME + csel x0, x2, x3, ls + + ldr_l x2, sdei_exit_mode + +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + sdei_handler_exit exit_mode=x2 +alternative_else_nop_endif + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline, tmp=x3 + br x5 +#endif +SYM_CODE_END(__sdei_asm_handler) +NOKPROBE(__sdei_asm_handler) +#endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/entry32.S b/arch/arm64/kernel/entry32.S deleted file mode 100644 index f332d5d1f6b4..000000000000 --- a/arch/arm64/kernel/entry32.S +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Compat system call wrappers - * - * Copyright (C) 2012 ARM Ltd. - * Authors: Will Deacon <will.deacon@arm.com> - * Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/linkage.h> -#include <linux/const.h> - -#include <asm/assembler.h> -#include <asm/asm-offsets.h> -#include <asm/errno.h> -#include <asm/page.h> - -/* - * System call wrappers for the AArch32 compatibility layer. - */ - -ENTRY(compat_sys_sigreturn_wrapper) - mov x0, sp - b compat_sys_sigreturn -ENDPROC(compat_sys_sigreturn_wrapper) - -ENTRY(compat_sys_rt_sigreturn_wrapper) - mov x0, sp - b compat_sys_rt_sigreturn -ENDPROC(compat_sys_rt_sigreturn_wrapper) - -ENTRY(compat_sys_statfs64_wrapper) - mov w3, #84 - cmp w1, #88 - csel w1, w3, w1, eq - b compat_sys_statfs64 -ENDPROC(compat_sys_statfs64_wrapper) - -ENTRY(compat_sys_fstatfs64_wrapper) - mov w3, #84 - cmp w1, #88 - csel w1, w3, w1, eq - b compat_sys_fstatfs64 -ENDPROC(compat_sys_fstatfs64_wrapper) - -/* - * Note: off_4k (w5) is always in units of 4K. If we can't do the - * requested offset because it is not page-aligned, we return -EINVAL. - */ -ENTRY(compat_sys_mmap2_wrapper) -#if PAGE_SHIFT > 12 - tst w5, #~PAGE_MASK >> 12 - b.ne 1f - lsr w5, w5, #PAGE_SHIFT - 12 -#endif - b sys_mmap_pgoff -1: mov x0, #-EINVAL - ret -ENDPROC(compat_sys_mmap2_wrapper) - -/* - * Wrappers for AArch32 syscalls that either take 64-bit parameters - * in registers or that take 32-bit parameters which require sign - * extension. - */ -ENTRY(compat_sys_pread64_wrapper) - regs_to_64 x3, x4, x5 - b sys_pread64 -ENDPROC(compat_sys_pread64_wrapper) - -ENTRY(compat_sys_pwrite64_wrapper) - regs_to_64 x3, x4, x5 - b sys_pwrite64 -ENDPROC(compat_sys_pwrite64_wrapper) - -ENTRY(compat_sys_truncate64_wrapper) - regs_to_64 x1, x2, x3 - b sys_truncate -ENDPROC(compat_sys_truncate64_wrapper) - -ENTRY(compat_sys_ftruncate64_wrapper) - regs_to_64 x1, x2, x3 - b sys_ftruncate -ENDPROC(compat_sys_ftruncate64_wrapper) - -ENTRY(compat_sys_readahead_wrapper) - regs_to_64 x1, x2, x3 - mov w2, w4 - b sys_readahead -ENDPROC(compat_sys_readahead_wrapper) - -ENTRY(compat_sys_fadvise64_64_wrapper) - mov w6, w1 - regs_to_64 x1, x2, x3 - regs_to_64 x2, x4, x5 - mov w3, w6 - b sys_fadvise64_64 -ENDPROC(compat_sys_fadvise64_64_wrapper) - -ENTRY(compat_sys_sync_file_range2_wrapper) - regs_to_64 x2, x2, x3 - regs_to_64 x3, x4, x5 - b sys_sync_file_range2 -ENDPROC(compat_sys_sync_file_range2_wrapper) - -ENTRY(compat_sys_fallocate_wrapper) - regs_to_64 x2, x2, x3 - regs_to_64 x3, x4, x5 - b sys_fallocate -ENDPROC(compat_sys_fallocate_wrapper) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5d547deb6996..23834d96d1e7 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1,35 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * FP/SIMD context switching and fault handling * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bitmap.h> +#include <linux/bitops.h> #include <linux/bottom_half.h> +#include <linux/bug.h> +#include <linux/cache.h> +#include <linux/compat.h> +#include <linux/compiler.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> +#include <linux/ctype.h> #include <linux/kernel.h> +#include <linux/linkage.h> +#include <linux/irqflags.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/prctl.h> #include <linux/preempt.h> +#include <linux/ptrace.h> #include <linux/sched/signal.h> +#include <linux/sched/task_stack.h> #include <linux/signal.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/sysctl.h> +#include <linux/swab.h> +#include <asm/esr.h> +#include <asm/exception.h> #include <asm/fpsimd.h> +#include <asm/cpufeature.h> #include <asm/cputype.h> +#include <asm/neon.h> +#include <asm/processor.h> #include <asm/simd.h> +#include <asm/sigcontext.h> +#include <asm/sysreg.h> +#include <asm/traps.h> +#include <asm/virt.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -39,6 +53,8 @@ #define FPEXC_IDF (1 << 7) /* + * (Note: in this discussion, statements about FPSIMD apply equally to SVE.) + * * In order to reduce the number of times the FPSIMD state is needlessly saved * and restored, we need to keep track of two things: * (a) for each task, we need to remember which CPU was the last one to have @@ -47,7 +63,7 @@ * been loaded into its FPSIMD registers most recently, or whether it has * been used to perform kernel mode NEON in the meantime. * - * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to + * For (a), we add a fpsimd_cpu field to thread_struct, which gets updated to * the id of the current CPU every time the state is loaded onto a CPU. For (b), * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the * address of the userland FPSIMD state of the task that was loaded onto the CPU @@ -56,31 +72,36 @@ * With this in place, we no longer have to restore the next FPSIMD state right * when switching between tasks. Instead, we can defer this check to userland * resume, at which time we verify whether the CPU's fpsimd_last_state and the - * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we + * task's fpsimd_cpu are still mutually in sync. If this is the case, we * can omit the FPSIMD restore. * * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to * indicate whether or not the userland FPSIMD state of the current task is * present in the registers. The flag is set unless the FPSIMD registers of this * CPU currently contain the most recent userland FPSIMD state of the current - * task. + * task. If the task is behaving as a VMM, then this is will be managed by + * KVM which will clear it to indicate that the vcpu FPSIMD state is currently + * loaded on the CPU, allowing the state to be saved if a FPSIMD-aware + * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and + * flag the register state as invalid. * * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may * save the task's FPSIMD context back to task_struct from softirq context. * To prevent this from racing with the manipulation of the task's FPSIMD state * from task context and thereby corrupting the state, it is necessary to * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with local_bh_disable() unless softirqs are already masked. + * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to + * run but prevent them to use FPSIMD. * * For a certain task, the sequence may look something like this: - * - the task gets scheduled in; if both the task's fpsimd_state.cpu field + * - the task gets scheduled in; if both the task's fpsimd_cpu field * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is * cleared, otherwise it is set; * * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's * userland FPSIMD state is copied from memory to the registers, the task's - * fpsimd_state.cpu field is set to the id of the current CPU, the current + * fpsimd_cpu field is set to the id of the current CPU, the current * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the * TIF_FOREIGN_FPSTATE flag is cleared; * @@ -97,12 +118,1357 @@ * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); +struct fpsimd_last_state_struct { + struct user_fpsimd_state *st; + void *sve_state; + void *za_state; + u64 *svcr; + unsigned int sve_vl; + unsigned int sme_vl; +}; + +static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); + +__ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { +#ifdef CONFIG_ARM64_SVE + [ARM64_VEC_SVE] = { + .type = ARM64_VEC_SVE, + .name = "SVE", + .min_vl = SVE_VL_MIN, + .max_vl = SVE_VL_MIN, + .max_virtualisable_vl = SVE_VL_MIN, + }, +#endif +#ifdef CONFIG_ARM64_SME + [ARM64_VEC_SME] = { + .type = ARM64_VEC_SME, + .name = "SME", + }, +#endif +}; + +static unsigned int vec_vl_inherit_flag(enum vec_type type) +{ + switch (type) { + case ARM64_VEC_SVE: + return TIF_SVE_VL_INHERIT; + case ARM64_VEC_SME: + return TIF_SME_VL_INHERIT; + default: + WARN_ON_ONCE(1); + return 0; + } +} + +struct vl_config { + int __default_vl; /* Default VL for tasks */ +}; + +static struct vl_config vl_config[ARM64_VEC_MAX]; + +static inline int get_default_vl(enum vec_type type) +{ + return READ_ONCE(vl_config[type].__default_vl); +} + +#ifdef CONFIG_ARM64_SVE + +static inline int get_sve_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SVE); +} + +static inline void set_default_vl(enum vec_type type, int val) +{ + WRITE_ONCE(vl_config[type].__default_vl, val); +} + +static inline void set_sve_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SVE, val); +} + +static void __percpu *efi_sve_state; + +#else /* ! CONFIG_ARM64_SVE */ + +/* Dummy declaration for code that will be optimised out: */ +extern void __percpu *efi_sve_state; + +#endif /* ! CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +static int get_sme_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SME); +} + +static void set_sme_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SME, val); +} + +static void sme_free(struct task_struct *); + +#else + +static inline void sme_free(struct task_struct *t) { } + +#endif + +DEFINE_PER_CPU(bool, fpsimd_context_busy); +EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); + +static void fpsimd_bind_task_to_cpu(void); + +static void __get_cpu_fpsimd_context(void) +{ + bool busy = __this_cpu_xchg(fpsimd_context_busy, true); + + WARN_ON(busy); +} + +/* + * Claim ownership of the CPU FPSIMD context for use by the calling context. + * + * The caller may freely manipulate the FPSIMD context metadata until + * put_cpu_fpsimd_context() is called. + * + * The double-underscore version must only be called if you know the task + * can't be preempted. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. + */ +static void get_cpu_fpsimd_context(void) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); + __get_cpu_fpsimd_context(); +} + +static void __put_cpu_fpsimd_context(void) +{ + bool busy = __this_cpu_xchg(fpsimd_context_busy, false); + + WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */ +} + +/* + * Release the CPU FPSIMD context. + * + * Must be called from a context in which get_cpu_fpsimd_context() was + * previously called, with no call to put_cpu_fpsimd_context() in the + * meantime. + */ +static void put_cpu_fpsimd_context(void) +{ + __put_cpu_fpsimd_context(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); +} + +static bool have_cpu_fpsimd_context(void) +{ + return !preemptible() && __this_cpu_read(fpsimd_context_busy); +} + +unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) +{ + return task->thread.vl[type]; +} + +void task_set_vl(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl[type] = vl; +} + +unsigned int task_get_vl_onexec(const struct task_struct *task, + enum vec_type type) +{ + return task->thread.vl_onexec[type]; +} + +void task_set_vl_onexec(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl_onexec[type] = vl; +} + +/* + * TIF_SME controls whether a task can use SME without trapping while + * in userspace, when TIF_SME is set then we must have storage + * alocated in sve_state and za_state to store the contents of both ZA + * and the SVE registers for both streaming and non-streaming modes. + * + * If both SVCR.ZA and SVCR.SM are disabled then at any point we + * may disable TIF_SME and reenable traps. + */ + + +/* + * TIF_SVE controls whether a task can use SVE without trapping while + * in userspace, and also (together with TIF_SME) the way a task's + * FPSIMD/SVE state is stored in thread_struct. + * + * The kernel uses this flag to track whether a user task is actively + * using SVE, and therefore whether full SVE register state needs to + * be tracked. If not, the cheaper FPSIMD context handling code can + * be used instead of the more costly SVE equivalents. + * + * * TIF_SVE or SVCR.SM set: + * + * The task can execute SVE instructions while in userspace without + * trapping to the kernel. + * + * When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the + * corresponding Zn), P0-P15 and FFR are encoded in + * task->thread.sve_state, formatted appropriately for vector + * length task->thread.sve_vl or, if SVCR.SM is set, + * task->thread.sme_vl. + * + * task->thread.sve_state must point to a valid buffer at least + * sve_state_size(task) bytes in size. + * + * During any syscall, the kernel may optionally clear TIF_SVE and + * discard the vector state except for the FPSIMD subset. + * + * * TIF_SVE clear: + * + * An attempt by the user task to execute an SVE instruction causes + * do_sve_acc() to be called, which does some preparation and then + * sets TIF_SVE. + * + * When stored, FPSIMD registers V0-V31 are encoded in + * task->thread.uw.fpsimd_state; bits [max : 128] for each of Z0-Z31 are + * logically zero but not stored anywhere; P0-P15 and FFR are not + * stored and have unspecified values from userspace's point of + * view. For hygiene purposes, the kernel zeroes them on next use, + * but userspace is discouraged from relying on this. + * + * task->thread.sve_state does not need to be non-NULL, valid or any + * particular size: it must not be dereferenced. + * + * * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state + * irrespective of whether TIF_SVE is clear or set, since these are + * not vector length dependent. + */ + +/* + * Update current's FPSIMD/SVE registers from thread_struct. + * + * This function should be called only when the FPSIMD/SVE state in + * thread_struct is known to be up to date, when preparing to enter + * userspace. + */ +static void task_fpsimd_load(void) +{ + bool restore_sve_regs = false; + bool restore_ffr; + + WARN_ON(!system_supports_fpsimd()); + WARN_ON(!have_cpu_fpsimd_context()); + + /* Check if we should restore SVE first */ + if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) { + sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); + restore_sve_regs = true; + restore_ffr = true; + } + + /* Restore SME, override SVE register configuration if needed */ + if (system_supports_sme()) { + unsigned long sme_vl = task_get_sme_vl(current); + + /* Ensure VL is set up for restoring data */ + if (test_thread_flag(TIF_SME)) + sme_set_vq(sve_vq_from_vl(sme_vl) - 1); + + write_sysreg_s(current->thread.svcr, SYS_SVCR); + + if (thread_za_enabled(¤t->thread)) + za_load_state(current->thread.za_state); + + if (thread_sm_enabled(¤t->thread)) { + restore_sve_regs = true; + restore_ffr = system_supports_fa64(); + } + } + + if (restore_sve_regs) + sve_load_state(sve_pffr(¤t->thread), + ¤t->thread.uw.fpsimd_state.fpsr, + restore_ffr); + else + fpsimd_load_state(¤t->thread.uw.fpsimd_state); +} + +/* + * Ensure FPSIMD/SVE storage in memory for the loaded context is up to + * date with respect to the CPU registers. Note carefully that the + * current context is the context last bound to the CPU stored in + * last, if KVM is involved this may be the guest VM context rather + * than the host thread for the VM pointed to by current. This means + * that we must always reference the state storage via last rather + * than via current, other than the TIF_ flags which KVM will + * carefully maintain for us. + */ +static void fpsimd_save(void) +{ + struct fpsimd_last_state_struct const *last = + this_cpu_ptr(&fpsimd_last_state); + /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ + bool save_sve_regs = false; + bool save_ffr; + unsigned int vl; + + WARN_ON(!system_supports_fpsimd()); + WARN_ON(!have_cpu_fpsimd_context()); + + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) + return; + + if (test_thread_flag(TIF_SVE)) { + save_sve_regs = true; + save_ffr = true; + vl = last->sve_vl; + } + + if (system_supports_sme()) { + u64 *svcr = last->svcr; + + *svcr = read_sysreg_s(SYS_SVCR); + + if (*svcr & SVCR_ZA_MASK) + za_save_state(last->za_state); + + /* If we are in streaming mode override regular SVE. */ + if (*svcr & SVCR_SM_MASK) { + save_sve_regs = true; + save_ffr = system_supports_fa64(); + vl = last->sme_vl; + } + } + + if (IS_ENABLED(CONFIG_ARM64_SVE) && save_sve_regs) { + /* Get the configured VL from RDVL, will account for SM */ + if (WARN_ON(sve_get_vl() != vl)) { + /* + * Can't save the user regs, so current would + * re-enter user with corrupt state. + * There's no way to recover, so kill it: + */ + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + return; + } + + sve_save_state((char *)last->sve_state + + sve_ffr_offset(vl), + &last->st->fpsr, save_ffr); + } else { + fpsimd_save_state(last->st); + } +} + +/* + * All vector length selection from userspace comes through here. + * We're on a slow path, so some sanity-checks are included. + * If things go wrong there's a bug somewhere, but try to fall back to a + * safe choice. + */ +static unsigned int find_supported_vector_length(enum vec_type type, + unsigned int vl) +{ + struct vl_info *info = &vl_info[type]; + int bit; + int max_vl = info->max_vl; + + if (WARN_ON(!sve_vl_valid(vl))) + vl = info->min_vl; + + if (WARN_ON(!sve_vl_valid(max_vl))) + max_vl = info->min_vl; + + if (vl > max_vl) + vl = max_vl; + if (vl < info->min_vl) + vl = info->min_vl; + + bit = find_next_bit(info->vq_map, SVE_VQ_MAX, + __vq_to_bit(sve_vq_from_vl(vl))); + return sve_vl_from_vq(__bit_to_vq(bit)); +} + +#if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL) + +static int vec_proc_do_default_vl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct vl_info *info = table->extra1; + enum vec_type type = info->type; + int ret; + int vl = get_default_vl(type); + struct ctl_table tmp_table = { + .data = &vl, + .maxlen = sizeof(vl), + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + /* Writing -1 has the special meaning "set to max": */ + if (vl == -1) + vl = info->max_vl; + + if (!sve_vl_valid(vl)) + return -EINVAL; + + set_default_vl(type, find_supported_vector_length(type, vl)); + return 0; +} + +static struct ctl_table sve_default_vl_table[] = { + { + .procname = "sve_default_vector_length", + .mode = 0644, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SVE], + }, + { } +}; + +static int __init sve_sysctl_init(void) +{ + if (system_supports_sve()) + if (!register_sysctl("abi", sve_default_vl_table)) + return -EINVAL; + + return 0; +} + +#else /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ +static int __init sve_sysctl_init(void) { return 0; } +#endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ + +#if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) +static struct ctl_table sme_default_vl_table[] = { + { + .procname = "sme_default_vector_length", + .mode = 0644, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SME], + }, + { } +}; + +static int __init sme_sysctl_init(void) +{ + if (system_supports_sme()) + if (!register_sysctl("abi", sme_default_vl_table)) + return -EINVAL; + + return 0; +} + +#else /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ +static int __init sme_sysctl_init(void) { return 0; } +#endif /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ + +#define ZREG(sve_state, vq, n) ((char *)(sve_state) + \ + (SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET)) + +#ifdef CONFIG_CPU_BIG_ENDIAN +static __uint128_t arm64_cpu_to_le128(__uint128_t x) +{ + u64 a = swab64(x); + u64 b = swab64(x >> 64); + + return ((__uint128_t)a << 64) | b; +} +#else +static __uint128_t arm64_cpu_to_le128(__uint128_t x) +{ + return x; +} +#endif + +#define arm64_le128_to_cpu(x) arm64_cpu_to_le128(x) + +static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, + unsigned int vq) +{ + unsigned int i; + __uint128_t *p; + + for (i = 0; i < SVE_NUM_ZREGS; ++i) { + p = (__uint128_t *)ZREG(sst, vq, i); + *p = arm64_cpu_to_le128(fst->vregs[i]); + } +} + +/* + * Transfer the FPSIMD state in task->thread.uw.fpsimd_state to + * task->thread.sve_state. + * + * Task can be a non-runnable task, or current. In the latter case, + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + * task->thread.uw.fpsimd_state must be up to date before calling this + * function. + */ +static void fpsimd_to_sve(struct task_struct *task) +{ + unsigned int vq; + void *sst = task->thread.sve_state; + struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; + + if (!system_supports_sve()) + return; + + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); + __fpsimd_to_sve(sst, fst, vq); +} + +/* + * Transfer the SVE state in task->thread.sve_state to + * task->thread.uw.fpsimd_state. + * + * Task can be a non-runnable task, or current. In the latter case, + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + * task->thread.sve_state must be up to date before calling this function. + */ +static void sve_to_fpsimd(struct task_struct *task) +{ + unsigned int vq, vl; + void const *sst = task->thread.sve_state; + struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state; + unsigned int i; + __uint128_t const *p; + + if (!system_supports_sve()) + return; + + vl = thread_get_cur_vl(&task->thread); + vq = sve_vq_from_vl(vl); + for (i = 0; i < SVE_NUM_ZREGS; ++i) { + p = (__uint128_t const *)ZREG(sst, vq, i); + fst->vregs[i] = arm64_le128_to_cpu(*p); + } +} + +#ifdef CONFIG_ARM64_SVE +/* + * Call __sve_free() directly only if you know task can't be scheduled + * or preempted. + */ +static void __sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; +} + +static void sve_free(struct task_struct *task) +{ + WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + + __sve_free(task); +} + +/* + * Return how many bytes of memory are required to store the full SVE + * state for task, given task's currently configured vector length. + */ +size_t sve_state_size(struct task_struct const *task) +{ + unsigned int vl = 0; + + if (system_supports_sve()) + vl = task_get_sve_vl(task); + if (system_supports_sme()) + vl = max(vl, task_get_sme_vl(task)); + + return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +} + +/* + * Ensure that task->thread.sve_state is allocated and sufficiently large. + * + * This function should be used only in preparation for replacing + * task->thread.sve_state with new data. The memory is always zeroed + * here to prevent stale data from showing through: this is done in + * the interest of testability and predictability: except in the + * do_sve_acc() case, there is no ABI requirement to hide stale data + * written previously be task. + */ +void sve_alloc(struct task_struct *task, bool flush) +{ + if (task->thread.sve_state) { + if (flush) + memset(task->thread.sve_state, 0, + sve_state_size(task)); + return; + } + + /* This is a small allocation (maximum ~8KB) and Should Not Fail. */ + task->thread.sve_state = + kzalloc(sve_state_size(task), GFP_KERNEL); +} + + +/* + * Force the FPSIMD state shared with SVE to be updated in the SVE state + * even if the SVE state is the current active state. + * + * This should only be called by ptrace. task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + */ +void fpsimd_force_sync_to_sve(struct task_struct *task) +{ + fpsimd_to_sve(task); +} + +/* + * Ensure that task->thread.sve_state is up to date with respect to + * the user task, irrespective of when SVE is in use or not. + * + * This should only be called by ptrace. task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + */ +void fpsimd_sync_to_sve(struct task_struct *task) +{ + if (!test_tsk_thread_flag(task, TIF_SVE) && + !thread_sm_enabled(&task->thread)) + fpsimd_to_sve(task); +} + +/* + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to + * the user task, irrespective of whether SVE is in use or not. + * + * This should only be called by ptrace. task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + */ +void sve_sync_to_fpsimd(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) + sve_to_fpsimd(task); +} + +/* + * Ensure that task->thread.sve_state is up to date with respect to + * the task->thread.uw.fpsimd_state. + * + * This should only be called by ptrace to merge new FPSIMD register + * values into a task for which SVE is currently active. + * task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + * task->thread.uw.fpsimd_state must already have been initialised with + * the new FPSIMD register values to be merged in. + */ +void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +{ + unsigned int vq; + void *sst = task->thread.sve_state; + struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; + + if (!test_tsk_thread_flag(task, TIF_SVE)) + return; + + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); + + memset(sst, 0, SVE_SIG_REGS_SIZE(vq)); + __fpsimd_to_sve(sst, fst, vq); +} + +int vec_set_vector_length(struct task_struct *task, enum vec_type type, + unsigned long vl, unsigned long flags) +{ + if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | + PR_SVE_SET_VL_ONEXEC)) + return -EINVAL; + + if (!sve_vl_valid(vl)) + return -EINVAL; + + /* + * Clamp to the maximum vector length that VL-agnostic code + * can work with. A flag may be assigned in the future to + * allow setting of larger vector lengths without confusing + * older software. + */ + if (vl > VL_ARCH_MAX) + vl = VL_ARCH_MAX; + + vl = find_supported_vector_length(type, vl); + + if (flags & (PR_SVE_VL_INHERIT | + PR_SVE_SET_VL_ONEXEC)) + task_set_vl_onexec(task, type, vl); + else + /* Reset VL to system default on next exec: */ + task_set_vl_onexec(task, type, 0); + + /* Only actually set the VL if not deferred: */ + if (flags & PR_SVE_SET_VL_ONEXEC) + goto out; + + if (vl == task_get_vl(task, type)) + goto out; + + /* + * To ensure the FPSIMD bits of the SVE vector registers are preserved, + * write any live register state back to task_struct, and convert to a + * regular FPSIMD thread. + */ + if (task == current) { + get_cpu_fpsimd_context(); + + fpsimd_save(); + } + + fpsimd_flush_task_state(task); + if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) + sve_to_fpsimd(task); + + if (system_supports_sme() && type == ARM64_VEC_SME) { + task->thread.svcr &= ~(SVCR_SM_MASK | + SVCR_ZA_MASK); + clear_thread_flag(TIF_SME); + } + + if (task == current) + put_cpu_fpsimd_context(); + + /* + * Force reallocation of task SVE and SME state to the correct + * size on next use: + */ + sve_free(task); + if (system_supports_sme() && type == ARM64_VEC_SME) + sme_free(task); + + task_set_vl(task, type, vl); + +out: + update_tsk_thread_flag(task, vec_vl_inherit_flag(type), + flags & PR_SVE_VL_INHERIT); + + return 0; +} + +/* + * Encode the current vector length and flags for return. + * This is only required for prctl(): ptrace has separate fields. + * SVE and SME use the same bits for _ONEXEC and _INHERIT. + * + * flags are as for vec_set_vector_length(). + */ +static int vec_prctl_status(enum vec_type type, unsigned long flags) +{ + int ret; + + if (flags & PR_SVE_SET_VL_ONEXEC) + ret = task_get_vl_onexec(current, type); + else + ret = task_get_vl(current, type); + + if (test_thread_flag(vec_vl_inherit_flag(type))) + ret |= PR_SVE_VL_INHERIT; + + return ret; +} + +/* PR_SVE_SET_VL */ +int sve_set_current_vl(unsigned long arg) +{ + unsigned long vl, flags; + int ret; + + vl = arg & PR_SVE_VL_LEN_MASK; + flags = arg & ~vl; + + if (!system_supports_sve() || is_compat_task()) + return -EINVAL; + + ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags); + if (ret) + return ret; + + return vec_prctl_status(ARM64_VEC_SVE, flags); +} + +/* PR_SVE_GET_VL */ +int sve_get_current_vl(void) +{ + if (!system_supports_sve() || is_compat_task()) + return -EINVAL; + + return vec_prctl_status(ARM64_VEC_SVE, 0); +} + +#ifdef CONFIG_ARM64_SME +/* PR_SME_SET_VL */ +int sme_set_current_vl(unsigned long arg) +{ + unsigned long vl, flags; + int ret; + + vl = arg & PR_SME_VL_LEN_MASK; + flags = arg & ~vl; + + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags); + if (ret) + return ret; + + return vec_prctl_status(ARM64_VEC_SME, flags); +} + +/* PR_SME_GET_VL */ +int sme_get_current_vl(void) +{ + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + return vec_prctl_status(ARM64_VEC_SME, 0); +} +#endif /* CONFIG_ARM64_SME */ + +static void vec_probe_vqs(struct vl_info *info, + DECLARE_BITMAP(map, SVE_VQ_MAX)) +{ + unsigned int vq, vl; + + bitmap_zero(map, SVE_VQ_MAX); + + for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) { + write_vl(info->type, vq - 1); /* self-syncing */ + + switch (info->type) { + case ARM64_VEC_SVE: + vl = sve_get_vl(); + break; + case ARM64_VEC_SME: + vl = sme_get_vl(); + break; + default: + vl = 0; + break; + } + + /* Minimum VL identified? */ + if (sve_vq_from_vl(vl) > vq) + break; + + vq = sve_vq_from_vl(vl); /* skip intervening lengths */ + set_bit(__vq_to_bit(vq), map); + } +} + +/* + * Initialise the set of known supported VQs for the boot CPU. + * This is called during kernel boot, before secondary CPUs are brought up. + */ +void __init vec_init_vq_map(enum vec_type type) +{ + struct vl_info *info = &vl_info[type]; + vec_probe_vqs(info, info->vq_map); + bitmap_copy(info->vq_partial_map, info->vq_map, SVE_VQ_MAX); +} + +/* + * If we haven't committed to the set of supported VQs yet, filter out + * those not supported by the current CPU. + * This function is called during the bring-up of early secondary CPUs only. + */ +void vec_update_vq_map(enum vec_type type) +{ + struct vl_info *info = &vl_info[type]; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + + vec_probe_vqs(info, tmp_map); + bitmap_and(info->vq_map, info->vq_map, tmp_map, SVE_VQ_MAX); + bitmap_or(info->vq_partial_map, info->vq_partial_map, tmp_map, + SVE_VQ_MAX); +} + +/* + * Check whether the current CPU supports all VQs in the committed set. + * This function is called during the bring-up of late secondary CPUs only. + */ +int vec_verify_vq_map(enum vec_type type) +{ + struct vl_info *info = &vl_info[type]; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; + + vec_probe_vqs(info, tmp_map); + + bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); + if (bitmap_intersects(tmp_map, info->vq_map, SVE_VQ_MAX)) { + pr_warn("%s: cpu%d: Required vector length(s) missing\n", + info->name, smp_processor_id()); + return -EINVAL; + } + + if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available()) + return 0; + + /* + * For KVM, it is necessary to ensure that this CPU doesn't + * support any vector length that guests may have probed as + * unsupported. + */ + + /* Recover the set of supported VQs: */ + bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); + /* Find VQs supported that are not globally supported: */ + bitmap_andnot(tmp_map, tmp_map, info->vq_map, SVE_VQ_MAX); + + /* Find the lowest such VQ, if any: */ + b = find_last_bit(tmp_map, SVE_VQ_MAX); + if (b >= SVE_VQ_MAX) + return 0; /* no mismatches */ + + /* + * Mismatches above sve_max_virtualisable_vl are fine, since + * no guest is allowed to configure ZCR_EL2.LEN to exceed this: + */ + if (sve_vl_from_vq(__bit_to_vq(b)) <= info->max_virtualisable_vl) { + pr_warn("%s: cpu%d: Unsupported vector length(s) present\n", + info->name, smp_processor_id()); + return -EINVAL; + } + + return 0; +} + +static void __init sve_efi_setup(void) +{ + int max_vl = 0; + int i; + + if (!IS_ENABLED(CONFIG_EFI)) + return; + + for (i = 0; i < ARRAY_SIZE(vl_info); i++) + max_vl = max(vl_info[i].max_vl, max_vl); + + /* + * alloc_percpu() warns and prints a backtrace if this goes wrong. + * This is evidence of a crippled system and we are returning void, + * so no attempt is made to handle this situation here. + */ + if (!sve_vl_valid(max_vl)) + goto fail; + + efi_sve_state = __alloc_percpu( + SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); + if (!efi_sve_state) + goto fail; + + return; + +fail: + panic("Cannot allocate percpu memory for EFI SVE save/restore"); +} + +/* + * Enable SVE for EL1. + * Intended for use by the cpufeatures code during CPU boot. + */ +void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); + isb(); +} + +/* + * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE + * vector length. + * + * Use only if SVE is present. + * This function clobbers the SVE vector length. + */ +u64 read_zcr_features(void) +{ + u64 zcr; + unsigned int vq_max; + + /* + * Set the maximum possible VL, and write zeroes to all other + * bits to see if they stick. + */ + sve_kernel_enable(NULL); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1); + + zcr = read_sysreg_s(SYS_ZCR_EL1); + zcr &= ~(u64)ZCR_ELx_LEN_MASK; /* find sticky 1s outside LEN field */ + vq_max = sve_vq_from_vl(sve_get_vl()); + zcr |= vq_max - 1; /* set LEN field to maximum effective value */ + + return zcr; +} + +void __init sve_setup(void) +{ + struct vl_info *info = &vl_info[ARM64_VEC_SVE]; + u64 zcr; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; + + if (!system_supports_sve()) + return; + + /* + * The SVE architecture mandates support for 128-bit vectors, + * so sve_vq_map must have at least SVE_VQ_MIN set. + * If something went wrong, at least try to patch it up: + */ + if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map))) + set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map); + + zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); + info->max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); + + /* + * Sanity-check that the max VL we determined through CPU features + * corresponds properly to sve_vq_map. If not, do our best: + */ + if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SVE, + info->max_vl))) + info->max_vl = find_supported_vector_length(ARM64_VEC_SVE, + info->max_vl); + + /* + * For the default VL, pick the maximum supported value <= 64. + * VL == 64 is guaranteed not to grow the signal frame. + */ + set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, 64)); + + bitmap_andnot(tmp_map, info->vq_partial_map, info->vq_map, + SVE_VQ_MAX); + + b = find_last_bit(tmp_map, SVE_VQ_MAX); + if (b >= SVE_VQ_MAX) + /* No non-virtualisable VLs found */ + info->max_virtualisable_vl = SVE_VQ_MAX; + else if (WARN_ON(b == SVE_VQ_MAX - 1)) + /* No virtualisable VLs? This is architecturally forbidden. */ + info->max_virtualisable_vl = SVE_VQ_MIN; + else /* b + 1 < SVE_VQ_MAX */ + info->max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); + + if (info->max_virtualisable_vl > info->max_vl) + info->max_virtualisable_vl = info->max_vl; + + pr_info("%s: maximum available vector length %u bytes per vector\n", + info->name, info->max_vl); + pr_info("%s: default vector length %u bytes per vector\n", + info->name, get_sve_default_vl()); + + /* KVM decides whether to support mismatched systems. Just warn here: */ + if (sve_max_virtualisable_vl() < sve_max_vl()) + pr_warn("%s: unvirtualisable vector lengths present\n", + info->name); + + sve_efi_setup(); +} + +/* + * Called from the put_task_struct() path, which cannot get here + * unless dead_task is really dead and not schedulable. + */ +void fpsimd_release_task(struct task_struct *dead_task) +{ + __sve_free(dead_task); + sme_free(dead_task); +} + +#endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +/* + * Ensure that task->thread.za_state is allocated and sufficiently large. + * + * This function should be used only in preparation for replacing + * task->thread.za_state with new data. The memory is always zeroed + * here to prevent stale data from showing through: this is done in + * the interest of testability and predictability, the architecture + * guarantees that when ZA is enabled it will be zeroed. + */ +void sme_alloc(struct task_struct *task) +{ + if (task->thread.za_state) { + memset(task->thread.za_state, 0, za_state_size(task)); + return; + } + + /* This could potentially be up to 64K. */ + task->thread.za_state = + kzalloc(za_state_size(task), GFP_KERNEL); +} + +static void sme_free(struct task_struct *task) +{ + kfree(task->thread.za_state); + task->thread.za_state = NULL; +} + +void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Set priority for all PEs to architecturally defined minimum */ + write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, + SYS_SMPRI_EL1); + + /* Allow SME in kernel */ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); + isb(); + + /* Allow EL0 to access TPIDR2 */ + write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); + isb(); +} + +/* + * This must be called after sme_kernel_enable(), we rely on the + * feature table being sorted to ensure this. + */ +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Allow use of FA64 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, + SYS_SMCR_EL1); +} + +/* + * Read the pseudo-SMCR used by cpufeatures to identify the supported + * vector length. + * + * Use only if SME is present. + * This function clobbers the SME vector length. + */ +u64 read_smcr_features(void) +{ + u64 smcr; + unsigned int vq_max; + + sme_kernel_enable(NULL); + sme_smstart_sm(); + + /* + * Set the maximum possible VL. + */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_LEN_MASK, + SYS_SMCR_EL1); + + smcr = read_sysreg_s(SYS_SMCR_EL1); + smcr &= ~(u64)SMCR_ELx_LEN_MASK; /* Only the LEN field */ + vq_max = sve_vq_from_vl(sve_get_vl()); + smcr |= vq_max - 1; /* set LEN field to maximum effective value */ + + sme_smstop_sm(); + + return smcr; +} + +void __init sme_setup(void) +{ + struct vl_info *info = &vl_info[ARM64_VEC_SME]; + u64 smcr; + int min_bit; + + if (!system_supports_sme()) + return; + + /* + * SME doesn't require any particular vector length be + * supported but it does require at least one. We should have + * disabled the feature entirely while bringing up CPUs but + * let's double check here. + */ + WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); + + smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); + info->max_vl = sve_vl_from_vq((smcr & SMCR_ELx_LEN_MASK) + 1); + + /* + * Sanity-check that the max VL we determined through CPU features + * corresponds properly to sme_vq_map. If not, do our best: + */ + if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SME, + info->max_vl))) + info->max_vl = find_supported_vector_length(ARM64_VEC_SME, + info->max_vl); + + WARN_ON(info->min_vl > info->max_vl); + + /* + * For the default VL, pick the maximum supported value <= 32 + * (256 bits) if there is one since this is guaranteed not to + * grow the signal frame when in streaming mode, otherwise the + * minimum available VL will be used. + */ + set_sme_default_vl(find_supported_vector_length(ARM64_VEC_SME, 32)); + + pr_info("SME: minimum available vector length %u bytes per vector\n", + info->min_vl); + pr_info("SME: maximum available vector length %u bytes per vector\n", + info->max_vl); + pr_info("SME: default vector length %u bytes per vector\n", + get_sme_default_vl()); +} + +#endif /* CONFIG_ARM64_SME */ + +static void sve_init_regs(void) +{ + /* + * Convert the FPSIMD state to SVE, zeroing all the state that + * is not shared with FPSIMD. If (as is likely) the current + * state is live in the registers then do this there and + * update our metadata for the current task including + * disabling the trap, otherwise update our in-memory copy. + * We are guaranteed to not be in streaming mode, we can only + * take a SVE trap when not in streaming mode and we can't be + * in streaming mode when taking a SME trap. + */ + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_set_vq(vq_minus_one); + sve_flush_live(true, vq_minus_one); + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_to_sve(current); + } +} + +/* + * Trapped SVE access + * + * Storage is allocated for the full SVE state, the current FPSIMD + * register contents are migrated across, and the access trap is + * disabled. + * + * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state() + * would have disabled the SVE access trap for userspace during + * ret_to_user, making an SVE access trap impossible in that case. + */ +void do_sve_acc(unsigned long esr, struct pt_regs *regs) +{ + /* Even if we chose not to use SVE, the hardware could still trap: */ + if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + sve_alloc(current, true); + if (!current->thread.sve_state) { + force_sig(SIGKILL); + return; + } + + get_cpu_fpsimd_context(); + + if (test_and_set_thread_flag(TIF_SVE)) + WARN_ON(1); /* SVE access shouldn't have trapped */ + + /* + * Even if the task can have used streaming mode we can only + * generate SVE access traps in normal SVE mode and + * transitioning out of streaming mode may discard any + * streaming mode state. Always clear the high bits to avoid + * any potential errors tracking what is properly initialised. + */ + sve_init_regs(); + + put_cpu_fpsimd_context(); +} + +/* + * Trapped SME access + * + * Storage is allocated for the full SVE and SME state, the current + * FPSIMD register contents are migrated to SVE if SVE is not already + * active, and the access trap is disabled. + * + * TIF_SME should be clear on entry: otherwise, fpsimd_restore_current_state() + * would have disabled the SME access trap for userspace during + * ret_to_user, making an SVE access trap impossible in that case. + */ +void do_sme_acc(unsigned long esr, struct pt_regs *regs) +{ + /* Even if we chose not to use SME, the hardware could still trap: */ + if (unlikely(!system_supports_sme()) || WARN_ON(is_compat_task())) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * If this not a trap due to SME being disabled then something + * is being used in the wrong mode, report as SIGILL. + */ + if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + sve_alloc(current, false); + sme_alloc(current); + if (!current->thread.sve_state || !current->thread.za_state) { + force_sig(SIGKILL); + return; + } + + get_cpu_fpsimd_context(); + + /* With TIF_SME userspace shouldn't generate any traps */ + if (test_and_set_thread_flag(TIF_SME)) + WARN_ON(1); + + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sme_vl(current)) - 1; + sme_set_vq(vq_minus_one); + + fpsimd_bind_task_to_cpu(); + } + + put_cpu_fpsimd_context(); +} /* * Trapped FP/ASIMD access. */ -void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs) { /* TODO: implement lazy context saving/restoring */ WARN_ON(1); @@ -111,74 +1477,128 @@ void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) /* * Raise a SIGFPE for the current process. */ -void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) { - siginfo_t info; - unsigned int si_code = 0; - - if (esr & FPEXC_IOF) - si_code = FPE_FLTINV; - else if (esr & FPEXC_DZF) - si_code = FPE_FLTDIV; - else if (esr & FPEXC_OFF) - si_code = FPE_FLTOVF; - else if (esr & FPEXC_UFF) - si_code = FPE_FLTUND; - else if (esr & FPEXC_IXF) - si_code = FPE_FLTRES; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGFPE; - info.si_code = si_code; - info.si_addr = (void __user *)instruction_pointer(regs); + unsigned int si_code = FPE_FLTUNK; + + if (esr & ESR_ELx_FP_EXC_TFV) { + if (esr & FPEXC_IOF) + si_code = FPE_FLTINV; + else if (esr & FPEXC_DZF) + si_code = FPE_FLTDIV; + else if (esr & FPEXC_OFF) + si_code = FPE_FLTOVF; + else if (esr & FPEXC_UFF) + si_code = FPE_FLTUND; + else if (esr & FPEXC_IXF) + si_code = FPE_FLTRES; + } - send_sig_info(SIGFPE, &info, current); + send_sig_fault(SIGFPE, si_code, + (void __user *)instruction_pointer(regs), + current); } void fpsimd_thread_switch(struct task_struct *next) { + bool wrong_task, wrong_cpu; + if (!system_supports_fpsimd()) return; + + __get_cpu_fpsimd_context(); + + /* Save unsaved fpsimd state, if any: */ + fpsimd_save(); + /* - * Save the current FPSIMD state to memory, but only if whatever is in - * the registers is in fact the most recent userland FPSIMD state of - * 'current'. + * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's + * state. For kernel threads, FPSIMD registers are never loaded + * and wrong_task and wrong_cpu will always be true. */ - if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); + wrong_task = __this_cpu_read(fpsimd_last_state.st) != + &next->thread.uw.fpsimd_state; + wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); - if (next->mm) { - /* - * If we are switching to a task whose most recent userland - * FPSIMD state is already in the registers of *this* cpu, - * we can skip loading the state from memory. Otherwise, set - * the TIF_FOREIGN_FPSTATE flag so the state will be loaded - * upon the next return to userland. - */ - struct fpsimd_state *st = &next->thread.fpsimd_state; + update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, + wrong_task || wrong_cpu); - if (__this_cpu_read(fpsimd_last_state) == st - && st->cpu == smp_processor_id()) - clear_ti_thread_flag(task_thread_info(next), - TIF_FOREIGN_FPSTATE); - else - set_ti_thread_flag(task_thread_info(next), - TIF_FOREIGN_FPSTATE); - } + __put_cpu_fpsimd_context(); +} + +static void fpsimd_flush_thread_vl(enum vec_type type) +{ + int vl, supported_vl; + + /* + * Reset the task vector length as required. This is where we + * ensure that all user tasks have a valid vector length + * configured: no kernel task can become a user task without + * an exec and hence a call to this function. By the time the + * first call to this function is made, all early hardware + * probing is complete, so __sve_default_vl should be valid. + * If a bug causes this to go wrong, we make some noise and + * try to fudge thread.sve_vl to a safe value here. + */ + vl = task_get_vl_onexec(current, type); + if (!vl) + vl = get_default_vl(type); + + if (WARN_ON(!sve_vl_valid(vl))) + vl = vl_info[type].min_vl; + + supported_vl = find_supported_vector_length(type, vl); + if (WARN_ON(supported_vl != vl)) + vl = supported_vl; + + task_set_vl(current, type, vl); + + /* + * If the task is not set to inherit, ensure that the vector + * length will be reset by a subsequent exec: + */ + if (!test_thread_flag(vec_vl_inherit_flag(type))) + task_set_vl_onexec(current, type, 0); } void fpsimd_flush_thread(void) { + void *sve_state = NULL; + void *za_state = NULL; + if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); - memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); - set_thread_flag(TIF_FOREIGN_FPSTATE); + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); - local_bh_enable(); + if (system_supports_sve()) { + clear_thread_flag(TIF_SVE); + + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; + + fpsimd_flush_thread_vl(ARM64_VEC_SVE); + } + + if (system_supports_sme()) { + clear_thread_flag(TIF_SME); + + /* Defer kfree() while in atomic context */ + za_state = current->thread.za_state; + current->thread.za_state = NULL; + + fpsimd_flush_thread_vl(ARM64_VEC_SME); + current->thread.svcr = 0; + } + + put_cpu_fpsimd_context(); + kfree(sve_state); + kfree(za_state); } /* @@ -190,72 +1610,201 @@ void fpsimd_preserve_current_state(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); + fpsimd_save(); + put_cpu_fpsimd_context(); +} - if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); +/* + * Like fpsimd_preserve_current_state(), but ensure that + * current->thread.uw.fpsimd_state is updated so that it can be copied to + * the signal frame. + */ +void fpsimd_signal_preserve_current_state(void) +{ + fpsimd_preserve_current_state(); + if (test_thread_flag(TIF_SVE)) + sve_to_fpsimd(current); +} + +/* + * Associate current's FPSIMD context with this cpu + * The caller must have ownership of the cpu FPSIMD context before calling + * this function. + */ +static void fpsimd_bind_task_to_cpu(void) +{ + struct fpsimd_last_state_struct *last = + this_cpu_ptr(&fpsimd_last_state); + + WARN_ON(!system_supports_fpsimd()); + last->st = ¤t->thread.uw.fpsimd_state; + last->sve_state = current->thread.sve_state; + last->za_state = current->thread.za_state; + last->sve_vl = task_get_sve_vl(current); + last->sme_vl = task_get_sme_vl(current); + last->svcr = ¤t->thread.svcr; + current->thread.fpsimd_cpu = smp_processor_id(); - local_bh_enable(); + /* + * Toggle SVE and SME trapping for userspace if needed, these + * are serialsied by ret_to_user(). + */ + if (system_supports_sme()) { + if (test_thread_flag(TIF_SME)) + sme_user_enable(); + else + sme_user_disable(); + } + + if (system_supports_sve()) { + if (test_thread_flag(TIF_SVE)) + sve_user_enable(); + else + sve_user_disable(); + } +} + +void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, + unsigned int sve_vl, void *za_state, + unsigned int sme_vl, u64 *svcr) +{ + struct fpsimd_last_state_struct *last = + this_cpu_ptr(&fpsimd_last_state); + + WARN_ON(!system_supports_fpsimd()); + WARN_ON(!in_softirq() && !irqs_disabled()); + + last->st = st; + last->svcr = svcr; + last->sve_state = sve_state; + last->za_state = za_state; + last->sve_vl = sve_vl; + last->sme_vl = sme_vl; } /* * Load the userland FPSIMD state of 'current' from memory, but only if the * FPSIMD state already held in the registers is /not/ the most recent FPSIMD - * state of 'current' + * state of 'current'. This is called when we are preparing to return to + * userspace to ensure that userspace sees a good register state. */ void fpsimd_restore_current_state(void) { - if (!system_supports_fpsimd()) + /* + * For the tasks that were created before we detected the absence of + * FP/SIMD, the TIF_FOREIGN_FPSTATE could be set via fpsimd_thread_switch(), + * e.g, init. This could be then inherited by the children processes. + * If we later detect that the system doesn't support FP/SIMD, + * we must clear the flag for all the tasks to indicate that the + * FPSTATE is clean (as we can't have one) to avoid looping for ever in + * do_notify_resume(). + */ + if (!system_supports_fpsimd()) { + clear_thread_flag(TIF_FOREIGN_FPSTATE); return; + } - local_bh_disable(); + get_cpu_fpsimd_context(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { - struct fpsimd_state *st = ¤t->thread.fpsimd_state; - - fpsimd_load_state(st); - __this_cpu_write(fpsimd_last_state, st); - st->cpu = smp_processor_id(); + task_fpsimd_load(); + fpsimd_bind_task_to_cpu(); } - local_bh_enable(); + put_cpu_fpsimd_context(); } /* * Load an updated userland FPSIMD state for 'current' from memory and set the * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current' + * FPSIMD state of 'current'. This is used by the signal code to restore the + * register state when returning from a signal handler in FPSIMD only cases, + * any SVE context will be discarded. */ -void fpsimd_update_current_state(struct fpsimd_state *state) +void fpsimd_update_current_state(struct user_fpsimd_state const *state) { - if (!system_supports_fpsimd()) + if (WARN_ON(!system_supports_fpsimd())) return; - local_bh_disable(); + get_cpu_fpsimd_context(); - fpsimd_load_state(state); - if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { - struct fpsimd_state *st = ¤t->thread.fpsimd_state; + current->thread.uw.fpsimd_state = *state; + if (test_thread_flag(TIF_SVE)) + fpsimd_to_sve(current); - __this_cpu_write(fpsimd_last_state, st); - st->cpu = smp_processor_id(); - } + task_fpsimd_load(); + fpsimd_bind_task_to_cpu(); - local_bh_enable(); + clear_thread_flag(TIF_FOREIGN_FPSTATE); + + put_cpu_fpsimd_context(); } /* * Invalidate live CPU copies of task t's FPSIMD state + * + * This function may be called with preemption enabled. The barrier() + * ensures that the assignment to fpsimd_cpu is visible to any + * preemption/softirq that could race with set_tsk_thread_flag(), so + * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared. + * + * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any + * subsequent code. */ void fpsimd_flush_task_state(struct task_struct *t) { - t->thread.fpsimd_state.cpu = NR_CPUS; + t->thread.fpsimd_cpu = NR_CPUS; + /* + * If we don't support fpsimd, bail out after we have + * reset the fpsimd_cpu for this task and clear the + * FPSTATE. + */ + if (!system_supports_fpsimd()) + return; + barrier(); + set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE); + + barrier(); } -#ifdef CONFIG_KERNEL_MODE_NEON +/* + * Invalidate any task's FPSIMD state that is present on this cpu. + * The FPSIMD context should be acquired with get_cpu_fpsimd_context() + * before calling this function. + */ +static void fpsimd_flush_cpu_state(void) +{ + WARN_ON(!system_supports_fpsimd()); + __this_cpu_write(fpsimd_last_state.st, NULL); -DEFINE_PER_CPU(bool, kernel_neon_busy); -EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + + set_thread_flag(TIF_FOREIGN_FPSTATE); +} + +/* + * Save the FPSIMD state to memory and invalidate cpu view. + * This function must be called with preemption disabled. + */ +void fpsimd_save_and_flush_cpu_state(void) +{ + if (!system_supports_fpsimd()) + return; + WARN_ON(preemptible()); + __get_cpu_fpsimd_context(); + fpsimd_save(); + fpsimd_flush_cpu_state(); + __put_cpu_fpsimd_context(); +} + +#ifdef CONFIG_KERNEL_MODE_NEON /* * Kernel-side NEON support functions @@ -281,20 +1830,13 @@ void kernel_neon_begin(void) BUG_ON(!may_use_simd()); - local_bh_disable(); - - __this_cpu_write(kernel_neon_busy, true); + get_cpu_fpsimd_context(); - /* Save unsaved task fpsimd state, if any: */ - if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); + /* Save unsaved fpsimd state, if any: */ + fpsimd_save(); /* Invalidate any task state remaining in the fpsimd regs: */ - __this_cpu_write(fpsimd_last_state, NULL); - - preempt_disable(); - - local_bh_enable(); + fpsimd_flush_cpu_state(); } EXPORT_SYMBOL(kernel_neon_begin); @@ -309,22 +1851,19 @@ EXPORT_SYMBOL(kernel_neon_begin); */ void kernel_neon_end(void) { - bool busy; - if (!system_supports_fpsimd()) return; - busy = __this_cpu_xchg(kernel_neon_busy, false); - WARN_ON(!busy); /* No matching kernel_neon_begin()? */ - - preempt_enable(); + put_cpu_fpsimd_context(); } EXPORT_SYMBOL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct fpsimd_state, efi_fpsimd_state); +static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); +static DEFINE_PER_CPU(bool, efi_sve_state_used); +static DEFINE_PER_CPU(bool, efi_sm_state); /* * EFI runtime services support functions @@ -350,10 +1889,46 @@ void __efi_fpsimd_begin(void) WARN_ON(preemptible()); - if (may_use_simd()) + if (may_use_simd()) { kernel_neon_begin(); - else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + } else { + /* + * If !efi_sve_state, SVE can't be in use yet and doesn't need + * preserving: + */ + if (system_supports_sve() && likely(efi_sve_state)) { + char *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + u64 svcr; + + __this_cpu_write(efi_sve_state_used, true); + + if (system_supports_sme()) { + svcr = read_sysreg_s(SYS_SVCR); + + __this_cpu_write(efi_sm_state, + svcr & SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = !(svcr & SVCR_SM_MASK); + } + + sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), + &this_cpu_ptr(&efi_fpsimd_state)->fpsr, + ffr); + + if (system_supports_sme()) + sysreg_clear_set_s(SYS_SVCR, + SVCR_SM_MASK, 0); + + } else { + fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + } + __this_cpu_write(efi_fpsimd_state_used, true); } } @@ -366,10 +1941,43 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (__this_cpu_xchg(efi_fpsimd_state_used, false)) - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); - else + if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { kernel_neon_end(); + } else { + if (system_supports_sve() && + likely(__this_cpu_read(efi_sve_state_used))) { + char const *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + + /* + * Restore streaming mode; EFI calls are + * normal function calls so should not return in + * streaming mode. + */ + if (system_supports_sme()) { + if (__this_cpu_read(efi_sm_state)) { + sysreg_clear_set_s(SYS_SVCR, + 0, + SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = false; + } + } + + sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), + &this_cpu_ptr(&efi_fpsimd_state)->fpsr, + ffr); + + __this_cpu_write(efi_sve_state_used, false); + } else { + fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + } + } } #endif /* CONFIG_EFI */ @@ -382,13 +1990,9 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, { switch (cmd) { case CPU_PM_ENTER: - if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); - this_cpu_write(fpsimd_last_state, NULL); + fpsimd_save_and_flush_cpu_state(); break; case CPU_PM_EXIT: - if (current->mm) - set_thread_flag(TIF_FOREIGN_FPSTATE); break; case CPU_PM_ENTER_FAILED: default: @@ -413,7 +2017,7 @@ static inline void fpsimd_pm_init(void) { } #ifdef CONFIG_HOTPLUG_CPU static int fpsimd_cpu_dead(unsigned int cpu) { - per_cpu(fpsimd_last_state, cpu) = NULL; + per_cpu(fpsimd_last_state.st, cpu) = NULL; return 0; } @@ -432,16 +2036,23 @@ static inline void fpsimd_hotplug_init(void) { } */ static int __init fpsimd_init(void) { - if (elf_hwcap & HWCAP_FP) { + if (cpu_have_named_feature(FP)) { fpsimd_pm_init(); fpsimd_hotplug_init(); } else { pr_notice("Floating-point is not implemented\n"); } - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); + + if (cpu_have_named_feature(SME) && !cpu_have_named_feature(SVE)) + pr_notice("SME is implemented but not SVE\n"); + + sve_sysctl_init(); + sme_sysctl_init(); + return 0; } core_initcall(fpsimd_init); diff --git a/arch/arm64/kernel/ftrace-mod.S b/arch/arm64/kernel/ftrace-mod.S deleted file mode 100644 index 00c4025be4ff..000000000000 --- a/arch/arm64/kernel/ftrace-mod.S +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .section ".text.ftrace_trampoline", "ax" - .align 3 -0: .quad 0 -__ftrace_trampoline: - ldr x16, 0b - br x16 -ENDPROC(__ftrace_trampoline) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c13b1fca0e5b..8745175f4a75 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/ftrace.c * * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/ftrace.h> @@ -18,6 +15,7 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> +#include <asm/patching.h> #ifdef CONFIG_DYNAMIC_FTRACE /* @@ -58,79 +56,158 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned long pc; u32 new; - pc = (unsigned long)&ftrace_call; + pc = (unsigned long)ftrace_call; new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK); return ftrace_modify_code(pc, 0, new, false); } +static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) +{ +#ifdef CONFIG_ARM64_MODULE_PLTS + struct plt_entry *plt = mod->arch.ftrace_trampolines; + + if (addr == FTRACE_ADDR) + return &plt[FTRACE_PLT_IDX]; + if (addr == FTRACE_REGS_ADDR && + IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + return &plt[FTRACE_REGS_PLT_IDX]; +#endif + return NULL; +} + /* - * Turn on the call to ftrace_caller() in instrumented function + * Find the address the callsite must branch to in order to reach '*addr'. + * + * Due to the limited range of 'BL' instructions, modules may be placed too far + * away to branch directly and must use a PLT. + * + * Returns true when '*addr' contains a reachable target address, or has been + * modified to contain a PLT address. Returns false otherwise. */ -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, + struct module *mod, + unsigned long *addr) { unsigned long pc = rec->ip; - u32 old, new; - long offset = (long)pc - (long)addr; + long offset = (long)*addr - (long)pc; + struct plt_entry *plt; - if (offset < -SZ_128M || offset >= SZ_128M) { -#ifdef CONFIG_ARM64_MODULE_PLTS - unsigned long *trampoline; - struct module *mod; - - /* - * On kernels that support module PLTs, the offset between the - * branch instruction and its target may legally exceed the - * range of an ordinary relative 'bl' opcode. In this case, we - * need to branch via a trampoline in the module. - * - * NOTE: __module_text_address() must be called with preemption - * disabled, but we can rely on ftrace_lock to ensure that 'mod' - * retains its validity throughout the remainder of this code. - */ + /* + * When the target is within range of the 'BL' instruction, use 'addr' + * as-is and branch to that directly. + */ + if (offset >= -SZ_128M && offset < SZ_128M) + return true; + + /* + * When the target is outside of the range of a 'BL' instruction, we + * must use a PLT to reach it. We can only place PLTs for modules, and + * only when module PLT support is built-in. + */ + if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) + return false; + + /* + * 'mod' is only set at module load time, but if we end up + * dealing with an out-of-range condition, we can assume it + * is due to a module being loaded far away from the kernel. + * + * NOTE: __module_text_address() must be called with preemption + * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * retains its validity throughout the remainder of this code. + */ + if (!mod) { preempt_disable(); mod = __module_text_address(pc); preempt_enable(); + } - if (WARN_ON(!mod)) - return -EINVAL; + if (WARN_ON(!mod)) + return false; - /* - * There is only one ftrace trampoline per module. For now, - * this is not a problem since on arm64, all dynamic ftrace - * invocations are routed via ftrace_caller(). This will need - * to be revisited if support for multiple ftrace entry points - * is added in the future, but for now, the pr_err() below - * deals with a theoretical issue only. - */ - trampoline = (unsigned long *)mod->arch.ftrace_trampoline; - if (trampoline[0] != addr) { - if (trampoline[0] != 0) { - pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); - return -EINVAL; - } - - /* point the trampoline to our ftrace entry point */ - module_disable_ro(mod); - trampoline[0] = addr; - module_enable_ro(mod, true); - - /* update trampoline before patching in the branch */ - smp_wmb(); - } - addr = (unsigned long)&trampoline[1]; -#else /* CONFIG_ARM64_MODULE_PLTS */ - return -EINVAL; -#endif /* CONFIG_ARM64_MODULE_PLTS */ + plt = get_ftrace_plt(mod, *addr); + if (!plt) { + pr_err("ftrace: no module PLT for %ps\n", (void *)*addr); + return false; } + *addr = (unsigned long)plt; + return true; +} + +/* + * Turn on the call to ftrace_caller() in instrumented function + */ +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long pc = rec->ip; + u32 old, new; + + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; + old = aarch64_insn_gen_nop(); new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); return ftrace_modify_code(pc, old, new, true); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + unsigned long pc = rec->ip; + u32 old, new; + + if (!ftrace_find_callable_addr(rec, NULL, &old_addr)) + return -EINVAL; + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; + + old = aarch64_insn_gen_branch_imm(pc, old_addr, + AARCH64_INSN_BRANCH_LINK); + new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); + + return ftrace_modify_code(pc, old, new, true); +} + +/* + * The compiler has inserted two NOPs before the regular function prologue. + * All instrumented functions follow the AAPCS, so x0-x8 and x19-x30 are live, + * and x9-x18 are free for our use. + * + * At runtime we want to be able to swing a single NOP <-> BL to enable or + * disable the ftrace call. The BL requires us to save the original LR value, + * so here we insert a <MOV X9, LR> over the first NOP so the instructions + * before the regular prologue are: + * + * | Compiled | Disabled | Enabled | + * +----------+------------+------------+ + * | NOP | MOV X9, LR | MOV X9, LR | + * | NOP | NOP | BL <entry> | + * + * The LR value will be recovered by ftrace_regs_entry, and restored into LR + * before returning to the regular function prologue. When a function is not + * being traced, the MOV is not harmful given x9 is not live per the AAPCS. + * + * Note: ftrace_process_locs() has pre-adjusted rec->ip to be the address of + * the BL. + */ +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) +{ + unsigned long pc = rec->ip - AARCH64_INSN_SIZE; + u32 old, new; + + old = aarch64_insn_gen_nop(); + new = aarch64_insn_gen_move_reg(AARCH64_INSN_REG_9, + AARCH64_INSN_REG_LR, + AARCH64_INSN_VARIANT_64BIT); + return ftrace_modify_code(pc, old, new, true); +} +#endif + /* * Turn off the call to ftrace_caller() in instrumented function */ @@ -138,66 +215,37 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned long pc = rec->ip; - bool validate = true; u32 old = 0, new; - long offset = (long)pc - (long)addr; - if (offset < -SZ_128M || offset >= SZ_128M) { -#ifdef CONFIG_ARM64_MODULE_PLTS - u32 replaced; - - /* - * 'mod' is only set at module load time, but if we end up - * dealing with an out-of-range condition, we can assume it - * is due to a module being loaded far away from the kernel. - */ - if (!mod) { - preempt_disable(); - mod = __module_text_address(pc); - preempt_enable(); - - if (WARN_ON(!mod)) - return -EINVAL; - } - - /* - * The instruction we are about to patch may be a branch and - * link instruction that was redirected via a PLT entry. In - * this case, the normal validation will fail, but we can at - * least check that we are dealing with a branch and link - * instruction that points into the right module. - */ - if (aarch64_insn_read((void *)pc, &replaced)) - return -EFAULT; + new = aarch64_insn_gen_nop(); - if (!aarch64_insn_is_bl(replaced) || - !within_module(pc + aarch64_get_branch_offset(replaced), - mod)) - return -EINVAL; + /* + * When using mcount, callsites in modules may have been initalized to + * call an arbitrary module PLT (which redirects to the _mcount stub) + * rather than the ftrace PLT we'll use at runtime (which redirects to + * the ftrace trampoline). We can ignore the old PLT when initializing + * the callsite. + * + * Note: 'mod' is only set at module load time. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && + IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && mod) { + return aarch64_insn_patch_text_nosync((void *)pc, new); + } - validate = false; -#else /* CONFIG_ARM64_MODULE_PLTS */ + if (!ftrace_find_callable_addr(rec, mod, &addr)) return -EINVAL; -#endif /* CONFIG_ARM64_MODULE_PLTS */ - } else { - old = aarch64_insn_gen_branch_imm(pc, addr, - AARCH64_INSN_BRANCH_LINK); - } - new = aarch64_insn_gen_nop(); + old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); - return ftrace_modify_code(pc, old, new, validate); + return ftrace_modify_code(pc, old, new, true); } void arch_ftrace_update_code(int command) { + command |= FTRACE_MAY_SLEEP; ftrace_modify_all_code(command); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -206,16 +254,12 @@ int __init ftrace_dyn_arch_init(void) * on the way back to parent. For this purpose, this function is called * in _mcount() or ftrace_caller() to replace return address (*parent) on * the call stack to return_to_handler. - * - * Note that @frame_pointer is used only for sanity check later. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; unsigned long old; - struct ftrace_graph_ent trace; - int err; if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; @@ -227,22 +271,29 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, */ old = *parent; - trace.func = self_addr; - trace.depth = current->curr_ret_stack + 1; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) - return; - - err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer, NULL); - if (err == -EBUSY) - return; - else + if (!function_graph_enter(old, self_addr, frame_pointer, + (void *)frame_pointer)) { *parent = return_hooker; + } } #ifdef CONFIG_DYNAMIC_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + /* + * When DYNAMIC_FTRACE_WITH_REGS is selected, `fregs` can never be NULL + * and arch_ftrace_get_regs(fregs) will always give a non-NULL pt_regs + * in which we can safely modify the LR. + */ + struct pt_regs *regs = arch_ftrace_get_regs(fregs); + unsigned long *parent = (unsigned long *)&procedure_link_pointer(regs); + + prepare_ftrace_return(ip, parent, frame_pointer(regs)); +} +#else /* * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() * depending on @enable. @@ -272,5 +323,6 @@ int ftrace_disable_ftrace_graph_caller(void) { return ftrace_modify_graph_caller(false); } +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 0b243ecaf7ac..2196aad7b55b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Low-level CPU initialisation * Based on arch/arm/kernel/head.S @@ -6,37 +7,29 @@ * Copyright (C) 2003-2012 ARM Ltd. * Authors: Catalin Marinas <catalin.marinas@arm.com> * Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> #include <linux/init.h> -#include <linux/irqchip/arm-gic-v3.h> +#include <linux/pgtable.h> +#include <asm/asm_pointer_auth.h> #include <asm/assembler.h> #include <asm/boot.h> +#include <asm/bug.h> #include <asm/ptrace.h> #include <asm/asm-offsets.h> #include <asm/cache.h> #include <asm/cputype.h> +#include <asm/el2_setup.h> #include <asm/elf.h> +#include <asm/image.h> #include <asm/kernel-pgtable.h> #include <asm/kvm_arm.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> -#include <asm/pgtable.h> #include <asm/page.h> +#include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> #include <asm/thread_info.h> @@ -44,14 +37,8 @@ #include "efi-header.S" -#define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET) - -#if (TEXT_OFFSET & 0xfff) != 0 -#error TEXT_OFFSET must be at least 4KB aligned -#elif (PAGE_OFFSET & 0x1fffff) != 0 +#if (PAGE_OFFSET & 0x1fffff) != 0 #error PAGE_OFFSET must be at least 2MB aligned -#elif TEXT_OFFSET > 0x1fffff -#error TEXT_OFFSET must be less than 2MB #endif /* @@ -62,44 +49,26 @@ * MMU = off, D-cache = off, I-cache = on or off, * x0 = physical address to the FDT blob. * - * This code is mostly position independent so you call this at - * __pa(PAGE_OFFSET + TEXT_OFFSET). - * * Note that the callee-saved registers are used for storing variables * that are useful before the MMU is enabled. The allocations are described * in the entry routines. */ __HEAD -_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ -#ifdef CONFIG_EFI - /* - * This add instruction has no meaningful effect except that - * its opcode forms the magic "MZ" signature required by UEFI. - */ - add x13, x18, #0x16 - b stext -#else - b stext // branch to kernel start, magic - .long 0 // reserved -#endif - le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + efi_signature_nop // special NOP to identity as PE/COFF executable + b primary_entry // branch to kernel start, magic + .quad 0 // Image load offset from start of RAM, little-endian le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved - .ascii "ARM\x64" // Magic number -#ifdef CONFIG_EFI - .long pe_header - _head // Offset to the PE header. + .ascii ARM64_IMAGE_MAGIC // Magic number + .long .Lpe_header_offset // Offset to the PE header. -pe_header: __EFI_PE_HEADER -#else - .long 0 // reserved -#endif __INIT @@ -108,32 +77,42 @@ pe_header: * primary lowlevel boot path: * * Register Scope Purpose - * x21 stext() .. start_kernel() FDT pointer passed at boot in x0 - * x23 stext() .. start_kernel() physical misalignment/KASLR offset - * x28 __create_page_tables() callee preserved temp register - * x19/x20 __primary_switch() callee preserved temp registers + * x20 primary_entry() .. __primary_switch() CPU boot mode + * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 + * x22 create_idmap() .. start_kernel() ID map VA of the DT blob + * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset + * x24 __primary_switch() linear map KASLR seed + * x25 primary_entry() .. start_kernel() supported VA size + * x28 create_idmap() callee preserved temp register */ -ENTRY(stext) +SYM_CODE_START(primary_entry) bl preserve_boot_args - bl el2_setup // Drop to EL1, w0=cpu_boot_mode - adrp x23, __PHYS_OFFSET - and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 - bl set_cpu_boot_mode_flag - bl __create_page_tables + bl init_kernel_el // w0=cpu_boot_mode + mov x20, x0 + bl create_idmap + /* * The following calls CPU setup code, see arch/arm64/mm/proc.S for * details. * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ +#if VA_BITS > 48 + mrs_s x0, SYS_ID_AA64MMFR2_EL1 + tst x0, #0xf << ID_AA64MMFR2_EL1_VARange_SHIFT + mov x0, #VA_BITS + mov x25, #VA_BITS_MIN + csel x25, x25, x0, eq + mov x0, x25 +#endif bl __cpu_setup // initialise processor b __primary_switch -ENDPROC(stext) +SYM_CODE_END(primary_entry) /* * Preserve the arguments passed by the bootloader in x0 .. x3 */ -preserve_boot_args: +SYM_CODE_START_LOCAL(preserve_boot_args) mov x21, x0 // x21=FDT adr_l x0, boot_args // record the contents of @@ -143,117 +122,196 @@ preserve_boot_args: dmb sy // needed before dc ivac with // MMU off - mov x1, #0x20 // 4 x 8 bytes - b __inval_dcache_area // tail call -ENDPROC(preserve_boot_args) + add x1, x0, #0x20 // 4 x 8 bytes + b dcache_inval_poc // tail call +SYM_CODE_END(preserve_boot_args) + +SYM_FUNC_START_LOCAL(clear_page_tables) + /* + * Clear the init page tables. + */ + adrp x0, init_pg_dir + adrp x1, init_pg_end + sub x2, x1, x0 + mov x1, xzr + b __pi_memset // tail call +SYM_FUNC_END(clear_page_tables) /* - * Macro to create a table entry to the next page. + * Macro to populate page table entries, these entries can be pointers to the next level + * or last level entries pointing to physical memory. * * tbl: page table address - * virt: virtual address - * shift: #imm page table shift - * ptrs: #imm pointers per table page + * rtbl: pointer to page table or physical memory + * index: start index to write + * eindex: end index to write - [index, eindex] written to + * flags: flags for pagetable entry to or in + * inc: increment to rtbl between each entry + * tmp1: temporary variable + * + * Preserves: tbl, eindex, flags, inc + * Corrupts: index, tmp1 + * Returns: rtbl + */ + .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 +.Lpe\@: phys_to_pte \tmp1, \rtbl + orr \tmp1, \tmp1, \flags // tmp1 = table entry + str \tmp1, [\tbl, \index, lsl #3] + add \rtbl, \rtbl, \inc // rtbl = pa next level + add \index, \index, #1 + cmp \index, \eindex + b.ls .Lpe\@ + .endm + +/* + * Compute indices of table entries from virtual address range. If multiple entries + * were needed in the previous page table level then the next page table level is assumed + * to be composed of multiple pages. (This effectively scales the end index). + * + * vstart: virtual address of start of range + * vend: virtual address of end of range - we map [vstart, vend] + * shift: shift used to transform virtual address into index + * order: #imm 2log(number of entries in page table) + * istart: index in table corresponding to vstart + * iend: index in table corresponding to vend + * count: On entry: how many extra entries were required in previous level, scales + * our end index. + * On exit: returns how many extra entries required for next page table level * - * Preserves: virt - * Corrupts: tmp1, tmp2 - * Returns: tbl -> next level table page address + * Preserves: vstart, vend + * Returns: istart, iend, count */ - .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 - lsr \tmp1, \virt, #\shift - and \tmp1, \tmp1, #\ptrs - 1 // table index - add \tmp2, \tbl, #PAGE_SIZE - orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type - str \tmp2, [\tbl, \tmp1, lsl #3] - add \tbl, \tbl, #PAGE_SIZE // next level table page + .macro compute_indices, vstart, vend, shift, order, istart, iend, count + ubfx \istart, \vstart, \shift, \order + ubfx \iend, \vend, \shift, \order + add \iend, \iend, \count, lsl \order + sub \count, \iend, \istart .endm /* - * Macro to populate the PGD (and possibily PUD) for the corresponding - * block entry in the next level (tbl) for the given virtual address. + * Map memory for specified virtual address range. Each level of page table needed supports + * multiple entries. If a level requires n entries the next page table level is assumed to be + * formed from n pages. + * + * tbl: location of page table + * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) + * vstart: virtual address of start of range + * vend: virtual address of end of range - we map [vstart, vend - 1] + * flags: flags to use to map last level entries + * phys: physical address corresponding to vstart - physical memory is contiguous + * order: #imm 2log(number of entries in PGD table) * - * Preserves: tbl, next, virt - * Corrupts: tmp1, tmp2 + * If extra_shift is set, an extra level will be populated if the end address does + * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range. + * + * Temporaries: istart, iend, tmp, count, sv - these need to be different registers + * Preserves: vstart, flags + * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv */ - .macro create_pgd_entry, tbl, virt, tmp1, tmp2 - create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2 + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift + sub \vend, \vend, #1 + add \rtbl, \tbl, #PAGE_SIZE + mov \count, #0 + + .ifnb \extra_shift + tst \vend, #~((1 << (\extra_shift)) - 1) + b.eq .L_\@ + compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + .endif +.L_\@: + compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count + mov \sv, \rtbl + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + #if SWAPPER_PGTABLE_LEVELS > 3 - create_table_entry \tbl, \virt, PUD_SHIFT, PTRS_PER_PUD, \tmp1, \tmp2 + compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv #endif + #if SWAPPER_PGTABLE_LEVELS > 2 - create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2 + compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv #endif + + compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1 + populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm /* - * Macro to populate block entries in the page table for the start..end - * virtual range (inclusive). + * Remap a subregion created with the map_memory macro with modified attributes + * or output address. The entire remapped region must have been covered in the + * invocation of map_memory. * - * Preserves: tbl, flags - * Corrupts: phys, start, end, pstate + * x0: last level table address (returned in first argument to map_memory) + * x1: start VA of the existing mapping + * x2: start VA of the region to update + * x3: end VA of the region to update (exclusive) + * x4: start PA associated with the region to update + * x5: attributes to set on the updated region + * x6: order of the last level mappings */ - .macro create_block_map, tbl, flags, phys, start, end - lsr \phys, \phys, #SWAPPER_BLOCK_SHIFT - lsr \start, \start, #SWAPPER_BLOCK_SHIFT - and \start, \start, #PTRS_PER_PTE - 1 // table index - orr \phys, \flags, \phys, lsl #SWAPPER_BLOCK_SHIFT // table entry - lsr \end, \end, #SWAPPER_BLOCK_SHIFT - and \end, \end, #PTRS_PER_PTE - 1 // table end index -9999: str \phys, [\tbl, \start, lsl #3] // store the entry - add \start, \start, #1 // next entry - add \phys, \phys, #SWAPPER_BLOCK_SIZE // next block - cmp \start, \end - b.ls 9999b - .endm +SYM_FUNC_START_LOCAL(remap_region) + sub x3, x3, #1 // make end inclusive -/* - * Setup the initial page tables. We only setup the barest amount which is - * required to get the kernel running. The following sections are required: - * - identity mapping to enable the MMU (low address, TTBR0) - * - first few MB of the kernel linear mapping to jump to once the MMU has - * been enabled - */ -__create_page_tables: - mov x28, lr + // Get the index offset for the start of the last level table + lsr x1, x1, x6 + bfi x1, xzr, #0, #PAGE_SHIFT - 3 - /* - * Invalidate the idmap and swapper page tables to avoid potential - * dirty cache lines being evicted. - */ - adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) - bl __inval_dcache_area + // Derive the start and end indexes into the last level table + // associated with the provided region + lsr x2, x2, x6 + lsr x3, x3, x6 + sub x2, x2, x1 + sub x3, x3, x1 - /* - * Clear the idmap and swapper page tables. - */ - adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) -1: stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - subs x1, x1, #64 - b.ne 1b + mov x1, #1 + lsl x6, x1, x6 // block size at this level - mov x7, SWAPPER_MM_MMUFLAGS + populate_entries x0, x4, x2, x3, x5, x6, x7 + ret +SYM_FUNC_END(remap_region) +SYM_FUNC_START_LOCAL(create_idmap) + mov x28, lr /* - * Create the identity mapping. + * The ID map carries a 1:1 mapping of the physical address range + * covered by the loaded image, which could be anywhere in DRAM. This + * means that the required size of the VA (== PA) space is decided at + * boot time, and could be more than the configured size of the VA + * space for ordinary kernel and user space mappings. + * + * There are three cases to consider here: + * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover + * the placement of the image. In this case, we configure one extra + * level of translation on the fly for the ID map only. (This case + * also covers 42-bit VA/52-bit PA on 64k pages). + * + * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can + * only happen when using 64k pages, in which case we need to extend + * the root level table rather than add a level. Note that we can + * treat this case as 'always extended' as long as we take care not + * to program an unsupported T0SZ value into the TCR register. + * + * - Combinations that would require two additional levels of + * translation are not supported, e.g., VA_BITS==36 on 16k pages, or + * VA_BITS==39/4k pages with 5-level paging, where the input address + * requires more than 47 or 48 bits, respectively. */ - adrp x0, idmap_pg_dir - adrp x3, __idmap_text_start // __pa(__idmap_text_start) - -#ifndef CONFIG_ARM64_VA_BITS_48 +#if (VA_BITS < 48) +#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) -#define EXTRA_PTRS (1 << (48 - EXTRA_SHIFT)) /* - * If VA_BITS < 48, it may be too small to allow for an ID mapping to be - * created that covers system RAM if that is located sufficiently high - * in the physical address space. So for the ID map, use an extended - * virtual range in that case, by configuring an additional translation - * level. + * If VA_BITS < 48, we have to configure an additional table level. * First, we have to verify our assumption that the current value of * VA_BITS was chosen such that all translation levels are fully * utilised, and that lowering T0SZ will always result in an additional @@ -262,75 +320,112 @@ __create_page_tables: #if VA_BITS != EXTRA_SHIFT #error "Mismatch between VA_BITS and page size/number of translation levels" #endif - +#else +#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) +#define EXTRA_SHIFT /* - * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the - * entire ID map region can be mapped. As T0SZ == (64 - #bits used), - * this number conveniently equals the number of leading zeroes in - * the physical address of __idmap_text_end. + * If VA_BITS == 48, we don't have to configure an additional + * translation level, but the top-level table has more entries. */ - adrp x5, __idmap_text_end - clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? - b.ge 1f // .. then skip additional level - - adr_l x6, idmap_t0sz - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line - - create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6 -1: #endif - - create_pgd_entry x0, x3, x5, x6 - mov x5, x3 // __pa(__idmap_text_start) - adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - create_block_map x0, x7, x3, x5, x6 + adrp x0, init_idmap_pg_dir + adrp x3, _text + adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE + mov x7, SWAPPER_RX_MMUFLAGS + + map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT + + /* Remap the kernel page tables r/w in the ID map */ + adrp x1, _text + adrp x2, init_pg_dir + adrp x3, init_pg_end + bic x4, x2, #SWAPPER_BLOCK_SIZE - 1 + mov x5, SWAPPER_RW_MMUFLAGS + mov x6, #SWAPPER_BLOCK_SHIFT + bl remap_region + + /* Remap the FDT after the kernel image */ + adrp x1, _text + adrp x22, _end + SWAPPER_BLOCK_SIZE + bic x2, x22, #SWAPPER_BLOCK_SIZE - 1 + bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address + add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE + bic x4, x21, #SWAPPER_BLOCK_SIZE - 1 + mov x5, SWAPPER_RW_MMUFLAGS + mov x6, #SWAPPER_BLOCK_SHIFT + bl remap_region /* - * Map the kernel image (starting with PHYS_OFFSET). + * Since the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. */ - adrp x0, swapper_pg_dir - mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) + dmb sy + + adrp x0, init_idmap_pg_dir + adrp x1, init_idmap_pg_end + bl dcache_inval_poc + ret x28 +SYM_FUNC_END(create_idmap) + +SYM_FUNC_START_LOCAL(create_kernel_mapping) + adrp x0, init_pg_dir + mov_q x5, KIMAGE_VADDR // compile time __va(_text) +#ifdef CONFIG_RELOCATABLE add x5, x5, x23 // add KASLR displacement - create_pgd_entry x0, x5, x3, x6 +#endif adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - create_block_map x0, x7, x3, x5, x6 + mov x7, SWAPPER_RW_MMUFLAGS + + map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 + + dsb ishst // sync with page table walker + ret +SYM_FUNC_END(create_kernel_mapping) /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate the idmap and swapper page - * tables again to remove any speculatively loaded cache lines. + * Initialize CPU registers with task-specific and cpu-specific context. + * + * Create a final frame record at task_pt_regs(current)->stackframe, so + * that the unwinder can identify the final frame record of any task by + * its location in the task stack. We reserve the entire pt_regs space + * for consistency with user tasks and kthreads. */ - adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) - dmb sy - bl __inval_dcache_area + .macro init_cpu_task tsk, tmp1, tmp2 + msr sp_el0, \tsk - ret x28 -ENDPROC(__create_page_tables) - .ltorg + ldr \tmp1, [\tsk, #TSK_STACK] + add sp, \tmp1, #THREAD_SIZE + sub sp, sp, #PT_REGS_SIZE + + stp xzr, xzr, [sp, #S_STACKFRAME] + add x29, sp, #S_STACKFRAME + + scs_load \tsk + + adr_l \tmp1, __per_cpu_offset + ldr w\tmp2, [\tsk, #TSK_TI_CPU] + ldr \tmp1, [\tmp1, \tmp2, lsl #3] + set_this_cpu_offset \tmp1 + .endm /* * The following fragment of code is executed with the MMU enabled. * - * x0 = __PHYS_OFFSET + * x0 = __pa(KERNEL_START) */ -__primary_switched: - adrp x4, init_thread_union - add sp, x4, #THREAD_SIZE - adr_l x5, init_task - msr sp_el0, x5 // Save thread_info +SYM_FUNC_START_LOCAL(__primary_switched) + adr_l x4, init_task + init_cpu_task x4, x5, x6 adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb - stp xzr, x30, [sp, #-16]! + stp x29, x30, [sp, #-16]! mov x29, sp str_l x21, __fdt_pointer, x5 // Save FDT pointer @@ -339,6 +434,9 @@ __primary_switched: sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings + mov x0, x20 + bl set_cpu_boot_mode_flag + // Clear BSS adr_l x0, __bss_start mov x1, xzr @@ -347,282 +445,185 @@ __primary_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW -#ifdef CONFIG_KASAN - bl kasan_early_init +#if VA_BITS > 48 + adr_l x8, vabits_actual // Set this early so KASAN early init + str x25, [x8] // ... observes the correct value + dc civac, x8 // Make visible to booting secondaries #endif + #ifdef CONFIG_RANDOMIZE_BASE - tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? - b.ne 0f - mov x0, x21 // pass FDT address in x0 - bl kaslr_early_init // parse FDT for KASLR options - cbz x0, 0f // KASLR disabled? just proceed - orr x23, x23, x0 // record KASLR offset - ldp x29, x30, [sp], #16 // we must enable KASLR, return - ret // to __primary_switch() -0: + adrp x5, memstart_offset_seed // Save KASLR linear map seed + strh w24, [x5, :lo12:memstart_offset_seed] +#endif +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + bl kasan_early_init #endif - add sp, sp, #16 - mov x29, #0 - mov x30, #0 - b start_kernel -ENDPROC(__primary_switched) + mov x0, x21 // pass FDT address in x0 + bl early_fdt_map // Try mapping the FDT early + mov x0, x20 // pass the full boot status + bl init_feature_override // Parse cpu feature overrides + mov x0, x20 + bl finalise_el2 // Prefer VHE if possible + ldp x29, x30, [sp], #16 + bl start_kernel + ASM_BUG() +SYM_FUNC_END(__primary_switched) /* * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".idmap.text","ax" - -ENTRY(kimage_vaddr) - .quad _text - TEXT_OFFSET + .section ".idmap.text","awx" /* - * If we're fortunate enough to boot at EL2, ensure that the world is - * sane before dropping to EL1. + * Starting from EL2 or EL1, configure the CPU to execute at the highest + * reachable EL supported by the kernel in a chosen default state. If dropping + * from EL2 to EL1, configure EL2 before configuring EL1. * - * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if - * booted in EL1 or EL2 respectively. + * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if + * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. + * + * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in x0 if + * booted in EL1 or EL2 respectively, with the top 32 bits containing + * potential context flags. These flags are *not* stored in __boot_cpu_mode. */ -ENTRY(el2_setup) - msr SPsel, #1 // We want to use SP_EL{1,2} +SYM_FUNC_START(init_kernel_el) mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 - b.eq 1f - mrs x0, sctlr_el1 -CPU_BE( orr x0, x0, #(3 << 24) ) // Set the EE and E0E bits for EL1 -CPU_LE( bic x0, x0, #(3 << 24) ) // Clear the EE and E0E bits for EL1 + b.eq init_el2 + +SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 - mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb - ret - -1: mrs x0, sctlr_el2 -CPU_BE( orr x0, x0, #(1 << 25) ) // Set the EE bit for EL2 -CPU_LE( bic x0, x0, #(1 << 25) ) // Clear the EE bit for EL2 - msr sctlr_el2, x0 - -#ifdef CONFIG_ARM64_VHE - /* - * Check for VHE being present. For the rest of the EL2 setup, - * x2 being non-zero indicates that we do have VHE, and that the - * kernel is intended to run at EL2. - */ - mrs x2, id_aa64mmfr1_el1 - ubfx x2, x2, #8, #4 -#else - mov x2, xzr -#endif + mov_q x0, INIT_PSTATE_EL1 + msr spsr_el1, x0 + msr elr_el1, lr + mov w0, #BOOT_CPU_MODE_EL1 + eret - /* Hyp configuration. */ - mov x0, #HCR_RW // 64-bit EL1 - cbz x2, set_hcr - orr x0, x0, #HCR_TGE // Enable Host Extensions - orr x0, x0, #HCR_E2H -set_hcr: +SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) + mov_q x0, HCR_HOST_NVHE_FLAGS msr hcr_el2, x0 isb - /* - * Allow Non-secure EL1 and EL0 to access physical timer and counter. - * This is not necessary for VHE, since the host kernel runs in EL2, - * and EL0 accesses are configured in the later stage of boot process. - * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout - * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined - * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 - * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in - * EL2. - */ - cbnz x2, 1f - mrs x0, cnthctl_el2 - orr x0, x0, #3 // Enable EL1 physical timers - msr cnthctl_el2, x0 -1: - msr cntvoff_el2, xzr // Clear virtual offset - -#ifdef CONFIG_ARM_GIC_V3 - /* GICv3 system register access */ - mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #24, #4 - cmp x0, #1 - b.ne 3f - - mrs_s x0, SYS_ICC_SRE_EL2 - orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 - orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1 - msr_s SYS_ICC_SRE_EL2, x0 - isb // Make sure SRE is now set - mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, - tbz x0, #0, 3f // and check that it sticks - msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults - -3: -#endif - - /* Populate ID registers. */ - mrs x0, midr_el1 - mrs x1, mpidr_el1 - msr vpidr_el2, x0 - msr vmpidr_el2, x1 + init_el2_state -#ifdef CONFIG_COMPAT - msr hstr_el2, xzr // Disable CP15 traps to EL2 -#endif - - /* EL2 debug */ - mrs x1, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx x0, x1, #8, #4 - cmp x0, #1 - b.lt 4f // Skip if no PMU present - mrs x0, pmcr_el0 // Disable debug access traps - ubfx x0, x0, #11, #5 // to EL2 and allow access to -4: - csel x3, xzr, x0, lt // all PMU counters from EL1 - - /* Statistical profiling */ - ubfx x0, x1, #32, #4 // Check ID_AA64DFR0_EL1 PMSVer - cbz x0, 6f // Skip if SPE not present - cbnz x2, 5f // VHE? - mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - orr x3, x3, x1 // If we don't have VHE, then - b 6f // use EL1&0 translation. -5: // For VHE, use EL2 translation - orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1 -6: - msr mdcr_el2, x3 // Configure debug traps - - /* Stage-2 translation */ - msr vttbr_el2, xzr - - cbz x2, install_el2_stub - - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + /* Hypervisor stub */ + adr_l x0, __hyp_stub_vectors + msr vbar_el2, x0 isb - ret -install_el2_stub: + mov_q x1, INIT_SCTLR_EL1_MMU_OFF + /* - * When VHE is not in use, early init of EL2 and EL1 needs to be - * done here. - * When VHE _is_ in use, EL1 will not be used in the host and - * requires no configuration, and all non-hyp-specific EL2 setup - * will be done via the _EL1 system register aliases in __cpu_setup. + * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, + * making it impossible to start in nVHE mode. Is that + * compliant with the architecture? Absolutely not! */ - /* sctlr_el1 */ - mov x0, #0x0800 // Set/clear RES{1,0} bits -CPU_BE( movk x0, #0x33d0, lsl #16 ) // Set EE and E0E on BE systems -CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems - msr sctlr_el1, x0 + mrs x0, hcr_el2 + and x0, x0, #HCR_E2H + cbz x0, 1f - /* Coprocessor traps. */ - mov x0, #0x33ff - msr cptr_el2, x0 // Disable copro. traps to EL2 + /* Set a sane SCTLR_EL1, the VHE way */ + msr_s SYS_SCTLR_EL12, x1 + mov x2, #BOOT_CPU_FLAG_E2H + b 2f - /* Hypervisor stub */ - adr_l x0, __hyp_stub_vectors - msr vbar_el2, x0 - - /* spsr */ - mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) - msr spsr_el2, x0 +1: + msr sctlr_el1, x1 + mov x2, xzr +2: msr elr_el2, lr - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + mov w0, #BOOT_CPU_MODE_EL2 + orr x0, x0, x2 eret -ENDPROC(el2_setup) +SYM_FUNC_END(init_kernel_el) /* * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed * in w0. See arch/arm64/include/asm/virt.h for more info. */ -set_cpu_boot_mode_flag: +SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) adr_l x1, __boot_cpu_mode cmp w0, #BOOT_CPU_MODE_EL2 b.ne 1f add x1, x1, #4 -1: str w0, [x1] // This CPU has booted in EL1 - dmb sy - dc ivac, x1 // Invalidate potentially stale cache line +1: str w0, [x1] // Save CPU boot mode ret -ENDPROC(set_cpu_boot_mode_flag) - -/* - * These values are written with the MMU off, but read with the MMU on. - * Writers will invalidate the corresponding address, discarding up to a - * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures - * sufficient alignment that the CWG doesn't overlap another section. - */ - .pushsection ".mmuoff.data.write", "aw" -/* - * We need to find out the CPU boot mode long after boot, so we need to - * store it in a writable variable. - * - * This is not in .bss, because we set it sufficiently early that the boot-time - * zeroing of .bss would clobber it. - */ -ENTRY(__boot_cpu_mode) - .long BOOT_CPU_MODE_EL2 - .long BOOT_CPU_MODE_EL1 -/* - * The booting CPU updates the failed status @__early_cpu_boot_status, - * with MMU turned off. - */ -ENTRY(__early_cpu_boot_status) - .long 0 - - .popsection +SYM_FUNC_END(set_cpu_boot_mode_flag) /* * This provides a "holding pen" for platforms to hold all secondary * cores are held until we're ready for them to initialise. */ -ENTRY(secondary_holding_pen) - bl el2_setup // Drop to EL1, w0=cpu_boot_mode - bl set_cpu_boot_mode_flag - mrs x0, mpidr_el1 +SYM_FUNC_START(secondary_holding_pen) + bl init_kernel_el // w0=cpu_boot_mode + mrs x2, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK - and x0, x0, x1 + and x2, x2, x1 adr_l x3, secondary_holding_pen_release pen: ldr x4, [x3] - cmp x4, x0 + cmp x4, x2 b.eq secondary_startup wfe b pen -ENDPROC(secondary_holding_pen) +SYM_FUNC_END(secondary_holding_pen) /* * Secondary entry point that jumps straight into the kernel. Only to * be used where CPUs are brought online dynamically by the kernel. */ -ENTRY(secondary_entry) - bl el2_setup // Drop to EL1 - bl set_cpu_boot_mode_flag +SYM_FUNC_START(secondary_entry) + bl init_kernel_el // w0=cpu_boot_mode b secondary_startup -ENDPROC(secondary_entry) +SYM_FUNC_END(secondary_entry) -secondary_startup: +SYM_FUNC_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ + mov x20, x0 // preserve boot mode + bl finalise_el2 + bl __cpu_secondary_check52bitva +#if VA_BITS > 48 + ldr_l x0, vabits_actual +#endif bl __cpu_setup // initialise processor + adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =__secondary_switched br x8 -ENDPROC(secondary_startup) +SYM_FUNC_END(secondary_startup) -__secondary_switched: +SYM_FUNC_START_LOCAL(__secondary_switched) + mov x0, x20 + bl set_cpu_boot_mode_flag + str_l xzr, __early_cpu_boot_status, x3 adr_l x5, vectors msr vbar_el1, x5 isb adr_l x0, secondary_data - ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack - mov sp, x1 ldr x2, [x0, #CPU_BOOT_TASK] - msr sp_el0, x2 - mov x29, #0 - mov x30, #0 - b secondary_start_kernel -ENDPROC(__secondary_switched) + cbz x2, __secondary_too_slow + + init_cpu_task x2, x1, x3 + +#ifdef CONFIG_ARM64_PTR_AUTH + ptrauth_keys_init_cpu x2, x3, x4, x5 +#endif + + bl secondary_start_kernel + ASM_BUG() +SYM_FUNC_END(__secondary_switched) + +SYM_FUNC_START_LOCAL(__secondary_too_slow) + wfe + wfi + b __secondary_too_slow +SYM_FUNC_END(__secondary_too_slow) /* * The booting CPU updates the failed status @__early_cpu_boot_status, @@ -646,6 +647,8 @@ ENDPROC(__secondary_switched) * Enable the MMU. * * x0 = SCTLR_EL1 value for turning on the MMU. + * x1 = TTBR1_EL1 value + * x2 = ID map root table address * * Returns to the caller via x30/lr. This requires the caller to be covered * by the .idmap.text section. @@ -653,102 +656,170 @@ ENDPROC(__secondary_switched) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ -ENTRY(__enable_mmu) - mrs x1, ID_AA64MMFR0_EL1 - ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED - b.ne __no_granule_support - update_early_cpu_boot_status 0, x1, x2 - adrp x1, idmap_pg_dir - adrp x2, swapper_pg_dir - msr ttbr0_el1, x1 // load TTBR0 - msr ttbr1_el1, x2 // load TTBR1 - isb - msr sctlr_el1, x0 - isb - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb +SYM_FUNC_START(__enable_mmu) + mrs x3, ID_AA64MMFR0_EL1 + ubfx x3, x3, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN + b.lt __no_granule_support + cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX + b.gt __no_granule_support + phys_to_ttbr x2, x2 + msr ttbr0_el1, x2 // load TTBR0 + load_ttbr1 x1, x1, x3 + + set_sctlr_el1 x0 + ret -ENDPROC(__enable_mmu) +SYM_FUNC_END(__enable_mmu) + +SYM_FUNC_START(__cpu_secondary_check52bitva) +#if VA_BITS > 48 + ldr_l x0, vabits_actual + cmp x0, #52 + b.ne 2f + + mrs_s x0, SYS_ID_AA64MMFR2_EL1 + and x0, x0, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT) + cbnz x0, 2f + + update_early_cpu_boot_status \ + CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_52_BIT_VA, x0, x1 +1: wfe + wfi + b 1b + +#endif +2: ret +SYM_FUNC_END(__cpu_secondary_check52bitva) -__no_granule_support: +SYM_FUNC_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ - update_early_cpu_boot_status CPU_STUCK_IN_KERNEL, x1, x2 + update_early_cpu_boot_status \ + CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2 1: wfe wfi b 1b -ENDPROC(__no_granule_support) +SYM_FUNC_END(__no_granule_support) #ifdef CONFIG_RELOCATABLE -__relocate_kernel: +SYM_FUNC_START_LOCAL(__relocate_kernel) /* * Iterate over each entry in the relocation table, and apply the * relocations in place. */ - ldr w9, =__rela_offset // offset to reloc table - ldr w10, =__rela_size // size of reloc table - + adr_l x9, __rela_start + adr_l x10, __rela_end mov_q x11, KIMAGE_VADDR // default virtual offset add x11, x11, x23 // actual virtual offset - add x9, x9, x11 // __va(.rela) - add x10, x9, x10 // __va(.rela) + sizeof(.rela) 0: cmp x9, x10 b.hs 1f - ldp x11, x12, [x9], #24 - ldr x13, [x9, #-8] - cmp w12, #R_AARCH64_RELATIVE + ldp x12, x13, [x9], #24 + ldr x14, [x9, #-8] + cmp w13, #R_AARCH64_RELATIVE b.ne 0b - add x13, x13, x23 // relocate - str x13, [x11, x23] + add x14, x14, x23 // relocate + str x14, [x12, x23] b 0b -1: ret -ENDPROC(__relocate_kernel) + +1: +#ifdef CONFIG_RELR + /* + * Apply RELR relocations. + * + * RELR is a compressed format for storing relative relocations. The + * encoded sequence of entries looks like: + * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] + * + * i.e. start with an address, followed by any number of bitmaps. The + * address entry encodes 1 relocation. The subsequent bitmap entries + * encode up to 63 relocations each, at subsequent offsets following + * the last address entry. + * + * The bitmap entries must have 1 in the least significant bit. The + * assumption here is that an address cannot have 1 in lsb. Odd + * addresses are not supported. Any odd addresses are stored in the RELA + * section, which is handled above. + * + * Excluding the least significant bit in the bitmap, each non-zero + * bit in the bitmap represents a relocation to be applied to + * a corresponding machine word that follows the base address + * word. The second least significant bit represents the machine + * word immediately following the initial address, and each bit + * that follows represents the next word, in linear order. As such, + * a single bitmap can encode up to 63 relocations in a 64-bit object. + * + * In this implementation we store the address of the next RELR table + * entry in x9, the address being relocated by the current address or + * bitmap entry in x13 and the address being relocated by the current + * bit in x14. + */ + adr_l x9, __relr_start + adr_l x10, __relr_end + +2: cmp x9, x10 + b.hs 7f + ldr x11, [x9], #8 + tbnz x11, #0, 3f // branch to handle bitmaps + add x13, x11, x23 + ldr x12, [x13] // relocate address entry + add x12, x12, x23 + str x12, [x13], #8 // adjust to start of bitmap + b 2b + +3: mov x14, x13 +4: lsr x11, x11, #1 + cbz x11, 6f + tbz x11, #0, 5f // skip bit if not set + ldr x12, [x14] // relocate bit + add x12, x12, x23 + str x12, [x14] + +5: add x14, x14, #8 // move to next bit's address + b 4b + +6: /* + * Move to the next bitmap's address. 8 is the word size, and 63 is the + * number of significant bits in a bitmap entry. + */ + add x13, x13, #(8 * 63) + b 2b + +7: #endif + ret -__primary_switch: -#ifdef CONFIG_RANDOMIZE_BASE - mov x19, x0 // preserve new SCTLR_EL1 value - mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value +SYM_FUNC_END(__relocate_kernel) #endif +SYM_FUNC_START_LOCAL(__primary_switch) + adrp x1, reserved_pg_dir + adrp x2, init_idmap_pg_dir bl __enable_mmu #ifdef CONFIG_RELOCATABLE - bl __relocate_kernel + adrp x23, KERNEL_START + and x23, x23, MIN_KIMG_ALIGN - 1 #ifdef CONFIG_RANDOMIZE_BASE - ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET - blr x8 - - /* - * If we return here, we have a KASLR displacement in x23 which we need - * to take into account by discarding the current kernel mapping and - * creating a new one. - */ - msr sctlr_el1, x20 // disable the MMU - isb - bl __create_page_tables // recreate kernel mapping - - tlbi vmalle1 // Remove any stale TLB entries - dsb nsh - - msr sctlr_el1, x19 // re-enable the MMU - isb - ic iallu // flush instructions fetched - dsb nsh // via old mapping - isb + mov x0, x22 + adrp x1, init_pg_end + mov sp, x1 + mov x29, xzr + bl __pi_kaslr_early_init + and x24, x0, #SZ_2M - 1 // capture memstart offset seed + bic x0, x0, #SZ_2M - 1 + orr x23, x23, x0 // record kernel offset +#endif +#endif + bl clear_page_tables + bl create_kernel_mapping + adrp x1, init_pg_dir + load_ttbr1 x1, x1, x2 +#ifdef CONFIG_RELOCATABLE bl __relocate_kernel #endif -#endif ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET + adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 -ENDPROC(__primary_switch) +SYM_FUNC_END(__primary_switch) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index e56d848b6466..0e1d9c3c6a93 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -1,20 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Hibernate low-level support * * Copyright (C) 2016 ARM Ltd. * Author: James Morse <james.morse@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> #include <linux/errno.h> @@ -27,23 +16,6 @@ #include <asm/virt.h> /* - * To prevent the possibility of old and new partial table walks being visible - * in the tlb, switch the ttbr to a zero page when we invalidate the old - * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i - * Even switching to our copied tables will cause a changed output address at - * each stage of the walk. - */ -.macro break_before_make_ttbr_switch zero_page, page_table - msr ttbr1_el1, \zero_page - isb - tlbi vmalle1 - dsb nsh - msr ttbr1_el1, \page_table - isb -.endm - - -/* * Resume from hibernate * * Loads temporary page tables then restores the memory image. @@ -53,7 +25,7 @@ * Because this code has to be copied to a 'safe' page, it can't call out to * other functions by PC-relative address. Also remember that it may be * mid-way through over-writing other functions. For this reason it contains - * code from flush_icache_range() and uses the copy_page() macro. + * code from caches_clean_inval_pou() and uses the copy_page() macro. * * This 'safe' page is mapped via ttbr0, and executed from there. This function * switches to a copy of the linear map in ttbr1, performs the restore, then @@ -73,12 +45,12 @@ * x5: physical address of a zero page that remains zero after resume */ .pushsection ".hibernate_exit.text", "ax" -ENTRY(swsusp_arch_suspend_exit) +SYM_CODE_START(swsusp_arch_suspend_exit) /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page */ - break_before_make_ttbr_switch x5, x0 + break_before_make_ttbr_switch x5, x0, x6, x8 mov x21, x1 mov x30, x2 @@ -95,11 +67,12 @@ ENTRY(swsusp_arch_suspend_exit) copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9 add x1, x10, #PAGE_SIZE - /* Clean the copied page to PoU - based on flush_icache_range() */ + /* Clean the copied page to PoU - based on caches_clean_inval_pou() */ raw_dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x10, x3 -2: dc cvau, x4 /* clean D line / unified line */ +2: /* clean D line / unified line */ +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE add x4, x4, x2 cmp x4, x1 b.lo 2b @@ -109,7 +82,7 @@ ENTRY(swsusp_arch_suspend_exit) dsb ish /* wait for PoU cleaning to finish */ /* switch to the restored kernels page tables */ - break_before_make_ttbr_switch x25, x21 + break_before_make_ttbr_switch x25, x21, x6, x8 ic ialluis dsb ish @@ -118,59 +91,5 @@ ENTRY(swsusp_arch_suspend_exit) cbz x24, 3f /* Do we need to re-initialise EL2? */ hvc #0 3: ret - - .ltorg -ENDPROC(swsusp_arch_suspend_exit) - -/* - * Restore the hyp stub. - * This must be done before the hibernate page is unmapped by _cpu_resume(), - * but happens before any of the hyp-stub's code is cleaned to PoC. - * - * x24: The physical address of __hyp_stub_vectors - */ -el1_sync: - msr vbar_el2, x24 - eret -ENDPROC(el1_sync) - -.macro invalid_vector label -\label: - b \label -ENDPROC(\label) -.endm - - invalid_vector el2_sync_invalid - invalid_vector el2_irq_invalid - invalid_vector el2_fiq_invalid - invalid_vector el2_error_invalid - invalid_vector el1_sync_invalid - invalid_vector el1_irq_invalid - invalid_vector el1_fiq_invalid - invalid_vector el1_error_invalid - -/* el2 vectors - switch el2 here while we restore the memory image. */ - .align 11 -ENTRY(hibernate_el2_vectors) - ventry el2_sync_invalid // Synchronous EL2t - ventry el2_irq_invalid // IRQ EL2t - ventry el2_fiq_invalid // FIQ EL2t - ventry el2_error_invalid // Error EL2t - - ventry el2_sync_invalid // Synchronous EL2h - ventry el2_irq_invalid // IRQ EL2h - ventry el2_fiq_invalid // FIQ EL2h - ventry el2_error_invalid // Error EL2h - - ventry el1_sync // Synchronous 64-bit EL1 - ventry el1_irq_invalid // IRQ 64-bit EL1 - ventry el1_fiq_invalid // FIQ 64-bit EL1 - ventry el1_error_invalid // Error 64-bit EL1 - - ventry el1_sync_invalid // Synchronous 32-bit EL1 - ventry el1_irq_invalid // IRQ 32-bit EL1 - ventry el1_fiq_invalid // FIQ 32-bit EL1 - ventry el1_error_invalid // Error 32-bit EL1 -END(hibernate_el2_vectors) - +SYM_CODE_END(swsusp_arch_suspend_exit) .popsection diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 095d3c170f5d..af5df48ba915 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /*: * Hibernate support specific for ARM64 * @@ -6,39 +7,31 @@ * Ubuntu project, hibernation support for mach-dove * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu) * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.) - * https://lkml.org/lkml/2010/6/18/4 - * https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html - * https://patchwork.kernel.org/patch/96442/ - * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * License terms: GNU General Public License (GPL) version 2 */ #define pr_fmt(x) "hibernate: " x #include <linux/cpu.h> #include <linux/kvm_host.h> -#include <linux/mm.h> #include <linux/pm.h> #include <linux/sched.h> #include <linux/suspend.h> #include <linux/utsname.h> -#include <linux/version.h> #include <asm/barrier.h> #include <asm/cacheflush.h> #include <asm/cputype.h> +#include <asm/daifflags.h> #include <asm/irqflags.h> #include <asm/kexec.h> #include <asm/memory.h> #include <asm/mmu_context.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/pgtable-hwdef.h> +#include <asm/mte.h> #include <asm/sections.h> #include <asm/smp.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <asm/sysreg.h> +#include <asm/trans_pgd.h> #include <asm/virt.h> /* @@ -52,10 +45,7 @@ extern int in_suspend; /* Do we need to reset el2? */ -#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) - -/* temporary el2 vectors in the __hibernate_exit_text section. */ -extern char hibernate_el2_vectors[]; +#define el2_reset_needed() (is_hyp_nvhe()) /* hyp-stub vectors, used to restore el2 during resume from hibernate. */ extern char __hyp_stub_vectors[]; @@ -166,14 +156,11 @@ int arch_hibernation_header_restore(void *addr) sleep_cpu = -EINVAL; return -EINVAL; } - if (!cpu_online(sleep_cpu)) { - pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); - ret = cpu_up(sleep_cpu); - if (ret) { - pr_err("Failed to bring hibernate-CPU up!\n"); - sleep_cpu = -EINVAL; - return ret; - } + + ret = bringup_hibernate_cpu(sleep_cpu); + if (ret) { + sleep_cpu = -EINVAL; + return ret; } resume_hdr = *hdr; @@ -182,9 +169,14 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); +static void *hibernate_page_alloc(void *arg) +{ + return (void *)get_safe_page((__force gfp_t)(unsigned long)arg); +} + /* * Copies length bytes, starting at src_start into an new page, - * perform cache maintentance, then maps it at the specified address low + * perform cache maintenance, then maps it at the specified address low * address as executable. * * This is used by hibernate to copy the code it needs to execute when @@ -195,84 +187,143 @@ EXPORT_SYMBOL(arch_hibernation_header_restore); * page system. */ static int create_safe_exec_page(void *src_start, size_t length, - unsigned long dst_addr, - phys_addr_t *phys_dst_addr, - void *(*allocator)(gfp_t mask), - gfp_t mask) + phys_addr_t *phys_dst_addr) +{ + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, + }; + + void *page = (void *)get_safe_page(GFP_ATOMIC); + phys_addr_t trans_ttbr0; + unsigned long t0sz; + int rc; + + if (!page) + return -ENOMEM; + + memcpy(page, src_start, length); + caches_clean_inval_pou((unsigned long)page, (unsigned long)page + length); + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); + if (rc) + return rc; + + cpu_install_ttbr0(trans_ttbr0, t0sz); + *phys_dst_addr = virt_to_phys(page); + + return 0; +} + +#ifdef CONFIG_ARM64_MTE + +static DEFINE_XARRAY(mte_pages); + +static int save_tags(struct page *page, unsigned long pfn) { - int rc = 0; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long dst = (unsigned long)allocator(mask); - - if (!dst) { - rc = -ENOMEM; - goto out; + void *tag_storage, *ret; + + tag_storage = mte_allocate_tag_storage(); + if (!tag_storage) + return -ENOMEM; + + mte_save_page_tags(page_address(page), tag_storage); + + ret = xa_store(&mte_pages, pfn, tag_storage, GFP_KERNEL); + if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { + mte_free_tag_storage(tag_storage); + return xa_err(ret); + } else if (WARN(ret, "swsusp: %s: Duplicate entry", __func__)) { + mte_free_tag_storage(ret); } - memcpy((void *)dst, src_start, length); - flush_icache_range(dst, dst + length); + return 0; +} - pgd = pgd_offset_raw(allocator(mask), dst_addr); - if (pgd_none(*pgd)) { - pud = allocator(mask); - if (!pud) { - rc = -ENOMEM; - goto out; - } - pgd_populate(&init_mm, pgd, pud); +static void swsusp_mte_free_storage(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + mte_free_tag_storage(tags); } + xa_unlock(&mte_pages); + + xa_destroy(&mte_pages); +} + +static int swsusp_mte_save_tags(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + int ret = 0; + int n = 0; + + if (!system_supports_mte()) + return 0; - pud = pud_offset(pgd, dst_addr); - if (pud_none(*pud)) { - pmd = allocator(mask); - if (!pmd) { - rc = -ENOMEM; - goto out; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + struct page *page = pfn_to_online_page(pfn); + + if (!page) + continue; + + if (!test_bit(PG_mte_tagged, &page->flags)) + continue; + + ret = save_tags(page, pfn); + if (ret) { + swsusp_mte_free_storage(); + goto out; + } + + n++; } - pud_populate(&init_mm, pud, pmd); } + pr_info("Saved %d MTE pages\n", n); - pmd = pmd_offset(pud, dst_addr); - if (pmd_none(*pmd)) { - pte = allocator(mask); - if (!pte) { - rc = -ENOMEM; - goto out; - } - pmd_populate_kernel(&init_mm, pmd, pte); +out: + return ret; +} + +static void swsusp_mte_restore_tags(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + int n = 0; + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + unsigned long pfn = xa_state.xa_index; + struct page *page = pfn_to_online_page(pfn); + + mte_restore_page_tags(page_address(page), tags); + + mte_free_tag_storage(tags); + n++; } + xa_unlock(&mte_pages); - pte = pte_offset_kernel(pmd, dst_addr); - set_pte(pte, __pte(virt_to_phys((void *)dst) | - pgprot_val(PAGE_KERNEL_EXEC))); + pr_info("Restored %d MTE pages\n", n); - /* - * Load our new page tables. A strict BBM approach requires that we - * ensure that TLBs are free of any entries that may overlap with the - * global mappings we are about to install. - * - * For a real hibernate/resume cycle TTBR0 currently points to a zero - * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI - * runtime services), while for a userspace-driven test_resume cycle it - * points to userspace page tables (and we must point it at a zero page - * ourselves). Elsewhere we only (un)install the idmap with preemption - * disabled, so T0SZ should be as required regardless. - */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - write_sysreg(virt_to_phys(pgd), ttbr0_el1); - isb(); + xa_destroy(&mte_pages); +} - *phys_dst_addr = virt_to_phys((void *)dst); +#else /* CONFIG_ARM64_MTE */ -out: - return rc; +static int swsusp_mte_save_tags(void) +{ + return 0; +} + +static void swsusp_mte_restore_tags(void) +{ } -#define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start)) +#endif /* CONFIG_ARM64_MTE */ int swsusp_arch_suspend(void) { @@ -285,22 +336,35 @@ int swsusp_arch_suspend(void) return -EBUSY; } - local_dbg_save(flags); + flags = local_daif_save(); if (__cpu_suspend_enter(&state)) { /* make the crash dump kernel image visible/saveable */ crash_prepare_suspend(); + ret = swsusp_mte_save_tags(); + if (ret) + return ret; + sleep_cpu = smp_processor_id(); ret = swsusp_save(); } else { /* Clean kernel core startup/idle code to PoC*/ - dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end); - dcache_clean_range(__idmap_text_start, __idmap_text_end); + dcache_clean_inval_poc((unsigned long)__mmuoff_data_start, + (unsigned long)__mmuoff_data_end); + dcache_clean_inval_poc((unsigned long)__idmap_text_start, + (unsigned long)__idmap_text_end); /* Clean kvm setup code to PoC? */ - if (el2_reset_needed()) - dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end); + if (el2_reset_needed()) { + dcache_clean_inval_poc( + (unsigned long)__hyp_idmap_text_start, + (unsigned long)__hyp_idmap_text_end); + dcache_clean_inval_poc((unsigned long)__hyp_text_start, + (unsigned long)__hyp_text_end); + } + + swsusp_mte_restore_tags(); /* make the crash dump kernel image protected again */ crash_post_resume(); @@ -313,144 +377,18 @@ int swsusp_arch_suspend(void) sleep_cpu = -EINVAL; __cpu_suspend_exit(); - } - - local_dbg_restore(flags); - return ret; -} - -static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) -{ - pte_t pte = *src_pte; - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_pte, pte_mkwrite(pte)); - } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Before marking this entry valid, check the pfn should - * be mapped. + * Just in case the boot kernel did turn the SSBD + * mitigation off behind our back, let's set the state + * to what we expect it to be. */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte))); + spectre_v4_enable_mitigation(NULL); } -} - -static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, - unsigned long end) -{ - pte_t *src_pte; - pte_t *dst_pte; - unsigned long addr = start; - dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pte) - return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmd, dst_pte); - dst_pte = pte_offset_kernel(dst_pmd, start); - - src_pte = pte_offset_kernel(src_pmd, start); - do { - _copy_pte(dst_pte, src_pte, addr); - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + local_daif_restore(flags); - return 0; -} - -static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start, - unsigned long end) -{ - pmd_t *src_pmd; - pmd_t *dst_pmd; - unsigned long next; - unsigned long addr = start; - - if (pud_none(*dst_pud)) { - dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmd) - return -ENOMEM; - pud_populate(&init_mm, dst_pud, dst_pmd); - } - dst_pmd = pmd_offset(dst_pud, start); - - src_pmd = pmd_offset(src_pud, start); - do { - next = pmd_addr_end(addr, end); - if (pmd_none(*src_pmd)) - continue; - if (pmd_table(*src_pmd)) { - if (copy_pte(dst_pmd, src_pmd, addr, next)) - return -ENOMEM; - } else { - set_pmd(dst_pmd, - __pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY)); - } - } while (dst_pmd++, src_pmd++, addr = next, addr != end); - - return 0; -} - -static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start, - unsigned long end) -{ - pud_t *dst_pud; - pud_t *src_pud; - unsigned long next; - unsigned long addr = start; - - if (pgd_none(*dst_pgd)) { - dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pud) - return -ENOMEM; - pgd_populate(&init_mm, dst_pgd, dst_pud); - } - dst_pud = pud_offset(dst_pgd, start); - - src_pud = pud_offset(src_pgd, start); - do { - next = pud_addr_end(addr, end); - if (pud_none(*src_pud)) - continue; - if (pud_table(*(src_pud))) { - if (copy_pmd(dst_pud, src_pud, addr, next)) - return -ENOMEM; - } else { - set_pud(dst_pud, - __pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY)); - } - } while (dst_pud++, src_pud++, addr = next, addr != end); - - return 0; -} - -static int copy_page_tables(pgd_t *dst_pgd, unsigned long start, - unsigned long end) -{ - unsigned long next; - unsigned long addr = start; - pgd_t *src_pgd = pgd_offset_k(start); - - dst_pgd = pgd_offset_raw(dst_pgd, start); - do { - next = pgd_addr_end(addr, end); - if (pgd_none(*src_pgd)) - continue; - if (copy_pud(dst_pgd, src_pgd, addr, next)) - return -ENOMEM; - } while (dst_pgd++, src_pgd++, addr = next, addr != end); - - return 0; + return ret; } /* @@ -461,85 +399,72 @@ static int copy_page_tables(pgd_t *dst_pgd, unsigned long start, */ int swsusp_arch_resume(void) { - int rc = 0; + int rc; void *zero_page; size_t exit_size; pgd_t *tmp_pg_dir; - phys_addr_t phys_hibernate_exit; + phys_addr_t el2_vectors; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; /* * Restoring the memory image will overwrite the ttbr1 page tables. * Create a second copy of just the linear map, and use this when * restoring. */ - tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!tmp_pg_dir) { - pr_err("Failed to allocate memory for temporary page tables.\n"); - rc = -ENOMEM; - goto out; - } - rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0); + rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET, + PAGE_END); if (rc) - goto out; + return rc; /* - * We need a zero page that is zero before & after resume in order to + * We need a zero page that is zero before & after resume in order * to break before make on the ttbr1 page tables. */ zero_page = (void *)get_safe_page(GFP_ATOMIC); if (!zero_page) { pr_err("Failed to allocate zero page.\n"); - rc = -ENOMEM; - goto out; + return -ENOMEM; + } + + if (el2_reset_needed()) { + rc = trans_pgd_copy_el2_vectors(&trans_info, &el2_vectors); + if (rc) { + pr_err("Failed to setup el2 vectors\n"); + return rc; + } } - /* - * Locate the exit code in the bottom-but-one page, so that *NULL - * still has disastrous affects. - */ - hibernate_exit = (void *)PAGE_SIZE; exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; /* * Copy swsusp_arch_suspend_exit() to a safe page. This will generate * a new set of ttbr0 page tables and load them. */ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, - (unsigned long)hibernate_exit, - &phys_hibernate_exit, - (void *)get_safe_page, GFP_ATOMIC); + (phys_addr_t *)&hibernate_exit); if (rc) { pr_err("Failed to create safe executable page for hibernate_exit code.\n"); - goto out; + return rc; } /* - * The hibernate exit text contains a set of el2 vectors, that will - * be executed at el2 with the mmu off in order to reload hyp-stub. - */ - __flush_dcache_area(hibernate_exit, exit_size); - - /* * KASLR will cause the el2 vectors to be in a different location in * the resumed kernel. Load hibernate's temporary copy into el2. * * We can skip this step if we booted at EL1, or are running with VHE. */ - if (el2_reset_needed()) { - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ - el2_vectors += hibernate_el2_vectors - - __hibernate_exit_text_start; /* offset */ - + if (el2_reset_needed()) __hyp_set_vectors(el2_vectors); - } hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1, resume_hdr.reenter_kernel, restore_pblist, resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page)); -out: - return rc; + return 0; } int hibernate_resume_nonboot_cpu_disable(void) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 749f81779420..b29a311bb055 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -1,21 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. * * Copyright (C) 2012 ARM Limited * Author: Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hw-breakpoint: " fmt @@ -28,15 +17,14 @@ #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/smp.h> +#include <linux/uaccess.h> -#include <asm/compat.h> #include <asm/current.h> #include <asm/debug-monitors.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/cputype.h> #include <asm/system_misc.h> -#include <asm/uaccess.h> /* Breakpoint currently in use for each BRP. */ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]); @@ -63,7 +51,7 @@ int hw_breakpoint_slots(int type) case TYPE_DATA: return get_num_wrps(); default: - pr_warning("unknown slot type: %d\n", type); + pr_warn("unknown slot type: %d\n", type); return 0; } } @@ -124,7 +112,7 @@ static u64 read_wb_reg(int reg, int n) GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val); GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val); default: - pr_warning("attempt to read from unknown breakpoint register %d\n", n); + pr_warn("attempt to read from unknown breakpoint register %d\n", n); } return val; @@ -139,7 +127,7 @@ static void write_wb_reg(int reg, int n, u64 val) GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val); GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val); default: - pr_warning("attempt to write to unknown breakpoint register %d\n", n); + pr_warn("attempt to write to unknown breakpoint register %d\n", n); } isb(); } @@ -157,7 +145,7 @@ static enum dbg_active_el debug_exception_level(int privilege) case AARCH64_BREAKPOINT_EL1: return DBG_ACTIVE_EL1; default: - pr_warning("invalid breakpoint privilege level %d\n", privilege); + pr_warn("invalid breakpoint privilege level %d\n", privilege); return -EINVAL; } } @@ -269,7 +257,7 @@ static int hw_breakpoint_control(struct perf_event *bp, * level. */ enable_debug_monitors(dbg_el); - /* Fall through */ + fallthrough; case HW_BREAKPOINT_RESTORE: /* Setup the address register. */ write_wb_reg(val_reg, i, info->address); @@ -344,14 +332,13 @@ static int get_hbp_len(u8 hbp_len) /* * Check whether bp virtual address is in kernel space. */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned int len; unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - va = info->address; - len = get_hbp_len(info->ctrl.len); + va = hw->address; + len = get_hbp_len(hw->ctrl.len); return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } @@ -422,53 +409,53 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, /* * Construct an arch_hw_breakpoint from a perf_event. */ -static int arch_build_bp_info(struct perf_event *bp) +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - /* Type */ - switch (bp->attr.bp_type) { + switch (attr->bp_type) { case HW_BREAKPOINT_X: - info->ctrl.type = ARM_BREAKPOINT_EXECUTE; + hw->ctrl.type = ARM_BREAKPOINT_EXECUTE; break; case HW_BREAKPOINT_R: - info->ctrl.type = ARM_BREAKPOINT_LOAD; + hw->ctrl.type = ARM_BREAKPOINT_LOAD; break; case HW_BREAKPOINT_W: - info->ctrl.type = ARM_BREAKPOINT_STORE; + hw->ctrl.type = ARM_BREAKPOINT_STORE; break; case HW_BREAKPOINT_RW: - info->ctrl.type = ARM_BREAKPOINT_LOAD | ARM_BREAKPOINT_STORE; + hw->ctrl.type = ARM_BREAKPOINT_LOAD | ARM_BREAKPOINT_STORE; break; default: return -EINVAL; } /* Len */ - switch (bp->attr.bp_len) { + switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: - info->ctrl.len = ARM_BREAKPOINT_LEN_1; + hw->ctrl.len = ARM_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - info->ctrl.len = ARM_BREAKPOINT_LEN_2; + hw->ctrl.len = ARM_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_3: - info->ctrl.len = ARM_BREAKPOINT_LEN_3; + hw->ctrl.len = ARM_BREAKPOINT_LEN_3; break; case HW_BREAKPOINT_LEN_4: - info->ctrl.len = ARM_BREAKPOINT_LEN_4; + hw->ctrl.len = ARM_BREAKPOINT_LEN_4; break; case HW_BREAKPOINT_LEN_5: - info->ctrl.len = ARM_BREAKPOINT_LEN_5; + hw->ctrl.len = ARM_BREAKPOINT_LEN_5; break; case HW_BREAKPOINT_LEN_6: - info->ctrl.len = ARM_BREAKPOINT_LEN_6; + hw->ctrl.len = ARM_BREAKPOINT_LEN_6; break; case HW_BREAKPOINT_LEN_7: - info->ctrl.len = ARM_BREAKPOINT_LEN_7; + hw->ctrl.len = ARM_BREAKPOINT_LEN_7; break; case HW_BREAKPOINT_LEN_8: - info->ctrl.len = ARM_BREAKPOINT_LEN_8; + hw->ctrl.len = ARM_BREAKPOINT_LEN_8; break; default: return -EINVAL; @@ -479,37 +466,37 @@ static int arch_build_bp_info(struct perf_event *bp) * AArch32 also requires breakpoints of length 2 for Thumb. * Watchpoints can be of length 1, 2, 4 or 8 bytes. */ - if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) { + if (hw->ctrl.type == ARM_BREAKPOINT_EXECUTE) { if (is_compat_bp(bp)) { - if (info->ctrl.len != ARM_BREAKPOINT_LEN_2 && - info->ctrl.len != ARM_BREAKPOINT_LEN_4) + if (hw->ctrl.len != ARM_BREAKPOINT_LEN_2 && + hw->ctrl.len != ARM_BREAKPOINT_LEN_4) return -EINVAL; - } else if (info->ctrl.len != ARM_BREAKPOINT_LEN_4) { + } else if (hw->ctrl.len != ARM_BREAKPOINT_LEN_4) { /* * FIXME: Some tools (I'm looking at you perf) assume * that breakpoints should be sizeof(long). This * is nonsense. For now, we fix up the parameter * but we should probably return -EINVAL instead. */ - info->ctrl.len = ARM_BREAKPOINT_LEN_4; + hw->ctrl.len = ARM_BREAKPOINT_LEN_4; } } /* Address */ - info->address = bp->attr.bp_addr; + hw->address = attr->bp_addr; /* * Privilege * Note that we disallow combined EL0/EL1 breakpoints because * that would complicate the stepping code. */ - if (arch_check_bp_in_kernelspace(bp)) - info->ctrl.privilege = AARCH64_BREAKPOINT_EL1; + if (arch_check_bp_in_kernelspace(hw)) + hw->ctrl.privilege = AARCH64_BREAKPOINT_EL1; else - info->ctrl.privilege = AARCH64_BREAKPOINT_EL0; + hw->ctrl.privilege = AARCH64_BREAKPOINT_EL0; /* Enabled? */ - info->ctrl.enabled = !bp->attr.disabled; + hw->ctrl.enabled = !attr->disabled; return 0; } @@ -517,14 +504,15 @@ static int arch_build_bp_info(struct perf_event *bp) /* * Validate the arch-specific HW Breakpoint register settings. */ -int arch_validate_hwbkpt_settings(struct perf_event *bp) +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); int ret; u64 alignment_mask, offset; /* Build the arch_hw_breakpoint. */ - ret = arch_build_bp_info(bp); + ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; @@ -538,42 +526,47 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * that here. */ if (is_compat_bp(bp)) { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) + if (hw->ctrl.len == ARM_BREAKPOINT_LEN_8) alignment_mask = 0x7; else alignment_mask = 0x3; - offset = info->address & alignment_mask; + offset = hw->address & alignment_mask; switch (offset) { case 0: /* Aligned */ break; case 1: - /* Allow single byte watchpoint. */ - if (info->ctrl.len == ARM_BREAKPOINT_LEN_1) - break; case 2: /* Allow halfword watchpoints and breakpoints. */ - if (info->ctrl.len == ARM_BREAKPOINT_LEN_2) + if (hw->ctrl.len == ARM_BREAKPOINT_LEN_2) + break; + + fallthrough; + case 3: + /* Allow single byte watchpoint. */ + if (hw->ctrl.len == ARM_BREAKPOINT_LEN_1) break; + + fallthrough; default: return -EINVAL; } } else { - if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) + if (hw->ctrl.type == ARM_BREAKPOINT_EXECUTE) alignment_mask = 0x3; else alignment_mask = 0x7; - offset = info->address & alignment_mask; + offset = hw->address & alignment_mask; } - info->address &= ~alignment_mask; - info->ctrl.len <<= offset; + hw->address &= ~alignment_mask; + hw->ctrl.len <<= offset; /* * Disallow per-task kernel breakpoints since these would * complicate the stepping code. */ - if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target) + if (hw->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target) return -EINVAL; return 0; @@ -624,7 +617,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers); /* * Debug exception handlers. */ -static int breakpoint_handler(unsigned long unused, unsigned int esr, +static int breakpoint_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; @@ -708,7 +701,7 @@ NOKPROBE_SYMBOL(breakpoint_handler); * addresses. There is no straight-forward way, short of disassembling the * offending instruction, to map that address back to the watchpoint. This * function computes the distance of the memory access from the watchpoint as a - * heuristic for the likelyhood that a given access triggered the watchpoint. + * heuristic for the likelihood that a given access triggered the watchpoint. * * See Section D2.10.5 "Determining the memory location that caused a Watchpoint * exception" of ARMv8 Architecture Reference Manual for details. @@ -737,7 +730,28 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, return 0; } -static int watchpoint_handler(unsigned long addr, unsigned int esr, +static int watchpoint_report(struct perf_event *wp, unsigned long addr, + struct pt_regs *regs) +{ + int step = is_default_overflow_handler(wp); + struct arch_hw_breakpoint *info = counter_arch_bp(wp); + + info->trigger = addr; + + /* + * If we triggered a user watchpoint from a uaccess routine, then + * handle the stepping ourselves since userspace really can't help + * us with this. + */ + if (!user_mode(regs) && info->ctrl.privilege == AARCH64_BREAKPOINT_EL0) + step = 1; + else + perf_bp_event(wp, regs); + + return step; +} + +static int watchpoint_handler(unsigned long addr, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access, closest_match = 0; @@ -746,7 +760,6 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; - struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; slots = this_cpu_ptr(wp_on_reg); @@ -784,25 +797,13 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, if (dist != 0) continue; - info = counter_arch_bp(wp); - info->trigger = addr; - perf_bp_event(wp, regs); - - /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(wp)) - step = 1; + step = watchpoint_report(wp, addr, regs); } - if (min_dist > 0 && min_dist != -1) { - /* No exact match found. */ - wp = slots[closest_match]; - info = counter_arch_bp(wp); - info->trigger = addr; - perf_bp_event(wp, regs); - /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(wp)) - step = 1; - } + /* No exact match found? */ + if (min_dist > 0 && min_dist != -1) + step = watchpoint_report(slots[closest_match], addr, regs); + rcu_read_unlock(); if (!step) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index e1261fbaa374..2ee18c860f2a 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -1,47 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Hypervisor stub * * Copyright (C) 2012 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/init.h> #include <linux/linkage.h> -#include <linux/irqchip/arm-gic-v3.h> #include <asm/assembler.h> +#include <asm/el2_setup.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/ptrace.h> #include <asm/virt.h> +// Warning, hardcoded register allocation +// This will clobber x1 and x2, and expect x1 to contain +// the id register value as read from the HW +.macro __check_override idreg, fld, width, pass, fail + ubfx x1, x1, #\fld, #\width + cbz x1, \fail + + adr_l x1, \idreg\()_override + ldr x2, [x1, FTR_OVR_VAL_OFFSET] + ldr x1, [x1, FTR_OVR_MASK_OFFSET] + ubfx x2, x2, #\fld, #\width + ubfx x1, x1, #\fld, #\width + cmp x1, xzr + and x2, x2, x1 + csinv x2, x2, xzr, ne + cbnz x2, \pass + b \fail +.endm + +.macro check_override idreg, fld, pass, fail + mrs x1, \idreg\()_el1 + __check_override \idreg \fld 4 \pass \fail +.endm + .text + .pushsection .hyp.text, "ax" + .align 11 -ENTRY(__hyp_stub_vectors) +SYM_CODE_START(__hyp_stub_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t ventry el2_error_invalid // Error EL2t - ventry el2_sync_invalid // Synchronous EL2h + ventry elx_sync // Synchronous EL2h ventry el2_irq_invalid // IRQ EL2h ventry el2_fiq_invalid // FIQ EL2h ventry el2_error_invalid // Error EL2h - ventry el1_sync // Synchronous 64-bit EL1 + ventry elx_sync // Synchronous 64-bit EL1 ventry el1_irq_invalid // IRQ 64-bit EL1 ventry el1_fiq_invalid // FIQ 64-bit EL1 ventry el1_error_invalid // Error 64-bit EL1 @@ -50,16 +65,19 @@ ENTRY(__hyp_stub_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -ENDPROC(__hyp_stub_vectors) +SYM_CODE_END(__hyp_stub_vectors) .align 11 -el1_sync: +SYM_CODE_START_LOCAL(elx_sync) cmp x0, #HVC_SET_VECTORS - b.ne 2f + b.ne 1f msr vbar_el2, x1 b 9f +1: cmp x0, #HVC_FINALISE_EL2 + b.eq __finalise_el2 + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 @@ -72,17 +90,154 @@ el1_sync: beq 9f // Nothing to reset! /* Someone called kvm_call_hyp() against the hyp-stub... */ - ldr x0, =HVC_STUB_ERR + mov_q x0, HVC_STUB_ERR eret 9: mov x0, xzr eret -ENDPROC(el1_sync) +SYM_CODE_END(elx_sync) + +SYM_CODE_START_LOCAL(__finalise_el2) + check_override id_aa64pfr0 ID_AA64PFR0_EL1_SVE_SHIFT .Linit_sve .Lskip_sve + +.Linit_sve: /* SVE register access */ + mrs x0, cptr_el2 // Disable SVE traps + bic x0, x0, #CPTR_EL2_TZ + msr cptr_el2, x0 + isb + mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector + msr_s SYS_ZCR_EL2, x1 // length for EL1. + +.Lskip_sve: + check_override id_aa64pfr1 ID_AA64PFR1_EL1_SME_SHIFT .Linit_sme .Lskip_sme + +.Linit_sme: /* SME register access and priority mapping */ + mrs x0, cptr_el2 // Disable SME traps + bic x0, x0, #CPTR_EL2_TSM + msr cptr_el2, x0 + isb + + mrs x1, sctlr_el2 + orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps + msr sctlr_el2, x1 + isb + + mov x0, #0 // SMCR controls + + // Full FP in SM? + mrs_s x1, SYS_ID_AA64SMFR0_EL1 + __check_override id_aa64smfr0 ID_AA64SMFR0_EL1_FA64_SHIFT 1 .Linit_sme_fa64 .Lskip_sme_fa64 + +.Linit_sme_fa64: + orr x0, x0, SMCR_ELx_FA64_MASK +.Lskip_sme_fa64: + + orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector + msr_s SYS_SMCR_EL2, x0 // length for EL1. + + mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? + ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 + cbz x1, .Lskip_sme + + msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal + + mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? + ubfx x1, x1, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 + cbz x1, .Lskip_sme + + mrs_s x1, SYS_HCRX_EL2 + orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping + msr_s SYS_HCRX_EL2, x1 + +.Lskip_sme: + + // nVHE? No way! Give me the real thing! + // Sanity check: MMU *must* be off + mrs x1, sctlr_el2 + tbnz x1, #0, 1f + + // Needs to be VHE capable, obviously + check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f + +1: mov_q x0, HVC_STUB_ERR + eret +2: + // Engage the VHE magic! + mov_q x0, HCR_HOST_VHE_FLAGS + msr hcr_el2, x0 + isb + + // Use the EL1 allocated stack, per-cpu offset + mrs x0, sp_el1 + mov sp, x0 + mrs x0, tpidr_el1 + msr tpidr_el2, x0 + + // FP configuration, vectors + mrs_s x0, SYS_CPACR_EL12 + msr cpacr_el1, x0 + mrs_s x0, SYS_VBAR_EL12 + msr vbar_el1, x0 + + // Use EL2 translations for SPE & TRBE and disable access from EL1 + mrs x0, mdcr_el2 + bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + msr mdcr_el2, x0 + + // Transfer the MM state from EL1 to EL2 + mrs_s x0, SYS_TCR_EL12 + msr tcr_el1, x0 + mrs_s x0, SYS_TTBR0_EL12 + msr ttbr0_el1, x0 + mrs_s x0, SYS_TTBR1_EL12 + msr ttbr1_el1, x0 + mrs_s x0, SYS_MAIR_EL12 + msr mair_el1, x0 + isb + + // Hack the exception return to stay at EL2 + mrs x0, spsr_el1 + and x0, x0, #~PSR_MODE_MASK + mov x1, #PSR_MODE_EL2h + orr x0, x0, x1 + msr spsr_el1, x0 + + b enter_vhe +SYM_CODE_END(__finalise_el2) + + // At the point where we reach enter_vhe(), we run with + // the MMU off (which is enforced by __finalise_el2()). + // We thus need to be in the idmap, or everything will + // explode when enabling the MMU. + + .pushsection .idmap.text, "ax" + +SYM_CODE_START_LOCAL(enter_vhe) + // Invalidate TLBs before enabling the MMU + tlbi vmalle1 + dsb nsh + isb + + // Enable the EL2 S1 MMU, as set up from EL1 + mrs_s x0, SYS_SCTLR_EL12 + set_sctlr_el1 x0 + + // Disable the EL1 S1 MMU for a good measure + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + msr_s SYS_SCTLR_EL12, x0 + + mov x0, xzr + + eret +SYM_CODE_END(enter_vhe) + + .popsection .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -94,6 +249,8 @@ ENDPROC(\label) invalid_vector el1_fiq_invalid invalid_vector el1_error_invalid + .popsection + /* * __hyp_set_vectors: Call this after boot to set the initial hypervisor * vectors as part of hypervisor installation. On an SMP system, this should @@ -115,15 +272,36 @@ ENDPROC(\label) * initialisation entry point. */ -ENTRY(__hyp_set_vectors) +SYM_FUNC_START(__hyp_set_vectors) mov x1, x0 mov x0, #HVC_SET_VECTORS hvc #0 ret -ENDPROC(__hyp_set_vectors) +SYM_FUNC_END(__hyp_set_vectors) -ENTRY(__hyp_reset_vectors) +SYM_FUNC_START(__hyp_reset_vectors) mov x0, #HVC_RESET_VECTORS hvc #0 ret -ENDPROC(__hyp_reset_vectors) +SYM_FUNC_END(__hyp_reset_vectors) + +/* + * Entry point to finalise EL2 and switch to VHE if deemed capable + * + * w0: boot mode, as returned by init_kernel_el() + */ +SYM_FUNC_START(finalise_el2) + // Need to have booted at EL2 + cmp w0, #BOOT_CPU_MODE_EL2 + b.ne 1f + + // and still be at EL1 + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL1 + b.ne 1f + + mov x0, #HVC_FINALISE_EL2 + hvc #0 +1: + ret +SYM_FUNC_END(finalise_el2) diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c new file mode 100644 index 000000000000..a2cfbacec2bb --- /dev/null +++ b/arch/arm64/kernel/idle.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Low-level idle sequences + */ + +#include <linux/cpu.h> +#include <linux/irqflags.h> + +#include <asm/barrier.h> +#include <asm/cpuidle.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> + +/* + * cpu_do_idle() + * + * Idle the processor (wait for interrupt). + * + * If the CPU supports priority masking we must do additional work to + * ensure that interrupts are not masked at the PMR (because the core will + * not wake up if we block the wake up signal in the interrupt controller). + */ +void noinstr cpu_do_idle(void) +{ + struct arm_cpuidle_irq_context context; + + arm_cpuidle_save_irq_context(&context); + + dsb(sy); + wfi(); + + arm_cpuidle_restore_irq_context(&context); +} + +/* + * This is our default idle handler. + */ +void noinstr arch_cpu_idle(void) +{ + /* + * This should do all the clock switching and wait for interrupt + * tricks + */ + cpu_do_idle(); + raw_local_irq_enable(); +} diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c new file mode 100644 index 000000000000..95133765ed29 --- /dev/null +++ b/arch/arm64/kernel/idreg-override.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Early cpufeature override framework + * + * Copyright (C) 2020 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/libfdt.h> + +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/setup.h> + +#define FTR_DESC_NAME_LEN 20 +#define FTR_DESC_FIELD_LEN 10 +#define FTR_ALIAS_NAME_LEN 30 +#define FTR_ALIAS_OPTION_LEN 116 + +static u64 __boot_status __initdata; + +struct ftr_set_desc { + char name[FTR_DESC_NAME_LEN]; + struct arm64_ftr_override *override; + struct { + char name[FTR_DESC_FIELD_LEN]; + u8 shift; + u8 width; + bool (*filter)(u64 val); + } fields[]; +}; + +#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } + +static bool __init mmfr1_vh_filter(u64 val) +{ + /* + * If we ever reach this point while running VHE, we're + * guaranteed to be on one of these funky, VHE-stuck CPUs. If + * the user was trying to force nVHE on us, proceed with + * attitude adjustment. + */ + return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) && + val == 0); +} + +static const struct ftr_set_desc mmfr1 __initconst = { + .name = "id_aa64mmfr1", + .override = &id_aa64mmfr1_override, + .fields = { + FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter), + {} + }, +}; + +static bool __init pfr0_sve_filter(u64 val) +{ + /* + * Disabling SVE also means disabling all the features that + * are associated with it. The easiest way to do it is just to + * override id_aa64zfr0_el1 to be 0. + */ + if (!val) { + id_aa64zfr0_override.val = 0; + id_aa64zfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr0 __initconst = { + .name = "id_aa64pfr0", + .override = &id_aa64pfr0_override, + .fields = { + FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + {} + }, +}; + +static bool __init pfr1_sme_filter(u64 val) +{ + /* + * Similarly to SVE, disabling SME also means disabling all + * the features that are associated with it. Just set + * id_aa64smfr0_el1 to 0 and don't look back. + */ + if (!val) { + id_aa64smfr0_override.val = 0; + id_aa64smfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr1 __initconst = { + .name = "id_aa64pfr1", + .override = &id_aa64pfr1_override, + .fields = { + FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), + FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + {} + }, +}; + +static const struct ftr_set_desc isar1 __initconst = { + .name = "id_aa64isar1", + .override = &id_aa64isar1_override, + .fields = { + FIELD("gpi", ID_AA64ISAR1_EL1_GPI_SHIFT, NULL), + FIELD("gpa", ID_AA64ISAR1_EL1_GPA_SHIFT, NULL), + FIELD("api", ID_AA64ISAR1_EL1_API_SHIFT, NULL), + FIELD("apa", ID_AA64ISAR1_EL1_APA_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc isar2 __initconst = { + .name = "id_aa64isar2", + .override = &id_aa64isar2_override, + .fields = { + FIELD("gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT, NULL), + FIELD("apa3", ID_AA64ISAR2_EL1_APA3_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc smfr0 __initconst = { + .name = "id_aa64smfr0", + .override = &id_aa64smfr0_override, + .fields = { + /* FA64 is a one bit field... :-/ */ + { "fa64", ID_AA64SMFR0_EL1_FA64_SHIFT, 1, }, + {} + }, +}; + +extern struct arm64_ftr_override kaslr_feature_override; + +static const struct ftr_set_desc kaslr __initconst = { + .name = "kaslr", +#ifdef CONFIG_RANDOMIZE_BASE + .override = &kaslr_feature_override, +#endif + .fields = { + FIELD("disabled", 0, NULL), + {} + }, +}; + +static const struct ftr_set_desc * const regs[] __initconst = { + &mmfr1, + &pfr0, + &pfr1, + &isar1, + &isar2, + &smfr0, + &kaslr, +}; + +static const struct { + char alias[FTR_ALIAS_NAME_LEN]; + char feature[FTR_ALIAS_OPTION_LEN]; +} aliases[] __initconst = { + { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, + { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "arm64.nosve", "id_aa64pfr0.sve=0 id_aa64pfr1.sme=0" }, + { "arm64.nosme", "id_aa64pfr1.sme=0" }, + { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nopauth", + "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " + "id_aa64isar1.api=0 id_aa64isar1.apa=0 " + "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, + { "arm64.nomte", "id_aa64pfr1.mte=0" }, + { "nokaslr", "kaslr.disabled=1" }, +}; + +static int __init find_field(const char *cmdline, + const struct ftr_set_desc *reg, int f, u64 *v) +{ + char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; + int len; + + len = snprintf(opt, ARRAY_SIZE(opt), "%s.%s=", + reg->name, reg->fields[f].name); + + if (!parameqn(cmdline, opt, len)) + return -1; + + return kstrtou64(cmdline + len, 0, v); +} + +static void __init match_options(const char *cmdline) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + int f; + + if (!regs[i]->override) + continue; + + for (f = 0; strlen(regs[i]->fields[f].name); f++) { + u64 shift = regs[i]->fields[f].shift; + u64 width = regs[i]->fields[f].width ?: 4; + u64 mask = GENMASK_ULL(shift + width - 1, shift); + u64 v; + + if (find_field(cmdline, regs[i], f, &v)) + continue; + + /* + * If an override gets filtered out, advertise + * it by setting the value to the all-ones while + * clearing the mask... Yes, this is fragile. + */ + if (regs[i]->fields[f].filter && + !regs[i]->fields[f].filter(v)) { + regs[i]->override->val |= mask; + regs[i]->override->mask &= ~mask; + continue; + } + + regs[i]->override->val &= ~mask; + regs[i]->override->val |= (v << shift) & mask; + regs[i]->override->mask |= mask; + + return; + } + } +} + +static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) +{ + do { + char buf[256]; + size_t len; + int i; + + cmdline = skip_spaces(cmdline); + + for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++); + if (!len) + return; + + len = min(len, ARRAY_SIZE(buf) - 1); + strncpy(buf, cmdline, len); + buf[len] = 0; + + if (strcmp(buf, "--") == 0) + return; + + cmdline += len; + + match_options(buf); + + for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) + if (parameq(buf, aliases[i].alias)) + __parse_cmdline(aliases[i].feature, false); + } while (1); +} + +static __init const u8 *get_bootargs_cmdline(void) +{ + const u8 *prop; + void *fdt; + int node; + + fdt = get_early_fdt_ptr(); + if (!fdt) + return NULL; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return NULL; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + return NULL; + + return strlen(prop) ? prop : NULL; +} + +static __init void parse_cmdline(void) +{ + const u8 *prop = get_bootargs_cmdline(); + + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(CONFIG_CMDLINE, true); + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) + __parse_cmdline(prop, true); +} + +/* Keep checkers quiet */ +void init_feature_override(u64 boot_status); + +asmlinkage void __init init_feature_override(u64 boot_status) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + if (regs[i]->override) { + regs[i]->override->val = 0; + regs[i]->override->mask = 0; + } + } + + __boot_status = boot_status; + + parse_cmdline(); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + if (regs[i]->override) + dcache_clean_inval_poc((unsigned long)regs[i]->override, + (unsigned long)regs[i]->override + + sizeof(*regs[i]->override)); + } +} diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h new file mode 100644 index 000000000000..8151412653de --- /dev/null +++ b/arch/arm64/kernel/image-vars.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Linker script variables to be set after section resolution, as + * ld.lld does not like variables assigned before SECTIONS is processed. + */ +#ifndef __ARM64_KERNEL_IMAGE_VARS_H +#define __ARM64_KERNEL_IMAGE_VARS_H + +#ifndef LINKER_SCRIPT +#error This file should only be included in vmlinux.lds.S +#endif + +PROVIDE(__efistub_kernel_size = _edata - _text); +PROVIDE(__efistub_primary_entry_offset = primary_entry - _text); + +/* + * The EFI stub has its own symbol namespace prefixed by __efistub_, to + * isolate it from the kernel proper. The following symbols are legally + * accessed by the stub, so provide some aliases to make them accessible. + * Only include data symbols here, or text symbols of functions that are + * guaranteed to be safe when executed at another offset than they were + * linked at. The routines below are all implemented in assembler in a + * position independent manner + */ +PROVIDE(__efistub_memcmp = __pi_memcmp); +PROVIDE(__efistub_memchr = __pi_memchr); +PROVIDE(__efistub_strlen = __pi_strlen); +PROVIDE(__efistub_strnlen = __pi_strnlen); +PROVIDE(__efistub_strcmp = __pi_strcmp); +PROVIDE(__efistub_strncmp = __pi_strncmp); +PROVIDE(__efistub_strrchr = __pi_strrchr); +PROVIDE(__efistub_dcache_clean_poc = __pi_dcache_clean_poc); + +PROVIDE(__efistub__text = _text); +PROVIDE(__efistub__end = _end); +PROVIDE(__efistub__edata = _edata); +PROVIDE(__efistub_screen_info = screen_info); +PROVIDE(__efistub__ctype = _ctype); + +PROVIDE(__pi___memcpy = __pi_memcpy); +PROVIDE(__pi___memmove = __pi_memmove); +PROVIDE(__pi___memset = __pi_memset); + +#ifdef CONFIG_KVM + +/* + * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, to + * separate it from the kernel proper. The following symbols are legally + * accessed by it, therefore provide aliases to make them linkable. + * Do not include symbols which may not be safely accessed under hypervisor + * memory mappings. + */ + +/* Alternative callbacks for init-time patching of nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_patch_vector_branch); +KVM_NVHE_ALIAS(kvm_update_va_mask); +KVM_NVHE_ALIAS(kvm_get_kimage_voffset); +KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); +KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); +KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); +KVM_NVHE_ALIAS(alt_cb_patch_nops); + +/* Global kernel state accessed by nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_vgic_global_state); + +/* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */ +KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); + +/* Vectors installed by hyp-init on reset HVC. */ +KVM_NVHE_ALIAS(__hyp_stub_vectors); + +/* Kernel symbol used by icache_is_vpipt(). */ +KVM_NVHE_ALIAS(__icache_flags); + +/* VMID bits set by the KVM VMID allocator */ +KVM_NVHE_ALIAS(kvm_arm_vmid_bits); + +/* Static keys which are set if a vGIC trap should be handled in hyp. */ +KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); +KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); + +/* Static key checked in pmr_sync(). */ +#ifdef CONFIG_ARM64_PSEUDO_NMI +KVM_NVHE_ALIAS(gic_pmr_sync); +/* Static key checked in GIC_PRIO_IRQOFF. */ +KVM_NVHE_ALIAS(gic_nonsecure_priorities); +#endif + +/* EL2 exception handling */ +KVM_NVHE_ALIAS(__start___kvm_ex_table); +KVM_NVHE_ALIAS(__stop___kvm_ex_table); + +/* Array containing bases of nVHE per-CPU memory regions. */ +KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); + +/* PMU available static key */ +#ifdef CONFIG_HW_PERF_EVENTS +KVM_NVHE_ALIAS(kvm_arm_pmu_available); +#endif + +/* Position-independent library routines */ +KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); +KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); +KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(memset, __pi_memset); + +#ifdef CONFIG_KASAN +KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); +#endif + +/* Kernel memory sections */ +KVM_NVHE_ALIAS(__start_rodata); +KVM_NVHE_ALIAS(__end_rodata); +KVM_NVHE_ALIAS(__bss_start); +KVM_NVHE_ALIAS(__bss_stop); + +/* Hyp memory sections */ +KVM_NVHE_ALIAS(__hyp_idmap_text_start); +KVM_NVHE_ALIAS(__hyp_idmap_text_end); +KVM_NVHE_ALIAS(__hyp_text_start); +KVM_NVHE_ALIAS(__hyp_text_end); +KVM_NVHE_ALIAS(__hyp_bss_start); +KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_rodata_start); +KVM_NVHE_ALIAS(__hyp_rodata_end); + +/* pKVM static key */ +KVM_NVHE_ALIAS(kvm_protected_mode_initialized); + +#endif /* CONFIG_KVM */ + +#endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index c7fcb232fe47..7bc3ba897901 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -1,27 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Linker script macros to generate Image header fields. * * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __ASM_IMAGE_H -#define __ASM_IMAGE_H +#ifndef __ARM64_KERNEL_IMAGE_H +#define __ARM64_KERNEL_IMAGE_H #ifndef LINKER_SCRIPT #error This file should only be included in vmlinux.lds.S #endif +#include <asm/image.h> + /* * There aren't any ELF relocations we can use to endian-swap values known only * at link time (e.g. the subtraction of two symbol addresses), so we must get @@ -47,19 +38,22 @@ sym##_lo32 = DATA_LE32((data) & 0xffffffff); \ sym##_hi32 = DATA_LE32((data) >> 32) +#define __HEAD_FLAG(field) (__HEAD_FLAG_##field << \ + ARM64_IMAGE_FLAG_##field##_SHIFT) + #ifdef CONFIG_CPU_BIG_ENDIAN -#define __HEAD_FLAG_BE 1 +#define __HEAD_FLAG_BE ARM64_IMAGE_FLAG_BE #else -#define __HEAD_FLAG_BE 0 +#define __HEAD_FLAG_BE ARM64_IMAGE_FLAG_LE #endif #define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) #define __HEAD_FLAG_PHYS_BASE 1 -#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ - (__HEAD_FLAG_PAGE_SIZE << 1) | \ - (__HEAD_FLAG_PHYS_BASE << 3)) +#define __HEAD_FLAGS (__HEAD_FLAG(BE) | \ + __HEAD_FLAG(PAGE_SIZE) | \ + __HEAD_FLAG(PHYS_BASE)) /* * These will output as part of the Image header, which should be little-endian @@ -68,54 +62,6 @@ */ #define HEAD_SYMBOLS \ DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ - DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); -#ifdef CONFIG_EFI - -__efistub_stext_offset = stext - _text; - -/* - * Prevent the symbol aliases below from being emitted into the kallsyms - * table, by forcing them to be absolute symbols (which are conveniently - * ignored by scripts/kallsyms) rather than section relative symbols. - * The distinction is only relevant for partial linking, and only for symbols - * that are defined within a section declaration (which is not the case for - * the definitions below) so the resulting values will be identical. - */ -#define KALLSYMS_HIDE(sym) ABSOLUTE(sym) - -/* - * The EFI stub has its own symbol namespace prefixed by __efistub_, to - * isolate it from the kernel proper. The following symbols are legally - * accessed by the stub, so provide some aliases to make them accessible. - * Only include data symbols here, or text symbols of functions that are - * guaranteed to be safe when executed at another offset than they were - * linked at. The routines below are all implemented in assembler in a - * position independent manner - */ -__efistub_memcmp = KALLSYMS_HIDE(__pi_memcmp); -__efistub_memchr = KALLSYMS_HIDE(__pi_memchr); -__efistub_memcpy = KALLSYMS_HIDE(__pi_memcpy); -__efistub_memmove = KALLSYMS_HIDE(__pi_memmove); -__efistub_memset = KALLSYMS_HIDE(__pi_memset); -__efistub_strlen = KALLSYMS_HIDE(__pi_strlen); -__efistub_strnlen = KALLSYMS_HIDE(__pi_strnlen); -__efistub_strcmp = KALLSYMS_HIDE(__pi_strcmp); -__efistub_strncmp = KALLSYMS_HIDE(__pi_strncmp); -__efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); - -#ifdef CONFIG_KASAN -__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); -__efistub___memmove = KALLSYMS_HIDE(__pi_memmove); -__efistub___memset = KALLSYMS_HIDE(__pi_memset); -#endif - -__efistub__text = KALLSYMS_HIDE(_text); -__efistub__end = KALLSYMS_HIDE(_end); -__efistub__edata = KALLSYMS_HIDE(_edata); -__efistub_screen_info = KALLSYMS_HIDE(screen_info); - -#endif - -#endif /* __ASM_IMAGE_H */ +#endif /* __ARM64_KERNEL_IMAGE_H */ diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c deleted file mode 100644 index 2718a77da165..000000000000 --- a/arch/arm64/kernel/insn.c +++ /dev/null @@ -1,1483 +0,0 @@ -/* - * Copyright (C) 2013 Huawei Ltd. - * Author: Jiang Liu <liuj97@gmail.com> - * - * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#include <linux/bitops.h> -#include <linux/bug.h> -#include <linux/compiler.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/spinlock.h> -#include <linux/stop_machine.h> -#include <linux/types.h> -#include <linux/uaccess.h> - -#include <asm/cacheflush.h> -#include <asm/debug-monitors.h> -#include <asm/fixmap.h> -#include <asm/insn.h> -#include <asm/kprobes.h> - -#define AARCH64_INSN_SF_BIT BIT(31) -#define AARCH64_INSN_N_BIT BIT(22) - -static int aarch64_insn_encoding_class[] = { - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_REG, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_FPSIMD, - AARCH64_INSN_CLS_DP_IMM, - AARCH64_INSN_CLS_DP_IMM, - AARCH64_INSN_CLS_BR_SYS, - AARCH64_INSN_CLS_BR_SYS, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_REG, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_FPSIMD, -}; - -enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn) -{ - return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; -} - -/* NOP is an alias of HINT */ -bool __kprobes aarch64_insn_is_nop(u32 insn) -{ - if (!aarch64_insn_is_hint(insn)) - return false; - - switch (insn & 0xFE0) { - case AARCH64_INSN_HINT_YIELD: - case AARCH64_INSN_HINT_WFE: - case AARCH64_INSN_HINT_WFI: - case AARCH64_INSN_HINT_SEV: - case AARCH64_INSN_HINT_SEVL: - return false; - default: - return true; - } -} - -bool aarch64_insn_is_branch_imm(u32 insn) -{ - return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) || - aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) || - aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)); -} - -static DEFINE_RAW_SPINLOCK(patch_lock); - -static void __kprobes *patch_map(void *addr, int fixmap) -{ - unsigned long uintaddr = (uintptr_t) addr; - bool module = !core_kernel_text(uintaddr); - struct page *page; - - if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - page = vmalloc_to_page(addr); - else if (!module) - page = phys_to_page(__pa_symbol(addr)); - else - return addr; - - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); -} - -static void __kprobes patch_unmap(int fixmap) -{ - clear_fixmap(fixmap); -} -/* - * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always - * little-endian. - */ -int __kprobes aarch64_insn_read(void *addr, u32 *insnp) -{ - int ret; - __le32 val; - - ret = probe_kernel_read(&val, addr, AARCH64_INSN_SIZE); - if (!ret) - *insnp = le32_to_cpu(val); - - return ret; -} - -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) -{ - void *waddr = addr; - unsigned long flags = 0; - int ret; - - raw_spin_lock_irqsave(&patch_lock, flags); - waddr = patch_map(addr, FIX_TEXT_POKE0); - - ret = probe_kernel_write(waddr, &insn, AARCH64_INSN_SIZE); - - patch_unmap(FIX_TEXT_POKE0); - raw_spin_unlock_irqrestore(&patch_lock, flags); - - return ret; -} - -int __kprobes aarch64_insn_write(void *addr, u32 insn) -{ - return __aarch64_insn_write(addr, cpu_to_le32(insn)); -} - -static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn) -{ - if (aarch64_get_insn_class(insn) != AARCH64_INSN_CLS_BR_SYS) - return false; - - return aarch64_insn_is_b(insn) || - aarch64_insn_is_bl(insn) || - aarch64_insn_is_svc(insn) || - aarch64_insn_is_hvc(insn) || - aarch64_insn_is_smc(insn) || - aarch64_insn_is_brk(insn) || - aarch64_insn_is_nop(insn); -} - -bool __kprobes aarch64_insn_uses_literal(u32 insn) -{ - /* ldr/ldrsw (literal), prfm */ - - return aarch64_insn_is_ldr_lit(insn) || - aarch64_insn_is_ldrsw_lit(insn) || - aarch64_insn_is_adr_adrp(insn) || - aarch64_insn_is_prfm_lit(insn); -} - -bool __kprobes aarch64_insn_is_branch(u32 insn) -{ - /* b, bl, cb*, tb*, b.cond, br, blr */ - - return aarch64_insn_is_b(insn) || - aarch64_insn_is_bl(insn) || - aarch64_insn_is_cbz(insn) || - aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_tbz(insn) || - aarch64_insn_is_tbnz(insn) || - aarch64_insn_is_ret(insn) || - aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_bcond(insn); -} - -/* - * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a - * Section B2.6.5 "Concurrent modification and execution of instructions": - * Concurrent modification and execution of instructions can lead to the - * resulting instruction performing any behavior that can be achieved by - * executing any sequence of instructions that can be executed from the - * same Exception level, except where the instruction before modification - * and the instruction after modification is a B, BL, NOP, BKPT, SVC, HVC, - * or SMC instruction. - */ -bool __kprobes aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn) -{ - return __aarch64_insn_hotpatch_safe(old_insn) && - __aarch64_insn_hotpatch_safe(new_insn); -} - -int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) -{ - u32 *tp = addr; - int ret; - - /* A64 instructions must be word aligned */ - if ((uintptr_t)tp & 0x3) - return -EINVAL; - - ret = aarch64_insn_write(tp, insn); - if (ret == 0) - flush_icache_range((uintptr_t)tp, - (uintptr_t)tp + AARCH64_INSN_SIZE); - - return ret; -} - -struct aarch64_insn_patch { - void **text_addrs; - u32 *new_insns; - int insn_cnt; - atomic_t cpu_count; -}; - -static int __kprobes aarch64_insn_patch_text_cb(void *arg) -{ - int i, ret = 0; - struct aarch64_insn_patch *pp = arg; - - /* The first CPU becomes master */ - if (atomic_inc_return(&pp->cpu_count) == 1) { - for (i = 0; ret == 0 && i < pp->insn_cnt; i++) - ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], - pp->new_insns[i]); - /* - * aarch64_insn_patch_text_nosync() calls flush_icache_range(), - * which ends with "dsb; isb" pair guaranteeing global - * visibility. - */ - /* Notify other processors with an additional increment. */ - atomic_inc(&pp->cpu_count); - } else { - while (atomic_read(&pp->cpu_count) <= num_online_cpus()) - cpu_relax(); - isb(); - } - - return ret; -} - -static -int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) -{ - struct aarch64_insn_patch patch = { - .text_addrs = addrs, - .new_insns = insns, - .insn_cnt = cnt, - .cpu_count = ATOMIC_INIT(0), - }; - - if (cnt <= 0) - return -EINVAL; - - return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); -} - -int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) -{ - int ret; - u32 insn; - - /* Unsafe to patch multiple instructions without synchronizaiton */ - if (cnt == 1) { - ret = aarch64_insn_read(addrs[0], &insn); - if (ret) - return ret; - - if (aarch64_insn_hotpatch_safe(insn, insns[0])) { - /* - * ARMv8 architecture doesn't guarantee all CPUs see - * the new instruction after returning from function - * aarch64_insn_patch_text_nosync(). So send IPIs to - * all other CPUs to achieve instruction - * synchronization. - */ - ret = aarch64_insn_patch_text_nosync(addrs[0], insns[0]); - kick_all_cpus_sync(); - return ret; - } - } - - return aarch64_insn_patch_text_sync(addrs, insns, cnt); -} - -static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, - u32 *maskp, int *shiftp) -{ - u32 mask; - int shift; - - switch (type) { - case AARCH64_INSN_IMM_26: - mask = BIT(26) - 1; - shift = 0; - break; - case AARCH64_INSN_IMM_19: - mask = BIT(19) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_16: - mask = BIT(16) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_14: - mask = BIT(14) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_12: - mask = BIT(12) - 1; - shift = 10; - break; - case AARCH64_INSN_IMM_9: - mask = BIT(9) - 1; - shift = 12; - break; - case AARCH64_INSN_IMM_7: - mask = BIT(7) - 1; - shift = 15; - break; - case AARCH64_INSN_IMM_6: - case AARCH64_INSN_IMM_S: - mask = BIT(6) - 1; - shift = 10; - break; - case AARCH64_INSN_IMM_R: - mask = BIT(6) - 1; - shift = 16; - break; - default: - return -EINVAL; - } - - *maskp = mask; - *shiftp = shift; - - return 0; -} - -#define ADR_IMM_HILOSPLIT 2 -#define ADR_IMM_SIZE SZ_2M -#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1) -#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1) -#define ADR_IMM_LOSHIFT 29 -#define ADR_IMM_HISHIFT 5 - -u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn) -{ - u32 immlo, immhi, mask; - int shift; - - switch (type) { - case AARCH64_INSN_IMM_ADR: - shift = 0; - immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK; - immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK; - insn = (immhi << ADR_IMM_HILOSPLIT) | immlo; - mask = ADR_IMM_SIZE - 1; - break; - default: - if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { - pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n", - type); - return 0; - } - } - - return (insn >> shift) & mask; -} - -u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type, - u32 insn, u64 imm) -{ - u32 immlo, immhi, mask; - int shift; - - if (insn == AARCH64_BREAK_FAULT) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_IMM_ADR: - shift = 0; - immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT; - imm >>= ADR_IMM_HILOSPLIT; - immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT; - imm = immlo | immhi; - mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) | - (ADR_IMM_HIMASK << ADR_IMM_HISHIFT)); - break; - default: - if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { - pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n", - type); - return AARCH64_BREAK_FAULT; - } - } - - /* Update the immediate field. */ - insn &= ~(mask << shift); - insn |= (imm & mask) << shift; - - return insn; -} - -u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, - u32 insn) -{ - int shift; - - switch (type) { - case AARCH64_INSN_REGTYPE_RT: - case AARCH64_INSN_REGTYPE_RD: - shift = 0; - break; - case AARCH64_INSN_REGTYPE_RN: - shift = 5; - break; - case AARCH64_INSN_REGTYPE_RT2: - case AARCH64_INSN_REGTYPE_RA: - shift = 10; - break; - case AARCH64_INSN_REGTYPE_RM: - shift = 16; - break; - default: - pr_err("%s: unknown register type encoding %d\n", __func__, - type); - return 0; - } - - return (insn >> shift) & GENMASK(4, 0); -} - -static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, - u32 insn, - enum aarch64_insn_register reg) -{ - int shift; - - if (insn == AARCH64_BREAK_FAULT) - return AARCH64_BREAK_FAULT; - - if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) { - pr_err("%s: unknown register encoding %d\n", __func__, reg); - return AARCH64_BREAK_FAULT; - } - - switch (type) { - case AARCH64_INSN_REGTYPE_RT: - case AARCH64_INSN_REGTYPE_RD: - shift = 0; - break; - case AARCH64_INSN_REGTYPE_RN: - shift = 5; - break; - case AARCH64_INSN_REGTYPE_RT2: - case AARCH64_INSN_REGTYPE_RA: - shift = 10; - break; - case AARCH64_INSN_REGTYPE_RM: - case AARCH64_INSN_REGTYPE_RS: - shift = 16; - break; - default: - pr_err("%s: unknown register type encoding %d\n", __func__, - type); - return AARCH64_BREAK_FAULT; - } - - insn &= ~(GENMASK(4, 0) << shift); - insn |= reg << shift; - - return insn; -} - -static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, - u32 insn) -{ - u32 size; - - switch (type) { - case AARCH64_INSN_SIZE_8: - size = 0; - break; - case AARCH64_INSN_SIZE_16: - size = 1; - break; - case AARCH64_INSN_SIZE_32: - size = 2; - break; - case AARCH64_INSN_SIZE_64: - size = 3; - break; - default: - pr_err("%s: unknown size encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn &= ~GENMASK(31, 30); - insn |= size << 30; - - return insn; -} - -static inline long branch_imm_common(unsigned long pc, unsigned long addr, - long range) -{ - long offset; - - if ((pc & 0x3) || (addr & 0x3)) { - pr_err("%s: A64 instructions must be word aligned\n", __func__); - return range; - } - - offset = ((long)addr - (long)pc); - - if (offset < -range || offset >= range) { - pr_err("%s: offset out of range\n", __func__); - return range; - } - - return offset; -} - -u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_branch_type type) -{ - u32 insn; - long offset; - - /* - * B/BL support [-128M, 128M) offset - * ARM64 virtual address arrangement guarantees all kernel and module - * texts are within +/-128M. - */ - offset = branch_imm_common(pc, addr, SZ_128M); - if (offset >= SZ_128M) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_BRANCH_LINK: - insn = aarch64_insn_get_bl_value(); - break; - case AARCH64_INSN_BRANCH_NOLINK: - insn = aarch64_insn_get_b_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, - offset >> 2); -} - -u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_register reg, - enum aarch64_insn_variant variant, - enum aarch64_insn_branch_type type) -{ - u32 insn; - long offset; - - offset = branch_imm_common(pc, addr, SZ_1M); - if (offset >= SZ_1M) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_BRANCH_COMP_ZERO: - insn = aarch64_insn_get_cbz_value(); - break; - case AARCH64_INSN_BRANCH_COMP_NONZERO: - insn = aarch64_insn_get_cbnz_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); -} - -u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_condition cond) -{ - u32 insn; - long offset; - - offset = branch_imm_common(pc, addr, SZ_1M); - - insn = aarch64_insn_get_bcond_value(); - - if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) { - pr_err("%s: unknown condition encoding %d\n", __func__, cond); - return AARCH64_BREAK_FAULT; - } - insn |= cond; - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); -} - -u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op) -{ - return aarch64_insn_get_hint_value() | op; -} - -u32 __kprobes aarch64_insn_gen_nop(void) -{ - return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); -} - -u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, - enum aarch64_insn_branch_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_BRANCH_NOLINK: - insn = aarch64_insn_get_br_value(); - break; - case AARCH64_INSN_BRANCH_LINK: - insn = aarch64_insn_get_blr_value(); - break; - case AARCH64_INSN_BRANCH_RETURN: - insn = aarch64_insn_get_ret_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg); -} - -u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, - enum aarch64_insn_register base, - enum aarch64_insn_register offset, - enum aarch64_insn_size_type size, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_REG_OFFSET: - insn = aarch64_insn_get_ldr_reg_value(); - break; - case AARCH64_INSN_LDST_STORE_REG_OFFSET: - insn = aarch64_insn_get_str_reg_value(); - break; - default: - pr_err("%s: unknown load/store encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, - offset); -} - -u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, - enum aarch64_insn_register reg2, - enum aarch64_insn_register base, - int offset, - enum aarch64_insn_variant variant, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - int shift; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX: - insn = aarch64_insn_get_ldp_pre_value(); - break; - case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX: - insn = aarch64_insn_get_stp_pre_value(); - break; - case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX: - insn = aarch64_insn_get_ldp_post_value(); - break; - case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX: - insn = aarch64_insn_get_stp_post_value(); - break; - default: - pr_err("%s: unknown load/store encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if ((offset & 0x3) || (offset < -256) || (offset > 252)) { - pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n", - __func__, offset); - return AARCH64_BREAK_FAULT; - } - shift = 2; - break; - case AARCH64_INSN_VARIANT_64BIT: - if ((offset & 0x7) || (offset < -512) || (offset > 504)) { - pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n", - __func__, offset); - return AARCH64_BREAK_FAULT; - } - shift = 3; - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - reg1); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, - reg2); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn, - offset >> shift); -} - -u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, - enum aarch64_insn_register base, - enum aarch64_insn_register state, - enum aarch64_insn_size_type size, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_EX: - insn = aarch64_insn_get_load_ex_value(); - break; - case AARCH64_INSN_LDST_STORE_EX: - insn = aarch64_insn_get_store_ex_value(); - break; - default: - pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - reg); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, - AARCH64_INSN_REG_ZR); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, - state); -} - -static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type, - enum aarch64_insn_prfm_target target, - enum aarch64_insn_prfm_policy policy, - u32 insn) -{ - u32 imm_type = 0, imm_target = 0, imm_policy = 0; - - switch (type) { - case AARCH64_INSN_PRFM_TYPE_PLD: - break; - case AARCH64_INSN_PRFM_TYPE_PLI: - imm_type = BIT(0); - break; - case AARCH64_INSN_PRFM_TYPE_PST: - imm_type = BIT(1); - break; - default: - pr_err("%s: unknown prfm type encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (target) { - case AARCH64_INSN_PRFM_TARGET_L1: - break; - case AARCH64_INSN_PRFM_TARGET_L2: - imm_target = BIT(0); - break; - case AARCH64_INSN_PRFM_TARGET_L3: - imm_target = BIT(1); - break; - default: - pr_err("%s: unknown prfm target encoding %d\n", __func__, target); - return AARCH64_BREAK_FAULT; - } - - switch (policy) { - case AARCH64_INSN_PRFM_POLICY_KEEP: - break; - case AARCH64_INSN_PRFM_POLICY_STRM: - imm_policy = BIT(0); - break; - default: - pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy); - return AARCH64_BREAK_FAULT; - } - - /* In this case, imm5 is encoded into Rt field. */ - insn &= ~GENMASK(4, 0); - insn |= imm_policy | (imm_target << 1) | (imm_type << 3); - - return insn; -} - -u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, - enum aarch64_insn_prfm_type type, - enum aarch64_insn_prfm_target target, - enum aarch64_insn_prfm_policy policy) -{ - u32 insn = aarch64_insn_get_prfm_value(); - - insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn); - - insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0); -} - -u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - int imm, enum aarch64_insn_variant variant, - enum aarch64_insn_adsb_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_ADSB_ADD: - insn = aarch64_insn_get_add_imm_value(); - break; - case AARCH64_INSN_ADSB_SUB: - insn = aarch64_insn_get_sub_imm_value(); - break; - case AARCH64_INSN_ADSB_ADD_SETFLAGS: - insn = aarch64_insn_get_adds_imm_value(); - break; - case AARCH64_INSN_ADSB_SUB_SETFLAGS: - insn = aarch64_insn_get_subs_imm_value(); - break; - default: - pr_err("%s: unknown add/sub encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - if (imm & ~(SZ_4K - 1)) { - pr_err("%s: invalid immediate encoding %d\n", __func__, imm); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); -} - -u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - int immr, int imms, - enum aarch64_insn_variant variant, - enum aarch64_insn_bitfield_type type) -{ - u32 insn; - u32 mask; - - switch (type) { - case AARCH64_INSN_BITFIELD_MOVE: - insn = aarch64_insn_get_bfm_value(); - break; - case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED: - insn = aarch64_insn_get_ubfm_value(); - break; - case AARCH64_INSN_BITFIELD_MOVE_SIGNED: - insn = aarch64_insn_get_sbfm_value(); - break; - default: - pr_err("%s: unknown bitfield encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - mask = GENMASK(4, 0); - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT; - mask = GENMASK(5, 0); - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - if (immr & ~mask) { - pr_err("%s: invalid immr encoding %d\n", __func__, immr); - return AARCH64_BREAK_FAULT; - } - if (imms & ~mask) { - pr_err("%s: invalid imms encoding %d\n", __func__, imms); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); -} - -u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst, - int imm, int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_movewide_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_MOVEWIDE_ZERO: - insn = aarch64_insn_get_movz_value(); - break; - case AARCH64_INSN_MOVEWIDE_KEEP: - insn = aarch64_insn_get_movk_value(); - break; - case AARCH64_INSN_MOVEWIDE_INVERSE: - insn = aarch64_insn_get_movn_value(); - break; - default: - pr_err("%s: unknown movewide encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - if (imm & ~(SZ_64K - 1)) { - pr_err("%s: invalid immediate encoding %d\n", __func__, imm); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift != 0 && shift != 16) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift != 0 && shift != 16 && shift != 32 && shift != 48) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn |= (shift >> 4) << 21; - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); -} - -u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_adsb_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_ADSB_ADD: - insn = aarch64_insn_get_add_value(); - break; - case AARCH64_INSN_ADSB_SUB: - insn = aarch64_insn_get_sub_value(); - break; - case AARCH64_INSN_ADSB_ADD_SETFLAGS: - insn = aarch64_insn_get_adds_value(); - break; - case AARCH64_INSN_ADSB_SUB_SETFLAGS: - insn = aarch64_insn_get_subs_value(); - break; - default: - pr_err("%s: unknown add/sub encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift & ~(SZ_32 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift & ~(SZ_64 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); -} - -u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_variant variant, - enum aarch64_insn_data1_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA1_REVERSE_16: - insn = aarch64_insn_get_rev16_value(); - break; - case AARCH64_INSN_DATA1_REVERSE_32: - insn = aarch64_insn_get_rev32_value(); - break; - case AARCH64_INSN_DATA1_REVERSE_64: - if (variant != AARCH64_INSN_VARIANT_64BIT) { - pr_err("%s: invalid variant for reverse64 %d\n", - __func__, variant); - return AARCH64_BREAK_FAULT; - } - insn = aarch64_insn_get_rev64_value(); - break; - default: - pr_err("%s: unknown data1 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); -} - -u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - enum aarch64_insn_variant variant, - enum aarch64_insn_data2_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA2_UDIV: - insn = aarch64_insn_get_udiv_value(); - break; - case AARCH64_INSN_DATA2_SDIV: - insn = aarch64_insn_get_sdiv_value(); - break; - case AARCH64_INSN_DATA2_LSLV: - insn = aarch64_insn_get_lslv_value(); - break; - case AARCH64_INSN_DATA2_LSRV: - insn = aarch64_insn_get_lsrv_value(); - break; - case AARCH64_INSN_DATA2_ASRV: - insn = aarch64_insn_get_asrv_value(); - break; - case AARCH64_INSN_DATA2_RORV: - insn = aarch64_insn_get_rorv_value(); - break; - default: - pr_err("%s: unknown data2 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); -} - -u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg1, - enum aarch64_insn_register reg2, - enum aarch64_insn_variant variant, - enum aarch64_insn_data3_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA3_MADD: - insn = aarch64_insn_get_madd_value(); - break; - case AARCH64_INSN_DATA3_MSUB: - insn = aarch64_insn_get_msub_value(); - break; - default: - pr_err("%s: unknown data3 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - reg1); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, - reg2); -} - -u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_logic_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LOGIC_AND: - insn = aarch64_insn_get_and_value(); - break; - case AARCH64_INSN_LOGIC_BIC: - insn = aarch64_insn_get_bic_value(); - break; - case AARCH64_INSN_LOGIC_ORR: - insn = aarch64_insn_get_orr_value(); - break; - case AARCH64_INSN_LOGIC_ORN: - insn = aarch64_insn_get_orn_value(); - break; - case AARCH64_INSN_LOGIC_EOR: - insn = aarch64_insn_get_eor_value(); - break; - case AARCH64_INSN_LOGIC_EON: - insn = aarch64_insn_get_eon_value(); - break; - case AARCH64_INSN_LOGIC_AND_SETFLAGS: - insn = aarch64_insn_get_ands_value(); - break; - case AARCH64_INSN_LOGIC_BIC_SETFLAGS: - insn = aarch64_insn_get_bics_value(); - break; - default: - pr_err("%s: unknown logical encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift & ~(SZ_32 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift & ~(SZ_64 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); -} - -/* - * Decode the imm field of a branch, and return the byte offset as a - * signed value (so it can be used when computing a new branch - * target). - */ -s32 aarch64_get_branch_offset(u32 insn) -{ - s32 imm; - - if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn); - return (imm << 6) >> 4; - } - - if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn); - return (imm << 13) >> 11; - } - - if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn); - return (imm << 18) >> 16; - } - - /* Unhandled instruction */ - BUG(); -} - -/* - * Encode the displacement of a branch in the imm field and return the - * updated instruction. - */ -u32 aarch64_set_branch_offset(u32 insn, s32 offset) -{ - if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, - offset >> 2); - - if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); - - if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn, - offset >> 2); - - /* Unhandled instruction */ - BUG(); -} - -s32 aarch64_insn_adrp_get_offset(u32 insn) -{ - BUG_ON(!aarch64_insn_is_adrp(insn)); - return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12; -} - -u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset) -{ - BUG_ON(!aarch64_insn_is_adrp(insn)); - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, - offset >> 12); -} - -/* - * Extract the Op/CR data from a msr/mrs instruction. - */ -u32 aarch64_insn_extract_system_reg(u32 insn) -{ - return (insn & 0x1FFFE0) >> 5; -} - -bool aarch32_insn_is_wide(u32 insn) -{ - return insn >= 0xe800; -} - -/* - * Macros/defines for extracting register numbers from instruction. - */ -u32 aarch32_insn_extract_reg_num(u32 insn, int offset) -{ - return (insn & (0xf << offset)) >> offset; -} - -#define OPC2_MASK 0x7 -#define OPC2_OFFSET 5 -u32 aarch32_insn_mcr_extract_opc2(u32 insn) -{ - return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET; -} - -#define CRM_MASK 0xf -u32 aarch32_insn_mcr_extract_crm(u32 insn) -{ - return insn & CRM_MASK; -} - -static bool __kprobes __check_eq(unsigned long pstate) -{ - return (pstate & PSR_Z_BIT) != 0; -} - -static bool __kprobes __check_ne(unsigned long pstate) -{ - return (pstate & PSR_Z_BIT) == 0; -} - -static bool __kprobes __check_cs(unsigned long pstate) -{ - return (pstate & PSR_C_BIT) != 0; -} - -static bool __kprobes __check_cc(unsigned long pstate) -{ - return (pstate & PSR_C_BIT) == 0; -} - -static bool __kprobes __check_mi(unsigned long pstate) -{ - return (pstate & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_pl(unsigned long pstate) -{ - return (pstate & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_vs(unsigned long pstate) -{ - return (pstate & PSR_V_BIT) != 0; -} - -static bool __kprobes __check_vc(unsigned long pstate) -{ - return (pstate & PSR_V_BIT) == 0; -} - -static bool __kprobes __check_hi(unsigned long pstate) -{ - pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ - return (pstate & PSR_C_BIT) != 0; -} - -static bool __kprobes __check_ls(unsigned long pstate) -{ - pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ - return (pstate & PSR_C_BIT) == 0; -} - -static bool __kprobes __check_ge(unsigned long pstate) -{ - pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ - return (pstate & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_lt(unsigned long pstate) -{ - pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ - return (pstate & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_gt(unsigned long pstate) -{ - /*PSR_N_BIT ^= PSR_V_BIT */ - unsigned long temp = pstate ^ (pstate << 3); - - temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ - return (temp & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_le(unsigned long pstate) -{ - /*PSR_N_BIT ^= PSR_V_BIT */ - unsigned long temp = pstate ^ (pstate << 3); - - temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ - return (temp & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_al(unsigned long pstate) -{ - return true; -} - -/* - * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that - * it behaves identically to 0b1110 ("al"). - */ -pstate_check_t * const aarch32_opcode_cond_checks[16] = { - __check_eq, __check_ne, __check_cs, __check_cc, - __check_mi, __check_pl, __check_vs, __check_vc, - __check_hi, __check_ls, __check_ge, __check_lt, - __check_gt, __check_le, __check_al, __check_al -}; diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index 354be2a872ae..aa7a4ec6a3ae 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/io.c * * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/export.h> @@ -25,8 +14,7 @@ */ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) { - while (count && (!IS_ALIGNED((unsigned long)from, 8) || - !IS_ALIGNED((unsigned long)to, 8))) { + while (count && !IS_ALIGNED((unsigned long)from, 8)) { *(u8 *)to = __raw_readb(from); from++; to++; @@ -54,23 +42,22 @@ EXPORT_SYMBOL(__memcpy_fromio); */ void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) { - while (count && (!IS_ALIGNED((unsigned long)to, 8) || - !IS_ALIGNED((unsigned long)from, 8))) { - __raw_writeb(*(volatile u8 *)from, to); + while (count && !IS_ALIGNED((unsigned long)to, 8)) { + __raw_writeb(*(u8 *)from, to); from++; to++; count--; } while (count >= 8) { - __raw_writeq(*(volatile u64 *)from, to); + __raw_writeq(*(u64 *)from, to); from += 8; to += 8; count -= 8; } while (count) { - __raw_writeb(*(volatile u8 *)from, to); + __raw_writeb(*(u8 *)from, to); from++; to++; count--; diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 713561e5bcab..38dbd3828f13 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/irq.c * @@ -7,48 +8,45 @@ * Dynamic Tick Timer written by Tony Lindgren <tony@atomide.com> and * Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>. * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include <linux/kernel_stat.h> #include <linux/irq.h> #include <linux/memory.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <linux/init.h> #include <linux/irqchip.h> +#include <linux/kprobes.h> +#include <linux/scs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <asm/daifflags.h> +#include <asm/exception.h> +#include <asm/vmap_stack.h> +#include <asm/softirq_stack.h> -unsigned long irq_err_count; +/* Only access this in an NMI enter/exit */ +DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts); DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); -int arch_show_interrupts(struct seq_file *p, int prec) -{ - show_ipi_list(p, prec); - seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count); - return 0; -} -void (*handle_arch_irq)(struct pt_regs *) = NULL; +DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); -void __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); +#endif + +static void init_irq_scs(void) { - if (handle_arch_irq) + int cpu; + + if (!IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) return; - handle_arch_irq = handle_irq; + for_each_possible_cpu(cpu) + per_cpu(irq_shadow_call_stack_ptr, cpu) = + scs_alloc(cpu_to_node(cpu)); } #ifdef CONFIG_VMAP_STACK @@ -58,17 +56,7 @@ static void init_irq_stacks(void) unsigned long *p; for_each_possible_cpu(cpu) { - /* - * To ensure that VMAP'd stack overflow detection works - * correctly, the IRQ stacks need to have the same - * alignment as other stacks. - */ - p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, - 0, cpu_to_node(cpu), - __builtin_return_address(0)); - + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } @@ -85,10 +73,63 @@ static void init_irq_stacks(void) } #endif +#ifndef CONFIG_PREEMPT_RT +static void ____do_softirq(struct pt_regs *regs) +{ + __do_softirq(); +} + +void do_softirq_own_stack(void) +{ + call_on_irq_stack(NULL, ____do_softirq); +} +#endif + +static void default_handle_irq(struct pt_regs *regs) +{ + panic("IRQ taken without a root IRQ handler\n"); +} + +static void default_handle_fiq(struct pt_regs *regs) +{ + panic("FIQ taken without a root FIQ handler\n"); +} + +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq; +void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq; + +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq != default_handle_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + pr_info("Root IRQ handler: %ps\n", handle_irq); + return 0; +} + +int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *)) +{ + if (handle_arch_fiq != default_handle_fiq) + return -EBUSY; + + handle_arch_fiq = handle_fiq; + pr_info("Root FIQ handler: %ps\n", handle_fiq); + return 0; +} + void __init init_IRQ(void) { init_irq_stacks(); + init_irq_scs(); irqchip_init(); - if (!handle_arch_irq) - panic("No interrupt controller found."); + + if (system_uses_irq_prio_masking()) { + /* + * Now that we have a stack for our IRQ handler, set + * the PMR/PSR pair to a consistent state. + */ + WARN_ON(read_sysreg(daif) & PSR_A_BIT); + local_daif_restore(DAIF_PROCCTX_NOIRQ); + } } diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index c2dd1ad3e648..faf88ec9c48e 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -1,53 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Huawei Ltd. * Author: Jiang Liu <liuj97@gmail.com> * * Based on arch/arm/kernel/jump_label.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kernel.h> #include <linux/jump_label.h> #include <asm/insn.h> - -#ifdef HAVE_JUMP_LABEL +#include <asm/patching.h> void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - void *addr = (void *)entry->code; + void *addr = (void *)jump_entry_code(entry); u32 insn; if (type == JUMP_LABEL_JMP) { - insn = aarch64_insn_gen_branch_imm(entry->code, - entry->target, + insn = aarch64_insn_gen_branch_imm(jump_entry_code(entry), + jump_entry_target(entry), AARCH64_INSN_BRANCH_NOLINK); } else { insn = aarch64_insn_gen_nop(); } - aarch64_insn_patch_text(&addr, &insn, 1); + aarch64_insn_patch_text_nosync(addr, insn); } - -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - /* - * We use the architected A64 NOP in arch_static_branch, so there's no - * need to patch an identical A64 NOP over the top of it here. The core - * will call arch_jump_label_transform from a module notifier if the - * NOP needs to be replaced by a branch. - */ -} - -#endif /* HAVE_JUMP_LABEL */ diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 47080c49cc7e..325455d16dbc 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/cache.h> @@ -13,74 +10,25 @@ #include <linux/mm_types.h> #include <linux/sched.h> #include <linux/types.h> +#include <linux/pgtable.h> +#include <linux/random.h> #include <asm/fixmap.h> #include <asm/kernel-pgtable.h> #include <asm/memory.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/sections.h> +#include <asm/setup.h> u64 __ro_after_init module_alloc_base; u16 __initdata memstart_offset_seed; -static __init u64 get_kaslr_seed(void *fdt) -{ - int node, len; - fdt64_t *prop; - u64 ret; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - return 0; - - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); - if (!prop || len != sizeof(u64)) - return 0; - - ret = fdt64_to_cpu(*prop); - *prop = 0; - return ret; -} - -static __init const u8 *get_cmdline(void *fdt) -{ - static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; - - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - return prop; - } -out: - return default_cmdline; -} +struct arm64_ftr_override kaslr_feature_override __initdata; -extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, - pgprot_t prot); - -/* - * This routine will be executed with the kernel mapped at its default virtual - * address, and if it returns successfully, the kernel will be remapped, and - * start_kernel() will be executed from a randomized virtual offset. The - * relocation will result in all absolute references (e.g., static variables - * containing function pointers) to be reinitialized, and zero-initialized - * .bss variables will be reset to 0. - */ -u64 __init kaslr_early_init(u64 dt_phys) +static int __init kaslr_init(void) { - void *fdt; - u64 seed, offset, mask, module_range; - const u8 *cmdline, *str; - int size; + u64 module_range; + u32 seed; /* * Set a reasonable default for module_alloc_base in case @@ -88,97 +36,58 @@ u64 __init kaslr_early_init(u64 dt_phys) */ module_alloc_base = (u64)_etext - MODULES_VSIZE; - /* - * Try to map the FDT early. If this fails, we simply bail, - * and proceed with KASLR disabled. We will make another - * attempt at mapping the FDT in setup_machine() - */ - early_fixmap_init(); - fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); - if (!fdt) - return 0; - - /* - * Retrieve (and wipe) the seed from the FDT - */ - seed = get_kaslr_seed(fdt); - if (!seed) + if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { + pr_info("KASLR disabled on command line\n"); return 0; + } - /* - * Check if 'nokaslr' appears on the command line, and - * return 0 if that is the case. - */ - cmdline = get_cmdline(fdt); - str = strstr(cmdline, "nokaslr"); - if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) + if (!kaslr_offset()) { + pr_warn("KASLR disabled due to lack of seed\n"); return 0; + } - /* - * OK, so we are proceeding with KASLR enabled. Calculate a suitable - * kernel image offset from the seed. Let's place the kernel in the - * lower half of the VMALLOC area (VA_BITS - 2). - * Even if we could randomize at page granularity for 16k and 64k pages, - * let's always round to 2 MB so we don't interfere with the ability to - * map using contiguous PTEs - */ - mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); - offset = seed & mask; - - /* use the top 16 bits to randomize the linear region */ - memstart_offset_seed = seed >> 48; + pr_info("KASLR enabled\n"); /* - * The kernel Image should not extend across a 1GB/32MB/512MB alignment - * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this - * happens, round down the KASLR offset by (1 << SWAPPER_TABLE_SHIFT). - * - * NOTE: The references to _text and _end below will already take the - * modulo offset (the physical displacement modulo 2 MB) into - * account, given that the physical placement is controlled by - * the loader, and will not change as a result of the virtual - * mapping we choose. + * KASAN without KASAN_VMALLOC does not expect the module region to + * intersect the vmalloc region, since shadow memory is allocated for + * each module at load time, whereas the vmalloc region will already be + * shadowed by KASAN zero pages. */ - if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != - (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) - offset = round_down(offset, 1 << SWAPPER_TABLE_SHIFT); + BUILD_BUG_ON((IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) && + !IS_ENABLED(CONFIG_KASAN_VMALLOC)); - if (IS_ENABLED(CONFIG_KASAN)) - /* - * KASAN does not expect the module region to intersect the - * vmalloc region, since shadow memory is allocated for each - * module at load time, whereas the vmalloc region is shadowed - * by KASAN zero pages. So keep modules out of the vmalloc - * region if KASAN is enabled. - */ - return offset; + seed = get_random_u32(); if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { /* - * Randomize the module region independently from the core - * kernel. This prevents modules from leaking any information + * Randomize the module region over a 2 GB window covering the + * kernel. This reduces the risk of modules leaking information * about the address of the kernel itself, but results in * branches between modules and the core kernel that are * resolved via PLTs. (Branches between modules will be * resolved normally.) */ - module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE; - module_alloc_base = VMALLOC_START; + module_range = SZ_2G - (u64)(_end - _stext); + module_alloc_base = max((u64)_end - SZ_2G, (u64)MODULES_VADDR); } else { /* * Randomize the module region by setting module_alloc_base to * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, * _stext) . This guarantees that the resulting region still * covers [_stext, _etext], and that all relative branches can - * be resolved without veneers. + * be resolved without veneers unless this region is exhausted + * and we fall back to a larger 2GB window in module_alloc() + * when ARM64_MODULE_PLTS is enabled. */ module_range = MODULES_VSIZE - (u64)(_etext - _stext); - module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; } /* use the lower 21 bits to randomize the base of the module region */ module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; module_alloc_base &= PAGE_MASK; - return offset; + return 0; } +subsys_initcall(kaslr_init) diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c new file mode 100644 index 000000000000..5ed6a585f21f --- /dev/null +++ b/arch/arm64/kernel/kexec_image.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kexec image loader + + * Copyright (C) 2018 Linaro Limited + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> + */ + +#define pr_fmt(fmt) "kexec_file(Image): " fmt + +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/pe.h> +#include <linux/string.h> +#include <asm/byteorder.h> +#include <asm/cpufeature.h> +#include <asm/image.h> +#include <asm/memory.h> + +static int image_probe(const char *kernel_buf, unsigned long kernel_len) +{ + const struct arm64_image_header *h = + (const struct arm64_image_header *)(kernel_buf); + + if (!h || (kernel_len < sizeof(*h))) + return -EINVAL; + + if (memcmp(&h->magic, ARM64_IMAGE_MAGIC, sizeof(h->magic))) + return -EINVAL; + + return 0; +} + +static void *image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + struct arm64_image_header *h; + u64 flags, value; + bool be_image, be_kernel; + struct kexec_buf kbuf; + unsigned long text_offset, kernel_segment_number; + struct kexec_segment *kernel_segment; + int ret; + + /* + * We require a kernel with an unambiguous Image header. Per + * Documentation/arm64/booting.rst, this is the case when image_size + * is non-zero (practically speaking, since v3.17). + */ + h = (struct arm64_image_header *)kernel; + if (!h->image_size) + return ERR_PTR(-EINVAL); + + /* Check cpu features */ + flags = le64_to_cpu(h->flags); + be_image = arm64_image_flag_field(flags, ARM64_IMAGE_FLAG_BE); + be_kernel = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + if ((be_image != be_kernel) && !system_supports_mixed_endian()) + return ERR_PTR(-EINVAL); + + value = arm64_image_flag_field(flags, ARM64_IMAGE_FLAG_PAGE_SIZE); + if (((value == ARM64_IMAGE_FLAG_PAGE_SIZE_4K) && + !system_supports_4kb_granule()) || + ((value == ARM64_IMAGE_FLAG_PAGE_SIZE_64K) && + !system_supports_64kb_granule()) || + ((value == ARM64_IMAGE_FLAG_PAGE_SIZE_16K) && + !system_supports_16kb_granule())) + return ERR_PTR(-EINVAL); + + /* Load the kernel */ + kbuf.image = image; + kbuf.buf_min = 0; + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = false; + + kbuf.buffer = kernel; + kbuf.bufsz = kernel_len; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = le64_to_cpu(h->image_size); + text_offset = le64_to_cpu(h->text_offset); + kbuf.buf_align = MIN_KIMG_ALIGN; + + /* Adjust kernel segment with TEXT_OFFSET */ + kbuf.memsz += text_offset; + + kernel_segment_number = image->nr_segments; + + /* + * The location of the kernel segment may make it impossible to satisfy + * the other segment requirements, so we try repeatedly to find a + * location that will work. + */ + while ((ret = kexec_add_buffer(&kbuf)) == 0) { + /* Try to load additional data */ + kernel_segment = &image->segment[kernel_segment_number]; + ret = load_other_segments(image, kernel_segment->mem, + kernel_segment->memsz, initrd, + initrd_len, cmdline); + if (!ret) + break; + + /* + * We couldn't find space for the other segments; erase the + * kernel segment and try the next available hole. + */ + image->nr_segments -= 1; + kbuf.buf_min = kernel_segment->mem + kernel_segment->memsz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + } + + if (ret) { + pr_err("Could not find any suitable kernel location!"); + return ERR_PTR(ret); + } + + kernel_segment = &image->segment[kernel_segment_number]; + kernel_segment->mem += text_offset; + kernel_segment->memsz -= text_offset; + image->start = kernel_segment->mem; + + pr_debug("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_segment->mem, kbuf.bufsz, + kernel_segment->memsz); + + return NULL; +} + +const struct kexec_file_ops kexec_image_ops = { + .probe = image_probe, + .load = image_load, +#ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG + .verify_sig = kexec_kernel_verify_pe_sig, +#endif +}; diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 2122cd187f19..cda9c1e9864f 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AArch64 KGDB support * @@ -5,18 +6,6 @@ * * Copyright (C) 2013 Cavium Inc. * Author: Vijaya Kumar K <vijaya.kumar@caviumnetworks.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/bug.h> @@ -28,6 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> +#include <asm/patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -138,14 +128,25 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task) { - struct pt_regs *thread_regs; + struct cpu_context *cpu_context = &task->thread.cpu_context; /* Initialize to zero */ memset((char *)gdb_regs, 0, NUMREGBYTES); - thread_regs = task_pt_regs(task); - memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES); - /* Special case for PSTATE (check comments in asm/kgdb.h for details) */ - dbg_get_reg(33, gdb_regs + GP_REG_BYTES, thread_regs); + + gdb_regs[19] = cpu_context->x19; + gdb_regs[20] = cpu_context->x20; + gdb_regs[21] = cpu_context->x21; + gdb_regs[22] = cpu_context->x22; + gdb_regs[23] = cpu_context->x23; + gdb_regs[24] = cpu_context->x24; + gdb_regs[25] = cpu_context->x25; + gdb_regs[26] = cpu_context->x26; + gdb_regs[27] = cpu_context->x27; + gdb_regs[28] = cpu_context->x28; + gdb_regs[29] = cpu_context->fp; + + gdb_regs[31] = cpu_context->sp; + gdb_regs[32] = cpu_context->pc; } void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) @@ -231,60 +232,46 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, return err; } -static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_brk_fn(struct pt_regs *regs, unsigned long esr) { kgdb_handle_exception(1, SIGTRAP, 0, regs); - return 0; + return DBG_HOOK_HANDLED; } NOKPROBE_SYMBOL(kgdb_brk_fn) -static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned long esr) { compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); - return 0; + return DBG_HOOK_HANDLED; } NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); -static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) { if (!kgdb_single_step) return DBG_HOOK_ERROR; - kgdb_handle_exception(1, SIGTRAP, 0, regs); - return 0; + kgdb_handle_exception(0, SIGTRAP, 0, regs); + return DBG_HOOK_HANDLED; } NOKPROBE_SYMBOL(kgdb_step_brk_fn); static struct break_hook kgdb_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_DYN_DBG_BRK_IMM), - .fn = kgdb_brk_fn + .fn = kgdb_brk_fn, + .imm = KGDB_DYN_DBG_BRK_IMM, }; static struct break_hook kgdb_compiled_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_COMPILED_DBG_BRK_IMM), - .fn = kgdb_compiled_brk_fn + .fn = kgdb_compiled_brk_fn, + .imm = KGDB_COMPILED_DBG_BRK_IMM, }; static struct step_hook kgdb_step_hook = { .fn = kgdb_step_brk_fn }; -static void kgdb_call_nmi_hook(void *ignored) -{ - kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); -} - -void kgdb_roundup_cpus(unsigned long flags) -{ - local_irq_enable(); - smp_call_function(kgdb_call_nmi_hook, NULL, 0); - local_irq_disable(); -} - static int __kgdb_notify(struct die_args *args, unsigned long cmd) { struct pt_regs *regs = args->regs; @@ -327,9 +314,9 @@ int kgdb_arch_init(void) if (ret != 0) return ret; - register_break_hook(&kgdb_brkpt_hook); - register_break_hook(&kgdb_compiled_brkpt_hook); - register_step_hook(&kgdb_step_hook); + register_kernel_break_hook(&kgdb_brkpt_hook); + register_kernel_break_hook(&kgdb_compiled_brkpt_hook); + register_kernel_step_hook(&kgdb_step_hook); return 0; } @@ -340,13 +327,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_break_hook(&kgdb_brkpt_hook); - unregister_break_hook(&kgdb_compiled_brkpt_hook); - unregister_step_hook(&kgdb_step_hook); + unregister_kernel_break_hook(&kgdb_brkpt_hook); + unregister_kernel_break_hook(&kgdb_compiled_brkpt_hook); + unregister_kernel_step_hook(&kgdb_step_hook); unregister_die_notifier(&kgdb_notifier); } -struct kgdb_arch arch_kgdb_ops; +const struct kgdb_arch arch_kgdb_ops; int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S index 997e6b27ff6a..692e9d2e31e5 100644 --- a/arch/arm64/kernel/kuser32.S +++ b/arch/arm64/kernel/kuser32.S @@ -1,35 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Low-level user helpers placed in the vectors page for AArch32. + * AArch32 user helpers. * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. * * Copyright (C) 2005-2011 Nicolas Pitre <nico@fluxnic.net> - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * Copyright (C) 2012-2018 ARM Ltd. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. + * The kuser helpers below are mapped at a fixed address by + * aarch32_setup_additional_pages() and are provided for compatibility + * reasons with 32 bit (aarch32) applications that need them. * - * - * AArch32 user helpers. - * - * Each segment is 32-byte aligned and will be moved to the top of the high - * vector page. New segments (if ever needed) must be added in front of - * existing ones. This mechanism should be used only for things that are - * really small and justified, and not be abused freely. - * - * See Documentation/arm/kernel_user_helpers.txt for formal definitions. + * See Documentation/arm/kernel_user_helpers.rst for formal definitions. */ #include <asm/unistd.h> + .section .rodata .align 5 .globl __kuser_helper_start __kuser_helper_start: @@ -77,42 +63,3 @@ __kuser_helper_version: // 0xffff0ffc .word ((__kuser_helper_end - __kuser_helper_start) >> 5) .globl __kuser_helper_end __kuser_helper_end: - -/* - * AArch32 sigreturn code - * - * For ARM syscalls, the syscall number has to be loaded into r7. - * We do not support an OABI userspace. - * - * For Thumb syscalls, we also pass the syscall number via r7. We therefore - * need two 16-bit instructions. - */ - .globl __aarch32_sigret_code_start -__aarch32_sigret_code_start: - - /* - * ARM Code - */ - .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn - - /* - * ARM code - */ - .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn - - .globl __aarch32_sigret_code_end -__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 11121f608eb5..ce3d40120f72 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec for arm64 * * Copyright (C) Linaro. * Copyright (C) Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/interrupt.h> @@ -14,20 +11,18 @@ #include <linux/kernel.h> #include <linux/kexec.h> #include <linux/page-flags.h> +#include <linux/set_memory.h> #include <linux/smp.h> #include <asm/cacheflush.h> #include <asm/cpu_ops.h> +#include <asm/daifflags.h> #include <asm/memory.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/page.h> - -#include "cpu-reset.h" - -/* Global variables for the arm64_relocate_new_kernel routine. */ -extern const unsigned char arm64_relocate_new_kernel[]; -extern const unsigned long arm64_relocate_new_kernel_size; +#include <asm/sections.h> +#include <asm/trans_pgd.h> /** * kexec_image_info - For debugging output. @@ -44,6 +39,9 @@ static void _kexec_image_info(const char *func, int line, pr_debug(" start: %lx\n", kimage->start); pr_debug(" head: %lx\n", kimage->head); pr_debug(" nr_segments: %lu\n", kimage->nr_segments); + pr_debug(" dtb_mem: %pa\n", &kimage->arch.dtb_mem); + pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); + pr_debug(" el2_vectors: %pa\n", &kimage->arch.el2_vectors); for (i = 0; i < kimage->nr_segments; i++) { pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", @@ -69,8 +67,6 @@ void machine_kexec_cleanup(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kexec_image_info(kimage); - if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { pr_err("Can't kexec: CPUs are stuck in the kernel.\n"); return -EBUSY; @@ -80,43 +76,6 @@ int machine_kexec_prepare(struct kimage *kimage) } /** - * kexec_list_flush - Helper to flush the kimage list and source pages to PoC. - */ -static void kexec_list_flush(struct kimage *kimage) -{ - kimage_entry_t *entry; - - for (entry = &kimage->head; ; entry++) { - unsigned int flag; - void *addr; - - /* flush the list entries. */ - __flush_dcache_area(entry, sizeof(kimage_entry_t)); - - flag = *entry & IND_FLAGS; - if (flag == IND_DONE) - break; - - addr = phys_to_virt(*entry & PAGE_MASK); - - switch (flag) { - case IND_INDIRECTION: - /* Set entry point just before the new list page. */ - entry = (kimage_entry_t *)addr - 1; - break; - case IND_SOURCE: - /* flush the source pages. */ - __flush_dcache_area(addr, PAGE_SIZE); - break; - case IND_DESTINATION: - break; - default: - BUG(); - } - } -} - -/** * kexec_segment_flush - Helper to flush the kimage segments to PoC. */ static void kexec_segment_flush(const struct kimage *kimage) @@ -133,11 +92,84 @@ static void kexec_segment_flush(const struct kimage *kimage) kimage->segment[i].memsz, kimage->segment[i].memsz / PAGE_SIZE); - __flush_dcache_area(phys_to_virt(kimage->segment[i].mem), - kimage->segment[i].memsz); + dcache_clean_inval_poc( + (unsigned long)phys_to_virt(kimage->segment[i].mem), + (unsigned long)phys_to_virt(kimage->segment[i].mem) + + kimage->segment[i].memsz); } } +/* Allocates pages for kexec page table */ +static void *kexec_page_alloc(void *arg) +{ + struct kimage *kimage = (struct kimage *)arg; + struct page *page = kimage_alloc_control_pages(kimage, 0); + void *vaddr = NULL; + + if (!page) + return NULL; + + vaddr = page_address(page); + memset(vaddr, 0, PAGE_SIZE); + + return vaddr; +} + +int machine_kexec_post_load(struct kimage *kimage) +{ + int rc; + pgd_t *trans_pgd; + void *reloc_code = page_to_virt(kimage->control_code_page); + long reloc_size; + struct trans_pgd_info info = { + .trans_alloc_page = kexec_page_alloc, + .trans_alloc_arg = kimage, + }; + + /* If in place, relocation is not used, only flush next kernel */ + if (kimage->head & IND_DONE) { + kexec_segment_flush(kimage); + kexec_image_info(kimage); + return 0; + } + + kimage->arch.el2_vectors = 0; + if (is_hyp_nvhe()) { + rc = trans_pgd_copy_el2_vectors(&info, + &kimage->arch.el2_vectors); + if (rc) + return rc; + } + + /* Create a copy of the linear map */ + trans_pgd = kexec_page_alloc(kimage); + if (!trans_pgd) + return -ENOMEM; + rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END); + if (rc) + return rc; + kimage->arch.ttbr1 = __pa(trans_pgd); + kimage->arch.zero_page = __pa_symbol(empty_zero_page); + + reloc_size = __relocate_new_kernel_end - __relocate_new_kernel_start; + memcpy(reloc_code, __relocate_new_kernel_start, reloc_size); + kimage->arch.kern_reloc = __pa(reloc_code); + rc = trans_pgd_idmap_page(&info, &kimage->arch.ttbr0, + &kimage->arch.t0sz, reloc_code); + if (rc) + return rc; + kimage->arch.phys_offset = virt_to_phys(kimage) - (long)kimage; + + /* Flush the reloc_code in preparation for its execution. */ + dcache_clean_inval_poc((unsigned long)reloc_code, + (unsigned long)reloc_code + reloc_size); + icache_inval_pou((uintptr_t)reloc_code, + (uintptr_t)reloc_code + reloc_size); + kexec_image_info(kimage); + + return 0; +} + /** * machine_kexec - Do the kexec reboot. * @@ -145,8 +177,6 @@ static void kexec_segment_flush(const struct kimage *kimage) */ void machine_kexec(struct kimage *kimage) { - phys_addr_t reboot_code_buffer_phys; - void *reboot_code_buffer; bool in_kexec_crash = (kimage == kexec_crash_image); bool stuck_cpus = cpus_are_stuck_in_kernel(); @@ -157,58 +187,35 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); - reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); - - kexec_image_info(kimage); - - pr_debug("%s:%d: control_code_page: %p\n", __func__, __LINE__, - kimage->control_code_page); - pr_debug("%s:%d: reboot_code_buffer_phys: %pa\n", __func__, __LINE__, - &reboot_code_buffer_phys); - pr_debug("%s:%d: reboot_code_buffer: %p\n", __func__, __LINE__, - reboot_code_buffer); - pr_debug("%s:%d: relocate_new_kernel: %p\n", __func__, __LINE__, - arm64_relocate_new_kernel); - pr_debug("%s:%d: relocate_new_kernel_size: 0x%lx(%lu) bytes\n", - __func__, __LINE__, arm64_relocate_new_kernel_size, - arm64_relocate_new_kernel_size); - - /* - * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use - * after the kernel is shut down. - */ - memcpy(reboot_code_buffer, arm64_relocate_new_kernel, - arm64_relocate_new_kernel_size); - - /* Flush the reboot_code_buffer in preparation for its execution. */ - __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); - flush_icache_range((uintptr_t)reboot_code_buffer, - arm64_relocate_new_kernel_size); - - /* Flush the kimage list and its buffers. */ - kexec_list_flush(kimage); - - /* Flush the new image if already in place. */ - if ((kimage != kexec_crash_image) && (kimage->head & IND_DONE)) - kexec_segment_flush(kimage); - pr_info("Bye!\n"); - /* Disable all DAIF exceptions. */ - asm volatile ("msr daifset, #0xf" : : : "memory"); + local_daif_mask(); /* - * cpu_soft_restart will shutdown the MMU, disable data caches, then - * transfer control to the reboot_code_buffer which contains a copy of - * the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel - * uses physical addressing to relocate the new image to its final - * position and transfers control to the image entry point when the - * relocation is complete. + * Both restart and kernel_reloc will shutdown the MMU, disable data + * caches. However, restart will start new kernel or purgatory directly, + * kernel_reloc contains the body of arm64_relocate_new_kernel + * In kexec case, kimage->start points to purgatory assuming that + * kernel entry and dtb address are embedded in purgatory by + * userspace (kexec-tools). + * In kexec_file case, the kernel starts directly without purgatory. */ - - cpu_soft_restart(kimage != kexec_crash_image, - reboot_code_buffer_phys, kimage->head, kimage->start, 0); + if (kimage->head & IND_DONE) { + typeof(cpu_soft_restart) *restart; + + cpu_install_idmap(); + restart = (void *)__pa_symbol(cpu_soft_restart); + restart(is_hyp_nvhe(), kimage->start, kimage->arch.dtb_mem, + 0, 0); + } else { + void (*kernel_reloc)(struct kimage *kimage); + + if (is_hyp_nvhe()) + __hyp_set_vectors(kimage->arch.el2_vectors); + cpu_install_ttbr0(kimage->arch.ttbr0, kimage->arch.t0sz); + kernel_reloc = (void *)kimage->arch.kern_reloc; + kernel_reloc(kimage); + } BUG(); /* Should never get here. */ } @@ -265,8 +272,6 @@ void arch_kexec_protect_crashkres(void) { int i; - kexec_segment_flush(kexec_crash_image); - for (i = 0; i < kexec_crash_image->nr_segments; i++) set_memory_valid( __phys_to_virt(kexec_crash_image->segment[i].mem), @@ -307,7 +312,7 @@ void crash_post_resume(void) * but does not hold any data of loaded kernel image. * * Note that all the pages in crash dump kernel memory have been initially - * marked as Reserved in kexec_reserve_crashkres_pages(). + * marked as Reserved as memory was allocated via memblock_reserve(). * * In hibernation, the pages which are Reserved and yet "nosave" are excluded * from the hibernation iamge. crash_is_nosave() does thich check for crash @@ -324,8 +329,13 @@ bool crash_is_nosave(unsigned long pfn) /* in reserved memory? */ addr = __pfn_to_phys(pfn); - if ((addr < crashk_res.start) || (crashk_res.end < addr)) - return false; + if ((addr < crashk_res.start) || (crashk_res.end < addr)) { + if (!crashk_low_res.end) + return false; + + if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr)) + return false; + } if (!kexec_crash_image) return true; @@ -347,18 +357,7 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) for (addr = begin; addr < end; addr += PAGE_SIZE) { page = phys_to_page(addr); - ClearPageReserved(page); free_reserved_page(page); } } #endif /* CONFIG_HIBERNATION */ - -void arch_crash_save_vmcoreinfo(void) -{ - VMCOREINFO_NUMBER(VA_BITS); - /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ - vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n", - kimage_voffset); - vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", - PHYS_OFFSET); -} diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c new file mode 100644 index 000000000000..a11a6e14ba89 --- /dev/null +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kexec_file for arm64 + * + * Copyright (C) 2018 Linaro Limited + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> + * + * Most code is derived from arm64 port of kexec-tools + */ + +#define pr_fmt(fmt) "kexec_file: " fmt + +#include <linux/ioport.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/libfdt.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/vmalloc.h> + +const struct kexec_file_ops * const kexec_file_loaders[] = { + &kexec_image_ops, + NULL +}; + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + kvfree(image->arch.dtb); + image->arch.dtb = NULL; + + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} + +static int prepare_elf_headers(void **addr, unsigned long *sz) +{ + struct crash_mem *cmem; + unsigned int nr_ranges; + int ret; + u64 i; + phys_addr_t start, end; + + nr_ranges = 2; /* for exclusion of crashkernel region */ + for_each_mem_range(i, &start, &end) + nr_ranges++; + + cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + if (!cmem) + return -ENOMEM; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + for_each_mem_range(i, &start, &end) { + cmem->ranges[cmem->nr_ranges].start = start; + cmem->ranges[cmem->nr_ranges].end = end - 1; + cmem->nr_ranges++; + } + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + if (crashk_low_res.end) { + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + goto out; + } + + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); + +out: + kfree(cmem); + return ret; +} + +/* + * Tries to add the initrd and DTB to the image. If it is not possible to find + * valid locations, this function will undo changes to the image and return non + * zero. + */ +int load_other_segments(struct kimage *image, + unsigned long kernel_load_addr, + unsigned long kernel_size, + char *initrd, unsigned long initrd_len, + char *cmdline) +{ + struct kexec_buf kbuf; + void *headers, *dtb = NULL; + unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + orig_segments = image->nr_segments; + int ret = 0; + + kbuf.image = image; + /* not allocate anything below the kernel */ + kbuf.buf_min = kernel_load_addr + kernel_size; + + /* load elf core header */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = prepare_elf_headers(&headers, &headers_sz); + if (ret) { + pr_err("Preparing elf core header failed\n"); + goto out_err; + } + + kbuf.buffer = headers; + kbuf.bufsz = headers_sz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = headers_sz; + kbuf.buf_align = SZ_64K; /* largest supported page size */ + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree(headers); + goto out_err; + } + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; + + pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + } + + /* load initrd */ + if (initrd) { + kbuf.buffer = initrd; + kbuf.bufsz = initrd_len; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = initrd_len; + kbuf.buf_align = 0; + /* within 1GB-aligned window of up to 32GB in size */ + kbuf.buf_max = round_down(kernel_load_addr, SZ_1G) + + (unsigned long)SZ_1G * 32; + kbuf.top_down = false; + + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_err; + initrd_load_addr = kbuf.mem; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, kbuf.bufsz, kbuf.memsz); + } + + /* load dtb */ + dtb = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr, + initrd_len, cmdline, 0); + if (!dtb) { + pr_err("Preparing for new dtb failed\n"); + ret = -EINVAL; + goto out_err; + } + + /* trim it */ + fdt_pack(dtb); + dtb_len = fdt_totalsize(dtb); + kbuf.buffer = dtb; + kbuf.bufsz = dtb_len; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = dtb_len; + /* not across 2MB boundary */ + kbuf.buf_align = SZ_2M; + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_err; + image->arch.dtb = dtb; + image->arch.dtb_mem = kbuf.mem; + + pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); + + return 0; + +out_err: + image->nr_segments = orig_segments; + kvfree(dtb); + return ret; +} diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index d05dbe658409..5a0a8f552a61 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -1,81 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014-2017 Linaro Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/elf.h> +#include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sort.h> -struct plt_entry { +static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc, + enum aarch64_insn_register reg) +{ + u32 adrp, add; + + adrp = aarch64_insn_gen_adr(pc, dst, reg, AARCH64_INSN_ADR_TYPE_ADRP); + add = aarch64_insn_gen_add_sub_imm(reg, reg, dst % SZ_4K, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_ADSB_ADD); + + return (struct plt_entry){ cpu_to_le32(adrp), cpu_to_le32(add) }; +} + +struct plt_entry get_plt_entry(u64 dst, void *pc) +{ + struct plt_entry plt; + static u32 br; + + if (!br) + br = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_16, + AARCH64_INSN_BRANCH_NOLINK); + + plt = __get_adrp_add_pair(dst, (u64)pc, AARCH64_INSN_REG_16); + plt.br = cpu_to_le32(br); + + return plt; +} + +static bool plt_entries_equal(const struct plt_entry *a, + const struct plt_entry *b) +{ + u64 p, q; + + /* + * Check whether both entries refer to the same target: + * do the cheapest checks first. + * If the 'add' or 'br' opcodes are different, then the target + * cannot be the same. + */ + if (a->add != b->add || a->br != b->br) + return false; + + p = ALIGN_DOWN((u64)a, SZ_4K); + q = ALIGN_DOWN((u64)b, SZ_4K); + /* - * A program that conforms to the AArch64 Procedure Call Standard - * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or - * IP1 (x17) may be inserted at any branch instruction that is - * exposed to a relocation that supports long branches. Since that - * is exactly what we are dealing with here, we are free to use x16 - * as a scratch register in the PLT veneers. + * If the 'adrp' opcodes are the same then we just need to check + * that they refer to the same 4k region. */ - __le32 mov0; /* movn x16, #0x.... */ - __le32 mov1; /* movk x16, #0x...., lsl #16 */ - __le32 mov2; /* movk x16, #0x...., lsl #32 */ - __le32 br; /* br x16 */ -}; + if (a->adrp == b->adrp && p == q) + return true; + + return (p + aarch64_insn_adrp_get_offset(le32_to_cpu(a->adrp))) == + (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp))); +} static bool in_init(const struct module *mod, void *loc) { return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size; } -u64 module_emit_plt_entry(struct module *mod, void *loc, const Elf64_Rela *rela, +u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs, + void *loc, const Elf64_Rela *rela, Elf64_Sym *sym) { struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : &mod->arch.init; - struct plt_entry *plt = (struct plt_entry *)pltsec->plt->sh_addr; + struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr; int i = pltsec->plt_num_entries; + int j = i - 1; u64 val = sym->st_value + rela->r_addend; - /* - * MOVK/MOVN/MOVZ opcode: - * +--------+------------+--------+-----------+-------------+---------+ - * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | - * +--------+------------+--------+-----------+-------------+---------+ - * - * Rd := 0x10 (x16) - * hw := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32) - * opc := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ) - * sf := 1 (64-bit variant) - */ - plt[i] = (struct plt_entry){ - cpu_to_le32(0x92800010 | (((~val ) & 0xffff)) << 5), - cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5), - cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5), - cpu_to_le32(0xd61f0200) - }; + if (is_forbidden_offset_for_adrp(&plt[i].adrp)) + i++; + + plt[i] = get_plt_entry(val, &plt[i]); /* * Check if the entry we just created is a duplicate. Given that the * relocations are sorted, this will be the last entry we allocated. * (if one exists). */ - if (i > 0 && - plt[i].mov0 == plt[i - 1].mov0 && - plt[i].mov1 == plt[i - 1].mov1 && - plt[i].mov2 == plt[i - 1].mov2) - return (u64)&plt[i - 1]; + if (j >= 0 && plt_entries_equal(plt + i, plt + j)) + return (u64)&plt[j]; - pltsec->plt_num_entries++; - BUG_ON(pltsec->plt_num_entries > pltsec->plt_max_entries); + pltsec->plt_num_entries += i - j; + if (WARN_ON(pltsec->plt_num_entries > pltsec->plt_max_entries)) + return 0; return (u64)&plt[i]; } -#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) +#ifdef CONFIG_ARM64_ERRATUM_843419 +u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, + void *loc, u64 val) +{ + struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : + &mod->arch.init; + struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr; + int i = pltsec->plt_num_entries++; + u32 br; + int rd; + + if (WARN_ON(pltsec->plt_num_entries > pltsec->plt_max_entries)) + return 0; + + if (is_forbidden_offset_for_adrp(&plt[i].adrp)) + i = pltsec->plt_num_entries++; + + /* get the destination register of the ADRP instruction */ + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, + le32_to_cpup((__le32 *)loc)); + + br = aarch64_insn_gen_branch_imm((u64)&plt[i].br, (u64)loc + 4, + AARCH64_INSN_BRANCH_NOLINK); + + plt[i] = __get_adrp_add_pair(val, (u64)&plt[i], rd); + plt[i].br = cpu_to_le32(br); + + return (u64)&plt[i]; +} +#endif + +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) static int cmp_rela(const void *a, const void *b) { @@ -102,16 +159,21 @@ static bool duplicate_rel(const Elf64_Rela *rela, int num) } static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, - Elf64_Word dstidx) + Elf64_Word dstidx, Elf_Shdr *dstsec) { unsigned int ret = 0; Elf64_Sym *s; int i; for (i = 0; i < num; i++) { + u64 min_align; + switch (ELF64_R_TYPE(rela[i].r_info)) { case R_AARCH64_JUMP26: case R_AARCH64_CALL26: + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + break; + /* * We only have to consider branch targets that resolve * to symbols that are defined in a different section. @@ -143,17 +205,96 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, if (rela[i].r_addend != 0 || !duplicate_rel(rela, i)) ret++; break; + case R_AARCH64_ADR_PREL_PG_HI21_NC: + case R_AARCH64_ADR_PREL_PG_HI21: + if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) || + !cpus_have_const_cap(ARM64_WORKAROUND_843419)) + break; + + /* + * Determine the minimal safe alignment for this ADRP + * instruction: the section alignment at which it is + * guaranteed not to appear at a vulnerable offset. + * + * This comes down to finding the least significant zero + * bit in bits [11:3] of the section offset, and + * increasing the section's alignment so that the + * resulting address of this instruction is guaranteed + * to equal the offset in that particular bit (as well + * as all less significant bits). This ensures that the + * address modulo 4 KB != 0xfff8 or 0xfffc (which would + * have all ones in bits [11:3]) + */ + min_align = 2ULL << ffz(rela[i].r_offset | 0x7); + + /* + * Allocate veneer space for each ADRP that may appear + * at a vulnerable offset nonetheless. At relocation + * time, some of these will remain unused since some + * ADRP instructions can be patched to ADR instructions + * instead. + */ + if (min_align > SZ_4K) + ret++; + else + dstsec->sh_addralign = max(dstsec->sh_addralign, + min_align); + break; } } + + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) && + cpus_have_const_cap(ARM64_WORKAROUND_843419)) + /* + * Add some slack so we can skip PLT slots that may trigger + * the erratum due to the placement of the ADRP instruction. + */ + ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry))); + return ret; } +static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela, + Elf64_Word dstidx) +{ + + Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info); + + if (s->st_shndx == dstidx) + return false; + + return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 || + ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26; +} + +/* Group branch PLT relas at the front end of the array. */ +static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela, + int numrels, Elf64_Word dstidx) +{ + int i = 0, j = numrels - 1; + + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return 0; + + while (i < j) { + if (branch_rela_needs_plt(syms, &rela[i], dstidx)) + i++; + else if (branch_rela_needs_plt(syms, &rela[j], dstidx)) + swap(rela[i], rela[j]); + else + j--; + } + + return i; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { unsigned long core_plts = 0; unsigned long init_plts = 0; Elf64_Sym *syms = NULL; + Elf_Shdr *pltsec, *tramp = NULL; int i; /* @@ -162,14 +303,17 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, */ for (i = 0; i < ehdr->e_shnum; i++) { if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt")) - mod->arch.core.plt = sechdrs + i; + mod->arch.core.plt_shndx = i; else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt")) - mod->arch.init.plt = sechdrs + i; + mod->arch.init.plt_shndx = i; + else if (!strcmp(secstrings + sechdrs[i].sh_name, + ".text.ftrace_trampoline")) + tramp = sechdrs + i; else if (sechdrs[i].sh_type == SHT_SYMTAB) syms = (Elf64_Sym *)sechdrs[i].sh_addr; } - if (!mod->arch.core.plt || !mod->arch.init.plt) { + if (!mod->arch.core.plt_shndx || !mod->arch.init.plt_shndx) { pr_err("%s: module PLT section(s) missing\n", mod->name); return -ENOEXEC; } @@ -180,7 +324,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, for (i = 0; i < ehdr->e_shnum; i++) { Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; - int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; if (sechdrs[i].sh_type != SHT_RELA) @@ -190,30 +334,45 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dstsec->sh_flags & SHF_EXECINSTR)) continue; - /* sort by type, symbol index and addend */ - sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + /* + * sort branch relocations requiring a PLT by type, symbol index + * and addend + */ + nents = partition_branch_plt_relas(syms, rels, numrels, + sechdrs[i].sh_info); + if (nents) + sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL); - if (strncmp(secstrings + dstsec->sh_name, ".init", 5) != 0) + if (!str_has_prefix(secstrings + dstsec->sh_name, ".init")) core_plts += count_plts(syms, rels, numrels, - sechdrs[i].sh_info); + sechdrs[i].sh_info, dstsec); else init_plts += count_plts(syms, rels, numrels, - sechdrs[i].sh_info); + sechdrs[i].sh_info, dstsec); } - mod->arch.core.plt->sh_type = SHT_NOBITS; - mod->arch.core.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.core.plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.core.plt->sh_size = (core_plts + 1) * sizeof(struct plt_entry); + pltsec = sechdrs + mod->arch.core.plt_shndx; + pltsec->sh_type = SHT_NOBITS; + pltsec->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + pltsec->sh_addralign = L1_CACHE_BYTES; + pltsec->sh_size = (core_plts + 1) * sizeof(struct plt_entry); mod->arch.core.plt_num_entries = 0; mod->arch.core.plt_max_entries = core_plts; - mod->arch.init.plt->sh_type = SHT_NOBITS; - mod->arch.init.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.init.plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.init.plt->sh_size = (init_plts + 1) * sizeof(struct plt_entry); + pltsec = sechdrs + mod->arch.init.plt_shndx; + pltsec->sh_type = SHT_NOBITS; + pltsec->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + pltsec->sh_addralign = L1_CACHE_BYTES; + pltsec->sh_size = (init_plts + 1) * sizeof(struct plt_entry); mod->arch.init.plt_num_entries = 0; mod->arch.init.plt_max_entries = init_plts; + if (tramp) { + tramp->sh_type = SHT_NOBITS; + tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + tramp->sh_addralign = __alignof__(struct plt_entry); + tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); + } + return 0; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f469e0435903..76b41e4ca9fa 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -1,25 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AArch64 loadable module support. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> */ #include <linux/bitops.h> #include <linux/elf.h> +#include <linux/ftrace.h> #include <linux/gfp.h> #include <linux/kasan.h> #include <linux/kernel.h> @@ -32,6 +22,7 @@ void *module_alloc(unsigned long size) { + u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; gfp_t gfp_mask = GFP_KERNEL; void *p; @@ -39,32 +30,41 @@ void *module_alloc(unsigned long size) if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) gfp_mask |= __GFP_NOWARN; + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + /* don't exceed the static module region - see below */ + module_alloc_end = MODULES_END; + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_base + MODULES_VSIZE, - gfp_mask, PAGE_KERNEL_EXEC, 0, + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - !IS_ENABLED(CONFIG_KASAN)) + (IS_ENABLED(CONFIG_KASAN_VMALLOC) || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) /* - * KASAN can only deal with module allocations being served - * from the reserved module region, since the remainder of - * the vmalloc region is already backed by zero shadow pages, - * and punching holes into it is non-trivial. Since the module - * region is not randomized when KASAN is enabled, it is even + * KASAN without KASAN_VMALLOC can only deal with module + * allocations being served from the reserved module region, + * since the remainder of the vmalloc region is already + * backed by zero shadow pages, and punching holes into it + * is non-trivial. Since the module region is not randomized + * when KASAN is enabled without KASAN_VMALLOC, it is even * less likely that the module region gets exhausted, so we * can simply omit this fallback in that case. */ - p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, - VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, - NUMA_NO_NODE, __builtin_return_address(0)); + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, + module_alloc_base + SZ_2G, GFP_KERNEL, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { @@ -95,16 +95,50 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) { s64 sval = do_reloc(op, place, val); + /* + * The ELF psABI for AArch64 documents the 16-bit and 32-bit place + * relative and absolute relocations as having a range of [-2^15, 2^16) + * or [-2^31, 2^32), respectively. However, in order to be able to + * detect overflows reliably, we have to choose whether we interpret + * such quantities as signed or as unsigned, and stick with it. + * The way we organize our address space requires a signed + * interpretation of 32-bit relative references, so let's use that + * for all R_AARCH64_PRELxx relocations. This means our upper + * bound for overflow detection should be Sxx_MAX rather than Uxx_MAX. + */ + switch (len) { case 16: *(s16 *)place = sval; - if (sval < S16_MIN || sval > U16_MAX) - return -ERANGE; + switch (op) { + case RELOC_OP_ABS: + if (sval < 0 || sval > U16_MAX) + return -ERANGE; + break; + case RELOC_OP_PREL: + if (sval < S16_MIN || sval > S16_MAX) + return -ERANGE; + break; + default: + pr_err("Invalid 16-bit data relocation (%d)\n", op); + return 0; + } break; case 32: *(s32 *)place = sval; - if (sval < S32_MIN || sval > U32_MAX) - return -ERANGE; + switch (op) { + case RELOC_OP_ABS: + if (sval < 0 || sval > U32_MAX) + return -ERANGE; + break; + case RELOC_OP_PREL: + if (sval < S32_MIN || sval > S32_MAX) + return -ERANGE; + break; + default: + pr_err("Invalid 32-bit data relocation (%d)\n", op); + return 0; + } break; case 64: *(s64 *)place = sval; @@ -197,6 +231,33 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, return 0; } +static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, + __le32 *place, u64 val) +{ + u32 insn; + + if (!is_forbidden_offset_for_adrp(place)) + return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21, + AARCH64_INSN_IMM_ADR); + + /* patch ADRP to ADR if it is in range */ + if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21, + AARCH64_INSN_IMM_ADR)) { + insn = le32_to_cpu(*place); + insn &= ~BIT(31); + } else { + /* out of range for ADR -> emit a veneer */ + val = module_emit_veneer_for_adrp(mod, sechdrs, place, val & ~0xfff); + if (!val) + return -ENOEXEC; + insn = aarch64_insn_gen_branch_imm((u64)place, val, + AARCH64_INSN_BRANCH_NOLINK); + } + + *place = cpu_to_le32(insn); + return 0; +} + int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -259,18 +320,21 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, /* MOVW instruction relocations. */ case R_AARCH64_MOVW_UABS_G0_NC: overflow_check = false; + fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; + fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; + fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, AARCH64_INSN_IMM_MOVKZ); @@ -336,14 +400,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, AARCH64_INSN_IMM_ADR); break; -#ifndef CONFIG_ARM64_ERRATUM_843419 case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; + fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: - ovf = reloc_insn_imm(RELOC_OP_PAGE, loc, val, 12, 21, - AARCH64_INSN_IMM_ADR); + ovf = reloc_insn_adrp(me, sechdrs, loc, val); + if (ovf && ovf != -ERANGE) + return ovf; break; -#endif case R_AARCH64_ADD_ABS_LO12_NC: case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; @@ -385,7 +449,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && ovf == -ERANGE) { - val = module_emit_plt_entry(me, loc, &rel[i], sym); + val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym); + if (!val) + return -ENOEXEC; ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, AARCH64_INSN_IMM_26); } @@ -410,23 +476,43 @@ overflow: return -ENOEXEC; } +static inline void __init_plt(struct plt_entry *plt, unsigned long addr) +{ + *plt = get_plt_entry(addr, plt); +} + +static int module_init_ftrace_plt(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod) +{ +#if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE) + const Elf_Shdr *s; + struct plt_entry *plts; + + s = find_section(hdr, sechdrs, ".text.ftrace_trampoline"); + if (!s) + return -ENOEXEC; + + plts = (void *)s->sh_addr; + + __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); + + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) + __init_plt(&plts[FTRACE_REGS_PLT_IDX], FTRACE_REGS_ADDR); + + mod->arch.ftrace_trampolines = plts; +#endif + return 0; +} + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *se; - const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + const Elf_Shdr *s; + s = find_section(hdr, sechdrs, ".altinstructions"); + if (s) + apply_alternatives_module((void *)s->sh_addr, s->sh_size); - for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { - if (strcmp(".altinstructions", secstrs + s->sh_name) == 0) { - apply_alternatives((void *)s->sh_addr, s->sh_size); - } -#ifdef CONFIG_ARM64_MODULE_PLTS - if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) && - !strcmp(".text.ftrace_trampoline", secstrs + s->sh_name)) - me->arch.ftrace_trampoline = (void *)s->sh_addr; -#endif - } - - return 0; + return module_init_ftrace_plt(hdr, sechdrs, me); } diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds deleted file mode 100644 index f7c9781a9d48..000000000000 --- a/arch/arm64/kernel/module.lds +++ /dev/null @@ -1,4 +0,0 @@ -SECTIONS { - .plt (NOLOAD) : { BYTE(0) } - .init.plt (NOLOAD) : { BYTE(0) } -} diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c new file mode 100644 index 000000000000..7467217c1eaf --- /dev/null +++ b/arch/arm64/kernel/mte.c @@ -0,0 +1,632 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 ARM Ltd. + */ + +#include <linux/bitops.h> +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/prctl.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/string.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/thread_info.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/uio.h> + +#include <asm/barrier.h> +#include <asm/cpufeature.h> +#include <asm/mte.h> +#include <asm/ptrace.h> +#include <asm/sysreg.h> + +static DEFINE_PER_CPU_READ_MOSTLY(u64, mte_tcf_preferred); + +#ifdef CONFIG_KASAN_HW_TAGS +/* + * The asynchronous and asymmetric MTE modes have the same behavior for + * store operations. This flag is set when either of these modes is enabled. + */ +DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); +EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); +#endif + +static void mte_sync_page_tags(struct page *page, pte_t old_pte, + bool check_swap, bool pte_is_tagged) +{ + if (check_swap && is_swap_pte(old_pte)) { + swp_entry_t entry = pte_to_swp_entry(old_pte); + + if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) + return; + } + + if (!pte_is_tagged) + return; + + /* + * Test PG_mte_tagged again in case it was racing with another + * set_pte_at(). + */ + if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + mte_clear_page_tags(page_address(page)); +} + +void mte_sync_tags(pte_t old_pte, pte_t pte) +{ + struct page *page = pte_page(pte); + long i, nr_pages = compound_nr(page); + bool check_swap = nr_pages == 1; + bool pte_is_tagged = pte_tagged(pte); + + /* Early out if there's nothing to do */ + if (!check_swap && !pte_is_tagged) + return; + + /* if PG_mte_tagged is set, tags have already been initialised */ + for (i = 0; i < nr_pages; i++, page++) { + if (!test_bit(PG_mte_tagged, &page->flags)) + mte_sync_page_tags(page, old_pte, check_swap, + pte_is_tagged); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); +} + +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = page_address(page1); + addr2 = page_address(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + + if (!system_supports_mte() || ret) + return ret; + + /* + * If the page content is identical but at least one of the pages is + * tagged, return non-zero to avoid KSM merging. If only one of the + * pages is tagged, set_pte_at() may zero or change the tags of the + * other page via mte_sync_tags(). + */ + if (test_bit(PG_mte_tagged, &page1->flags) || + test_bit(PG_mte_tagged, &page2->flags)) + return addr1 != addr2; + + return ret; +} + +static inline void __mte_enable_kernel(const char *mode, unsigned long tcf) +{ + /* Enable MTE Sync Mode for EL1. */ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCF, tcf)); + isb(); + + pr_info_once("MTE: enabled in %s mode at EL1\n", mode); +} + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_enable_kernel_sync(void) +{ + /* + * Make sure we enter this function when no PE has set + * async mode previously. + */ + WARN_ONCE(system_uses_mte_async_or_asymm_mode(), + "MTE async mode enabled system wide!"); + + __mte_enable_kernel("synchronous", SCTLR_EL1_TCF_SYNC); +} + +void mte_enable_kernel_async(void) +{ + __mte_enable_kernel("asynchronous", SCTLR_EL1_TCF_ASYNC); + + /* + * MTE async mode is set system wide by the first PE that + * executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); +} + +void mte_enable_kernel_asymm(void) +{ + if (cpus_have_cap(ARM64_MTE_ASYMM)) { + __mte_enable_kernel("asymmetric", SCTLR_EL1_TCF_ASYMM); + + /* + * MTE asymm mode behaves as async mode for store + * operations. The mode is set system wide by the + * first PE that executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); + } else { + /* + * If the CPU does not support MTE asymmetric mode the + * kernel falls back on synchronous mode which is the + * default for kasan=on. + */ + mte_enable_kernel_sync(); + } +} +#endif + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_check_tfsr_el1(void) +{ + u64 tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + + if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { + /* + * Note: isb() is not required after this direct write + * because there is no indirect read subsequent to it + * (per ARM DDI 0487F.c table D13-1). + */ + write_sysreg_s(0, SYS_TFSR_EL1); + + kasan_report_async(); + } +} +#endif + +/* + * This is where we actually resolve the system and process MTE mode + * configuration into an actual value in SCTLR_EL1 that affects + * userspace. + */ +static void mte_update_sctlr_user(struct task_struct *task) +{ + /* + * This must be called with preemption disabled and can only be called + * on the current or next task since the CPU must match where the thread + * is going to run. The caller is responsible for calling + * update_sctlr_el1() later in the same preemption disabled block. + */ + unsigned long sctlr = task->thread.sctlr_user; + unsigned long mte_ctrl = task->thread.mte_ctrl; + unsigned long pref, resolved_mte_tcf; + + pref = __this_cpu_read(mte_tcf_preferred); + /* + * If there is no overlap between the system preferred and + * program requested values go with what was requested. + */ + resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl; + sctlr &= ~SCTLR_EL1_TCF0_MASK; + /* + * Pick an actual setting. The order in which we check for + * set bits and map into register values determines our + * default order. + */ + if (resolved_mte_tcf & MTE_CTRL_TCF_ASYMM) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYMM); + else if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC); + else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC); + task->thread.sctlr_user = sctlr; +} + +static void mte_update_gcr_excl(struct task_struct *task) +{ + /* + * SYS_GCR_EL1 will be set to current->thread.mte_ctrl value by + * mte_set_user_gcr() in kernel_exit, but only if KASAN is enabled. + */ + if (kasan_hw_tags_enabled()) + return; + + write_sysreg_s( + ((task->thread.mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) | SYS_GCR_EL1_RRND, + SYS_GCR_EL1); +} + +#ifdef CONFIG_KASAN_HW_TAGS +/* Only called from assembly, silence sparse */ +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst); + +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (kasan_hw_tags_enabled()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} +#endif + +void mte_thread_init_user(void) +{ + if (!system_supports_mte()) + return; + + /* clear any pending asynchronous tag fault */ + dsb(ish); + write_sysreg_s(0, SYS_TFSRE0_EL1); + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + /* disable tag checking and reset tag generation mask */ + set_mte_ctrl(current, 0); +} + +void mte_thread_switch(struct task_struct *next) +{ + if (!system_supports_mte()) + return; + + mte_update_sctlr_user(next); + mte_update_gcr_excl(next); + + /* TCO may not have been disabled on exception entry for the current task. */ + mte_disable_tco_entry(next); + + /* + * Check if an async tag exception occurred at EL1. + * + * Note: On the context switch path we rely on the dsb() present + * in __switch_to() to guarantee that the indirect writes to TFSR_EL1 + * are synchronized before this point. + */ + isb(); + mte_check_tfsr_el1(); +} + +void mte_cpu_setup(void) +{ + u64 rgsr; + + /* + * CnP must be enabled only after the MAIR_EL1 register has been set + * up. Inconsistent MAIR_EL1 between CPUs sharing the same TLB may + * lead to the wrong memory type being used for a brief window during + * CPU power-up. + * + * CnP is not a boot feature so MTE gets enabled before CnP, but let's + * make sure that is the case. + */ + BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT); + BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT); + + /* Normal Tagged memory type at the corresponding MAIR index */ + sysreg_clear_set(mair_el1, + MAIR_ATTRIDX(MAIR_ATTR_MASK, MT_NORMAL_TAGGED), + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_TAGGED, + MT_NORMAL_TAGGED)); + + write_sysreg_s(KERNEL_GCR_EL1, SYS_GCR_EL1); + + /* + * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then + * RGSR_EL1.SEED must be non-zero for IRG to produce + * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we + * must initialize it. + */ + rgsr = (read_sysreg(CNTVCT_EL0) & SYS_RGSR_EL1_SEED_MASK) << + SYS_RGSR_EL1_SEED_SHIFT; + if (rgsr == 0) + rgsr = 1 << SYS_RGSR_EL1_SEED_SHIFT; + write_sysreg_s(rgsr, SYS_RGSR_EL1); + + /* clear any pending tag check faults in TFSR*_EL1 */ + write_sysreg_s(0, SYS_TFSR_EL1); + write_sysreg_s(0, SYS_TFSRE0_EL1); + + local_flush_tlb_all(); +} + +void mte_suspend_enter(void) +{ + if (!system_supports_mte()) + return; + + /* + * The barriers are required to guarantee that the indirect writes + * to TFSR_EL1 are synchronized before we report the state. + */ + dsb(nsh); + isb(); + + /* Report SYS_TFSR_EL1 before suspend entry */ + mte_check_tfsr_el1(); +} + +void mte_suspend_exit(void) +{ + if (!system_supports_mte()) + return; + + mte_cpu_setup(); +} + +long set_mte_ctrl(struct task_struct *task, unsigned long arg) +{ + u64 mte_ctrl = (~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) << MTE_CTRL_GCR_USER_EXCL_SHIFT; + + if (!system_supports_mte()) + return 0; + + if (arg & PR_MTE_TCF_ASYNC) + mte_ctrl |= MTE_CTRL_TCF_ASYNC; + if (arg & PR_MTE_TCF_SYNC) + mte_ctrl |= MTE_CTRL_TCF_SYNC; + + /* + * If the system supports it and both sync and async modes are + * specified then implicitly enable asymmetric mode. + * Userspace could see a mix of both sync and async anyway due + * to differing or changing defaults on CPUs. + */ + if (cpus_have_cap(ARM64_MTE_ASYMM) && + (arg & PR_MTE_TCF_ASYNC) && + (arg & PR_MTE_TCF_SYNC)) + mte_ctrl |= MTE_CTRL_TCF_ASYMM; + + task->thread.mte_ctrl = mte_ctrl; + if (task == current) { + preempt_disable(); + mte_update_sctlr_user(task); + mte_update_gcr_excl(task); + update_sctlr_el1(task->thread.sctlr_user); + preempt_enable(); + } + + return 0; +} + +long get_mte_ctrl(struct task_struct *task) +{ + unsigned long ret; + u64 mte_ctrl = task->thread.mte_ctrl; + u64 incl = (~mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK; + + if (!system_supports_mte()) + return 0; + + ret = incl << PR_MTE_TAG_SHIFT; + if (mte_ctrl & MTE_CTRL_TCF_ASYNC) + ret |= PR_MTE_TCF_ASYNC; + if (mte_ctrl & MTE_CTRL_TCF_SYNC) + ret |= PR_MTE_TCF_SYNC; + + return ret; +} + +/* + * Access MTE tags in another process' address space as given in mm. Update + * the number of tags copied. Return 0 if any tags copied, error otherwise. + * Inspired by __access_remote_vm(). + */ +static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + struct vm_area_struct *vma; + void __user *buf = kiov->iov_base; + size_t len = kiov->iov_len; + int ret; + int write = gup_flags & FOLL_WRITE; + + if (!access_ok(buf, len)) + return -EFAULT; + + if (mmap_read_lock_killable(mm)) + return -EIO; + + while (len) { + unsigned long tags, offset; + void *maddr; + struct page *page = NULL; + + ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page, + &vma, NULL); + if (ret <= 0) + break; + + /* + * Only copy tags if the page has been mapped as PROT_MTE + * (PG_mte_tagged set). Otherwise the tags are not valid and + * not accessible to user. Moreover, an mprotect(PROT_MTE) + * would cause the existing tags to be cleared if the page + * was never mapped with PROT_MTE. + */ + if (!(vma->vm_flags & VM_MTE)) { + ret = -EOPNOTSUPP; + put_page(page); + break; + } + WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags)); + + /* limit access to the end of the page */ + offset = offset_in_page(addr); + tags = min(len, (PAGE_SIZE - offset) / MTE_GRANULE_SIZE); + + maddr = page_address(page); + if (write) { + tags = mte_copy_tags_from_user(maddr + offset, buf, tags); + set_page_dirty_lock(page); + } else { + tags = mte_copy_tags_to_user(buf, maddr + offset, tags); + } + put_page(page); + + /* error accessing the tracer's buffer */ + if (!tags) + break; + + len -= tags; + buf += tags; + addr += tags * MTE_GRANULE_SIZE; + } + mmap_read_unlock(mm); + + /* return an error if no tags copied */ + kiov->iov_len = buf - kiov->iov_base; + if (!kiov->iov_len) { + /* check for error accessing the tracee's address space */ + if (ret <= 0) + return -EIO; + else + return -EFAULT; + } + + return 0; +} + +/* + * Copy MTE tags in another process' address space at 'addr' to/from tracer's + * iovec buffer. Return 0 on success. Inspired by ptrace_access_vm(). + */ +static int access_remote_tags(struct task_struct *tsk, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return -EPERM; + + if (!tsk->ptrace || (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return -EPERM; + } + + ret = __access_remote_tags(mm, addr, kiov, gup_flags); + mmput(mm); + + return ret; +} + +int mte_ptrace_copy_tags(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + struct iovec kiov; + struct iovec __user *uiov = (void __user *)data; + unsigned int gup_flags = FOLL_FORCE; + + if (!system_supports_mte()) + return -EIO; + + if (get_user(kiov.iov_base, &uiov->iov_base) || + get_user(kiov.iov_len, &uiov->iov_len)) + return -EFAULT; + + if (request == PTRACE_POKEMTETAGS) + gup_flags |= FOLL_WRITE; + + /* align addr to the MTE tag granule */ + addr &= MTE_GRANULE_MASK; + + ret = access_remote_tags(child, addr, &kiov, gup_flags); + if (!ret) + ret = put_user(kiov.iov_len, &uiov->iov_len); + + return ret; +} + +static ssize_t mte_tcf_preferred_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (per_cpu(mte_tcf_preferred, dev->id)) { + case MTE_CTRL_TCF_ASYNC: + return sysfs_emit(buf, "async\n"); + case MTE_CTRL_TCF_SYNC: + return sysfs_emit(buf, "sync\n"); + case MTE_CTRL_TCF_ASYMM: + return sysfs_emit(buf, "asymm\n"); + default: + return sysfs_emit(buf, "???\n"); + } +} + +static ssize_t mte_tcf_preferred_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u64 tcf; + + if (sysfs_streq(buf, "async")) + tcf = MTE_CTRL_TCF_ASYNC; + else if (sysfs_streq(buf, "sync")) + tcf = MTE_CTRL_TCF_SYNC; + else if (cpus_have_cap(ARM64_MTE_ASYMM) && sysfs_streq(buf, "asymm")) + tcf = MTE_CTRL_TCF_ASYMM; + else + return -EINVAL; + + device_lock(dev); + per_cpu(mte_tcf_preferred, dev->id) = tcf; + device_unlock(dev); + + return count; +} +static DEVICE_ATTR_RW(mte_tcf_preferred); + +static int register_mte_tcf_preferred_sysctl(void) +{ + unsigned int cpu; + + if (!system_supports_mte()) + return 0; + + for_each_possible_cpu(cpu) { + per_cpu(mte_tcf_preferred, cpu) = MTE_CTRL_TCF_ASYNC; + device_create_file(get_cpu_device(cpu), + &dev_attr_mte_tcf_preferred); + } + + return 0; +} +subsys_initcall(register_mte_tcf_preferred_sysctl); + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +size_t mte_probe_user_range(const char __user *uaddr, size_t size) +{ + const char __user *end = uaddr + size; + int err = 0; + char val; + + __raw_get_user(val, uaddr, err); + if (err) + return size; + + uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); + while (uaddr < end) { + /* + * A read is sufficient for mte, the caller should have probed + * for the pte write permission if required. + */ + __raw_get_user(val, uaddr, err); + if (err) + return end - uaddr; + uaddr += MTE_GRANULE_SIZE; + } + (void)val; + + return 0; +} diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 53f371ed4568..57c7c211f8c7 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -1,25 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * Copyright (C) 2013 Citrix Systems * * Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com> */ +#define pr_fmt(fmt) "arm-pv: " fmt + +#include <linux/arm-smccc.h> +#include <linux/cpuhotplug.h> #include <linux/export.h> +#include <linux/io.h> #include <linux/jump_label.h> +#include <linux/printk.h> +#include <linux/psci.h> +#include <linux/reboot.h> +#include <linux/slab.h> #include <linux/types.h> +#include <linux/static_call.h> + #include <asm/paravirt.h> +#include <asm/pvclock-abi.h> +#include <asm/smp_plat.h> struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct pv_time_ops pv_time_ops; -EXPORT_SYMBOL_GPL(pv_time_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); + +struct pv_time_stolen_time_region { + struct pvclock_vcpu_stolen_time __rcu *kaddr; +}; + +static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region); + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +/* return stolen time in ns by asking the hypervisor */ +static u64 para_steal_clock(int cpu) +{ + struct pvclock_vcpu_stolen_time *kaddr = NULL; + struct pv_time_stolen_time_region *reg; + u64 ret = 0; + + reg = per_cpu_ptr(&stolen_time_region, cpu); + + /* + * paravirt_steal_clock() may be called before the CPU + * online notification callback runs. Until the callback + * has run we just return zero. + */ + rcu_read_lock(); + kaddr = rcu_dereference(reg->kaddr); + if (!kaddr) { + rcu_read_unlock(); + return 0; + } + + ret = le64_to_cpu(READ_ONCE(kaddr->stolen_time)); + rcu_read_unlock(); + return ret; +} + +static int stolen_time_cpu_down_prepare(unsigned int cpu) +{ + struct pvclock_vcpu_stolen_time *kaddr = NULL; + struct pv_time_stolen_time_region *reg; + + reg = this_cpu_ptr(&stolen_time_region); + if (!reg->kaddr) + return 0; + + kaddr = rcu_replace_pointer(reg->kaddr, NULL, true); + synchronize_rcu(); + memunmap(kaddr); + + return 0; +} + +static int stolen_time_cpu_online(unsigned int cpu) +{ + struct pvclock_vcpu_stolen_time *kaddr = NULL; + struct pv_time_stolen_time_region *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&stolen_time_region); + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_ST, &res); + + if (res.a0 == SMCCC_RET_NOT_SUPPORTED) + return -EINVAL; + + kaddr = memremap(res.a0, + sizeof(struct pvclock_vcpu_stolen_time), + MEMREMAP_WB); + + rcu_assign_pointer(reg->kaddr, kaddr); + + if (!reg->kaddr) { + pr_warn("Failed to map stolen time data structure\n"); + return -ENOMEM; + } + + if (le32_to_cpu(kaddr->revision) != 0 || + le32_to_cpu(kaddr->attributes) != 0) { + pr_warn_once("Unexpected revision or attributes in stolen time data\n"); + return -ENXIO; + } + + return 0; +} + +static int __init pv_time_init_stolen_time(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "hypervisor/arm/pvtime:online", + stolen_time_cpu_online, + stolen_time_cpu_down_prepare); + if (ret < 0) + return ret; + return 0; +} + +static bool __init has_pv_steal_clock(void) +{ + struct arm_smccc_res res; + + /* To detect the presence of PV time support we require SMCCC 1.1+ */ + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_HV_PV_TIME_FEATURES, &res); + + if (res.a0 != SMCCC_RET_SUCCESS) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_FEATURES, + ARM_SMCCC_HV_PV_TIME_ST, &res); + + return (res.a0 == SMCCC_RET_SUCCESS); +} + +int __init pv_time_init(void) +{ + int ret; + + if (!has_pv_steal_clock()) + return 0; + + ret = pv_time_init_stolen_time(); + if (ret) + return ret; + + static_call_update(pv_steal_clock, para_steal_clock); + + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + + pr_info("using stolen time PV\n"); + + return 0; +} diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c new file mode 100644 index 000000000000..33e0fabc0b79 --- /dev/null +++ b/arch/arm64/kernel/patching.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> +#include <linux/uaccess.h> + +#include <asm/cacheflush.h> +#include <asm/fixmap.h> +#include <asm/insn.h> +#include <asm/kprobes.h> +#include <asm/patching.h> +#include <asm/sections.h> + +static DEFINE_RAW_SPINLOCK(patch_lock); + +static bool is_exit_text(unsigned long addr) +{ + /* discarded with init text/data */ + return system_state < SYSTEM_RUNNING && + addr >= (unsigned long)__exittext_begin && + addr < (unsigned long)__exittext_end; +} + +static bool is_image_text(unsigned long addr) +{ + return core_kernel_text(addr) || is_exit_text(addr); +} + +static void __kprobes *patch_map(void *addr, int fixmap) +{ + unsigned long uintaddr = (uintptr_t) addr; + bool image = is_image_text(uintaddr); + struct page *page; + + if (image) + page = phys_to_page(__pa_symbol(addr)); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + page = vmalloc_to_page(addr); + else + return addr; + + BUG_ON(!page); + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + + (uintaddr & ~PAGE_MASK)); +} + +static void __kprobes patch_unmap(int fixmap) +{ + clear_fixmap(fixmap); +} +/* + * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always + * little-endian. + */ +int __kprobes aarch64_insn_read(void *addr, u32 *insnp) +{ + int ret; + __le32 val; + + ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE); + if (!ret) + *insnp = le32_to_cpu(val); + + return ret; +} + +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +{ + void *waddr = addr; + unsigned long flags = 0; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + + patch_unmap(FIX_TEXT_POKE0); + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} + +int __kprobes aarch64_insn_write(void *addr, u32 insn) +{ + return __aarch64_insn_write(addr, cpu_to_le32(insn)); +} + +int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) +{ + u32 *tp = addr; + int ret; + + /* A64 instructions must be word aligned */ + if ((uintptr_t)tp & 0x3) + return -EINVAL; + + ret = aarch64_insn_write(tp, insn); + if (ret == 0) + caches_clean_inval_pou((uintptr_t)tp, + (uintptr_t)tp + AARCH64_INSN_SIZE); + + return ret; +} + +struct aarch64_insn_patch { + void **text_addrs; + u32 *new_insns; + int insn_cnt; + atomic_t cpu_count; +}; + +static int __kprobes aarch64_insn_patch_text_cb(void *arg) +{ + int i, ret = 0; + struct aarch64_insn_patch *pp = arg; + + /* The last CPU becomes master */ + if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { + for (i = 0; ret == 0 && i < pp->insn_cnt; i++) + ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], + pp->new_insns[i]); + /* Notify other processors with an additional increment. */ + atomic_inc(&pp->cpu_count); + } else { + while (atomic_read(&pp->cpu_count) <= num_online_cpus()) + cpu_relax(); + isb(); + } + + return ret; +} + +int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) +{ + struct aarch64_insn_patch patch = { + .text_addrs = addrs, + .new_insns = insns, + .insn_cnt = cnt, + .cpu_count = ATOMIC_INIT(0), + }; + + if (cnt <= 0) + return -EINVAL; + + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); +} diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 0e2ea1c78542..2276689b5411 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Code borrowed from powerpc/kernel/pci-common.c * * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * */ #include <linux/acpi.h> @@ -86,14 +82,29 @@ int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { - if (!acpi_disabled) { - struct pci_config_window *cfg = bridge->bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct device *bus_dev = &bridge->bus->dev; + struct pci_config_window *cfg; + struct acpi_device *adev; + struct device *bus_dev; - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - } + if (acpi_disabled) + return 0; + + cfg = bridge->bus->sysdata; + + /* + * On Hyper-V there is no corresponding ACPI device for a root bridge, + * therefore ->parent is set as NULL by the driver. And set 'adev' as + * NULL in this case because there is no proper ACPI device. + */ + if (!cfg->parent) + adev = NULL; + else + adev = to_acpi_device(cfg->parent); + + bus_dev = &bridge->bus->dev; + + ACPI_COMPANION_SET(&bridge->dev, adev); + set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); return 0; } @@ -121,7 +132,7 @@ pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) struct device *dev = &root->device->dev; struct resource *bus_res = &root->secondary; u16 seg = root->segment; - struct pci_ecam_ops *ecam_ops; + const struct pci_ecam_ops *ecam_ops; struct resource cfgres; struct acpi_device *adev; struct pci_config_window *cfg; @@ -165,16 +176,16 @@ static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) /* Interface called from ACPI code to setup PCI host controller */ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { - int node = acpi_get_node(root->device->handle); struct acpi_pci_generic_root_info *ri; struct pci_bus *bus, *child; struct acpi_pci_root_ops *root_ops; + struct pci_host_bridge *host; - ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); + ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return NULL; - root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node); + root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); if (!root_ops) { kfree(ri); return NULL; @@ -189,13 +200,21 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) root_ops->release_info = pci_acpi_generic_release_info; root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = &ri->cfg->ops->pci_ops; + root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); if (!bus) return NULL; - pci_bus_size_bridges(bus); - pci_bus_assign_resources(bus); + /* If we must preserve the resource configuration, claim now */ + host = pci_find_host_bridge(bus); + if (host->preserve_config) + pci_bus_claim_resources(bus); + + /* + * Assign whatever was left unassigned. If we didn't claim above, + * this will reassign everything. + */ + pci_assign_unassigned_root_bus_resources(bus); list_for_each_entry(child, &bus->children, node) pcie_bus_configure_settings(child); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index bcafd7dcfe8b..65b196e3ca6c 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -1,24 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arm64 callchain support * * Copyright (C) 2015 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/perf_event.h> +#include <linux/stacktrace.h> #include <linux/uaccess.h> -#include <asm/stacktrace.h> +#include <asm/pointer_auth.h> struct frame_tail { struct frame_tail __user *fp; @@ -35,9 +25,10 @@ user_backtrace(struct frame_tail __user *tail, { struct frame_tail buftail; unsigned long err; + unsigned long lr; /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + if (!access_ok(tail, sizeof(buftail))) return NULL; pagefault_disable(); @@ -47,7 +38,9 @@ user_backtrace(struct frame_tail __user *tail, if (err) return NULL; - perf_callchain_store(entry, buftail.lr); + lr = ptrauth_strip_insn_pac(buftail.lr); + + perf_callchain_store(entry, lr); /* * Frame pointers should strictly progress back up the stack @@ -82,7 +75,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, unsigned long err; /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + if (!access_ok(tail, sizeof(buftail))) return NULL; pagefault_disable(); @@ -109,7 +102,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } @@ -123,7 +116,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, tail = (struct frame_tail __user *)regs->regs[29]; while (entry->nr < entry->max_stack && - tail && !((unsigned long)tail & 0xf)) + tail && !((unsigned long)tail & 0x7)) tail = user_backtrace(tail, entry); } else { #ifdef CONFIG_COMPAT @@ -139,51 +132,38 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, } } -/* - * Gets called by walk_stackframe() for every stackframe. This will be called - * whist unwinding the stackframe and is like a subroutine return so we use - * the PC. - */ -static int callchain_trace(struct stackframe *frame, void *data) +static bool callchain_trace(void *data, unsigned long pc) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, frame->pc); - return 0; + return perf_callchain_store(entry, pc) == 0; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct stackframe frame; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } - frame.fp = regs->regs[29]; - frame.pc = regs->pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = current->curr_ret_stack; -#endif - - walk_stackframe(current, &frame, callchain_trace, entry); + arch_stack_walk(callchain_trace, entry, current, regs); } unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + unsigned int guest_state = perf_guest_state(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 9eaef51f83ff..7b0643fe2f13 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1,22 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * PMU support + * ARMv8 PMUv3 Performance Events handling code. * * Copyright (C) 2012 ARM Limited * Author: Will Deacon <will.deacon@arm.com> * * This code is based heavily on the ARMv7 perf event code. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <asm/irq_regs.h> @@ -24,153 +13,16 @@ #include <asm/sysreg.h> #include <asm/virt.h> +#include <clocksource/arm_arch_timer.h> + #include <linux/acpi.h> +#include <linux/clocksource.h> +#include <linux/kvm_host.h> #include <linux/of.h> #include <linux/perf/arm_pmu.h> #include <linux/platform_device.h> - -/* - * ARMv8 PMUv3 Performance Events handling code. - * Common event types (some are defined in asm/perf_event.h). - */ - -/* At least one of the following is required. */ -#define ARMV8_PMUV3_PERFCTR_INST_RETIRED 0x08 -#define ARMV8_PMUV3_PERFCTR_INST_SPEC 0x1B - -/* Common architectural events. */ -#define ARMV8_PMUV3_PERFCTR_LD_RETIRED 0x06 -#define ARMV8_PMUV3_PERFCTR_ST_RETIRED 0x07 -#define ARMV8_PMUV3_PERFCTR_EXC_TAKEN 0x09 -#define ARMV8_PMUV3_PERFCTR_EXC_RETURN 0x0A -#define ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED 0x0B -#define ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED 0x0C -#define ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED 0x0D -#define ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED 0x0E -#define ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED 0x0F -#define ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED 0x1C -#define ARMV8_PMUV3_PERFCTR_CHAIN 0x1E -#define ARMV8_PMUV3_PERFCTR_BR_RETIRED 0x21 - -/* Common microarchitectural events. */ -#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL 0x01 -#define ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL 0x02 -#define ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL 0x05 -#define ARMV8_PMUV3_PERFCTR_MEM_ACCESS 0x13 -#define ARMV8_PMUV3_PERFCTR_L1I_CACHE 0x14 -#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB 0x15 -#define ARMV8_PMUV3_PERFCTR_L2D_CACHE 0x16 -#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL 0x17 -#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB 0x18 -#define ARMV8_PMUV3_PERFCTR_BUS_ACCESS 0x19 -#define ARMV8_PMUV3_PERFCTR_MEMORY_ERROR 0x1A -#define ARMV8_PMUV3_PERFCTR_BUS_CYCLES 0x1D -#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE 0x1F -#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE 0x20 -#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED 0x22 -#define ARMV8_PMUV3_PERFCTR_STALL_FRONTEND 0x23 -#define ARMV8_PMUV3_PERFCTR_STALL_BACKEND 0x24 -#define ARMV8_PMUV3_PERFCTR_L1D_TLB 0x25 -#define ARMV8_PMUV3_PERFCTR_L1I_TLB 0x26 -#define ARMV8_PMUV3_PERFCTR_L2I_CACHE 0x27 -#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL 0x28 -#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE 0x29 -#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL 0x2A -#define ARMV8_PMUV3_PERFCTR_L3D_CACHE 0x2B -#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB 0x2C -#define ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL 0x2D -#define ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL 0x2E -#define ARMV8_PMUV3_PERFCTR_L2D_TLB 0x2F -#define ARMV8_PMUV3_PERFCTR_L2I_TLB 0x30 - -/* ARMv8 recommended implementation defined event types */ -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD 0x40 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR 0x41 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD 0x42 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR 0x43 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_INNER 0x44 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_OUTER 0x45 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WB_VICTIM 0x46 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WB_CLEAN 0x47 -#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_INVAL 0x48 - -#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD 0x4C -#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR 0x4D -#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD 0x4E -#define ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR 0x4F -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_RD 0x50 -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WR 0x51 -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_REFILL_RD 0x52 -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_REFILL_WR 0x53 - -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WB_VICTIM 0x56 -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_WB_CLEAN 0x57 -#define ARMV8_IMPDEF_PERFCTR_L2D_CACHE_INVAL 0x58 - -#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_REFILL_RD 0x5C -#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_REFILL_WR 0x5D -#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_RD 0x5E -#define ARMV8_IMPDEF_PERFCTR_L2D_TLB_WR 0x5F - -#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD 0x60 -#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR 0x61 -#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_SHARED 0x62 -#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_NOT_SHARED 0x63 -#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_NORMAL 0x64 -#define ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_PERIPH 0x65 - -#define ARMV8_IMPDEF_PERFCTR_MEM_ACCESS_RD 0x66 -#define ARMV8_IMPDEF_PERFCTR_MEM_ACCESS_WR 0x67 -#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_LD_SPEC 0x68 -#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_ST_SPEC 0x69 -#define ARMV8_IMPDEF_PERFCTR_UNALIGNED_LDST_SPEC 0x6A - -#define ARMV8_IMPDEF_PERFCTR_LDREX_SPEC 0x6C -#define ARMV8_IMPDEF_PERFCTR_STREX_PASS_SPEC 0x6D -#define ARMV8_IMPDEF_PERFCTR_STREX_FAIL_SPEC 0x6E -#define ARMV8_IMPDEF_PERFCTR_STREX_SPEC 0x6F -#define ARMV8_IMPDEF_PERFCTR_LD_SPEC 0x70 -#define ARMV8_IMPDEF_PERFCTR_ST_SPEC 0x71 -#define ARMV8_IMPDEF_PERFCTR_LDST_SPEC 0x72 -#define ARMV8_IMPDEF_PERFCTR_DP_SPEC 0x73 -#define ARMV8_IMPDEF_PERFCTR_ASE_SPEC 0x74 -#define ARMV8_IMPDEF_PERFCTR_VFP_SPEC 0x75 -#define ARMV8_IMPDEF_PERFCTR_PC_WRITE_SPEC 0x76 -#define ARMV8_IMPDEF_PERFCTR_CRYPTO_SPEC 0x77 -#define ARMV8_IMPDEF_PERFCTR_BR_IMMED_SPEC 0x78 -#define ARMV8_IMPDEF_PERFCTR_BR_RETURN_SPEC 0x79 -#define ARMV8_IMPDEF_PERFCTR_BR_INDIRECT_SPEC 0x7A - -#define ARMV8_IMPDEF_PERFCTR_ISB_SPEC 0x7C -#define ARMV8_IMPDEF_PERFCTR_DSB_SPEC 0x7D -#define ARMV8_IMPDEF_PERFCTR_DMB_SPEC 0x7E - -#define ARMV8_IMPDEF_PERFCTR_EXC_UNDEF 0x81 -#define ARMV8_IMPDEF_PERFCTR_EXC_SVC 0x82 -#define ARMV8_IMPDEF_PERFCTR_EXC_PABORT 0x83 -#define ARMV8_IMPDEF_PERFCTR_EXC_DABORT 0x84 - -#define ARMV8_IMPDEF_PERFCTR_EXC_IRQ 0x86 -#define ARMV8_IMPDEF_PERFCTR_EXC_FIQ 0x87 -#define ARMV8_IMPDEF_PERFCTR_EXC_SMC 0x88 - -#define ARMV8_IMPDEF_PERFCTR_EXC_HVC 0x8A -#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_PABORT 0x8B -#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_DABORT 0x8C -#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_OTHER 0x8D -#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_IRQ 0x8E -#define ARMV8_IMPDEF_PERFCTR_EXC_TRAP_FIQ 0x8F -#define ARMV8_IMPDEF_PERFCTR_RC_LD_SPEC 0x90 -#define ARMV8_IMPDEF_PERFCTR_RC_ST_SPEC 0x91 - -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_RD 0xA0 -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WR 0xA1 -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_REFILL_RD 0xA2 -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_REFILL_WR 0xA3 - -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WB_VICTIM 0xA6 -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_WB_CLEAN 0xA7 -#define ARMV8_IMPDEF_PERFCTR_L3D_CACHE_INVAL 0xA8 +#include <linux/sched_clock.h> +#include <linux/smp.h> /* ARMv8 Cortex-A53 specific event types. */ #define ARMV8_A53_PERFCTR_PREF_LINEFILL 0xC2 @@ -182,12 +34,10 @@ #define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS 0xEC #define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS 0xED -/* PMUv3 HW events mapping. */ - /* * ARMv8 Architectural defined events, not all of these may - * be supported on any given implementation. Undefined events will - * be disabled at run-time. + * be supported on any given implementation. Unsupported events will + * be disabled at run-time based on the PMCEID registers. */ static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { PERF_MAP_ALL_UNSUPPORTED, @@ -209,8 +59,6 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, @@ -221,10 +69,11 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, + [C(LL)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD, + [C(LL)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_LL_CACHE_RD, + [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, }; static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -262,12 +111,6 @@ static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -318,113 +161,103 @@ armv8pmu_events_sysfs_show(struct device *dev, pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%03llx\n", pmu_attr->id); -} - -#define ARMV8_EVENT_ATTR_RESOLVE(m) #m -#define ARMV8_EVENT_ATTR(name, config) \ - PMU_EVENT_ATTR(name, armv8_event_attr_##name, \ - config, armv8pmu_events_sysfs_show) - -ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR); -ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL); -ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL); -ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL); -ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE); -ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL); -ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED); -ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED); -ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED); -ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN); -ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN); -ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED); -ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED); -ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED); -ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED); -ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED); -ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED); -ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES); -ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED); -ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS); -ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE); -ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB); -ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE); -ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL); -ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB); -ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS); -ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR); -ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC); -ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED); -ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES); -/* Don't expose the chain event in /sys, since it's useless in isolation */ -ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE); -ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE); -ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED); -ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED); -ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND); -ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND); -ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB); -ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB); -ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE); -ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL); -ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE); -ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL); -ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE); -ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB); -ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL); -ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL); -ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB); -ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB); + return sprintf(page, "event=0x%04llx\n", pmu_attr->id); +} + +#define ARMV8_EVENT_ATTR(name, config) \ + PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config) static struct attribute *armv8_pmuv3_event_attrs[] = { - &armv8_event_attr_sw_incr.attr.attr, - &armv8_event_attr_l1i_cache_refill.attr.attr, - &armv8_event_attr_l1i_tlb_refill.attr.attr, - &armv8_event_attr_l1d_cache_refill.attr.attr, - &armv8_event_attr_l1d_cache.attr.attr, - &armv8_event_attr_l1d_tlb_refill.attr.attr, - &armv8_event_attr_ld_retired.attr.attr, - &armv8_event_attr_st_retired.attr.attr, - &armv8_event_attr_inst_retired.attr.attr, - &armv8_event_attr_exc_taken.attr.attr, - &armv8_event_attr_exc_return.attr.attr, - &armv8_event_attr_cid_write_retired.attr.attr, - &armv8_event_attr_pc_write_retired.attr.attr, - &armv8_event_attr_br_immed_retired.attr.attr, - &armv8_event_attr_br_return_retired.attr.attr, - &armv8_event_attr_unaligned_ldst_retired.attr.attr, - &armv8_event_attr_br_mis_pred.attr.attr, - &armv8_event_attr_cpu_cycles.attr.attr, - &armv8_event_attr_br_pred.attr.attr, - &armv8_event_attr_mem_access.attr.attr, - &armv8_event_attr_l1i_cache.attr.attr, - &armv8_event_attr_l1d_cache_wb.attr.attr, - &armv8_event_attr_l2d_cache.attr.attr, - &armv8_event_attr_l2d_cache_refill.attr.attr, - &armv8_event_attr_l2d_cache_wb.attr.attr, - &armv8_event_attr_bus_access.attr.attr, - &armv8_event_attr_memory_error.attr.attr, - &armv8_event_attr_inst_spec.attr.attr, - &armv8_event_attr_ttbr_write_retired.attr.attr, - &armv8_event_attr_bus_cycles.attr.attr, - &armv8_event_attr_l1d_cache_allocate.attr.attr, - &armv8_event_attr_l2d_cache_allocate.attr.attr, - &armv8_event_attr_br_retired.attr.attr, - &armv8_event_attr_br_mis_pred_retired.attr.attr, - &armv8_event_attr_stall_frontend.attr.attr, - &armv8_event_attr_stall_backend.attr.attr, - &armv8_event_attr_l1d_tlb.attr.attr, - &armv8_event_attr_l1i_tlb.attr.attr, - &armv8_event_attr_l2i_cache.attr.attr, - &armv8_event_attr_l2i_cache_refill.attr.attr, - &armv8_event_attr_l3d_cache_allocate.attr.attr, - &armv8_event_attr_l3d_cache_refill.attr.attr, - &armv8_event_attr_l3d_cache.attr.attr, - &armv8_event_attr_l3d_cache_wb.attr.attr, - &armv8_event_attr_l2d_tlb_refill.attr.attr, - &armv8_event_attr_l2i_tlb_refill.attr.attr, - &armv8_event_attr_l2d_tlb.attr.attr, - &armv8_event_attr_l2i_tlb.attr.attr, + ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR), + ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL), + ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL), + ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL), + ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE), + ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL), + ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED), + ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED), + ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED), + ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN), + ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN), + ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED), + ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED), + ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED), + ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED), + ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED), + ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED), + ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES), + ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED), + ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS), + ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE), + ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB), + ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE), + ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL), + ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB), + ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS), + ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR), + ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC), + ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED), + ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES), + /* Don't expose the chain event in /sys, since it's useless in isolation */ + ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE), + ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE), + ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED), + ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED), + ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND), + ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND), + ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB), + ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB), + ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE), + ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL), + ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE), + ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL), + ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE), + ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB), + ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL), + ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL), + ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB), + ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB), + ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS), + ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE), + ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS), + ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK), + ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK), + ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD), + ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD), + ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD), + ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED), + ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC), + ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL), + ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND), + ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND), + ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT), + ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP), + ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED), + ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE), + ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION), + ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES), + ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM), + ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS), + ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS), + ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(trb_wrap, ARMV8_PMUV3_PERFCTR_TRB_WRAP), + ARMV8_EVENT_ATTR(trb_trig, ARMV8_PMUV3_PERFCTR_TRB_TRIG), + ARMV8_EVENT_ATTR(trcextout0, ARMV8_PMUV3_PERFCTR_TRCEXTOUT0), + ARMV8_EVENT_ATTR(trcextout1, ARMV8_PMUV3_PERFCTR_TRCEXTOUT1), + ARMV8_EVENT_ATTR(trcextout2, ARMV8_PMUV3_PERFCTR_TRCEXTOUT2), + ARMV8_EVENT_ATTR(trcextout3, ARMV8_PMUV3_PERFCTR_TRCEXTOUT3), + ARMV8_EVENT_ATTR(cti_trigout4, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT4), + ARMV8_EVENT_ATTR(cti_trigout5, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT5), + ARMV8_EVENT_ATTR(cti_trigout6, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT6), + ARMV8_EVENT_ATTR(cti_trigout7, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT7), + ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT), + ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT), + ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT), + ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED), + ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD), + ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR), NULL, }; @@ -439,37 +272,147 @@ armv8pmu_event_attr_is_visible(struct kobject *kobj, pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); - if (test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap)) + if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && + test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap)) return attr->mode; + if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) { + u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; + + if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS && + test_bit(id, cpu_pmu->pmceid_ext_bitmap)) + return attr->mode; + } + return 0; } -static struct attribute_group armv8_pmuv3_events_attr_group = { +static const struct attribute_group armv8_pmuv3_events_attr_group = { .name = "events", .attrs = armv8_pmuv3_event_attrs, .is_visible = armv8pmu_event_attr_is_visible, }; PMU_FORMAT_ATTR(event, "config:0-15"); +PMU_FORMAT_ATTR(long, "config1:0"); +PMU_FORMAT_ATTR(rdpmc, "config1:1"); + +static int sysctl_perf_user_access __read_mostly; + +static inline bool armv8pmu_event_is_64bit(struct perf_event *event) +{ + return event->attr.config1 & 0x1; +} + +static inline bool armv8pmu_event_want_user_access(struct perf_event *event) +{ + return event->attr.config1 & 0x2; +} static struct attribute *armv8_pmuv3_format_attrs[] = { &format_attr_event.attr, + &format_attr_long.attr, + &format_attr_rdpmc.attr, NULL, }; -static struct attribute_group armv8_pmuv3_format_attr_group = { +static const struct attribute_group armv8_pmuv3_format_attr_group = { .name = "format", .attrs = armv8_pmuv3_format_attrs, }; +static ssize_t slots_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); + u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK; + + return sysfs_emit(page, "0x%08x\n", slots); +} + +static DEVICE_ATTR_RO(slots); + +static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); + u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT) + & ARMV8_PMU_BUS_SLOTS_MASK; + + return sysfs_emit(page, "0x%08x\n", bus_slots); +} + +static DEVICE_ATTR_RO(bus_slots); + +static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); + u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT) + & ARMV8_PMU_BUS_WIDTH_MASK; + u32 val = 0; + + /* Encoded as Log2(number of bytes), plus one */ + if (bus_width > 2 && bus_width < 13) + val = 1 << (bus_width - 1); + + return sysfs_emit(page, "0x%08x\n", val); +} + +static DEVICE_ATTR_RO(bus_width); + +static struct attribute *armv8_pmuv3_caps_attrs[] = { + &dev_attr_slots.attr, + &dev_attr_bus_slots.attr, + &dev_attr_bus_width.attr, + NULL, +}; + +static const struct attribute_group armv8_pmuv3_caps_attr_group = { + .name = "caps", + .attrs = armv8_pmuv3_caps_attrs, +}; + /* * Perf Events' indices */ #define ARMV8_IDX_CYCLE_COUNTER 0 #define ARMV8_IDX_COUNTER0 1 -#define ARMV8_IDX_COUNTER_LAST(cpu_pmu) \ - (ARMV8_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1) +#define ARMV8_IDX_CYCLE_COUNTER_USER 32 + +/* + * We unconditionally enable ARMv8.5-PMU long event counter support + * (64-bit events) where supported. Indicate if this arm_pmu has long + * event counter support. + */ +static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu) +{ + return (cpu_pmu->pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5); +} + +static inline bool armv8pmu_event_has_user_read(struct perf_event *event) +{ + return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT; +} + +/* + * We must chain two programmable counters for 64 bit events, + * except when we have allocated the 64bit cycle counter (for CPU + * cycles event) or when user space counter access is enabled. + */ +static inline bool armv8pmu_event_is_chained(struct perf_event *event) +{ + int idx = event->hw.idx; + struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + + return !armv8pmu_event_has_user_read(event) && + armv8pmu_event_is_64bit(event) && + !armv8pmu_has_long_event(cpu_pmu) && + (idx != ARMV8_IDX_CYCLE_COUNTER); +} /* * ARMv8 low level PMU access @@ -481,6 +424,73 @@ static struct attribute_group armv8_pmuv3_format_attr_group = { #define ARMV8_IDX_TO_COUNTER(x) \ (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) +/* + * This code is really good + */ + +#define PMEVN_CASE(n, case_macro) \ + case n: case_macro(n); break + +#define PMEVN_SWITCH(x, case_macro) \ + do { \ + switch (x) { \ + PMEVN_CASE(0, case_macro); \ + PMEVN_CASE(1, case_macro); \ + PMEVN_CASE(2, case_macro); \ + PMEVN_CASE(3, case_macro); \ + PMEVN_CASE(4, case_macro); \ + PMEVN_CASE(5, case_macro); \ + PMEVN_CASE(6, case_macro); \ + PMEVN_CASE(7, case_macro); \ + PMEVN_CASE(8, case_macro); \ + PMEVN_CASE(9, case_macro); \ + PMEVN_CASE(10, case_macro); \ + PMEVN_CASE(11, case_macro); \ + PMEVN_CASE(12, case_macro); \ + PMEVN_CASE(13, case_macro); \ + PMEVN_CASE(14, case_macro); \ + PMEVN_CASE(15, case_macro); \ + PMEVN_CASE(16, case_macro); \ + PMEVN_CASE(17, case_macro); \ + PMEVN_CASE(18, case_macro); \ + PMEVN_CASE(19, case_macro); \ + PMEVN_CASE(20, case_macro); \ + PMEVN_CASE(21, case_macro); \ + PMEVN_CASE(22, case_macro); \ + PMEVN_CASE(23, case_macro); \ + PMEVN_CASE(24, case_macro); \ + PMEVN_CASE(25, case_macro); \ + PMEVN_CASE(26, case_macro); \ + PMEVN_CASE(27, case_macro); \ + PMEVN_CASE(28, case_macro); \ + PMEVN_CASE(29, case_macro); \ + PMEVN_CASE(30, case_macro); \ + default: WARN(1, "Invalid PMEV* index\n"); \ + } \ + } while (0) + +#define RETURN_READ_PMEVCNTRN(n) \ + return read_sysreg(pmevcntr##n##_el0) +static unsigned long read_pmevcntrn(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); + return 0; +} + +#define WRITE_PMEVCNTRN(n) \ + write_sysreg(val, pmevcntr##n##_el0) +static void write_pmevcntrn(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); +} + +#define WRITE_PMEVTYPERN(n) \ + write_sysreg(val, pmevtyper##n##_el0) +static void write_pmevtypern(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); +} + static inline u32 armv8pmu_pmcr_read(void) { return read_sysreg(pmcr_el0); @@ -498,105 +508,223 @@ static inline int armv8pmu_has_overflowed(u32 pmovsr) return pmovsr & ARMV8_PMU_OVERFLOWED_MASK; } -static inline int armv8pmu_counter_valid(struct arm_pmu *cpu_pmu, int idx) -{ - return idx >= ARMV8_IDX_CYCLE_COUNTER && - idx <= ARMV8_IDX_COUNTER_LAST(cpu_pmu); -} - static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) { return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline int armv8pmu_select_counter(int idx) +static inline u64 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(counter, pmselr_el0); - isb(); - return idx; + return read_pmevcntrn(counter); } -static inline u32 armv8pmu_read_counter(struct perf_event *event) +static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) +{ + int idx = event->hw.idx; + u64 val = armv8pmu_read_evcntr(idx); + + if (armv8pmu_event_is_chained(event)) + val = (val << 32) | armv8pmu_read_evcntr(idx - 1); + return val; +} + +/* + * The cycle counter is always a 64-bit counter. When ARMV8_PMU_PMCR_LP + * is set the event counters also become 64-bit counters. Unless the + * user has requested a long counter (attr.config1) then we want to + * interrupt upon 32-bit overflow - we achieve this by applying a bias. + */ +static bool armv8pmu_event_needs_bias(struct perf_event *event) { struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - u32 value = 0; - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u reading wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) - value = read_sysreg(pmccntr_el0); - else if (armv8pmu_select_counter(idx) == idx) - value = read_sysreg(pmxevcntr_el0); + if (armv8pmu_event_is_64bit(event)) + return false; + + if (armv8pmu_has_long_event(cpu_pmu) || + idx == ARMV8_IDX_CYCLE_COUNTER) + return true; + + return false; +} + +static u64 armv8pmu_bias_long_counter(struct perf_event *event, u64 value) +{ + if (armv8pmu_event_needs_bias(event)) + value |= GENMASK(63, 32); return value; } -static inline void armv8pmu_write_counter(struct perf_event *event, u32 value) +static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value) +{ + if (armv8pmu_event_needs_bias(event)) + value &= ~GENMASK(63, 32); + + return value; +} + +static u64 armv8pmu_read_counter(struct perf_event *event) { - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + u64 value; - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u writing wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) { - /* - * Set the upper 32bits as this is a 64bit counter but we only - * count using the lower 32bits and we want an interrupt when - * it overflows. - */ - u64 value64 = 0xffffffff00000000ULL | value; + if (idx == ARMV8_IDX_CYCLE_COUNTER) + value = read_sysreg(pmccntr_el0); + else + value = armv8pmu_read_hw_counter(event); - write_sysreg(value64, pmccntr_el0); - } else if (armv8pmu_select_counter(idx) == idx) - write_sysreg(value, pmxevcntr_el0); + return armv8pmu_unbias_long_counter(event, value); } -static inline void armv8pmu_write_evtype(int idx, u32 val) +static inline void armv8pmu_write_evcntr(int idx, u64 value) { - if (armv8pmu_select_counter(idx) == idx) { - val &= ARMV8_PMU_EVTYPE_MASK; - write_sysreg(val, pmxevtyper_el0); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + + write_pmevcntrn(counter, value); +} + +static inline void armv8pmu_write_hw_counter(struct perf_event *event, + u64 value) +{ + int idx = event->hw.idx; + + if (armv8pmu_event_is_chained(event)) { + armv8pmu_write_evcntr(idx, upper_32_bits(value)); + armv8pmu_write_evcntr(idx - 1, lower_32_bits(value)); + } else { + armv8pmu_write_evcntr(idx, value); } } -static inline int armv8pmu_enable_counter(int idx) +static void armv8pmu_write_counter(struct perf_event *event, u64 value) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenset_el0); - return idx; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + value = armv8pmu_bias_long_counter(event, value); + + if (idx == ARMV8_IDX_CYCLE_COUNTER) + write_sysreg(value, pmccntr_el0); + else + armv8pmu_write_hw_counter(event, value); } -static inline int armv8pmu_disable_counter(int idx) +static inline void armv8pmu_write_evtype(int idx, u32 val) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenclr_el0); - return idx; + + val &= ARMV8_PMU_EVTYPE_MASK; + write_pmevtypern(counter, val); } -static inline int armv8pmu_enable_intens(int idx) +static inline void armv8pmu_write_event_type(struct perf_event *event) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenset_el1); - return idx; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + /* + * For chained events, the low counter is programmed to count + * the event of interest and the high counter is programmed + * with CHAIN event code with filters set to count at all ELs. + */ + if (armv8pmu_event_is_chained(event)) { + u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN | + ARMV8_PMU_INCLUDE_EL2; + + armv8pmu_write_evtype(idx - 1, hwc->config_base); + armv8pmu_write_evtype(idx, chain_evt); + } else { + if (idx == ARMV8_IDX_CYCLE_COUNTER) + write_sysreg(hwc->config_base, pmccfiltr_el0); + else + armv8pmu_write_evtype(idx, hwc->config_base); + } } -static inline int armv8pmu_disable_intens(int idx) +static u32 armv8pmu_event_cnten_mask(struct perf_event *event) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenclr_el1); + int counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + u32 mask = BIT(counter); + + if (armv8pmu_event_is_chained(event)) + mask |= BIT(counter - 1); + return mask; +} + +static inline void armv8pmu_enable_counter(u32 mask) +{ + /* + * Make sure event configuration register writes are visible before we + * enable the counter. + * */ + isb(); + write_sysreg(mask, pmcntenset_el0); +} + +static inline void armv8pmu_enable_event_counter(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + u32 mask = armv8pmu_event_cnten_mask(event); + + kvm_set_pmu_events(mask, attr); + + /* We rely on the hypervisor switch code to enable guest counters */ + if (!kvm_pmu_counter_deferred(attr)) + armv8pmu_enable_counter(mask); +} + +static inline void armv8pmu_disable_counter(u32 mask) +{ + write_sysreg(mask, pmcntenclr_el0); + /* + * Make sure the effects of disabling the counter are visible before we + * start configuring the event. + */ + isb(); +} + +static inline void armv8pmu_disable_event_counter(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + u32 mask = armv8pmu_event_cnten_mask(event); + + kvm_clr_pmu_events(mask); + + /* We rely on the hypervisor switch code to disable guest counters */ + if (!kvm_pmu_counter_deferred(attr)) + armv8pmu_disable_counter(mask); +} + +static inline void armv8pmu_enable_intens(u32 mask) +{ + write_sysreg(mask, pmintenset_el1); +} + +static inline void armv8pmu_enable_event_irq(struct perf_event *event) +{ + u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + armv8pmu_enable_intens(BIT(counter)); +} + +static inline void armv8pmu_disable_intens(u32 mask) +{ + write_sysreg(mask, pmintenclr_el1); isb(); /* Clear the overflow flag in case an interrupt is pending. */ - write_sysreg(BIT(counter), pmovsclr_el0); + write_sysreg(mask, pmovsclr_el0); isb(); +} - return idx; +static inline void armv8pmu_disable_event_irq(struct perf_event *event) +{ + u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + armv8pmu_disable_intens(BIT(counter)); } static inline u32 armv8pmu_getreset_flags(void) @@ -613,74 +741,93 @@ static inline u32 armv8pmu_getreset_flags(void) return value; } -static void armv8pmu_enable_event(struct perf_event *event) +static void armv8pmu_disable_user_access(void) { - unsigned long flags; - struct hw_perf_event *hwc = &event->hw; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - int idx = hwc->idx; + write_sysreg(0, pmuserenr_el0); +} + +static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) +{ + int i; + struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); + /* Clear any unused counters to avoid leaking their contents */ + for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) { + if (i == ARMV8_IDX_CYCLE_COUNTER) + write_sysreg(0, pmccntr_el0); + else + armv8pmu_write_evcntr(i, 0); + } + + write_sysreg(0, pmuserenr_el0); + write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0); +} + +static void armv8pmu_enable_event(struct perf_event *event) +{ /* * Enable counter and interrupt, and set the counter to count * the event that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* * Disable counter */ - armv8pmu_disable_counter(idx); + armv8pmu_disable_event_counter(event); /* - * Set event (if destined for PMNx counters). + * Set event. */ - armv8pmu_write_evtype(idx, hwc->config_base); + armv8pmu_write_event_type(event); /* * Enable interrupt for this counter */ - armv8pmu_enable_intens(idx); + armv8pmu_enable_event_irq(event); /* * Enable counter */ - armv8pmu_enable_counter(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); + armv8pmu_enable_event_counter(event); } static void armv8pmu_disable_event(struct perf_event *event) { - unsigned long flags; - struct hw_perf_event *hwc = &event->hw; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - int idx = hwc->idx; - - /* - * Disable counter and interrupt - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* * Disable counter */ - armv8pmu_disable_counter(idx); + armv8pmu_disable_event_counter(event); /* * Disable interrupt for this counter */ - armv8pmu_disable_intens(idx); + armv8pmu_disable_event_irq(event); +} - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +static void armv8pmu_start(struct arm_pmu *cpu_pmu) +{ + struct perf_event_context *task_ctx = + this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx; + + if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user) + armv8pmu_enable_user_access(cpu_pmu); + else + armv8pmu_disable_user_access(); + + /* Enable all counters */ + armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); +} + +static void armv8pmu_stop(struct arm_pmu *cpu_pmu) +{ + /* Disable all counters */ + armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); } -static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) +static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) { u32 pmovsr; struct perf_sample_data data; - struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev; struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); struct pt_regs *regs; int idx; @@ -701,6 +848,11 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); + /* + * Stop the PMU while processing the counter overflows + * to prevent skews in group events. + */ + armv8pmu_stop(cpu_pmu); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -722,48 +874,55 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event)) continue; + /* + * Perf event overflow will queue the processing of the event as + * an irq_work which will be taken care of in the handling of + * IPI_IRQ_WORK. + */ if (perf_event_overflow(event, &data, regs)) cpu_pmu->disable(event); } - - /* - * Handle the pending perf events. - * - * Note: this call *must* be run with interrupts disabled. For - * platforms that can have the PMU interrupts raised as an NMI, this - * will not work. - */ - irq_work_run(); + armv8pmu_start(cpu_pmu); return IRQ_HANDLED; } -static void armv8pmu_start(struct arm_pmu *cpu_pmu) +static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc, + struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + int idx; - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Enable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); + for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + } + return -EAGAIN; } -static void armv8pmu_stop(struct arm_pmu *cpu_pmu) +static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, + struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + int idx; - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Disable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); + /* + * Chaining requires two consecutive event counters, where + * the lower idx must be even. + */ + for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2) { + if (!test_and_set_bit(idx, cpuc->used_mask)) { + /* Check if the preceding even counter is available */ + if (!test_and_set_bit(idx - 1, cpuc->used_mask)) + return idx; + /* Release the Odd counter */ + clear_bit(idx, cpuc->used_mask); + } + } + return -EAGAIN; } static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { - int idx; struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; @@ -772,22 +931,49 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) { if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) return ARMV8_IDX_CYCLE_COUNTER; + else if (armv8pmu_event_is_64bit(event) && + armv8pmu_event_want_user_access(event) && + !armv8pmu_has_long_event(cpu_pmu)) + return -EAGAIN; } /* * Otherwise use events counters */ - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; ++idx) { - if (!test_and_set_bit(idx, cpuc->used_mask)) - return idx; - } + if (armv8pmu_event_is_chained(event)) + return armv8pmu_get_chain_idx(cpuc, cpu_pmu); + else + return armv8pmu_get_single_idx(cpuc, cpu_pmu); +} - /* The counters are all in use. */ - return -EAGAIN; +static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + int idx = event->hw.idx; + + clear_bit(idx, cpuc->used_mask); + if (armv8pmu_event_is_chained(event)) + clear_bit(idx - 1, cpuc->used_mask); +} + +static int armv8pmu_user_event_idx(struct perf_event *event) +{ + if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event)) + return 0; + + /* + * We remap the cycle counter index to 32 to + * match the offset applied to the rest of + * the counter indices. + */ + if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER) + return ARMV8_IDX_CYCLE_COUNTER_USER; + + return event->hw.idx; } /* - * Add an event filter to a given event. This will only work for PMUv2 PMUs. + * Add an event filter to a given event. */ static int armv8pmu_set_event_filter(struct hw_perf_event *event, struct perf_event_attr *attr) @@ -804,14 +990,23 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, * with other architectures (x86 and Power). */ if (is_kernel_in_hyp_mode()) { - if (!attr->exclude_kernel) + if (!attr->exclude_kernel && !attr->exclude_host) config_base |= ARMV8_PMU_INCLUDE_EL2; - } else { - if (attr->exclude_kernel) + if (attr->exclude_guest) config_base |= ARMV8_PMU_EXCLUDE_EL1; - if (!attr->exclude_hv) + if (attr->exclude_host) + config_base |= ARMV8_PMU_EXCLUDE_EL0; + } else { + if (!attr->exclude_hv && !attr->exclude_host) config_base |= ARMV8_PMU_INCLUDE_EL2; } + + /* + * Filter out !VHE kernels and guest kernels + */ + if (attr->exclude_kernel) + config_base |= ARMV8_PMU_EXCLUDE_EL1; + if (attr->exclude_user) config_base |= ARMV8_PMU_EXCLUDE_EL0; @@ -824,23 +1019,35 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, return 0; } +static int armv8pmu_filter_match(struct perf_event *event) +{ + unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT; + return evtype != ARMV8_PMUV3_PERFCTR_CHAIN; +} + static void armv8pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; - u32 idx, nb_cnt = cpu_pmu->num_events; + u32 pmcr; /* The counter and interrupt enable registers are unknown at reset. */ - for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { - armv8pmu_disable_counter(idx); - armv8pmu_disable_intens(idx); - } + armv8pmu_disable_counter(U32_MAX); + armv8pmu_disable_intens(U32_MAX); + + /* Clear the counters we flip at guest entry/exit */ + kvm_clr_pmu_events(U32_MAX); /* * Initialize & Reset PMNC. Request overflow interrupt for * 64 bit cycle counter but cheat in armv8pmu_write_counter(). */ - armv8pmu_pmcr_write(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | - ARMV8_PMU_PMCR_LC); + pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_LC; + + /* Enable long event counter support where available */ + if (armv8pmu_has_long_event(cpu_pmu)) + pmcr |= ARMV8_PMU_PMCR_LP; + + armv8pmu_pmcr_write(pmcr); } static int __armv8_pmuv3_map_event(struct perf_event *event, @@ -858,7 +1065,29 @@ static int __armv8_pmuv3_map_event(struct perf_event *event, &armv8_pmuv3_perf_cache_map, ARMV8_PMU_EVTYPE_EVENT); - /* Onl expose micro/arch events supported by this PMU */ + if (armv8pmu_event_is_64bit(event)) + event->hw.flags |= ARMPMU_EVT_64BIT; + + /* + * User events must be allocated into a single counter, and so + * must not be chained. + * + * Most 64-bit events require long counter support, but 64-bit + * CPU_CYCLES events can be placed into the dedicated cycle + * counter when this is free. + */ + if (armv8pmu_event_want_user_access(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + return -EINVAL; + if (armv8pmu_event_is_64bit(event) && + (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && + !armv8pmu_has_long_event(armpmu)) + return -EOPNOTSUPP; + + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; + } + + /* Only expose micro/arch events supported by this PMU */ if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) && test_bit(hw_event_id, armpmu->pmceid_bitmap)) { return hw_event_id; @@ -910,15 +1139,17 @@ static void __armv8pmu_probe_pmu(void *info) struct armv8pmu_probe_info *probe = info; struct arm_pmu *cpu_pmu = probe->pmu; u64 dfr0; + u64 pmceid_raw[2]; u32 pmceid[2]; int pmuver; dfr0 = read_sysreg(id_aa64dfr0_el1); - pmuver = cpuid_feature_extract_signed_field(dfr0, - ID_AA64DFR0_PMUVER_SHIFT); - if (pmuver < 1) + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF || pmuver == 0) return; + cpu_pmu->pmuver = pmuver; probe->present = true; /* Read the nb of CNTx counters supported from PMNC */ @@ -928,12 +1159,23 @@ static void __armv8pmu_probe_pmu(void *info) /* Add the CPU cycles counter */ cpu_pmu->num_events += 1; - pmceid[0] = read_sysreg(pmceid0_el0); - pmceid[1] = read_sysreg(pmceid1_el0); + pmceid[0] = pmceid_raw[0] = read_sysreg(pmceid0_el0); + pmceid[1] = pmceid_raw[1] = read_sysreg(pmceid1_el0); - bitmap_from_u32array(cpu_pmu->pmceid_bitmap, - ARMV8_PMUV3_MAX_COMMON_EVENTS, pmceid, - ARRAY_SIZE(pmceid)); + bitmap_from_arr32(cpu_pmu->pmceid_bitmap, + pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + pmceid[0] = pmceid_raw[0] >> 32; + pmceid[1] = pmceid_raw[1] >> 32; + + bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap, + pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + /* store PMMIR_EL1 register for sysfs */ + if (pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4 && (pmceid_raw[1] & BIT(31))) + cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1); + else + cpu_pmu->reg_pmmir = 0; } static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) @@ -953,164 +1195,182 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) return probe.present ? 0 : -ENODEV; } -static int armv8_pmu_init(struct arm_pmu *cpu_pmu) +static void armv8pmu_disable_user_access_ipi(void *unused) { - int ret = armv8pmu_probe_pmu(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->handle_irq = armv8pmu_handle_irq, - cpu_pmu->enable = armv8pmu_enable_event, - cpu_pmu->disable = armv8pmu_disable_event, - cpu_pmu->read_counter = armv8pmu_read_counter, - cpu_pmu->write_counter = armv8pmu_write_counter, - cpu_pmu->get_event_idx = armv8pmu_get_event_idx, - cpu_pmu->start = armv8pmu_start, - cpu_pmu->stop = armv8pmu_stop, - cpu_pmu->reset = armv8pmu_reset, - cpu_pmu->max_period = (1LLU << 32) - 1, - cpu_pmu->set_event_filter = armv8pmu_set_event_filter; - - return 0; + armv8pmu_disable_user_access(); } -static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) +static int armv8pmu_proc_user_access_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - int ret = armv8_pmu_init(cpu_pmu); - if (ret) + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write || sysctl_perf_user_access) return ret; - cpu_pmu->name = "armv8_pmuv3"; - cpu_pmu->map_event = armv8_pmuv3_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - + on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1); return 0; } -static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; +static struct ctl_table armv8_pmu_sysctl_table[] = { + { + .procname = "perf_user_access", + .data = &sysctl_perf_user_access, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = armv8pmu_proc_user_access_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; - cpu_pmu->name = "armv8_cortex_a35"; - cpu_pmu->map_event = armv8_a53_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; +static void armv8_pmu_register_sysctl_table(void) +{ + static u32 tbl_registered = 0; - return 0; + if (!cmpxchg_relaxed(&tbl_registered, 0, 1)) + register_sysctl("kernel", armv8_pmu_sysctl_table); } -static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) +static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, + int (*map_event)(struct perf_event *event), + const struct attribute_group *events, + const struct attribute_group *format, + const struct attribute_group *caps) { - int ret = armv8_pmu_init(cpu_pmu); + int ret = armv8pmu_probe_pmu(cpu_pmu); if (ret) return ret; - cpu_pmu->name = "armv8_cortex_a53"; - cpu_pmu->map_event = armv8_a53_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; + cpu_pmu->handle_irq = armv8pmu_handle_irq; + cpu_pmu->enable = armv8pmu_enable_event; + cpu_pmu->disable = armv8pmu_disable_event; + cpu_pmu->read_counter = armv8pmu_read_counter; + cpu_pmu->write_counter = armv8pmu_write_counter; + cpu_pmu->get_event_idx = armv8pmu_get_event_idx; + cpu_pmu->clear_event_idx = armv8pmu_clear_event_idx; + cpu_pmu->start = armv8pmu_start; + cpu_pmu->stop = armv8pmu_stop; + cpu_pmu->reset = armv8pmu_reset; + cpu_pmu->set_event_filter = armv8pmu_set_event_filter; + cpu_pmu->filter_match = armv8pmu_filter_match; + + cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx; + cpu_pmu->name = name; + cpu_pmu->map_event = map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ? + events : &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = format ? + format : &armv8_pmuv3_format_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ? + caps : &armv8_pmuv3_caps_attr_group; + + armv8_pmu_register_sysctl_table(); return 0; } -static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) +static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name, + int (*map_event)(struct perf_event *event)) { - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; + return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL); +} - cpu_pmu->name = "armv8_cortex_a57"; - cpu_pmu->map_event = armv8_a57_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; +#define PMUV3_INIT_SIMPLE(name) \ +static int name##_pmu_init(struct arm_pmu *cpu_pmu) \ +{ \ + return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\ +} - return 0; +PMUV3_INIT_SIMPLE(armv8_pmuv3) + +PMUV3_INIT_SIMPLE(armv8_cortex_a34) +PMUV3_INIT_SIMPLE(armv8_cortex_a55) +PMUV3_INIT_SIMPLE(armv8_cortex_a65) +PMUV3_INIT_SIMPLE(armv8_cortex_a75) +PMUV3_INIT_SIMPLE(armv8_cortex_a76) +PMUV3_INIT_SIMPLE(armv8_cortex_a77) +PMUV3_INIT_SIMPLE(armv8_cortex_a78) +PMUV3_INIT_SIMPLE(armv9_cortex_a510) +PMUV3_INIT_SIMPLE(armv9_cortex_a710) +PMUV3_INIT_SIMPLE(armv8_cortex_x1) +PMUV3_INIT_SIMPLE(armv9_cortex_x2) +PMUV3_INIT_SIMPLE(armv8_neoverse_e1) +PMUV3_INIT_SIMPLE(armv8_neoverse_n1) +PMUV3_INIT_SIMPLE(armv9_neoverse_n2) +PMUV3_INIT_SIMPLE(armv8_neoverse_v1) + +PMUV3_INIT_SIMPLE(armv8_nvidia_carmel) +PMUV3_INIT_SIMPLE(armv8_nvidia_denver) + +static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) +{ + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a35", + armv8_a53_map_event); } -static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) +static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) { - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a53", + armv8_a53_map_event); +} - cpu_pmu->name = "armv8_cortex_a72"; - cpu_pmu->map_event = armv8_a57_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; +static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) +{ + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57", + armv8_a57_map_event); +} - return 0; +static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) +{ + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72", + armv8_a57_map_event); } static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) { - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a73"; - cpu_pmu->map_event = armv8_a73_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a73", + armv8_a73_map_event); } static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cavium_thunder"; - cpu_pmu->map_event = armv8_thunder_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder", + armv8_thunder_map_event); } static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) { - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_brcm_vulcan"; - cpu_pmu->map_event = armv8_vulcan_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_brcm_vulcan", + armv8_vulcan_map_event); } static const struct of_device_id armv8_pmu_of_device_ids[] = { - {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, + {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_pmu_init}, + {.compatible = "arm,cortex-a34-pmu", .data = armv8_cortex_a34_pmu_init}, {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, + {.compatible = "arm,cortex-a55-pmu", .data = armv8_cortex_a55_pmu_init}, {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, + {.compatible = "arm,cortex-a65-pmu", .data = armv8_cortex_a65_pmu_init}, {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, + {.compatible = "arm,cortex-a75-pmu", .data = armv8_cortex_a75_pmu_init}, + {.compatible = "arm,cortex-a76-pmu", .data = armv8_cortex_a76_pmu_init}, + {.compatible = "arm,cortex-a77-pmu", .data = armv8_cortex_a77_pmu_init}, + {.compatible = "arm,cortex-a78-pmu", .data = armv8_cortex_a78_pmu_init}, + {.compatible = "arm,cortex-a510-pmu", .data = armv9_cortex_a510_pmu_init}, + {.compatible = "arm,cortex-a710-pmu", .data = armv9_cortex_a710_pmu_init}, + {.compatible = "arm,cortex-x1-pmu", .data = armv8_cortex_x1_pmu_init}, + {.compatible = "arm,cortex-x2-pmu", .data = armv9_cortex_x2_pmu_init}, + {.compatible = "arm,neoverse-e1-pmu", .data = armv8_neoverse_e1_pmu_init}, + {.compatible = "arm,neoverse-n1-pmu", .data = armv8_neoverse_n1_pmu_init}, + {.compatible = "arm,neoverse-n2-pmu", .data = armv9_neoverse_n2_pmu_init}, + {.compatible = "arm,neoverse-v1-pmu", .data = armv8_neoverse_v1_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, + {.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init}, + {.compatible = "nvidia,denver-pmu", .data = armv8_nvidia_denver_pmu_init}, {}, }; @@ -1123,6 +1383,7 @@ static struct platform_driver armv8_pmu_driver = { .driver = { .name = ARMV8_PMU_PDEV_NAME, .of_match_table = armv8_pmu_of_device_ids, + .suppress_bind_attrs = true, }, .probe = armv8_pmu_device_probe, }; @@ -1132,6 +1393,69 @@ static int __init armv8_pmu_driver_init(void) if (acpi_disabled) return platform_driver_register(&armv8_pmu_driver); else - return arm_pmu_acpi_probe(armv8_pmuv3_init); + return arm_pmu_acpi_probe(armv8_pmuv3_pmu_init); } device_initcall(armv8_pmu_driver_init) + +void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, u64 now) +{ + struct clock_read_data *rd; + unsigned int seq; + u64 ns; + + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_time_short = 0; + userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event); + + if (userpg->cap_user_rdpmc) { + if (event->hw.flags & ARMPMU_EVT_64BIT) + userpg->pmc_width = 64; + else + userpg->pmc_width = 32; + } + + do { + rd = sched_clock_read_begin(&seq); + + if (rd->read_sched_clock != arch_timer_read_counter) + return; + + userpg->time_mult = rd->mult; + userpg->time_shift = rd->shift; + userpg->time_zero = rd->epoch_ns; + userpg->time_cycles = rd->epoch_cyc; + userpg->time_mask = rd->sched_clock_mask; + + /* + * Subtract the cycle base, such that software that + * doesn't know about cap_user_time_short still 'works' + * assuming no wraps. + */ + ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); + userpg->time_zero -= ns; + + } while (sched_clock_read_retry(seq)); + + userpg->time_offset = userpg->time_zero - now; + + /* + * time_shift is not expected to be greater than 31 due to + * the original published conversion algorithm shifting a + * 32-bit value (now specifies a 64-bit value) - refer + * perf_event_mmap_page documentation in perf_event.h. + */ + if (userpg->time_shift == 32) { + userpg->time_shift = 31; + userpg->time_mult >>= 1; + } + + /* + * Internal timekeeping for enabled/running/stopped times + * is always computed with the sched_clock. + */ + userpg->cap_user_time = 1; + userpg->cap_user_time_zero = 1; + userpg->cap_user_time_short = 1; +} diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index bd1b74c2436f..b4eece3eb17d 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -1,28 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/perf_event.h> #include <linux/bug.h> #include <linux/sched/task_stack.h> -#include <asm/compat.h> #include <asm/perf_regs.h> #include <asm/ptrace.h> +static u64 perf_ext_regs_value(int idx) +{ + switch (idx) { + case PERF_REG_ARM64_VG: + if (WARN_ON_ONCE(!system_supports_sve())) + return 0; + + /* + * Vector granule is current length in bits of SVE registers + * divided by 64. + */ + return (task_get_sve_vl(current) * 8) / 64; + default: + WARN_ON_ONCE(true); + return 0; + } +} + u64 perf_reg_value(struct pt_regs *regs, int idx) { - if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX)) + if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX)) return 0; /* - * Compat (i.e. 32 bit) mode: - * - PC has been set in the pt_regs struct in kernel_entry, - * - Handle SP and LR here. + * Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but + * we're stuck with it for ABI compatibility reasons. + * + * For a 32-bit consumer inspecting a 32-bit task, then it will look at + * the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h). + * These correspond directly to a prefix of the registers saved in our + * 'struct pt_regs', with the exception of the PC, so we copy that down + * (x15 corresponds to SP_hyp in the architecture). + * + * So far, so good. + * + * The oddity arises when a 64-bit consumer looks at a 32-bit task and + * asks for registers beyond PERF_REG_ARM_MAX. In this case, we return + * SP_usr, LR_usr and PC in the positions where the AArch64 SP, LR and + * PC registers would normally live. The initial idea was to allow a + * 64-bit unwinder to unwind a 32-bit task and, although it's not clear + * how well that works in practice, somebody might be relying on it. + * + * At the time we make a sample, we don't know whether the consumer is + * 32-bit or 64-bit, so we have to cater for both possibilities. */ if (compat_user_mode(regs)) { if ((u32)idx == PERF_REG_ARM64_SP) return regs->compat_sp; if ((u32)idx == PERF_REG_ARM64_LR) return regs->compat_lr; + if (idx == 15) + return regs->pc; } if ((u32)idx == PERF_REG_ARM64_SP) @@ -31,6 +69,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) if ((u32)idx == PERF_REG_ARM64_PC) return regs->pc; + if ((u32)idx >= PERF_REG_ARM64_MAX) + return perf_ext_regs_value(idx); + return regs->regs[idx]; } @@ -38,7 +79,12 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + u64 reserved_mask = REG_RESERVED; + + if (system_supports_sve()) + reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG); + + if (!mask || mask & reserved_mask) return -EINVAL; return 0; @@ -53,8 +99,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile new file mode 100644 index 000000000000..839291430cb3 --- /dev/null +++ b/arch/arm64/kernel/pi/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright 2022 Google LLC + +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + $(call cc-option,-mbranch-protection=none) \ + -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ + -include $(srctree)/include/linux/hidden.h \ + -D__DISABLE_EXPORTS -ffreestanding -D__NO_FORTIFY \ + $(call cc-option,-fno-addrsig) + +# remove SCS flags from all objects in this directory +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) +# disable LTO +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) + +GCOV_PROFILE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ + --remove-section=.note.gnu.property \ + --prefix-alloc-sections=.init +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) + +obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c new file mode 100644 index 000000000000..17bff6e399e4 --- /dev/null +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2022 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +// NOTE: code in this file runs *very* early, and is not permitted to use +// global variables or anything that relies on absolute addressing. + +#include <linux/libfdt.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/archrandom.h> +#include <asm/memory.h> + +/* taken from lib/string.c */ +static char *__strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + l1 = strlen(s1); + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} +static bool cmdline_contains_nokaslr(const u8 *cmdline) +{ + const u8 *str; + + str = __strstr(cmdline, "nokaslr"); + return str == cmdline || (str > cmdline && *(str - 1) == ' '); +} + +static bool is_kaslr_disabled_cmdline(void *fdt) +{ + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + int node; + const u8 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + + if (cmdline_contains_nokaslr(prop)) + return true; + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) + goto out; + + return false; + } +out: + return cmdline_contains_nokaslr(CONFIG_CMDLINE); +} + +static u64 get_kaslr_seed(void *fdt) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +asmlinkage u64 kaslr_early_init(void *fdt) +{ + u64 seed; + + if (is_kaslr_disabled_cmdline(fdt)) + return 0; + + seed = get_kaslr_seed(fdt); + if (!seed) { + if (!__early_cpu_has_rndr() || + !__arm64_rndr((unsigned long *)&seed)) + return 0; + } + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of + * the lower and upper quarters to avoid colliding with other + * allocations. + */ + return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0)); +} diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c new file mode 100644 index 000000000000..2708b620b4ae --- /dev/null +++ b/arch/arm64/kernel/pointer_auth.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/compat.h> +#include <linux/errno.h> +#include <linux/prctl.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <asm/cpufeature.h> +#include <asm/pointer_auth.h> + +int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) +{ + struct ptrauth_keys_user *keys = &tsk->thread.keys_user; + unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY | + PR_PAC_APDAKEY | PR_PAC_APDBKEY; + unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY; + + if (!system_supports_address_auth() && !system_supports_generic_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if (!arg) { + ptrauth_keys_init_user(keys); + return 0; + } + + if (arg & ~key_mask) + return -EINVAL; + + if (((arg & addr_key_mask) && !system_supports_address_auth()) || + ((arg & PR_PAC_APGAKEY) && !system_supports_generic_auth())) + return -EINVAL; + + if (arg & PR_PAC_APIAKEY) + get_random_bytes(&keys->apia, sizeof(keys->apia)); + if (arg & PR_PAC_APIBKEY) + get_random_bytes(&keys->apib, sizeof(keys->apib)); + if (arg & PR_PAC_APDAKEY) + get_random_bytes(&keys->apda, sizeof(keys->apda)); + if (arg & PR_PAC_APDBKEY) + get_random_bytes(&keys->apdb, sizeof(keys->apdb)); + if (arg & PR_PAC_APGAKEY) + get_random_bytes(&keys->apga, sizeof(keys->apga)); + ptrauth_keys_install_user(keys); + + return 0; +} + +static u64 arg_to_enxx_mask(unsigned long arg) +{ + u64 sctlr_enxx_mask = 0; + + WARN_ON(arg & ~PR_PAC_ENABLED_KEYS_MASK); + if (arg & PR_PAC_APIAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIA; + if (arg & PR_PAC_APIBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIB; + if (arg & PR_PAC_APDAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDA; + if (arg & PR_PAC_APDBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDB; + return sctlr_enxx_mask; +} + +int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys, + unsigned long enabled) +{ + u64 sctlr; + + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if ((keys & ~PR_PAC_ENABLED_KEYS_MASK) || (enabled & ~keys)) + return -EINVAL; + + preempt_disable(); + sctlr = tsk->thread.sctlr_user; + sctlr &= ~arg_to_enxx_mask(keys); + sctlr |= arg_to_enxx_mask(enabled); + tsk->thread.sctlr_user = sctlr; + if (tsk == current) + update_sctlr_el1(sctlr); + preempt_enable(); + + return 0; +} + +int ptrauth_get_enabled_keys(struct task_struct *tsk) +{ + int retval = 0; + + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIA) + retval |= PR_PAC_APIAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIB) + retval |= PR_PAC_APIBKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDA) + retval |= PR_PAC_APDAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDB) + retval |= PR_PAC_APDBKEY; + + return retval; +} diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile index 89b6df613dde..8e4be92e25b1 100644 --- a/arch/arm64/kernel/probes/Makefile +++ b/arch/arm64/kernel/probes/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o \ kprobes_trampoline.o \ simulate-insn.o diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 6bf6657a5a52..104101f633b1 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/probes/decode-insn.c * * Copyright (C) 2013 Linaro Limited. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/kernel.h> @@ -37,7 +29,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) aarch64_insn_is_msr_imm(insn) || aarch64_insn_is_msr_reg(insn) || aarch64_insn_is_exception(insn) || - aarch64_insn_is_eret(insn)) + aarch64_insn_is_eret(insn) || + aarch64_insn_is_eret_auth(insn)) return false; /* @@ -50,11 +43,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) != AARCH64_INSN_SPCLREG_DAIF; /* - * The HINT instruction is is problematic when single-stepping, - * except for the NOP case. + * The HINT instruction is steppable only if it is in whitelist + * and the rest of other such instructions are blocked for + * single stepping as they may cause exception or other + * unintended behaviour. */ if (aarch64_insn_is_hint(insn)) - return aarch64_insn_is_nop(insn); + return aarch64_insn_is_steppable_hint(insn); return true; } diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 192ab007bacb..8b758c5a2062 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -1,16 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * arch/arm64/kernel/probes/decode-insn.h * * Copyright (C) 2013 Linaro Limited. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef _ARM_KERNEL_KPROBES_ARM64_H diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d849d9804011..c9e4d0720285 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/probes/kprobes.c * @@ -5,34 +6,32 @@ * * Copyright (C) 2013 Linaro Limited. * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * */ + +#define pr_fmt(fmt) "kprobes: " fmt + +#include <linux/extable.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/kprobes.h> -#include <linux/extable.h> +#include <linux/sched/debug.h> +#include <linux/set_memory.h> #include <linux/slab.h> #include <linux/stop_machine.h> -#include <linux/sched/debug.h> #include <linux/stringify.h> -#include <asm/traps.h> -#include <asm/ptrace.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> + #include <asm/cacheflush.h> +#include <asm/daifflags.h> #include <asm/debug-monitors.h> -#include <asm/system_misc.h> #include <asm/insn.h> -#include <linux/uaccess.h> #include <asm/irq.h> +#include <asm/patching.h> +#include <asm/ptrace.h> #include <asm/sections.h> +#include <asm/system_misc.h> +#include <asm/traps.h> #include "decode-insn.h" @@ -40,16 +39,33 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); +post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - /* prepare insn slot */ - p->ainsn.api.insn[0] = cpu_to_le32(p->opcode); + kprobe_opcode_t *addr = p->ainsn.api.insn; - flush_icache_range((uintptr_t) (p->ainsn.api.insn), - (uintptr_t) (p->ainsn.api.insn) + - MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + /* + * Prepare insn slot, Mark Rutland points out it depends on a coupe of + * subtleties: + * + * - That the I-cache maintenance for these instructions is complete + * *before* the kprobe BRK is written (and aarch64_insn_patch_text_nosync() + * ensures this, but just omits causing a Context-Synchronization-Event + * on all CPUS). + * + * - That the kprobe BRK results in an exception (and consequently a + * Context-Synchronoization-Event), which ensures that the CPU will + * fetch thesingle-step slot instructions *after* this, ensuring that + * the new instructions are used + * + * It supposes to place ISB after patching to guarantee I-cache maintenance + * is observed on all CPUS, however, single-step slot is installed in + * the BRK exception handler, so it is unnecessary to generate + * Contex-Synchronization-Event via ISB again. + */ + aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* * Needs restoring of return address after stepping xol. @@ -72,14 +88,12 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ - post_kprobe_handler(kcb, regs); + post_kprobe_handler(p, kcb, regs); } int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long probe_addr = (unsigned long)p->addr; - extern char __start_rodata[]; - extern char __end_rodata[]; if (probe_addr & 0x3) return -EINVAL; @@ -87,10 +101,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) /* copy instruction */ p->opcode = le32_to_cpu(*p->addr); - if (in_exception_text(probe_addr)) - return -EINVAL; - if (probe_addr >= (unsigned long) __start_rodata && - probe_addr <= (unsigned long) __end_rodata) + if (search_exception_tables(probe_addr)) return -EINVAL; /* decode instruction */ @@ -107,7 +118,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) if (!p->ainsn.api.insn) return -ENOMEM; break; - }; + } /* prepare the instruction */ if (p->ainsn.api.insn) @@ -118,27 +129,28 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) +void *alloc_insn_page(void) { - void *addrs[1]; - u32 insns[1]; - - addrs[0] = (void *)addr; - insns[0] = (u32)opcode; - - return aarch64_insn_patch_text(addrs, insns, 1); + return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } /* arm kprobe: install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { - patch_text(p->addr, BRK64_OPCODE_KPROBES); + void *addr = p->addr; + u32 insn = BRK64_OPCODE_KPROBES; + + aarch64_insn_patch_text(&addr, &insn, 1); } /* disarm kprobe: remove breakpoint from text */ void __kprobes arch_disarm_kprobe(struct kprobe *p) { - patch_text(p->addr, p->opcode); + void *addr = p->addr; + + aarch64_insn_patch_text(&addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) @@ -167,67 +179,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p) } /* - * When PSTATE.D is set (masked), then software step exceptions can not be - * generated. - * SPSR's D bit shows the value of PSTATE.D immediately before the - * exception was taken. PSTATE.D is set while entering into any exception - * mode, however software clears it for any normal (none-debug-exception) - * mode in the exception entry. Therefore, when we are entering into kprobe - * breakpoint handler from any normal mode then SPSR.D bit is already - * cleared, however it is set when we are entering from any debug exception - * mode. - * Since we always need to generate single step exception after a kprobe - * breakpoint exception therefore we need to clear it unconditionally, when - * we become sure that the current breakpoint exception is for kprobe. - */ -static void __kprobes -spsr_set_debug_flag(struct pt_regs *regs, int mask) -{ - unsigned long spsr = regs->pstate; - - if (mask) - spsr |= PSR_D_BIT; - else - spsr &= ~PSR_D_BIT; - - regs->pstate = spsr; -} - -/* - * Interrupts need to be disabled before single-step mode is set, and not - * reenabled until after single-step mode ends. - * Without disabling interrupt on local CPU, there is a chance of - * interrupt occurrence in the period of exception return and start of - * out-of-line single-step, that result in wrongly single stepping - * into the interrupt handler. + * Mask all of DAIF while executing the instruction out-of-line, to keep things + * simple and avoid nesting exceptions. Interrupts do have to be disabled since + * the kprobe state is per-CPU and doesn't get migrated. */ static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - kcb->saved_irqflag = regs->pstate; - regs->pstate |= PSR_I_BIT; + kcb->saved_irqflag = regs->pstate & DAIF_MASK; + regs->pstate |= DAIF_MASK; } static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - if (kcb->saved_irqflag & PSR_I_BIT) - regs->pstate |= PSR_I_BIT; - else - regs->pstate &= ~PSR_I_BIT; -} - -static void __kprobes -set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - kcb->ss_ctx.ss_pending = true; - kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t); -} - -static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb) -{ - kcb->ss_ctx.ss_pending = false; - kcb->ss_ctx.match_addr = 0; + regs->pstate &= ~DAIF_MASK; + regs->pstate |= kcb->saved_irqflag; } static void __kprobes setup_singlestep(struct kprobe *p, @@ -249,13 +216,7 @@ static void __kprobes setup_singlestep(struct kprobe *p, /* prepare for single stepping */ slot = (unsigned long)p->ainsn.api.insn; - set_ss_context(kcb, slot); /* mark pending ss */ - - spsr_set_debug_flag(regs, 0); - - /* IRQs and single stepping do not mix well. */ kprobes_save_local_irqflag(kcb, regs); - kernel_enable_single_step(regs); instruction_pointer_set(regs, slot); } else { /* insn simulation */ @@ -275,7 +236,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, break; case KPROBE_HIT_SS: case KPROBE_REENTER: - pr_warn("Unrecoverable kprobe detected at %p.\n", p->addr); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); break; @@ -288,13 +249,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, } static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) +post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - struct kprobe *cur = kprobe_running(); - - if (!cur) - return; - /* return addr restore if non-branching insn */ if (cur->ainsn.api.restore != 0) instruction_pointer_set(regs, cur->ainsn.api.restore); @@ -306,12 +262,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) } /* call post handler */ kcb->kprobe_status = KPROBE_HIT_SSDONE; - if (cur->post_handler) { - /* post_handler can hit breakpoint and single step - * again, so we enable D-flag for recursive exception. - */ + if (cur->post_handler) cur->post_handler(cur, regs, 0); - } reset_current_kprobe(); } @@ -332,37 +284,19 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) * normal page fault. */ instruction_pointer_set(regs, (unsigned long) cur->addr); - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); - kernel_disable_single_step(); - - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { restore_previous_kprobe(kcb); - else + } else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, fsr)) - return 1; - - /* * In case the user-specified fault handler returned * zero, try to fix up. */ @@ -395,26 +329,14 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry, - * so get out doing nothing more here. - * - * pre_handler can hit a breakpoint and can step thru - * before return, keep PSTATE D-flag enabled until - * pre_handler return back. + * pre-handler and it returned non-zero, it will + * modify the execution path and no need to single + * stepping. Let's just reset current kprobe and exit. */ if (!p->pre_handler || !p->pre_handler(p, regs)) { setup_singlestep(p, regs, kcb, 0); - return; - } - } - } else if ((le32_to_cpu(*(kprobe_opcode_t *) addr) == - BRK64_OPCODE_KPROBES) && cur_kprobe) { - /* We probably hit a jprobe. Call its break handler. */ - if (cur_kprobe->break_handler && - cur_kprobe->break_handler(cur_kprobe, regs)) { - setup_singlestep(cur_kprobe, regs, kcb, 0); - return; + } else + reset_current_kprobe(); } } /* @@ -428,219 +350,83 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) } static int __kprobes -kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - if ((kcb->ss_ctx.ss_pending) - && (kcb->ss_ctx.match_addr == addr)) { - clear_ss_context(kcb); /* clear pending ss */ - return DBG_HOOK_HANDLED; - } - /* not ours, kprobes should ignore it */ - return DBG_HOOK_ERROR; -} - -int __kprobes -kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) +kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - int retval; - - /* return error if this is not our step */ - retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); + unsigned long addr = instruction_pointer(regs); + struct kprobe *cur = kprobe_running(); - if (retval == DBG_HOOK_HANDLED) { + if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && + ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); - kernel_disable_single_step(); + post_kprobe_handler(cur, kcb, regs); - post_kprobe_handler(kcb, regs); + return DBG_HOOK_HANDLED; } - return retval; + /* not ours, kprobes should ignore it */ + return DBG_HOOK_ERROR; } -int __kprobes -kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) +static struct break_hook kprobes_break_ss_hook = { + .imm = KPROBES_BRK_SS_IMM, + .fn = kprobe_breakpoint_ss_handler, +}; + +static int __kprobes +kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) { kprobe_handler(regs); return DBG_HOOK_HANDLED; } -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - /* - * Since we can't be sure where in the stack frame "stacked" - * pass-by-value arguments are stored we just don't try to - * duplicate any of the stack. Do not use jprobes on functions that - * use more than 64 bytes (after padding each to an 8 byte boundary) - * of arguments, or pass individual arguments larger than 16 bytes. - */ - - instruction_pointer_set(regs, (unsigned long) jp->entry); - preempt_disable(); - pause_graph_tracing(); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - /* - * Jprobe handler return by entering break exception, - * encoded same as kprobe, but with following conditions - * -a special PC to identify it from the other kprobes. - * -restore stack addr to original saved pt_regs - */ - asm volatile(" mov sp, %0 \n" - "jprobe_return_break: brk %1 \n" - : - : "r" (kcb->jprobe_saved_regs.sp), - "I" (BRK64_ESR_KPROBES) - : "memory"); - - unreachable(); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - long stack_addr = kcb->jprobe_saved_regs.sp; - long orig_sp = kernel_stack_pointer(regs); - struct jprobe *jp = container_of(p, struct jprobe, kp); - extern const char jprobe_return_break[]; - - if (instruction_pointer(regs) != (u64) jprobe_return_break) - return 0; - - if (orig_sp != stack_addr) { - struct pt_regs *saved_regs = - (struct pt_regs *)kcb->jprobe_saved_regs.sp; - pr_err("current sp %lx does not match saved sp %lx\n", - orig_sp, stack_addr); - pr_err("Saved registers for jprobe %p\n", jp); - __show_regs(saved_regs); - pr_err("Current registers\n"); - __show_regs(regs); - BUG(); - } - unpause_graph_tracing(); - *regs = kcb->jprobe_saved_regs; - preempt_enable_no_resched(); - return 1; -} - -bool arch_within_kprobe_blacklist(unsigned long addr) -{ - if ((addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) || - (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end) || - (addr >= (unsigned long)__idmap_text_start && - addr < (unsigned long)__idmap_text_end) || - !!search_exception_tables(addr)) - return true; - - if (!is_kernel_in_hyp_mode()) { - if ((addr >= (unsigned long)__hyp_text_start && - addr < (unsigned long)__hyp_text_end) || - (addr >= (unsigned long)__hyp_idmap_text_start && - addr < (unsigned long)__hyp_idmap_text_end)) - return true; - } +static struct break_hook kprobes_break_hook = { + .imm = KPROBES_BRK_IMM, + .fn = kprobe_breakpoint_handler, +}; - return false; +/* + * Provide a blacklist of symbols identifying ranges which cannot be kprobed. + * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). + */ +int __init arch_populate_kprobe_blacklist(void) +{ + int ret; + + ret = kprobe_add_area_blacklist((unsigned long)__entry_text_start, + (unsigned long)__entry_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__idmap_text_start, + (unsigned long)__idmap_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__hyp_text_start, + (unsigned long)__hyp_text_end); + if (ret || is_kernel_in_hyp_mode()) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__hyp_idmap_text_start, + (unsigned long)__hyp_idmap_text_end); + return ret; } void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address = - (unsigned long)&kretprobe_trampoline; - kprobe_opcode_t *correct_ret_addr = NULL; - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); - - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - if (ri->rp && ri->rp->handler) { - __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_hash_unlock(current, &flags); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void *)orig_ret_address; + return (void *)kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { ri->ret_addr = (kprobe_opcode_t *)regs->regs[30]; + ri->fp = (void *)regs->regs[29]; /* replace return addr (x30) with trampoline */ - regs->regs[30] = (long)&kretprobe_trampoline; + regs->regs[30] = (long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) @@ -650,5 +436,8 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { + register_kernel_break_hook(&kprobes_break_hook); + register_kernel_break_hook(&kprobes_break_ss_hook); + return 0; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 5d6e7f14638c..9a6499bed58b 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * trampoline entry and return code for kretprobes. */ @@ -24,7 +25,7 @@ stp x24, x25, [sp, #S_X24] stp x26, x27, [sp, #S_X26] stp x28, x29, [sp, #S_X28] - add x0, sp, #S_FRAME_SIZE + add x0, sp, #PT_REGS_SIZE stp lr, x0, [sp, #S_LR] /* * Construct a useful saved PSTATE @@ -60,11 +61,14 @@ ldp x28, x29, [sp, #S_X28] .endm -ENTRY(kretprobe_trampoline) - sub sp, sp, #S_FRAME_SIZE +SYM_CODE_START(__kretprobe_trampoline) + sub sp, sp, #PT_REGS_SIZE save_all_base_regs + /* Setup a frame pointer. */ + add x29, sp, #S_FP + mov x0, sp bl trampoline_probe_handler /* @@ -73,9 +77,10 @@ ENTRY(kretprobe_trampoline) */ mov lr, x0 + /* The frame pointer (x29) is restored with other registers. */ restore_all_base_regs - add sp, sp, #S_FRAME_SIZE + add sp, sp, #PT_REGS_SIZE ret -ENDPROC(kretprobe_trampoline) +SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index be05868418ee..22d0b3252476 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/probes/simulate-insn.c * * Copyright (C) 2013 Linaro Limited. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bitops.h> @@ -18,6 +10,7 @@ #include <linux/kprobes.h> #include <asm/ptrace.h> +#include <asm/traps.h> #include "simulate-insn.h" diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index 050bde683c2d..e065dc92218e 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -1,16 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * arch/arm64/kernel/probes/simulate-insn.h * * Copyright (C) 2013 Linaro Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef _ARM_KERNEL_KPROBES_SIMULATE_INSN_H diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 636ca0119c0e..d49aef2657cd 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/highmem.h> #include <linux/ptrace.h> @@ -24,7 +21,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, memcpy(dst, src, len); /* flush caches (dcache/icache) */ - sync_icache_aliases(dst, len); + sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); kunmap_atomic(xol_page_kaddr); } @@ -41,7 +38,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) - return -ENOTSUPP; + return -EOPNOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; @@ -169,34 +166,29 @@ int arch_uprobe_exception_notify(struct notifier_block *self, } static int uprobe_breakpoint_handler(struct pt_regs *regs, - unsigned int esr) + unsigned long esr) { - if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + if (uprobe_pre_sstep_notifier(regs)) return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; } static int uprobe_single_step_handler(struct pt_regs *regs, - unsigned int esr) + unsigned long esr) { struct uprobe_task *utask = current->utask; - if (user_mode(regs)) { - WARN_ON(utask && - (instruction_pointer(regs) != utask->xol_vaddr + 4)); - - if (uprobe_post_sstep_notifier(regs)) - return DBG_HOOK_HANDLED; - } + WARN_ON(utask && (instruction_pointer(regs) != utask->xol_vaddr + 4)); + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; } /* uprobe breakpoint handler hook */ static struct break_hook uprobes_break_hook = { - .esr_mask = BRK64_ESR_MASK, - .esr_val = BRK64_ESR_UPROBES, + .imm = UPROBES_BRK_IMM, .fn = uprobe_breakpoint_handler, }; @@ -207,8 +199,8 @@ static struct step_hook uprobes_step_hook = { static int __init arch_init_uprobes(void) { - register_break_hook(&uprobes_break_hook); - register_step_hook(&uprobes_step_hook); + register_user_break_hook(&uprobes_break_hook); + register_user_step_hook(&uprobes_step_hook); return 0; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 2dc0f8482210..044a7d7f1f6a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -1,41 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/process.c * * Original Copyright (C) 1995 Linus Torvalds * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ - -#include <stdarg.h> - #include <linux/compat.h> #include <linux/efi.h> +#include <linux/elf.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/kernel.h> +#include <linux/mman.h> #include <linux/mm.h> +#include <linux/nospec.h> #include <linux/stddef.h> +#include <linux/sysctl.h> #include <linux/unistd.h> #include <linux/user.h> #include <linux/delay.h> #include <linux/reboot.h> #include <linux/interrupt.h> -#include <linux/kallsyms.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/elfcore.h> @@ -49,19 +38,27 @@ #include <linux/notifier.h> #include <trace/events/power.h> #include <linux/percpu.h> +#include <linux/thread_info.h> +#include <linux/prctl.h> +#include <linux/stacktrace.h> #include <asm/alternative.h> #include <asm/compat.h> +#include <asm/cpufeature.h> #include <asm/cacheflush.h> #include <asm/exec.h> #include <asm/fpsimd.h> #include <asm/mmu_context.h> +#include <asm/mte.h> #include <asm/processor.h> +#include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/switch_to.h> +#include <asm/system_misc.h> -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> -unsigned long __stack_chk_guard __read_mostly; +unsigned long __stack_chk_guard __ro_after_init; EXPORT_SYMBOL(__stack_chk_guard); #endif @@ -71,23 +68,6 @@ EXPORT_SYMBOL(__stack_chk_guard); void (*pm_power_off)(void); EXPORT_SYMBOL_GPL(pm_power_off); -void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); - -/* - * This is our default idle handler. - */ -void arch_cpu_idle(void) -{ - /* - * This should do all the clock switching and wait for interrupt - * tricks - */ - trace_cpu_idle_rcuidle(1, smp_processor_id()); - cpu_do_idle(); - local_irq_enable(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); -} - #ifdef CONFIG_HOTPLUG_CPU void arch_cpu_idle_dead(void) { @@ -102,11 +82,11 @@ void arch_cpu_idle_dead(void) * to execute e.g. a RAM-based pin loop is not sufficient. This allows the * kexec'd kernel to use any and all RAM as it sees fit, without having to * avoid any code or data used by any SW CPU pin loop. The CPU hotplug - * functionality embodied in disable_nonboot_cpus() to achieve this. + * functionality embodied in smpt_shutdown_nonboot_cpus() to achieve this. */ void machine_shutdown(void) { - disable_nonboot_cpus(); + smp_shutdown_nonboot_cpus(reboot_cpu); } /* @@ -131,8 +111,7 @@ void machine_power_off(void) { local_irq_disable(); smp_send_stop(); - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); } /* @@ -158,10 +137,7 @@ void machine_restart(char *cmd) efi_reboot(reboot_mode, NULL); /* Now call the architecture specific reboot code. */ - if (arm_pm_restart) - arm_pm_restart(reboot_mode, cmd); - else - do_kernel_restart(cmd); + do_kernel_restart(cmd); /* * Whoops - the architecture was unable to reboot. @@ -170,6 +146,57 @@ void machine_restart(char *cmd) while (1); } +#define bstr(suffix, str) [PSR_BTYPE_ ## suffix >> PSR_BTYPE_SHIFT] = str +static const char *const btypes[] = { + bstr(NONE, "--"), + bstr( JC, "jc"), + bstr( C, "-c"), + bstr( J , "j-") +}; +#undef bstr + +static void print_pstate(struct pt_regs *regs) +{ + u64 pstate = regs->pstate; + + if (compat_user_mode(regs)) { + printk("pstate: %08llx (%c%c%c%c %c %s %s %c%c%c %cDIT %cSSBS)\n", + pstate, + pstate & PSR_AA32_N_BIT ? 'N' : 'n', + pstate & PSR_AA32_Z_BIT ? 'Z' : 'z', + pstate & PSR_AA32_C_BIT ? 'C' : 'c', + pstate & PSR_AA32_V_BIT ? 'V' : 'v', + pstate & PSR_AA32_Q_BIT ? 'Q' : 'q', + pstate & PSR_AA32_T_BIT ? "T32" : "A32", + pstate & PSR_AA32_E_BIT ? "BE" : "LE", + pstate & PSR_AA32_A_BIT ? 'A' : 'a', + pstate & PSR_AA32_I_BIT ? 'I' : 'i', + pstate & PSR_AA32_F_BIT ? 'F' : 'f', + pstate & PSR_AA32_DIT_BIT ? '+' : '-', + pstate & PSR_AA32_SSBS_BIT ? '+' : '-'); + } else { + const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >> + PSR_BTYPE_SHIFT]; + + printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO %cTCO %cDIT %cSSBS BTYPE=%s)\n", + pstate, + pstate & PSR_N_BIT ? 'N' : 'n', + pstate & PSR_Z_BIT ? 'Z' : 'z', + pstate & PSR_C_BIT ? 'C' : 'c', + pstate & PSR_V_BIT ? 'V' : 'v', + pstate & PSR_D_BIT ? 'D' : 'd', + pstate & PSR_A_BIT ? 'A' : 'a', + pstate & PSR_I_BIT ? 'I' : 'i', + pstate & PSR_F_BIT ? 'F' : 'f', + pstate & PSR_PAN_BIT ? '+' : '-', + pstate & PSR_UAO_BIT ? '+' : '-', + pstate & PSR_TCO_BIT ? '+' : '-', + pstate & PSR_DIT_BIT ? '+' : '-', + pstate & PSR_SSBS_BIT ? '+' : '-', + btype_str); + } +} + void __show_regs(struct pt_regs *regs) { int i, top_reg; @@ -186,39 +213,47 @@ void __show_regs(struct pt_regs *regs) } show_regs_print_info(KERN_DEFAULT); - print_symbol("PC is at %s\n", instruction_pointer(regs)); - print_symbol("LR is at %s\n", lr); - printk("pc : [<%016llx>] lr : [<%016llx>] pstate: %08llx\n", - regs->pc, lr, regs->pstate); + print_pstate(regs); + + if (!user_mode(regs)) { + printk("pc : %pS\n", (void *)regs->pc); + printk("lr : %pS\n", (void *)ptrauth_strip_insn_pac(lr)); + } else { + printk("pc : %016llx\n", regs->pc); + printk("lr : %016llx\n", lr); + } + printk("sp : %016llx\n", sp); + if (system_uses_irq_prio_masking()) + printk("pmr_save: %08llx\n", regs->pmr_save); + i = top_reg; while (i >= 0) { - printk("x%-2d: %016llx ", i, regs->regs[i]); - i--; + printk("x%-2d: %016llx", i, regs->regs[i]); - if (i % 2 == 0) { - pr_cont("x%-2d: %016llx ", i, regs->regs[i]); - i--; - } + while (i-- % 3) + pr_cont(" x%-2d: %016llx", i, regs->regs[i]); pr_cont("\n"); } } -void show_regs(struct pt_regs * regs) +void show_regs(struct pt_regs *regs) { __show_regs(regs); - dump_backtrace(regs, NULL); + dump_backtrace(regs, NULL, KERN_DEFAULT); } static void tls_thread_flush(void) { write_sysreg(0, tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(0, SYS_TPIDR2_EL0); if (is_compat_task()) { - current->thread.tp_value = 0; + current->thread.uw.tp_value = 0; /* * We need to ensure ordering between the shadow state and the @@ -230,15 +265,23 @@ static void tls_thread_flush(void) } } +static void flush_tagged_addr_state(void) +{ + if (IS_ENABLED(CONFIG_ARM64_TAGGED_ADDR_ABI)) + clear_thread_flag(TIF_TAGGED_ADDR); +} + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); + flush_tagged_addr_state(); } -void release_thread(struct task_struct *dead_task) +void arch_release_task_struct(struct task_struct *tsk) { + fpsimd_release_task(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -246,19 +289,77 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) if (current->mm) fpsimd_preserve_current_state(); *dst = *src; + + /* We rely on the above assignment to initialize dst's thread_flags: */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK)); + + /* + * Detach src's sve_state (if any) from dst so that it does not + * get erroneously used or freed prematurely. dst's copies + * will be allocated on demand later on if dst uses SVE. + * For consistency, also clear TIF_SVE here: this could be done + * later in copy_process(), but to avoid tripping up future + * maintainers it is best not to leave TIF flags and buffers in + * an inconsistent state, even temporarily. + */ + dst->thread.sve_state = NULL; + clear_tsk_thread_flag(dst, TIF_SVE); + + /* + * In the unlikely event that we create a new thread with ZA + * enabled we should retain the ZA state so duplicate it here. + * This may be shortly freed if we exec() or if CLONE_SETTLS + * but it's simpler to do it here. To avoid confusing the rest + * of the code ensure that we have a sve_state allocated + * whenever za_state is allocated. + */ + if (thread_za_enabled(&src->thread)) { + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + dst->thread.za_state = kmemdup(src->thread.za_state, + za_state_size(src), + GFP_KERNEL); + if (!dst->thread.za_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + } else { + dst->thread.za_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + } + + /* clear any pending asynchronous tag fault raised by the parent */ + clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); + return 0; } asmlinkage void ret_from_fork(void) asm("ret_from_fork"); -int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long stack_start = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); - if (likely(!(p->flags & PF_KTHREAD))) { + /* + * In case p was allocated the same task_struct pointer as some + * other recently-exited task, make sure p is disassociated from + * any cpu that may have run that now-exited task recently. + * Otherwise we could erroneously skip reloading the FPSIMD + * registers for p. + */ + fpsimd_flush_task_state(p); + + ptrauth_thread_init_kernel(p); + + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; @@ -267,6 +368,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2()) + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (stack_start) { if (is_compat_thread(task_thread_info(p))) @@ -276,22 +379,34 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } /* - * If a TLS pointer was passed to clone (4th argument), use it - * for the new thread. + * If a TLS pointer was passed to clone, use it for the new + * thread. We also reset TPIDR2 if it's in use. */ - if (clone_flags & CLONE_SETTLS) - p->thread.tp_value = childregs->regs[3]; + if (clone_flags & CLONE_SETTLS) { + p->thread.uw.tp_value = tls; + p->thread.tpidr2_el0 = 0; + } } else { + /* + * A kthread has no context to ERET to, so ensure any buggy + * ERET is treated as an illegal exception return. + * + * When a user task is created from a kthread, childregs will + * be initialized by start_thread() or start_compat_thread(). + */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->pstate = PSR_MODE_EL1h; - if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_const_cap(ARM64_HAS_UAO)) - childregs->pstate |= PSR_UAO_BIT; - p->thread.cpu_context.x19 = stack_start; - p->thread.cpu_context.x20 = stk_sz; + childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + + p->thread.cpu_context.x19 = (unsigned long)args->fn; + p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; + /* + * For the benefit of the unwinder, set up childregs->stackframe + * as the final frame for the new task. + */ + p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; ptrace_hw_copy_thread(p); @@ -301,31 +416,45 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, void tls_preserve_current_state(void) { *task_user_tls(current) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2() && !is_compat_task()) + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); } static void tls_thread_switch(struct task_struct *next) { - unsigned long tpidr, tpidrro; - tls_preserve_current_state(); - tpidr = *task_user_tls(next); - tpidrro = is_compat_thread(task_thread_info(next)) ? - next->thread.tp_value : 0; + if (is_compat_thread(task_thread_info(next))) + write_sysreg(next->thread.uw.tp_value, tpidrro_el0); + else if (!arm64_kernel_unmapped_at_el0()) + write_sysreg(0, tpidrro_el0); - write_sysreg(tpidr, tpidr_el0); - write_sysreg(tpidrro, tpidrro_el0); + write_sysreg(*task_user_tls(next), tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0); } -/* Restore the UAO state depending on next's addr_limit */ -void uao_thread_switch(struct task_struct *next) +/* + * Force SSBS state on context-switch, since it may be lost after migrating + * from a CPU which treats the bit as RES0 in a heterogeneous system. + */ +static void ssbs_thread_switch(struct task_struct *next) { - if (IS_ENABLED(CONFIG_ARM64_UAO)) { - if (task_thread_info(next)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); - } + /* + * Nothing to do for kernel threads, but 'regs' may be junk + * (e.g. idle task) so check the flags and bail early. + */ + if (unlikely(next->flags & PF_KTHREAD)) + return; + + /* + * If all CPUs implement the SSBS extension, then we just need to + * context-switch the PSTATE field. + */ + if (cpus_have_const_cap(ARM64_SSBS)) + return; + + spectre_v4_enable_task_mitigation(next); } /* @@ -343,9 +472,52 @@ static void entry_task_switch(struct task_struct *next) } /* + * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. + * Ensure access is disabled when switching to a 32bit task, ensure + * access is enabled when switching to a 64bit task. + */ +static void erratum_1418040_thread_switch(struct task_struct *next) +{ + if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || + !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) + return; + + if (is_compat_thread(task_thread_info(next))) + sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); + else + sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); +} + +static void erratum_1418040_new_exec(void) +{ + preempt_disable(); + erratum_1418040_thread_switch(current); + preempt_enable(); +} + +/* + * __switch_to() checks current->thread.sctlr_user as an optimisation. Therefore + * this function must be called with preemption disabled and the update to + * sctlr_user must be made in the same preemption disabled block so that + * __switch_to() does not see the variable update before the SCTLR_EL1 one. + */ +void update_sctlr_el1(u64 sctlr) +{ + /* + * EnIA must not be cleared while in the kernel as this is necessary for + * in-kernel PAC. It will be cleared on kernel exit if needed. + */ + sysreg_clear_set(sctlr_el1, SCTLR_USER_MASK & ~SCTLR_ELx_ENIA, sctlr); + + /* ISB required for the kernel uaccess routines when setting TCF0. */ + isb(); +} + +/* * Thread switching. */ -__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, +__notrace_funcgraph __sched +struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) { struct task_struct *last; @@ -355,7 +527,9 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); entry_task_switch(next); - uao_thread_switch(next); + ssbs_thread_switch(next); + erratum_1418040_thread_switch(next); + ptrauth_thread_switch_user(next); /* * Complete any pending TLB or cache maintenance on this CPU in case @@ -365,62 +539,219 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, */ dsb(ish); + /* + * MTE thread switching must happen after the DSB above to ensure that + * any asynchronous tag check faults have been logged in the TFSR*_EL1 + * registers. + */ + mte_thread_switch(next); + /* avoid expensive SCTLR_EL1 accesses if no change */ + if (prev->thread.sctlr_user != next->thread.sctlr_user) + update_sctlr_el1(next->thread.sctlr_user); + /* the actual thread switch */ last = cpu_switch_to(prev, next); return last; } -unsigned long get_wchan(struct task_struct *p) +struct wchan_info { + unsigned long pc; + int count; +}; + +static bool get_wchan_cb(void *arg, unsigned long pc) { - struct stackframe frame; - unsigned long stack_page, ret = 0; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; + struct wchan_info *wchan_info = arg; - stack_page = (unsigned long)try_get_task_stack(p); - if (!stack_page) + if (!in_sched_functions(pc)) { + wchan_info->pc = pc; + return false; + } + return wchan_info->count++ < 16; +} + +unsigned long __get_wchan(struct task_struct *p) +{ + struct wchan_info wchan_info = { + .pc = 0, + .count = 0, + }; + + if (!try_get_task_stack(p)) return 0; - frame.fp = thread_saved_fp(p); - frame.pc = thread_saved_pc(p); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = p->curr_ret_stack; -#endif - do { - if (unwind_frame(p, &frame)) - goto out; - if (!in_sched_functions(frame.pc)) { - ret = frame.pc; - goto out; - } - } while (count ++ < 16); + arch_stack_walk(get_wchan_cb, &wchan_info, p, NULL); -out: put_task_stack(p); - return ret; + + return wchan_info.pc; } unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & ~0xf; } -unsigned long arch_randomize_brk(struct mm_struct *mm) +#ifdef CONFIG_COMPAT +int compat_elf_check_arch(const struct elf32_hdr *hdr) { - if (is_compat_task()) - return randomize_page(mm->brk, SZ_32M); - else - return randomize_page(mm->brk, SZ_1G); + if (!system_supports_32bit_el0()) + return false; + + if ((hdr)->e_machine != EM_ARM) + return false; + + if (!((hdr)->e_flags & EF_ARM_EABI_MASK)) + return false; + + /* + * Prevent execve() of a 32-bit program from a deadline task + * if the restricted affinity mask would be inadmissible on an + * asymmetric system. + */ + return !static_branch_unlikely(&arm64_mismatched_32bit_el0) || + !dl_task_check_affinity(current, system_32bit_el0_cpumask()); } +#endif /* * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. */ void arch_setup_new_exec(void) { - current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; + unsigned long mmflags = 0; + + if (is_compat_task()) { + mmflags = MMCF_AARCH32; + + /* + * Restrict the CPU affinity mask for a 32-bit task so that + * it contains only 32-bit-capable CPUs. + * + * From the perspective of the task, this looks similar to + * what would happen if the 64-bit-only CPUs were hot-unplugged + * at the point of execve(), although we try a bit harder to + * honour the cpuset hierarchy. + */ + if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) + force_compatible_cpus_allowed_ptr(current); + } else if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) { + relax_compatible_cpus_allowed_ptr(current); + } + + current->mm->context.flags = mmflags; + ptrauth_thread_init_user(); + mte_thread_init_user(); + erratum_1418040_new_exec(); + + if (task_spec_ssb_noexec(current)) { + arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, + PR_SPEC_ENABLE); + } +} + +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +/* + * Control the relaxed ABI allowing tagged user addresses into the kernel. + */ +static unsigned int tagged_addr_disabled; + +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) +{ + unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE; + struct thread_info *ti = task_thread_info(task); + + if (is_compat_thread(ti)) + return -EINVAL; + + if (system_supports_mte()) + valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \ + | PR_MTE_TAG_MASK; + + if (arg & ~valid_mask) + return -EINVAL; + + /* + * Do not allow the enabling of the tagged address ABI if globally + * disabled via sysctl abi.tagged_addr_disabled. + */ + if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled) + return -EINVAL; + + if (set_mte_ctrl(task, arg) != 0) + return -EINVAL; + + update_ti_thread_flag(ti, TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); + + return 0; +} + +long get_tagged_addr_ctrl(struct task_struct *task) +{ + long ret = 0; + struct thread_info *ti = task_thread_info(task); + + if (is_compat_thread(ti)) + return -EINVAL; + + if (test_ti_thread_flag(ti, TIF_TAGGED_ADDR)) + ret = PR_TAGGED_ADDR_ENABLE; + + ret |= get_mte_ctrl(task); + + return ret; +} + +/* + * Global sysctl to disable the tagged user addresses support. This control + * only prevents the tagged address ABI enabling via prctl() and does not + * disable it for tasks that already opted in to the relaxed ABI. + */ + +static struct ctl_table tagged_addr_sysctl_table[] = { + { + .procname = "tagged_addr_disabled", + .mode = 0644, + .data = &tagged_addr_disabled, + .maxlen = sizeof(int), + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init tagged_addr_init(void) +{ + if (!register_sysctl("abi", tagged_addr_sysctl_table)) + return -EINVAL; + return 0; } + +core_initcall(tagged_addr_init); +#endif /* CONFIG_ARM64_TAGGED_ADDR_ABI */ + +#ifdef CONFIG_BINFMT_ELF +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + /* + * For dynamically linked executables the interpreter is + * responsible for setting PROT_BTI on everything except + * itself. + */ + if (is_interp != has_interp) + return prot; + + if (!(state->flags & ARM64_ELF_BTI)) + return prot; + + if (prot & PROT_EXEC) + prot |= PROT_BTI; + + return prot; +} +#endif diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c new file mode 100644 index 000000000000..bfce41c2a53b --- /dev/null +++ b/arch/arm64/kernel/proton-pack.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as + * detailed at: + * + * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability + * + * This code was originally written hastily under an awful lot of stress and so + * aspects of it are somewhat hacky. Unfortunately, changing anything in here + * instantly makes me feel ill. Thanks, Jann. Thann. + * + * Copyright (C) 2018 ARM Ltd, All Rights Reserved. + * Copyright (C) 2020 Google LLC + * + * "If there's something strange in your neighbourhood, who you gonna call?" + * + * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org> + */ + +#include <linux/arm-smccc.h> +#include <linux/bpf.h> +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/nospec.h> +#include <linux/prctl.h> +#include <linux/sched/task_stack.h> + +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/spectre.h> +#include <asm/traps.h> +#include <asm/vectors.h> +#include <asm/virt.h> + +/* + * We try to ensure that the mitigation state can never change as the result of + * onlining a late CPU. + */ +static void update_mitigation_state(enum mitigation_state *oldp, + enum mitigation_state new) +{ + enum mitigation_state state; + + do { + state = READ_ONCE(*oldp); + if (new <= state) + break; + + /* Userspace almost certainly can't deal with this. */ + if (WARN_ON(system_capabilities_finalized())) + break; + } while (cmpxchg_relaxed(oldp, state, new) != state); +} + +/* + * Spectre v1. + * + * The kernel can't protect userspace for this one: it's each person for + * themselves. Advertise what we're doing and be done with it. + */ +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +/* + * Spectre v2. + * + * This one sucks. A CPU is either: + * + * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2. + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in software by firmware. + * - Mitigated in software by a CPU-specific dance in the kernel and a + * firmware call at EL2. + * - Vulnerable. + * + * It's not unlikely for different CPUs in a big.LITTLE system to fall into + * different camps. + */ +static enum mitigation_state spectre_v2_state; + +static bool __read_mostly __nospectre_v2; +static int __init parse_spectre_v2_param(char *str) +{ + __nospectre_v2 = true; + return 0; +} +early_param("nospectre_v2", parse_spectre_v2_param); + +static bool spectre_v2_mitigations_off(void) +{ + bool ret = __nospectre_v2 || cpu_mitigations_off(); + + if (ret) + pr_info_once("spectre-v2 mitigation disabled by command line option\n"); + + return ret; +} + +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false; +#endif +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + + switch (spectre_v2_state) { + case SPECTRE_UNAFFECTED: + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + fallthrough; + case SPECTRE_MITIGATED: + if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) +{ + u64 pfr0; + static const struct midr_range spectre_v2_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ } + }; + + /* If the CPU has CSV2 set, we're safe */ + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT)) + return SPECTRE_UNAFFECTED; + + /* Alternatively, we have a list of unaffected CPUs */ + if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + return SPECTRE_UNAFFECTED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + return true; +} + +enum mitigation_state arm64_get_spectre_v2_state(void) +{ + return spectre_v2_state; +} + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static void install_bp_hardening_cb(bp_hardening_cb_t fn) +{ + __this_cpu_write(bp_hardening_data.fn, fn); + + /* + * Vinz Clortho takes the hyp_vecs start/end "keys" at + * the door when we're a guest. Skip the hyp-vectors work. + */ + if (!is_hyp_mode_available()) + return; + + __this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT); +} + +/* Called during entry so must be noinstr */ +static noinstr void call_smc_arch_workaround_1(void) +{ + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +/* Called during entry so must be noinstr */ +static noinstr void call_hvc_arch_workaround_1(void) +{ + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +/* Called during entry so must be noinstr */ +static noinstr void qcom_link_stack_sanitisation(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void) +{ + u32 midr = read_cpuid_id(); + if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) && + ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1)) + return NULL; + + return qcom_link_stack_sanitisation; +} + +static enum mitigation_state spectre_v2_enable_fw_mitigation(void) +{ + bp_hardening_cb_t cb; + enum mitigation_state state; + + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v2_mitigations_off()) + return SPECTRE_VULNERABLE; + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + cb = call_hvc_arch_workaround_1; + break; + + case SMCCC_CONDUIT_SMC: + cb = call_smc_arch_workaround_1; + break; + + default: + return SPECTRE_VULNERABLE; + } + + /* + * Prefer a CPU-specific workaround if it exists. Note that we + * still rely on firmware for the mitigation at EL2. + */ + cb = spectre_v2_get_sw_mitigation_cb() ?: cb; + install_bp_hardening_cb(cb); + return SPECTRE_MITIGATED; +} + +void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v2_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v2_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v2_state, state); +} + +/* + * Spectre-v3a. + * + * Phew, there's not an awful lot to do here! We just instruct EL2 to use + * an indirect trampoline for the hyp vectors so that guests can't read + * VBAR_EL2 to defeat randomisation of the hypervisor VA layout. + */ +bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) +{ + static const struct midr_range spectre_v3a_unsafe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, + }; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); +} + +void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (this_cpu_has_cap(ARM64_SPECTRE_V3A)) + data->slot += HYP_VECTOR_INDIRECT; +} + +/* + * Spectre v4. + * + * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is + * either: + * + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in hardware via PSTATE.SSBS. + * - Mitigated in software by firmware (sometimes referred to as SSBD). + * + * Wait, that doesn't sound so bad, does it? Keep reading... + * + * A major source of headaches is that the software mitigation is enabled both + * on a per-task basis, but can also be forced on for the kernel, necessitating + * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs + * allow EL0 to toggle SSBS directly, which can end up with the prctl() state + * being stale when re-entering the kernel. The usual big.LITTLE caveats apply, + * so you can have systems that have both firmware and SSBS mitigations. This + * means we actually have to reject late onlining of CPUs with mitigations if + * all of the currently onlined CPUs are safelisted, as the mitigation tends to + * be opt-in for userspace. Yes, really, the cure is worse than the disease. + * + * The only good part is that if the firmware mitigation is present, then it is + * present for all CPUs, meaning we don't have to worry about late onlining of a + * vulnerable CPU if one of the boot CPUs is using the firmware mitigation. + * + * Give me a VAX-11/780 any day of the week... + */ +static enum mitigation_state spectre_v4_state; + +/* This is the per-cpu state tracking whether we need to talk to firmware */ +DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); + +enum spectre_v4_policy { + SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, + SPECTRE_V4_POLICY_MITIGATION_ENABLED, + SPECTRE_V4_POLICY_MITIGATION_DISABLED, +}; + +static enum spectre_v4_policy __read_mostly __spectre_v4_policy; + +static const struct spectre_v4_param { + const char *str; + enum spectre_v4_policy policy; +} spectre_v4_params[] = { + { "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, }, + { "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, }, + { "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, }, +}; +static int __init parse_spectre_v4_param(char *str) +{ + int i; + + if (!str || !str[0]) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) { + const struct spectre_v4_param *param = &spectre_v4_params[i]; + + if (strncmp(str, param->str, strlen(param->str))) + continue; + + __spectre_v4_policy = param->policy; + return 0; + } + + return -EINVAL; +} +early_param("ssbd", parse_spectre_v4_param); + +/* + * Because this was all written in a rush by people working in different silos, + * we've ended up with multiple command line options to control the same thing. + * Wrap these up in some helpers, which prefer disabling the mitigation if faced + * with contradictory parameters. The mitigation is always either "off", + * "dynamic" or "on". + */ +static bool spectre_v4_mitigations_off(void) +{ + bool ret = cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; + + if (ret) + pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); + + return ret; +} + +/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ +static bool spectre_v4_mitigations_dynamic(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC; +} + +static bool spectre_v4_mitigations_on(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED; +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n"); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +enum mitigation_state arm64_get_spectre_v4_state(void) +{ + return spectre_v4_state; +} + +static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) +{ + static const struct midr_range spectre_v4_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ }, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + return SPECTRE_UNAFFECTED; + + /* CPU features are detected first */ + if (this_cpu_has_cap(ARM64_SSBS)) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_2, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + fallthrough; + case SMCCC_RET_NOT_REQUIRED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope) +{ + enum mitigation_state state; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_get_cpu_fw_mitigation_state(); + + return state != SPECTRE_UNAFFECTED; +} + +static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr) +{ + if (user_mode(regs)) + return 1; + + if (instr & BIT(PSTATE_Imm_shift)) + regs->pstate |= PSR_SSBS_BIT; + else + regs->pstate &= ~PSR_SSBS_BIT; + + arm64_skip_faulting_instruction(regs, 4); + return 0; +} + +static struct undef_hook ssbs_emulation_hook = { + .instr_mask = ~(1U << PSTATE_Imm_shift), + .instr_val = 0xd500401f | PSTATE_SSBS, + .fn = ssbs_emulation_handler, +}; + +static enum mitigation_state spectre_v4_enable_hw_mitigation(void) +{ + static bool undef_hook_registered = false; + static DEFINE_RAW_SPINLOCK(hook_lock); + enum mitigation_state state; + + /* + * If the system is mitigated but this CPU doesn't have SSBS, then + * we must be on the safelist and there's nothing more to do. + */ + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS)) + return state; + + raw_spin_lock(&hook_lock); + if (!undef_hook_registered) { + register_undef_hook(&ssbs_emulation_hook); + undef_hook_registered = true; + } + raw_spin_unlock(&hook_lock); + + if (spectre_v4_mitigations_off()) { + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); + set_pstate_ssbs(1); + return SPECTRE_VULNERABLE; + } + + /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ + set_pstate_ssbs(0); + return SPECTRE_MITIGATED; +} + +/* + * Patch a branch over the Spectre-v4 mitigation code with a NOP so that + * we fallthrough and check whether firmware needs to be called on this CPU. + */ +void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (spectre_v4_mitigations_off()) + return; + + if (cpus_have_cap(ARM64_SSBS)) + return; + + if (spectre_v4_mitigations_dynamic()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* + * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction + * to call into firmware to adjust the mitigation state. + */ +void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + u32 insn; + + BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */ + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + insn = aarch64_insn_get_hvc_value(); + break; + case SMCCC_CONDUIT_SMC: + insn = aarch64_insn_get_smc_value(); + break; + default: + return; + } + + *updptr = cpu_to_le32(insn); +} + +static enum mitigation_state spectre_v4_enable_fw_mitigation(void) +{ + enum mitigation_state state; + + state = spectre_v4_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v4_mitigations_off()) { + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL); + return SPECTRE_VULNERABLE; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL); + + if (spectre_v4_mitigations_dynamic()) + __this_cpu_write(arm64_ssbd_callback_required, 1); + + return SPECTRE_MITIGATED; +} + +void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v4_enable_hw_mitigation(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v4_state, state); +} + +static void __update_pstate_ssbs(struct pt_regs *regs, bool state) +{ + u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + + if (state) + regs->pstate |= bit; + else + regs->pstate &= ~bit; +} + +void spectre_v4_enable_task_mitigation(struct task_struct *tsk) +{ + struct pt_regs *regs = task_pt_regs(tsk); + bool ssbs = false, kthread = tsk->flags & PF_KTHREAD; + + if (spectre_v4_mitigations_off()) + ssbs = true; + else if (spectre_v4_mitigations_dynamic() && !kthread) + ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD); + + __update_pstate_ssbs(regs, ssbs); +} + +/* + * The Spectre-v4 mitigation can be controlled via a prctl() from userspace. + * This is interesting because the "speculation disabled" behaviour can be + * configured so that it is preserved across exec(), which means that the + * prctl() may be necessary even when PSTATE.SSBS can be toggled directly + * from userspace. + */ +static void ssbd_prctl_enable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_set_spec_ssb_disable(task); + set_tsk_thread_flag(task, TIF_SSBD); +} + +static void ssbd_prctl_disable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_clear_spec_ssb_disable(task); + clear_tsk_thread_flag(task, TIF_SSBD); +} + +static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + /* Enable speculation: disable mitigation */ + /* + * Force disabled speculation prevents it from being + * re-enabled. + */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + + /* + * If the mitigation is forced on, then speculation is forced + * off and we again prevent it from being re-enabled. + */ + if (spectre_v4_mitigations_on()) + return -EPERM; + + ssbd_prctl_disable_mitigation(task); + break; + case PR_SPEC_FORCE_DISABLE: + /* Force disable speculation: force enable mitigation */ + /* + * If the mitigation is forced off, then speculation is forced + * on and we prevent it from being disabled. + */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + task_set_spec_ssb_force_disable(task); + fallthrough; + case PR_SPEC_DISABLE: + /* Disable speculation: enable mitigation */ + /* Same as PR_SPEC_FORCE_DISABLE */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + ssbd_prctl_enable_mitigation(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + /* Disable speculation until execve(): enable mitigation */ + /* + * If the mitigation state is forced one way or the other, then + * we must fail now before we try to toggle it on execve(). + */ + if (task_spec_ssb_force_disable(task) || + spectre_v4_mitigations_off() || + spectre_v4_mitigations_on()) { + return -EPERM; + } + + ssbd_prctl_enable_mitigation(task); + task_set_spec_ssb_noexec(task); + break; + default: + return -ERANGE; + } + + spectre_v4_enable_task_mitigation(task); + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +static int ssbd_prctl_get(struct task_struct *task) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return PR_SPEC_NOT_AFFECTED; + case SPECTRE_MITIGATED: + if (spectre_v4_mitigations_on()) + return PR_SPEC_NOT_AFFECTED; + + if (spectre_v4_mitigations_dynamic()) + break; + + /* Mitigations are disabled, so we're vulnerable. */ + fallthrough; + case SPECTRE_VULNERABLE: + fallthrough; + default: + return PR_SPEC_ENABLE; + } + + /* Check the mitigation state for this task */ + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; + + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_get(task); + default: + return -ENODEV; + } +} + +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} + +enum bhb_mitigation_bits { + BHB_LOOP, + BHB_FW, + BHB_HW, + BHB_INSN, +}; +static unsigned long system_bhb_mitigations; + +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +u8 spectre_bhb_loop_affected(int scope) +{ + u8 k = 0; + static u8 max_bhb_k; + + if (scope == SCOPE_LOCAL_CPU) { + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; + + max_bhb_k = max(max_bhb_k, k); + } else { + k = max_bhb_k; + } + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool is_spectre_bhb_fw_affected(int scope) +{ + static bool system_affected; + enum mitigation_state fw_state; + bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; + static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + {}, + }; + bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), + spectre_bhb_firmware_mitigated_list); + + if (scope != SCOPE_LOCAL_CPU) + return system_affected; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { + system_affected = true; + return true; + } + + return false; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_ECBHB_SHIFT); +} + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (supports_clearbhb(scope)) + return true; + + if (spectre_bhb_loop_affected(scope)) + return true; + + if (is_spectre_bhb_fw_affected(scope)) + return true; + + return false; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + if (slot < 0) + return; + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (arm64_kernel_unmapped_at_el0()) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + +static bool __read_mostly __nospectre_bhb; +static int __init parse_spectre_bhb_param(char *str) +{ + __nospectre_bhb = true; + return 0; +} +early_param("nospectre_bhb", parse_spectre_bhb_param); + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + bp_hardening_cb_t cpu_cb; + enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); + } else if (cpu_mitigations_off() || __nospectre_bhb) { + pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + set_bit(BHB_HW, &system_bhb_mitigations); + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have ClearBHB + * added. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + state = SPECTRE_MITIGATED; + set_bit(BHB_INSN, &system_bhb_mitigations); + } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have the + * branchy-loop added. A57/A72-r0 will already have selected + * the spectre-indirect vector, which is sufficient for BHB + * too. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + state = SPECTRE_MITIGATED; + set_bit(BHB_LOOP, &system_bhb_mitigations); + } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (fw_state == SPECTRE_MITIGATED) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); + } + } + + update_mitigation_state(&spectre_bhb_state, state); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); + + if (test_bit(BHB_LOOP, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); + + if (test_bit(BHB_FW, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to correct the immediate */ +void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to mov WA3 when supported */ +void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + !test_bit(BHB_FW, &system_bhb_mitigations)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + + insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_REG_ZR, rd, + ARM_SMCCC_ARCH_WORKAROUND_3); + if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT)) + return; + + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to NOP when not supported */ +void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 2); + + if (test_bit(BHB_INSN, &system_bhb_mitigations)) + return; + + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +#ifdef CONFIG_BPF_SYSCALL +#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n" +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_state == SPECTRE_VULNERABLE || + spectre_bhb_state != SPECTRE_MITIGATED) + return; + + if (!new_state) + pr_err("WARNING: %s", EBPF_WARN); +} +#endif diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index e8edbf13302a..29a8e444db83 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -1,12 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * Copyright (C) 2013 ARM Limited * @@ -24,7 +17,6 @@ #include <uapi/linux/psci.h> -#include <asm/compiler.h> #include <asm/cpu_ops.h> #include <asm/errno.h> #include <asm/smp_plat.h> @@ -46,7 +38,8 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu) static int cpu_psci_cpu_boot(unsigned int cpu) { - int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry)); + phys_addr_t pa_secondary_entry = __pa_symbol(secondary_entry); + int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry); if (err) pr_err("failed to boot CPU%d (%d)\n", cpu, err); @@ -54,6 +47,11 @@ static int cpu_psci_cpu_boot(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU +static bool cpu_psci_cpu_can_disable(unsigned int cpu) +{ + return !psci_tos_resident_on(cpu); +} + static int cpu_psci_cpu_disable(unsigned int cpu) { /* Fail early if we don't have CPU_OFF support */ @@ -69,7 +67,6 @@ static int cpu_psci_cpu_disable(unsigned int cpu) static void cpu_psci_cpu_die(unsigned int cpu) { - int ret; /* * There are no known implementations of PSCI actually using the * power state field, pass a sensible default for now. @@ -77,14 +74,13 @@ static void cpu_psci_cpu_die(unsigned int cpu) u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT; - ret = psci_ops.cpu_off(state); - - pr_crit("unable to power off CPU%u (%d)\n", cpu, ret); + psci_ops.cpu_off(state); } static int cpu_psci_cpu_kill(unsigned int cpu) { - int err, i; + int err; + unsigned long start, end; if (!psci_ops.affinity_info) return 0; @@ -94,16 +90,18 @@ static int cpu_psci_cpu_kill(unsigned int cpu) * while it is dying. So, try again a few times. */ - for (i = 0; i < 10; i++) { + start = jiffies; + end = start + msecs_to_jiffies(100); + do { err = psci_ops.affinity_info(cpu_logical_map(cpu), 0); if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) { - pr_info("CPU%d killed.\n", cpu); + pr_info("CPU%d killed (polled %d ms)\n", cpu, + jiffies_to_msecs(jiffies - start)); return 0; } - msleep(10); - pr_info("Retrying again to check for CPU kill\n"); - } + usleep_range(100, 1000); + } while (time_before(jiffies, end)); pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n", cpu, err); @@ -113,14 +111,11 @@ static int cpu_psci_cpu_kill(unsigned int cpu) const struct cpu_operations cpu_psci_ops = { .name = "psci", -#ifdef CONFIG_CPU_IDLE - .cpu_init_idle = psci_cpu_init_idle, - .cpu_suspend = psci_cpu_suspend_enter, -#endif .cpu_init = cpu_psci_cpu_init, .cpu_prepare = cpu_psci_cpu_prepare, .cpu_boot = cpu_psci_cpu_boot, #ifdef CONFIG_HOTPLUG_CPU + .cpu_can_disable = cpu_psci_cpu_can_disable, .cpu_disable = cpu_psci_cpu_disable, .cpu_die = cpu_psci_cpu_die, .cpu_kill = cpu_psci_cpu_kill, diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 9cbb6123208f..c2fb5755bbec 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/ptrace.c * @@ -5,18 +6,6 @@ * edited by Linus Torvalds * ARM modifications Copyright (C) 2000 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/audit.h> @@ -25,6 +14,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> +#include <linux/nospec.h> #include <linux/smp.h> #include <linux/ptrace.h> #include <linux/user.h> @@ -32,16 +22,19 @@ #include <linux/security.h> #include <linux/init.h> #include <linux/signal.h> +#include <linux/string.h> #include <linux/uaccess.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/regset.h> -#include <linux/tracehook.h> #include <linux/elf.h> #include <asm/compat.h> +#include <asm/cpufeature.h> #include <asm/debug-monitors.h> -#include <asm/pgtable.h> +#include <asm/fpsimd.h> +#include <asm/mte.h> +#include <asm/pointer_auth.h> #include <asm/stacktrace.h> #include <asm/syscall.h> #include <asm/traps.h> @@ -128,7 +121,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr); + on_irq_stack(addr, sizeof(unsigned long)); } /** @@ -178,36 +171,32 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct pt_regs *regs) { struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); - siginfo_t info = { - .si_signo = SIGTRAP, - .si_errno = 0, - .si_code = TRAP_HWBKPT, - .si_addr = (void __user *)(bkpt->trigger), - }; + const char *desc = "Hardware breakpoint trap (ptrace)"; #ifdef CONFIG_COMPAT - int i; - - if (!is_compat_task()) - goto send_sig; + if (is_compat_task()) { + int si_errno = 0; + int i; - for (i = 0; i < ARM_MAX_BRP; ++i) { - if (current->thread.debug.hbp_break[i] == bp) { - info.si_errno = (i << 1) + 1; - break; + for (i = 0; i < ARM_MAX_BRP; ++i) { + if (current->thread.debug.hbp_break[i] == bp) { + si_errno = (i << 1) + 1; + break; + } } - } - for (i = 0; i < ARM_MAX_WRP; ++i) { - if (current->thread.debug.hbp_watch[i] == bp) { - info.si_errno = -((i << 1) + 1); - break; + for (i = 0; i < ARM_MAX_WRP; ++i) { + if (current->thread.debug.hbp_watch[i] == bp) { + si_errno = -((i << 1) + 1); + break; + } } + arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, + desc); + return; } - -send_sig: #endif - force_sig_info(SIGTRAP, &info, current); + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* @@ -247,15 +236,20 @@ static struct perf_event *ptrace_hbp_get_event(unsigned int note_type, switch (note_type) { case NT_ARM_HW_BREAK: - if (idx < ARM_MAX_BRP) - bp = tsk->thread.debug.hbp_break[idx]; + if (idx >= ARM_MAX_BRP) + goto out; + idx = array_index_nospec(idx, ARM_MAX_BRP); + bp = tsk->thread.debug.hbp_break[idx]; break; case NT_ARM_HW_WATCH: - if (idx < ARM_MAX_WRP) - bp = tsk->thread.debug.hbp_watch[idx]; + if (idx >= ARM_MAX_WRP) + goto out; + idx = array_index_nospec(idx, ARM_MAX_WRP); + bp = tsk->thread.debug.hbp_watch[idx]; break; } +out: return bp; } @@ -268,19 +262,22 @@ static int ptrace_hbp_set_event(unsigned int note_type, switch (note_type) { case NT_ARM_HW_BREAK: - if (idx < ARM_MAX_BRP) { - tsk->thread.debug.hbp_break[idx] = bp; - err = 0; - } + if (idx >= ARM_MAX_BRP) + goto out; + idx = array_index_nospec(idx, ARM_MAX_BRP); + tsk->thread.debug.hbp_break[idx] = bp; + err = 0; break; case NT_ARM_HW_WATCH: - if (idx < ARM_MAX_WRP) { - tsk->thread.debug.hbp_watch[idx] = bp; - err = 0; - } + if (idx >= ARM_MAX_WRP) + goto out; + idx = array_index_nospec(idx, ARM_MAX_WRP); + tsk->thread.debug.hbp_watch[idx] = bp; + err = 0; break; } +out: return err; } @@ -475,11 +472,10 @@ static int ptrace_hbp_set_addr(unsigned int note_type, static int hw_break_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { unsigned int note_type = regset->core_note_type; - int ret, idx = 0, offset, limit; + int ret, idx = 0; u32 info, ctrl; u64 addr; @@ -488,49 +484,21 @@ static int hw_break_get(struct task_struct *target, if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &info, 0, - sizeof(info)); - if (ret) - return ret; - - /* Pad */ - offset = offsetof(struct user_hwdebug_state, pad); - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; - + membuf_write(&to, &info, sizeof(info)); + membuf_zero(&to, sizeof(u32)); /* (address, ctrl) registers */ - offset = offsetof(struct user_hwdebug_state, dbg_regs); - limit = regset->n * regset->size; - while (count && offset < limit) { + while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &addr, - offset, offset + PTRACE_HBP_ADDR_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_ADDR_SZ; - ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl); if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &ctrl, - offset, offset + PTRACE_HBP_CTRL_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_CTRL_SZ; - - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_PAD_SZ; + membuf_store(&to, addr); + membuf_store(&to, ctrl); + membuf_zero(&to, sizeof(u32)); idx++; } - return 0; } @@ -590,11 +558,10 @@ static int hw_break_set(struct task_struct *target, static int gpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1); + return membuf_write(&to, uregs, sizeof(*uregs)); } static int gpr_set(struct task_struct *target, const struct user_regset *regset, @@ -615,20 +582,66 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, return 0; } +static int fpr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!system_supports_fpsimd()) + return -ENODEV; + return regset->n; +} + /* * TODO: update fp accessors for lazy context switching (sync/flush hwstate) */ -static int fpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) +static int __fpr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) { struct user_fpsimd_state *uregs; - uregs = &target->thread.fpsimd_state.user_fpsimd; + + sve_sync_to_fpsimd(target); + + uregs = &target->thread.uw.fpsimd_state; + + return membuf_write(&to, uregs, sizeof(*uregs)); +} + +static int fpr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_fpsimd()) + return -EINVAL; if (target == current) fpsimd_preserve_current_state(); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1); + return __fpr_get(target, regset, to); +} + +static int __fpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + unsigned int start_pos) +{ + int ret; + struct user_fpsimd_state newstate; + + /* + * Ensure target->thread.uw.fpsimd_state is up to date, so that a + * short copyin can't resurrect stale data. + */ + sve_sync_to_fpsimd(target); + + newstate = target->thread.uw.fpsimd_state; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, + start_pos, start_pos + sizeof(newstate)); + if (ret) + return ret; + + target->thread.uw.fpsimd_state = newstate; + + return ret; } static int fpr_set(struct task_struct *target, const struct user_regset *regset, @@ -636,28 +649,35 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { int ret; - struct user_fpsimd_state newstate = - target->thread.fpsimd_state.user_fpsimd; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1); + if (!system_supports_fpsimd()) + return -EINVAL; + + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0); if (ret) return ret; - target->thread.fpsimd_state.user_fpsimd = newstate; + sve_sync_from_fpsimd_zeropad(target); fpsimd_flush_task_state(target); + return ret; } static int tls_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - unsigned long *tls = &target->thread.tp_value; + int ret; if (target == current) tls_preserve_current_state(); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, tls, 0, -1); + ret = membuf_store(&to, target->thread.uw.tp_value); + if (system_supports_tpidr2()) + ret = membuf_store(&to, target->thread.tpidr2_el0); + else + ret = membuf_zero(&to, sizeof(u64)); + + return ret; } static int tls_set(struct task_struct *target, const struct user_regset *regset, @@ -665,25 +685,28 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { int ret; - unsigned long tls = target->thread.tp_value; + unsigned long tls[2]; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); + tls[0] = target->thread.uw.tp_value; + if (system_supports_sme()) + tls[1] = target->thread.tpidr2_el0; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count); if (ret) return ret; - target->thread.tp_value = tls; + target->thread.uw.tp_value = tls[0]; + if (system_supports_sme()) + target->thread.tpidr2_el0 = tls[1]; + return ret; } static int system_call_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - int syscallno = task_pt_regs(target)->syscallno; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &syscallno, 0, -1); + return membuf_store(&to, task_pt_regs(target)->syscallno); } static int system_call_set(struct task_struct *target, @@ -702,6 +725,633 @@ static int system_call_set(struct task_struct *target, return ret; } +#ifdef CONFIG_ARM64_SVE + +static void sve_init_header_from_task(struct user_sve_header *header, + struct task_struct *target, + enum vec_type type) +{ + unsigned int vq; + bool active; + bool fpsimd_only; + enum vec_type task_type; + + memset(header, 0, sizeof(*header)); + + /* Check if the requested registers are active for the task */ + if (thread_sm_enabled(&target->thread)) + task_type = ARM64_VEC_SME; + else + task_type = ARM64_VEC_SVE; + active = (task_type == type); + + switch (type) { + case ARM64_VEC_SVE: + if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); + break; + case ARM64_VEC_SME: + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = false; + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (active) { + if (fpsimd_only) { + header->flags |= SVE_PT_REGS_FPSIMD; + } else { + header->flags |= SVE_PT_REGS_SVE; + } + } + + header->vl = task_get_vl(target, type); + vq = sve_vq_from_vl(header->vl); + + header->max_vl = vec_max_vl(type); + header->size = SVE_PT_SIZE(vq, header->flags); + header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), + SVE_PT_REGS_SVE); +} + +static unsigned int sve_size_from_header(struct user_sve_header const *header) +{ + return ALIGN(header->size, SVE_VQ_BYTES); +} + +static int sve_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, + enum vec_type type) +{ + struct user_sve_header header; + unsigned int vq; + unsigned long start, end; + + /* Header */ + sve_init_header_from_task(&header, target, type); + vq = sve_vq_from_vl(header.vl); + + membuf_write(&to, &header, sizeof(header)); + + if (target == current) + fpsimd_preserve_current_state(); + + BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); + BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + + switch ((header.flags & SVE_PT_REGS_MASK)) { + case SVE_PT_REGS_FPSIMD: + return __fpr_get(target, regset, to); + + case SVE_PT_REGS_SVE: + start = SVE_PT_SVE_OFFSET; + end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + membuf_write(&to, target->thread.sve_state, end - start); + + start = end; + end = SVE_PT_SVE_FPSR_OFFSET(vq); + membuf_zero(&to, end - start); + + /* + * Copy fpsr, and fpcr which must follow contiguously in + * struct fpsimd_state: + */ + start = end; + end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, + end - start); + + start = end; + end = sve_size_from_header(&header); + return membuf_zero(&to, end - start); + + default: + return 0; + } +} + +static int sve_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SVE); +} + +static int sve_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + enum vec_type type) +{ + int ret; + struct user_sve_header header; + unsigned int vq; + unsigned long start, end; + + /* Header */ + if (count < sizeof(header)) + return -EINVAL; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, + 0, sizeof(header)); + if (ret) + goto out; + + /* + * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by + * vec_set_vector_length(), which will also validate them for us: + */ + ret = vec_set_vector_length(target, type, header.vl, + ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); + if (ret) + goto out; + + /* Actual VL set may be less than the user asked for: */ + vq = sve_vq_from_vl(task_get_vl(target, type)); + + /* Enter/exit streaming mode */ + if (system_supports_sme()) { + u64 old_svcr = target->thread.svcr; + + switch (type) { + case ARM64_VEC_SVE: + target->thread.svcr &= ~SVCR_SM_MASK; + break; + case ARM64_VEC_SME: + target->thread.svcr |= SVCR_SM_MASK; + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + /* + * If we switched then invalidate any existing SVE + * state and ensure there's storage. + */ + if (target->thread.svcr != old_svcr) + sve_alloc(target, true); + } + + /* Registers: FPSIMD-only case */ + + BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); + if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + clear_tsk_thread_flag(target, TIF_SVE); + if (type == ARM64_VEC_SME) + fpsimd_force_sync_to_sve(target); + goto out; + } + + /* + * Otherwise: no registers or full SVE case. For backwards + * compatibility reasons we treat empty flags as SVE registers. + */ + + /* + * If setting a different VL from the requested VL and there is + * register data, the data layout will be wrong: don't even + * try to set the registers in this case. + */ + if (count && vq != sve_vq_from_vl(header.vl)) { + ret = -EIO; + goto out; + } + + sve_alloc(target, true); + if (!target->thread.sve_state) { + ret = -ENOMEM; + clear_tsk_thread_flag(target, TIF_SVE); + goto out; + } + + /* + * Ensure target->thread.sve_state is up to date with target's + * FPSIMD regs, so that a short copyin leaves trailing + * registers unmodified. Always enable SVE even if going into + * streaming mode. + */ + fpsimd_sync_to_sve(target); + set_tsk_thread_flag(target, TIF_SVE); + + BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + start = SVE_PT_SVE_OFFSET; + end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.sve_state, + start, end); + if (ret) + goto out; + + start = end; + end = SVE_PT_SVE_FPSR_OFFSET(vq); + ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + start, end); + if (ret) + goto out; + + /* + * Copy fpsr, and fpcr which must follow contiguously in + * struct fpsimd_state: + */ + start = end; + end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.uw.fpsimd_state.fpsr, + start, end); + +out: + fpsimd_flush_task_state(target); + return ret; +} + +static int sve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SVE); +} + +#endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +static int ssve_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SME); +} + +static int ssve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SME); +} + +static int za_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + memset(&header, 0, sizeof(header)); + + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header.flags |= ZA_PT_VL_INHERIT; + + header.vl = task_get_sme_vl(target); + vq = sve_vq_from_vl(header.vl); + header.max_vl = sme_max_vl(); + header.max_size = ZA_PT_SIZE(vq); + + /* If ZA is not active there is only the header */ + if (thread_za_enabled(&target->thread)) + header.size = ZA_PT_SIZE(vq); + else + header.size = ZA_PT_ZA_OFFSET; + + membuf_write(&to, &header, sizeof(header)); + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + end = ZA_PT_ZA_OFFSET; + + if (target == current) + fpsimd_preserve_current_state(); + + /* Any register data to include? */ + if (thread_za_enabled(&target->thread)) { + start = end; + end = ZA_PT_SIZE(vq); + membuf_write(&to, target->thread.za_state, end - start); + } + + /* Zero any trailing padding */ + start = end; + end = ALIGN(header.size, SVE_VQ_BYTES); + return membuf_zero(&to, end - start); +} + +static int za_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + if (count < sizeof(header)) + return -EINVAL; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, + 0, sizeof(header)); + if (ret) + goto out; + + /* + * All current ZA_PT_* flags are consumed by + * vec_set_vector_length(), which will also validate them for + * us: + */ + ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, + ((unsigned long)header.flags) << 16); + if (ret) + goto out; + + /* Actual VL set may be less than the user asked for: */ + vq = sve_vq_from_vl(task_get_sme_vl(target)); + + /* Ensure there is some SVE storage for streaming mode */ + if (!target->thread.sve_state) { + sve_alloc(target, false); + if (!target->thread.sve_state) { + ret = -ENOMEM; + goto out; + } + } + + /* Allocate/reinit ZA storage */ + sme_alloc(target); + if (!target->thread.za_state) { + ret = -ENOMEM; + goto out; + } + + /* If there is no data then disable ZA */ + if (!count) { + target->thread.svcr &= ~SVCR_ZA_MASK; + goto out; + } + + /* + * If setting a different VL from the requested VL and there is + * register data, the data layout will be wrong: don't even + * try to set the registers in this case. + */ + if (vq != sve_vq_from_vl(header.vl)) { + ret = -EIO; + goto out; + } + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + start = ZA_PT_ZA_OFFSET; + end = ZA_PT_SIZE(vq); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.za_state, + start, end); + if (ret) + goto out; + + /* Mark ZA as active and let userspace use it */ + set_tsk_thread_flag(target, TIF_SME); + target->thread.svcr |= SVCR_ZA_MASK; + +out: + fpsimd_flush_task_state(target); + return ret; +} + +#endif /* CONFIG_ARM64_SME */ + +#ifdef CONFIG_ARM64_PTR_AUTH +static int pac_mask_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + /* + * The PAC bits can differ across data and instruction pointers + * depending on TCR_EL1.TBID*, which we may make use of in future, so + * we expose separate masks. + */ + unsigned long mask = ptrauth_user_pac_mask(); + struct user_pac_mask uregs = { + .data_mask = mask, + .insn_mask = mask, + }; + + if (!system_supports_address_auth()) + return -EINVAL; + + return membuf_write(&to, &uregs, sizeof(uregs)); +} + +static int pac_enabled_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + return membuf_write(&to, &enabled_keys, sizeof(enabled_keys)); +} + +static int pac_enabled_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0, + sizeof(long)); + if (ret) + return ret; + + return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK, + enabled_keys); +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static __uint128_t pac_key_to_user(const struct ptrauth_key *key) +{ + return (__uint128_t)key->hi << 64 | key->lo; +} + +static struct ptrauth_key pac_key_from_user(__uint128_t ukey) +{ + struct ptrauth_key key = { + .lo = (unsigned long)ukey, + .hi = (unsigned long)(ukey >> 64), + }; + + return key; +} + +static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, + const struct ptrauth_keys_user *keys) +{ + ukeys->apiakey = pac_key_to_user(&keys->apia); + ukeys->apibkey = pac_key_to_user(&keys->apib); + ukeys->apdakey = pac_key_to_user(&keys->apda); + ukeys->apdbkey = pac_key_to_user(&keys->apdb); +} + +static void pac_address_keys_from_user(struct ptrauth_keys_user *keys, + const struct user_pac_address_keys *ukeys) +{ + keys->apia = pac_key_from_user(ukeys->apiakey); + keys->apib = pac_key_from_user(ukeys->apibkey); + keys->apda = pac_key_from_user(ukeys->apdakey); + keys->apdb = pac_key_from_user(ukeys->apdbkey); +} + +static int pac_address_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + + return membuf_write(&to, &user_keys, sizeof(user_keys)); +} + +static int pac_address_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + int ret; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_address_keys_from_user(keys, &user_keys); + + return 0; +} + +static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, + const struct ptrauth_keys_user *keys) +{ + ukeys->apgakey = pac_key_to_user(&keys->apga); +} + +static void pac_generic_keys_from_user(struct ptrauth_keys_user *keys, + const struct user_pac_generic_keys *ukeys) +{ + keys->apga = pac_key_from_user(ukeys->apgakey); +} + +static int pac_generic_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + + return membuf_write(&to, &user_keys, sizeof(user_keys)); +} + +static int pac_generic_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + int ret; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_generic_keys_from_user(keys, &user_keys); + + return 0; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ +#endif /* CONFIG_ARM64_PTR_AUTH */ + +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (IS_ERR_VALUE(ctrl)) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -711,6 +1361,24 @@ enum aarch64_regset { REGSET_HW_WATCH, #endif REGSET_SYSTEM_CALL, +#ifdef CONFIG_ARM64_SVE + REGSET_SVE, +#endif +#ifdef CONFIG_ARM64_SVE + REGSET_SSVE, + REGSET_ZA, +#endif +#ifdef CONFIG_ARM64_PTR_AUTH + REGSET_PAC_MASK, + REGSET_PAC_ENABLED_KEYS, +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_PACA_KEYS, + REGSET_PACG_KEYS, +#endif +#endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + REGSET_TAGGED_ADDR_CTRL, +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -719,7 +1387,7 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), - .get = gpr_get, + .regset_get = gpr_get, .set = gpr_set }, [REGSET_FPR] = { @@ -731,15 +1399,16 @@ static const struct user_regset aarch64_regsets[] = { */ .size = sizeof(u32), .align = sizeof(u32), - .get = fpr_get, + .active = fpr_active, + .regset_get = fpr_get, .set = fpr_set }, [REGSET_TLS] = { .core_note_type = NT_ARM_TLS, - .n = 1, + .n = 2, .size = sizeof(void *), .align = sizeof(void *), - .get = tls_get, + .regset_get = tls_get, .set = tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -748,7 +1417,7 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { @@ -756,7 +1425,7 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, #endif @@ -765,9 +1434,93 @@ static const struct user_regset aarch64_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .get = system_call_get, + .regset_get = system_call_get, .set = system_call_set, }, +#ifdef CONFIG_ARM64_SVE + [REGSET_SVE] = { /* Scalable Vector Extension */ + .core_note_type = NT_ARM_SVE, + .n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE), + SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = sve_get, + .set = sve_set, + }, +#endif +#ifdef CONFIG_ARM64_SME + [REGSET_SSVE] = { /* Streaming mode SVE */ + .core_note_type = NT_ARM_SSVE, + .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), + SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = ssve_get, + .set = ssve_set, + }, + [REGSET_ZA] = { /* SME ZA */ + .core_note_type = NT_ARM_ZA, + /* + * ZA is a single register but it's variably sized and + * the ptrace core requires that the size of any data + * be an exact multiple of the configured register + * size so report as though we had SVE_VQ_BYTES + * registers. These values aren't exposed to + * userspace. + */ + .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = za_get, + .set = za_set, + }, +#endif +#ifdef CONFIG_ARM64_PTR_AUTH + [REGSET_PAC_MASK] = { + .core_note_type = NT_ARM_PAC_MASK, + .n = sizeof(struct user_pac_mask) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = pac_mask_get, + /* this cannot be set dynamically */ + }, + [REGSET_PAC_ENABLED_KEYS] = { + .core_note_type = NT_ARM_PAC_ENABLED_KEYS, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = pac_enabled_keys_get, + .set = pac_enabled_keys_set, + }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_PACA_KEYS] = { + .core_note_type = NT_ARM_PACA_KEYS, + .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .regset_get = pac_address_keys_get, + .set = pac_address_keys_set, + }, + [REGSET_PACG_KEYS] = { + .core_note_type = NT_ARM_PACG_KEYS, + .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .regset_get = pac_generic_keys_get, + .set = pac_generic_keys_set, + }, +#endif +#endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + [REGSET_TAGGED_ADDR_CTRL] = { + .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { @@ -776,63 +1529,36 @@ static const struct user_regset_view user_aarch64_view = { }; #ifdef CONFIG_COMPAT -#include <linux/compat.h> - enum compat_regset { REGSET_COMPAT_GPR, REGSET_COMPAT_VFP, }; +static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int idx) +{ + struct pt_regs *regs = task_pt_regs(task); + + switch (idx) { + case 15: + return regs->pc; + case 16: + return pstate_to_compat_psr(regs->pstate); + case 17: + return regs->orig_x0; + default: + return regs->regs[idx]; + } +} + static int compat_gpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - int ret = 0; - unsigned int i, start, num_regs; - - /* Calculate the number of AArch32 registers contained in count */ - num_regs = count / regset->size; - - /* Convert pos into an register number */ - start = pos / regset->size; - - if (start + num_regs > regset->n) - return -EIO; - - for (i = 0; i < num_regs; ++i) { - unsigned int idx = start + i; - compat_ulong_t reg; + int i = 0; - switch (idx) { - case 15: - reg = task_pt_regs(target)->pc; - break; - case 16: - reg = task_pt_regs(target)->pstate; - break; - case 17: - reg = task_pt_regs(target)->orig_x0; - break; - default: - reg = task_pt_regs(target)->regs[idx]; - } - - if (kbuf) { - memcpy(kbuf, ®, sizeof(reg)); - kbuf += sizeof(reg); - } else { - ret = copy_to_user(ubuf, ®, sizeof(reg)); - if (ret) { - ret = -EFAULT; - break; - } - - ubuf += sizeof(reg); - } - } - - return ret; + while (to.left) + membuf_store(&to, compat_get_user_reg(target, i++)); + return 0; } static int compat_gpr_set(struct task_struct *target, @@ -877,6 +1603,7 @@ static int compat_gpr_set(struct task_struct *target, newregs.pc = reg; break; case 16: + reg = compat_psr_to_pstate(reg); newregs.pstate = reg; break; case 17: @@ -898,14 +1625,15 @@ static int compat_gpr_set(struct task_struct *target, static int compat_vfp_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; - int ret, vregs_end_pos; - uregs = &target->thread.fpsimd_state.user_fpsimd; + if (!system_supports_fpsimd()) + return -EINVAL; + + uregs = &target->thread.uw.fpsimd_state; if (target == current) fpsimd_preserve_current_state(); @@ -914,19 +1642,10 @@ static int compat_vfp_get(struct task_struct *target, * The VFP registers are packed into the fpsimd_state, so they all sit * nicely together for us. We just need to create the fpscr separately. */ - vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, - 0, vregs_end_pos); - - if (count && !ret) { - fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | - (uregs->fpcr & VFP_FPSCR_CTRL_MASK); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpscr, - vregs_end_pos, VFP_STATE_SIZE); - } - - return ret; + membuf_write(&to, uregs, VFP_STATE_SIZE - sizeof(compat_ulong_t)); + fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | + (uregs->fpcr & VFP_FPSCR_CTRL_MASK); + return membuf_store(&to, fpscr); } static int compat_vfp_set(struct task_struct *target, @@ -938,7 +1657,10 @@ static int compat_vfp_set(struct task_struct *target, compat_ulong_t fpscr; int ret, vregs_end_pos; - uregs = &target->thread.fpsimd_state.user_fpsimd; + if (!system_supports_fpsimd()) + return -EINVAL; + + uregs = &target->thread.uw.fpsimd_state; vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, uregs, 0, @@ -958,11 +1680,10 @@ static int compat_vfp_set(struct task_struct *target, } static int compat_tls_get(struct task_struct *target, - const struct user_regset *regset, unsigned int pos, - unsigned int count, void *kbuf, void __user *ubuf) + const struct user_regset *regset, + struct membuf to) { - compat_ulong_t tls = (compat_ulong_t)target->thread.tp_value; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); + return membuf_store(&to, (compat_ulong_t)target->thread.uw.tp_value); } static int compat_tls_set(struct task_struct *target, @@ -971,13 +1692,13 @@ static int compat_tls_set(struct task_struct *target, const void __user *ubuf) { int ret; - compat_ulong_t tls = target->thread.tp_value; + compat_ulong_t tls = target->thread.uw.tp_value; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); if (ret) return ret; - target->thread.tp_value = tls; + target->thread.uw.tp_value = tls; return ret; } @@ -987,7 +1708,7 @@ static const struct user_regset aarch32_regsets[] = { .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), - .get = compat_gpr_get, + .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { @@ -995,7 +1716,8 @@ static const struct user_regset aarch32_regsets[] = { .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_vfp_get, + .active = fpr_active, + .regset_get = compat_vfp_get, .set = compat_vfp_set }, }; @@ -1011,7 +1733,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), - .get = compat_gpr_get, + .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_FPR] = { @@ -1019,7 +1741,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_vfp_get, + .regset_get = compat_vfp_get, .set = compat_vfp_set }, [REGSET_TLS] = { @@ -1027,7 +1749,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_tls_get, + .regset_get = compat_tls_get, .set = compat_tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1036,7 +1758,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { @@ -1044,7 +1766,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, #endif @@ -1053,7 +1775,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .get = system_call_get, + .regset_get = system_call_get, .set = system_call_set, }, }; @@ -1078,9 +1800,7 @@ static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, else if (off == COMPAT_PT_TEXT_END_ADDR) tmp = tsk->mm->end_code; else if (off < sizeof(compat_elf_gregset_t)) - return copy_regset_to_user(tsk, &user_aarch32_view, - REGSET_COMPAT_GPR, off, - sizeof(compat_ulong_t), ret); + tmp = compat_get_user_reg(tsk, off >> 2); else if (off >= COMPAT_USER_SZ) return -EIO; else @@ -1092,8 +1812,8 @@ static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t val) { - int ret; - mm_segment_t old_fs = get_fs(); + struct pt_regs newregs = *task_pt_regs(tsk); + unsigned int idx = off / 4; if (off & 3 || off >= COMPAT_USER_SZ) return -EIO; @@ -1101,14 +1821,25 @@ static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, if (off >= sizeof(compat_elf_gregset_t)) return 0; - set_fs(KERNEL_DS); - ret = copy_regset_from_user(tsk, &user_aarch32_view, - REGSET_COMPAT_GPR, off, - sizeof(compat_ulong_t), - &val); - set_fs(old_fs); + switch (idx) { + case 15: + newregs.pc = val; + break; + case 16: + newregs.pstate = compat_psr_to_pstate(val); + break; + case 17: + newregs.orig_x0 = val; + break; + default: + newregs.regs[idx] = val; + } + + if (!valid_user_regs(&newregs.user_regs, tsk)) + return -EINVAL; - return ret; + *task_pt_regs(tsk) = newregs; + return 0; } #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1155,7 +1886,7 @@ static int compat_ptrace_hbp_get(unsigned int note_type, u64 addr = 0; u32 ctrl = 0; - int err, idx = compat_ptrace_hbp_num_to_idx(num);; + int err, idx = compat_ptrace_hbp_num_to_idx(num); if (num & 1) { err = ptrace_hbp_get_addr(note_type, tsk, idx, &addr); @@ -1194,9 +1925,7 @@ static int compat_ptrace_gethbpregs(struct task_struct *tsk, compat_long_t num, { int ret; u32 kdata; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); /* Watchpoint */ if (num < 0) { ret = compat_ptrace_hbp_get(NT_ARM_HW_WATCH, tsk, num, &kdata); @@ -1207,7 +1936,6 @@ static int compat_ptrace_gethbpregs(struct task_struct *tsk, compat_long_t num, } else { ret = compat_ptrace_hbp_get(NT_ARM_HW_BREAK, tsk, num, &kdata); } - set_fs(old_fs); if (!ret) ret = put_user(kdata, data); @@ -1220,7 +1948,6 @@ static int compat_ptrace_sethbpregs(struct task_struct *tsk, compat_long_t num, { int ret; u32 kdata = 0; - mm_segment_t old_fs = get_fs(); if (num == 0) return 0; @@ -1229,12 +1956,10 @@ static int compat_ptrace_sethbpregs(struct task_struct *tsk, compat_long_t num, if (ret) return ret; - set_fs(KERNEL_DS); if (num < 0) ret = compat_ptrace_hbp_set(NT_ARM_HW_WATCH, tsk, num, &kdata); else ret = compat_ptrace_hbp_set(NT_ARM_HW_BREAK, tsk, num, &kdata); - set_fs(old_fs); return ret; } @@ -1274,7 +1999,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; case COMPAT_PTRACE_GET_THREAD_AREA: - ret = put_user((compat_ulong_t)child->thread.tp_value, + ret = put_user((compat_ulong_t)child->thread.uw.tp_value, (compat_ulong_t __user *)datap); break; @@ -1339,6 +2064,12 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + switch (request) { + case PTRACE_PEEKMTETAGS: + case PTRACE_POKEMTETAGS: + return mte_ptrace_copy_tags(child, request, addr, data); + } + return ptrace_request(child, request, addr, data); } @@ -1347,36 +2078,63 @@ enum ptrace_syscall_dir { PTRACE_SYSCALL_EXIT, }; -static void tracehook_report_syscall(struct pt_regs *regs, - enum ptrace_syscall_dir dir) +static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) { int regno; unsigned long saved_reg; /* - * A scratch register (ip(r12) on AArch32, x7 on AArch64) is - * used to denote syscall entry/exit: + * We have some ABI weirdness here in the way that we handle syscall + * exit stops because we indicate whether or not the stop has been + * signalled from syscall entry or syscall exit by clobbering a general + * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee + * and restoring its old value after the stop. This means that: + * + * - Any writes by the tracer to this register during the stop are + * ignored/discarded. + * + * - The actual value of the register is not available during the stop, + * so the tracer cannot save it and restore it later. + * + * - Syscall stops behave differently to seccomp and pseudo-step traps + * (the latter do not nobble any registers). */ regno = (is_compat_task() ? 12 : 7); saved_reg = regs->regs[regno]; regs->regs[regno] = dir; - if (dir == PTRACE_SYSCALL_EXIT) - tracehook_report_syscall_exit(regs, 0); - else if (tracehook_report_syscall_entry(regs)) - forget_syscall(regs); + if (dir == PTRACE_SYSCALL_ENTER) { + if (ptrace_report_syscall_entry(regs)) + forget_syscall(regs); + regs->regs[regno] = saved_reg; + } else if (!test_thread_flag(TIF_SINGLESTEP)) { + ptrace_report_syscall_exit(regs, 0); + regs->regs[regno] = saved_reg; + } else { + regs->regs[regno] = saved_reg; - regs->regs[regno] = saved_reg; + /* + * Signal a pseudo-step exception since we are stepping but + * tracer modifications to the registers may have rewound the + * state machine. + */ + ptrace_report_syscall_exit(regs, 1); + } } -asmlinkage int syscall_trace_enter(struct pt_regs *regs) +int syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); + unsigned long flags = read_thread_flags(); + + if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { + report_syscall(regs, PTRACE_SYSCALL_ENTER); + if (flags & _TIF_SYSCALL_EMU) + return NO_SYSCALL; + } /* Do the secure computing after ptrace; failures should be fast. */ - if (secure_computing(NULL) == -1) - return -1; + if (secure_computing() == -1) + return NO_SYSCALL; if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, regs->syscallno); @@ -1387,27 +2145,36 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) return regs->syscallno; } -asmlinkage void syscall_trace_exit(struct pt_regs *regs) +void syscall_trace_exit(struct pt_regs *regs) { + unsigned long flags = read_thread_flags(); + audit_syscall_exit(regs); - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_exit(regs, regs_return_value(regs)); + if (flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) + report_syscall(regs, PTRACE_SYSCALL_EXIT); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); + rseq_syscall(regs); } /* - * Bits which are always architecturally RES0 per ARM DDI 0487A.h + * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a. + * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is + * not described in ARM DDI 0487D.a. + * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may + * be allocated an EL0 meaning in future. * Userspace cannot use these until they have an architectural meaning. + * Note that this follows the SPSR_ELx format, not the AArch32 PSR format. * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ - (GENMASK_ULL(63,32) | GENMASK_ULL(27, 22) | GENMASK_ULL(20, 10) | \ - GENMASK_ULL(5, 5)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \ + GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ - (GENMASK_ULL(63,32) | GENMASK_ULL(24, 22) | GENMASK_ULL(20,20)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) static int valid_compat_regs(struct user_pt_regs *regs) { @@ -1415,15 +2182,15 @@ static int valid_compat_regs(struct user_pt_regs *regs) if (!system_supports_mixed_endian_el0()) { if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) - regs->pstate |= COMPAT_PSR_E_BIT; + regs->pstate |= PSR_AA32_E_BIT; else - regs->pstate &= ~COMPAT_PSR_E_BIT; + regs->pstate &= ~PSR_AA32_E_BIT; } if (user_mode(regs) && (regs->pstate & PSR_MODE32_BIT) && - (regs->pstate & COMPAT_PSR_A_BIT) == 0 && - (regs->pstate & COMPAT_PSR_I_BIT) == 0 && - (regs->pstate & COMPAT_PSR_F_BIT) == 0) { + (regs->pstate & PSR_AA32_A_BIT) == 0 && + (regs->pstate & PSR_AA32_I_BIT) == 0 && + (regs->pstate & PSR_AA32_F_BIT) == 0) { return 1; } @@ -1431,11 +2198,11 @@ static int valid_compat_regs(struct user_pt_regs *regs) * Force PSR to a valid 32-bit EL0t, preserving the same bits as * arch/arm. */ - regs->pstate &= COMPAT_PSR_N_BIT | COMPAT_PSR_Z_BIT | - COMPAT_PSR_C_BIT | COMPAT_PSR_V_BIT | - COMPAT_PSR_Q_BIT | COMPAT_PSR_IT_MASK | - COMPAT_PSR_GE_MASK | COMPAT_PSR_E_BIT | - COMPAT_PSR_T_BIT; + regs->pstate &= PSR_AA32_N_BIT | PSR_AA32_Z_BIT | + PSR_AA32_C_BIT | PSR_AA32_V_BIT | + PSR_AA32_Q_BIT | PSR_AA32_IT_MASK | + PSR_AA32_GE_MASK | PSR_AA32_E_BIT | + PSR_AA32_T_BIT; regs->pstate |= PSR_MODE32_BIT; return 0; @@ -1465,8 +2232,8 @@ static int valid_native_regs(struct user_pt_regs *regs) */ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task) { - if (!test_tsk_thread_flag(task, TIF_SINGLESTEP)) - regs->pstate &= ~DBG_SPSR_SS; + /* https://lore.kernel.org/lkml/20191118131525.GA4180@willie-the-truck */ + user_regs_reset_single_step(regs, task); if (is_compat_thread(task_thread_info(task))) return valid_compat_regs(regs); diff --git a/arch/arm64/kernel/reloc_test_core.c b/arch/arm64/kernel/reloc_test_core.c index c124752a8bd3..99f2ffe9fc05 100644 --- a/arch/arm64/kernel/reloc_test_core.c +++ b/arch/arm64/kernel/reloc_test_core.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/module.h> @@ -28,6 +24,7 @@ asmlinkage u64 absolute_data16(void); asmlinkage u64 signed_movw(void); asmlinkage u64 unsigned_movw(void); asmlinkage u64 relative_adrp(void); +asmlinkage u64 relative_adrp_far(void); asmlinkage u64 relative_adr(void); asmlinkage u64 relative_data64(void); asmlinkage u64 relative_data32(void); @@ -43,16 +40,15 @@ static struct { { "R_AARCH64_ABS16", absolute_data16, UL(SYM16_ABS_VAL) }, { "R_AARCH64_MOVW_SABS_Gn", signed_movw, UL(SYM64_ABS_VAL) }, { "R_AARCH64_MOVW_UABS_Gn", unsigned_movw, UL(SYM64_ABS_VAL) }, -#ifndef CONFIG_ARM64_ERRATUM_843419 { "R_AARCH64_ADR_PREL_PG_HI21", relative_adrp, (u64)&sym64_rel }, -#endif + { "R_AARCH64_ADR_PREL_PG_HI21", relative_adrp_far, (u64)&memstart_addr }, { "R_AARCH64_ADR_PREL_LO21", relative_adr, (u64)&sym64_rel }, { "R_AARCH64_PREL64", relative_data64, (u64)&sym64_rel }, { "R_AARCH64_PREL32", relative_data32, (u64)&sym64_rel }, { "R_AARCH64_PREL16", relative_data16, (u64)&sym64_rel }, }; -static int reloc_test_init(void) +static int __init reloc_test_init(void) { int i; @@ -71,7 +67,7 @@ static int reloc_test_init(void) return 0; } -static void reloc_test_exit(void) +static void __exit reloc_test_exit(void) { } diff --git a/arch/arm64/kernel/reloc_test_syms.S b/arch/arm64/kernel/reloc_test_syms.S index e1edcefeb02d..c50f45fa29fa 100644 --- a/arch/arm64/kernel/reloc_test_syms.S +++ b/arch/arm64/kernel/reloc_test_syms.S @@ -1,83 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/linkage.h> -ENTRY(absolute_data64) +SYM_FUNC_START(absolute_data64) ldr x0, 0f ret 0: .quad sym64_abs -ENDPROC(absolute_data64) +SYM_FUNC_END(absolute_data64) -ENTRY(absolute_data32) +SYM_FUNC_START(absolute_data32) ldr w0, 0f ret 0: .long sym32_abs -ENDPROC(absolute_data32) +SYM_FUNC_END(absolute_data32) -ENTRY(absolute_data16) +SYM_FUNC_START(absolute_data16) adr x0, 0f ldrh w0, [x0] ret 0: .short sym16_abs, 0 -ENDPROC(absolute_data16) +SYM_FUNC_END(absolute_data16) -ENTRY(signed_movw) +SYM_FUNC_START(signed_movw) movz x0, #:abs_g2_s:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(signed_movw) +SYM_FUNC_END(signed_movw) -ENTRY(unsigned_movw) +SYM_FUNC_START(unsigned_movw) movz x0, #:abs_g3:sym64_abs movk x0, #:abs_g2_nc:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(unsigned_movw) +SYM_FUNC_END(unsigned_movw) -#ifndef CONFIG_ARM64_ERRATUM_843419 - -ENTRY(relative_adrp) + .align 12 + .space 0xff8 +SYM_FUNC_START(relative_adrp) adrp x0, sym64_rel add x0, x0, #:lo12:sym64_rel ret -ENDPROC(relative_adrp) +SYM_FUNC_END(relative_adrp) -#endif + .align 12 + .space 0xffc +SYM_FUNC_START(relative_adrp_far) + adrp x0, memstart_addr + add x0, x0, #:lo12:memstart_addr + ret +SYM_FUNC_END(relative_adrp_far) -ENTRY(relative_adr) +SYM_FUNC_START(relative_adr) adr x0, sym64_rel ret -ENDPROC(relative_adr) +SYM_FUNC_END(relative_adr) -ENTRY(relative_data64) +SYM_FUNC_START(relative_data64) adr x1, 0f ldr x0, [x1] add x0, x0, x1 ret 0: .quad sym64_rel - . -ENDPROC(relative_data64) +SYM_FUNC_END(relative_data64) -ENTRY(relative_data32) +SYM_FUNC_START(relative_data32) adr x1, 0f ldr w0, [x1] add x0, x0, x1 ret 0: .long sym64_rel - . -ENDPROC(relative_data32) +SYM_FUNC_END(relative_data32) -ENTRY(relative_data16) +SYM_FUNC_START(relative_data16) adr x1, 0f ldrsh w0, [x1] add x0, x0, x1 ret 0: .short sym64_rel - ., 0 -ENDPROC(relative_data16) +SYM_FUNC_END(relative_data16) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index ce704a4aeadd..413f899e4ac6 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -1,12 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * kexec for arm64 * * Copyright (C) Linaro. * Copyright (C) Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * Copyright (C) 2021, Microsoft Corporation. + * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kexec.h> @@ -16,115 +15,86 @@ #include <asm/kexec.h> #include <asm/page.h> #include <asm/sysreg.h> +#include <asm/virt.h> + +.macro turn_off_mmu tmp1, tmp2 + mov_q \tmp1, INIT_SCTLR_EL1_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el1, \tmp1 + isb +.endm +.section ".kexec_relocate.text", "ax" /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. * - * The memory that the old kernel occupies may be overwritten when coping the + * The memory that the old kernel occupies may be overwritten when copying the * new image to its final location. To assure that the * arm64_relocate_new_kernel routine which does that copy is not overwritten, * all code and data needed by arm64_relocate_new_kernel must be between the * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec - * control_code_page, a special page which has been set up to be preserved - * during the copy operation. + * safe memory that has been set up to be preserved during the copy operation. */ -ENTRY(arm64_relocate_new_kernel) +SYM_CODE_START(arm64_relocate_new_kernel) + /* + * The kimage structure isn't allocated specially and may be clobbered + * during relocation. We must load any values we need from it prior to + * any relocation occurring. + */ + ldr x28, [x0, #KIMAGE_START] + ldr x27, [x0, #KIMAGE_ARCH_EL2_VECTORS] + ldr x26, [x0, #KIMAGE_ARCH_DTB_MEM] /* Setup the list loop variables. */ - mov x17, x1 /* x17 = kimage_start */ - mov x16, x0 /* x16 = kimage_head */ - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ - mov x14, xzr /* x14 = entry ptr */ - mov x13, xzr /* x13 = copy dest */ - - /* Clear the sctlr_el2 flags. */ - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.ne 1f - mrs x0, sctlr_el2 - ldr x1, =SCTLR_ELx_FLAGS - bic x0, x0, x1 - msr sctlr_el2, x0 - isb -1: - - /* Check if the new image needs relocation. */ - tbnz x16, IND_DONE_BIT, .Ldone - + ldr x18, [x0, #KIMAGE_ARCH_ZERO_PAGE] /* x18 = zero page for BBM */ + ldr x17, [x0, #KIMAGE_ARCH_TTBR1] /* x17 = linear map copy */ + ldr x16, [x0, #KIMAGE_HEAD] /* x16 = kimage_head */ + ldr x22, [x0, #KIMAGE_ARCH_PHYS_OFFSET] /* x22 phys_offset */ + raw_dcache_line_size x15, x1 /* x15 = dcache line size */ + break_before_make_ttbr_switch x18, x17, x1, x2 /* set linear map */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ - + sub x12, x12, x22 /* Convert x12 to virt */ /* Test the entry flags. */ .Ltest_source: tbz x16, IND_SOURCE_BIT, .Ltest_indirection /* Invalidate dest page to PoC. */ - mov x0, x13 - add x20, x0, #PAGE_SIZE - sub x1, x15, #1 - bic x0, x0, x1 -2: dc ivac, x0 - add x0, x0, x15 - cmp x0, x20 - b.lo 2b - dsb sy - - mov x20, x13 - mov x21, x12 - copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7 - - /* dest += PAGE_SIZE */ - add x13, x13, PAGE_SIZE + mov x19, x13 + copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 + add x1, x19, #PAGE_SIZE + dcache_by_myline_op civac, sy, x19, x1, x15, x20 b .Lnext - .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination - - /* ptr = addr */ - mov x14, x12 + mov x14, x12 /* ptr = addr */ b .Lnext - .Ltest_destination: tbz x16, IND_DESTINATION_BIT, .Lnext - - /* dest = addr */ - mov x13, x12 - + mov x13, x12 /* dest = addr */ .Lnext: - /* entry = *ptr++ */ - ldr x16, [x14], #8 - - /* while (!(entry & DONE)) */ - tbz x16, IND_DONE_BIT, .Lloop - -.Ldone: + ldr x16, [x14], #8 /* entry = *ptr++ */ + tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */ /* wait for writes from copy_page to finish */ dsb nsh ic iallu dsb nsh isb + turn_off_mmu x12, x13 /* Start new image. */ - mov x0, xzr + cbz x27, .Lel1 + mov x1, x28 /* kernel entry point */ + mov x2, x26 /* dtb address */ + mov x3, xzr + mov x4, xzr + mov x0, #HVC_SOFT_RESTART + hvc #0 /* Jumps from el2 */ +.Lel1: + mov x0, x26 /* dtb address */ mov x1, xzr mov x2, xzr mov x3, xzr - br x17 - -ENDPROC(arm64_relocate_new_kernel) - -.ltorg - -.align 3 /* To keep the 64-bit values below naturally aligned. */ - -.Lcopy_end: -.org KEXEC_CONTROL_PAGE_SIZE - -/* - * arm64_relocate_new_kernel_size - Number of bytes to copy to the - * control_code_page. - */ -.globl arm64_relocate_new_kernel_size -arm64_relocate_new_kernel_size: - .quad .Lcopy_end - arm64_relocate_new_kernel + br x28 /* Jumps from el1 */ +SYM_CODE_END(arm64_relocate_new_kernel) diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 933adbc0f654..68330017d04f 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -1,53 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/return_address.c * * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/export.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> +#include <linux/stacktrace.h> #include <asm/stack_pointer.h> -#include <asm/stacktrace.h> struct return_address_data { unsigned int level; void *addr; }; -static int save_return_addr(struct stackframe *frame, void *d) +static bool save_return_addr(void *d, unsigned long pc) { struct return_address_data *data = d; if (!data->level) { - data->addr = (void *)frame->pc; - return 1; + data->addr = (void *)pc; + return false; } else { --data->level; - return 0; + return true; } } +NOKPROBE_SYMBOL(save_return_addr); void *return_address(unsigned int level) { struct return_address_data data; - struct stackframe frame; data.level = level + 2; data.addr = NULL; - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.pc = (unsigned long)return_address; /* dummy */ -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = current->curr_ret_stack; -#endif - - walk_stackframe(current, &frame, save_return_addr, &data); + arch_stack_walk(save_return_addr, &data, current, NULL); if (!data.level) return data.addr; @@ -55,3 +47,4 @@ void *return_address(unsigned int level) return NULL; } EXPORT_SYMBOL_GPL(return_address); +NOKPROBE_SYMBOL(return_address); diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c new file mode 100644 index 000000000000..d56e170e1ca7 --- /dev/null +++ b/arch/arm64/kernel/sdei.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#define pr_fmt(fmt) "sdei: " fmt + +#include <linux/arm-smccc.h> +#include <linux/arm_sdei.h> +#include <linux/hardirq.h> +#include <linux/irqflags.h> +#include <linux/sched/task_stack.h> +#include <linux/scs.h> +#include <linux/uaccess.h> + +#include <asm/alternative.h> +#include <asm/exception.h> +#include <asm/kprobes.h> +#include <asm/mmu.h> +#include <asm/ptrace.h> +#include <asm/sections.h> +#include <asm/stacktrace.h> +#include <asm/sysreg.h> +#include <asm/vmap_stack.h> + +unsigned long sdei_exit_mode; + +/* + * VMAP'd stacks checking for stack overflow on exception using sp as a scratch + * register, meaning SDEI has to switch to its own stack. We need two stacks as + * a critical event may interrupt a normal event that has just taken a + * synchronous exception, and is using sp as scratch register. For a critical + * event interrupting a normal event, we can't reliably tell if we were on the + * sdei stack. + * For now, we allocate stacks when the driver is probed. + */ +DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); + +#ifdef CONFIG_VMAP_STACK +DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); +#endif + +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); +#endif + +static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu) +{ + unsigned long *p; + + p = per_cpu(*ptr, cpu); + if (p) { + per_cpu(*ptr, cpu) = NULL; + vfree(p); + } +} + +static void free_sdei_stacks(void) +{ + int cpu; + + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return; + + for_each_possible_cpu(cpu) { + _free_sdei_stack(&sdei_stack_normal_ptr, cpu); + _free_sdei_stack(&sdei_stack_critical_ptr, cpu); + } +} + +static int _init_sdei_stack(unsigned long * __percpu *ptr, int cpu) +{ + unsigned long *p; + + p = arch_alloc_vmap_stack(SDEI_STACK_SIZE, cpu_to_node(cpu)); + if (!p) + return -ENOMEM; + per_cpu(*ptr, cpu) = p; + + return 0; +} + +static int init_sdei_stacks(void) +{ + int cpu; + int err = 0; + + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return 0; + + for_each_possible_cpu(cpu) { + err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_stack(&sdei_stack_critical_ptr, cpu); + if (err) + break; + } + + if (err) + free_sdei_stacks(); + + return err; +} + +static void _free_sdei_scs(unsigned long * __percpu *ptr, int cpu) +{ + void *s; + + s = per_cpu(*ptr, cpu); + if (s) { + per_cpu(*ptr, cpu) = NULL; + scs_free(s); + } +} + +static void free_sdei_scs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + _free_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + _free_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + } +} + +static int _init_sdei_scs(unsigned long * __percpu *ptr, int cpu) +{ + void *s; + + s = scs_alloc(cpu_to_node(cpu)); + if (!s) + return -ENOMEM; + per_cpu(*ptr, cpu) = s; + + return 0; +} + +static int init_sdei_scs(void) +{ + int cpu; + int err = 0; + + if (!IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) + return 0; + + for_each_possible_cpu(cpu) { + err = _init_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + if (err) + break; + } + + if (err) + free_sdei_scs(); + + return err; +} + +unsigned long sdei_arch_get_entry_point(int conduit) +{ + /* + * SDEI works between adjacent exception levels. If we booted at EL1 we + * assume a hypervisor is marshalling events. If we booted at EL2 and + * dropped to EL1 because we don't support VHE, then we can't support + * SDEI. + */ + if (is_hyp_nvhe()) { + pr_err("Not supported on this hardware/boot configuration\n"); + goto out_err; + } + + if (init_sdei_stacks()) + goto out_err; + + if (init_sdei_scs()) + goto out_err_free_stacks; + + sdei_exit_mode = (conduit == SMCCC_CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + if (arm64_kernel_unmapped_at_el0()) { + unsigned long offset; + + offset = (unsigned long)__sdei_asm_entry_trampoline - + (unsigned long)__entry_tramp_text_start; + return TRAMP_VALIAS + offset; + } else +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + return (unsigned long)__sdei_asm_handler; + +out_err_free_stacks: + free_sdei_stacks(); +out_err: + return 0; +} + +/* + * do_sdei_event() returns one of: + * SDEI_EV_HANDLED - success, return to the interrupted context. + * SDEI_EV_FAILED - failure, return this error code to firmare. + * virtual-address - success, return to this address. + */ +unsigned long __kprobes do_sdei_event(struct pt_regs *regs, + struct sdei_registered_event *arg) +{ + u32 mode; + int i, err = 0; + int clobbered_registers = 4; + u64 elr = read_sysreg(elr_el1); + u32 kernel_mode = read_sysreg(CurrentEL) | 1; /* +SPSel */ + unsigned long vbar = read_sysreg(vbar_el1); + + if (arm64_kernel_unmapped_at_el0()) + clobbered_registers++; + + /* Retrieve the missing registers values */ + for (i = 0; i < clobbered_registers; i++) { + /* from within the handler, this call always succeeds */ + sdei_api_event_context(i, ®s->regs[i]); + } + + err = sdei_event_handler(regs, arg); + if (err) + return SDEI_EV_FAILED; + + if (elr != read_sysreg(elr_el1)) { + /* + * We took a synchronous exception from the SDEI handler. + * This could deadlock, and if you interrupt KVM it will + * hyp-panic instead. + */ + pr_warn("unsafe: exception during handler\n"); + } + + mode = regs->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK); + + /* + * If we interrupted the kernel with interrupts masked, we always go + * back to wherever we came from. + */ + if (mode == kernel_mode && !interrupts_enabled(regs)) + return SDEI_EV_HANDLED; + + /* + * Otherwise, we pretend this was an IRQ. This lets user space tasks + * receive signals before we return to them, and KVM to invoke it's + * world switch to do the same. + * + * See DDI0487B.a Table D1-7 'Vector offsets from vector table base + * address'. + */ + if (mode == kernel_mode) + return vbar + 0x280; + else if (mode & PSR_MODE32_BIT) + return vbar + 0x680; + + return vbar + 0x480; +} diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index d4b740538ad5..fea3223704b6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/setup.c * * Copyright (C) 1995-2001 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> @@ -23,11 +12,9 @@ #include <linux/stddef.h> #include <linux/ioport.h> #include <linux/delay.h> -#include <linux/utsname.h> #include <linux/initrd.h> #include <linux/console.h> #include <linux/cache.h> -#include <linux/bootmem.h> #include <linux/screen_info.h> #include <linux/init.h> #include <linux/kexec.h> @@ -36,6 +23,7 @@ #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/fs.h> +#include <linux/panic_notifier.h> #include <linux/proc_fs.h> #include <linux/memblock.h> #include <linux/of_fdt.h> @@ -48,6 +36,7 @@ #include <asm/fixmap.h> #include <asm/cpu.h> #include <asm/cputype.h> +#include <asm/daifflags.h> #include <asm/elf.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> @@ -59,11 +48,13 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/traps.h> -#include <asm/memblock.h> #include <asm/efi.h> #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h> +static int num_standard_resources; +static struct resource *standard_resources; + phys_addr_t __fdt_pointer __initdata; /* @@ -95,15 +86,10 @@ u64 __cacheline_aligned boot_args[4]; void __init smp_setup_processor_id(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; - cpu_logical_map(0) = mpidr; + set_cpu_logical_map(0, mpidr); - /* - * clear __my_cpu_offset on boot CPU to avoid hang caused by - * using percpu variable early, for example, lockdep will - * access percpu variable inside lock_release - */ - set_my_cpu_offset(0); - pr_info("Booting Linux on physical CPU 0x%lx\n", (unsigned long)mpidr); + pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", + (unsigned long)mpidr, read_cpuid_id()); } bool arch_match_cpu_phys_id(int cpu, u64 phys_id) @@ -177,22 +163,49 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } +static void *early_fdt_ptr __initdata; + +void __init *get_early_fdt_ptr(void) +{ + return early_fdt_ptr; +} + +asmlinkage void __init early_fdt_map(u64 dt_phys) +{ + int fdt_size; + + early_fixmap_init(); + early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); +} + static void __init setup_machine_fdt(phys_addr_t dt_phys) { - void *dt_virt = fixmap_remap_fdt(dt_phys); + int size; + void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; + if (dt_virt) + memblock_reserve(dt_phys, size); + if (!dt_virt || !early_init_dt_scan(dt_virt)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n" + "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" "\nPlease check your bootloader.", &dt_phys, dt_virt); + /* + * Note that in this _really_ early stage we cannot even BUG() + * or oops, so the least terrible thing to do is cpu_relax(), + * or else we could end-up printing non-initialized data, etc. + */ while (true) cpu_relax(); } + /* Early fixups are done, map the FDT as read-only now */ + fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); + name = of_flat_dt_get_machine_name(); if (!name) return; @@ -205,67 +218,106 @@ static void __init request_standard_resources(void) { struct memblock_region *region; struct resource *res; + unsigned long i = 0; + size_t res_size; - kernel_code.start = __pa_symbol(_text); + kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); + insert_resource(&iomem_resource, &kernel_code); + insert_resource(&iomem_resource, &kernel_data); - for_each_memblock(memory, region) { - res = alloc_bootmem_low(sizeof(*res)); + num_standard_resources = memblock.memory.cnt; + res_size = num_standard_resources * sizeof(*standard_resources); + standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); + if (!standard_resources) + panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + + for_each_mem_region(region) { + res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; res->flags = IORESOURCE_MEM; + res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); + res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; } else { res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); + res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; + } + + insert_resource(&iomem_resource, res); + } +} + +static int __init reserve_memblock_reserved_regions(void) +{ + u64 i, j; + + for (i = 0; i < num_standard_resources; ++i) { + struct resource *mem = &standard_resources[i]; + phys_addr_t r_start, r_end, mem_size = resource_size(mem); + + if (!memblock_is_region_reserved(mem->start, mem_size)) + continue; + + for_each_reserved_mem_range(j, &r_start, &r_end) { + resource_size_t start, end; + + start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); + end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end); + + if (start > mem->end || end < mem->start) + continue; + + reserve_region_with_split(mem, start, end, "reserved"); } - res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); - res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; - - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); -#ifdef CONFIG_KEXEC_CORE - /* Userspace will find "Crash kernel" region in /proc/iomem. */ - if (crashk_res.end && crashk_res.start >= res->start && - crashk_res.end <= res->end) - request_resource(res, &crashk_res); -#endif } + + return 0; } +arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; -void __init setup_arch(char **cmdline_p) +u64 cpu_logical_map(unsigned int cpu) { - pr_info("Boot CPU: AArch64 Processor [%08x]\n", read_cpuid_id()); + return __cpu_logical_map[cpu]; +} - sprintf(init_utsname()->machine, UTS_MACHINE); - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; +void __init __no_sanitize_address setup_arch(char **cmdline_p) +{ + setup_initial_init_mm(_stext, _etext, _edata, _end); *cmdline_p = boot_command_line; + /* + * If know now we are going to need KPTI then use non-global + * mappings from the start, avoiding the cost of rewriting + * everything later. + */ + arm64_use_ng_mappings = kaslr_requires_kpti(); + early_fixmap_init(); early_ioremap_init(); setup_machine_fdt(__fdt_pointer); + /* + * Initialise the static keys early as they may be enabled by the + * cpufeature code and early parameters. + */ + jump_label_init(); parse_early_param(); /* - * Unmask asynchronous aborts after bringing up possible earlycon. - * (Report possible System Errors once we can report this occurred) + * Unmask asynchronous aborts and fiq after bringing up possible + * earlycon. (Report possible System Errors once we can report this + * occurred). */ - local_async_enable(); + local_daif_restore(DAIF_PROCCTX_NOIRQ); /* * TTBR0 is only used for the identity mapping at this stage. Make it @@ -275,6 +327,10 @@ void __init setup_arch(char **cmdline_p) xen_early_init(); efi_init(); + + if (!efi_enabled(EFI_BOOT) && ((u64)_text % MIN_KIMG_ALIGN) != 0) + pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!"); + arm64_memblock_init(); paging_init(); @@ -300,26 +356,22 @@ void __init setup_arch(char **cmdline_p) else psci_acpi_init(); - cpu_read_bootcpu_ops(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation * faults in case uaccess_enable() is inadvertently called by the init * thread. */ - init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page); + init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); #endif -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif if (boot_args[1] || boot_args[2] || boot_args[3]) { pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n" "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n" @@ -328,16 +380,24 @@ void __init setup_arch(char **cmdline_p) } } +static inline bool cpu_can_disable(unsigned int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops && ops->cpu_can_disable) + return ops->cpu_can_disable(cpu); +#endif + return false; +} + static int __init topology_init(void) { int i; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_data.cpu, i); - cpu->hotpluggable = 1; + cpu->hotpluggable = cpu_can_disable(i); register_cpu(cpu, i); } @@ -345,31 +405,36 @@ static int __init topology_init(void) } subsys_initcall(topology_init); -/* - * Dump out kernel offset information on panic. - */ -static int dump_kernel_offset(struct notifier_block *self, unsigned long v, - void *p) +static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", offset, KIMAGE_VADDR); + pr_emerg("PHYS_OFFSET: 0x%llx\n", PHYS_OFFSET); } else { pr_emerg("Kernel Offset: disabled\n"); } +} + +static int arm64_panic_block_dump(struct notifier_block *self, + unsigned long v, void *p) +{ + dump_kernel_offset(); + dump_cpu_features(); + dump_mem_limit(); return 0; } -static struct notifier_block kernel_offset_notifier = { - .notifier_call = dump_kernel_offset +static struct notifier_block arm64_panic_block = { + .notifier_call = arm64_panic_block_dump }; -static int __init register_kernel_offset_dumper(void) +static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, - &kernel_offset_notifier); + &arm64_panic_block); return 0; } -__initcall(register_kernel_offset_dumper); +device_initcall(register_arm64_panic_block); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 0bdc96c61bc0..9ad911f1647c 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1,36 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/signal.h> -#include <linux/personality.h> #include <linux/freezer.h> #include <linux/stddef.h> #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> +#include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/cacheflush.h> @@ -38,7 +28,9 @@ #include <asm/unistd.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> +#include <asm/syscall.h> #include <asm/signal32.h> +#include <asm/traps.h> #include <asm/vdso.h> /* @@ -63,6 +55,8 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long sve_offset; + unsigned long za_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -97,7 +91,7 @@ static size_t sigframe_size(struct rt_sigframe_user_layout const *user) * not taken into account. This limit is not a guarantee and is * NOT ABI. */ -#define SIGFRAME_MAXSZ SZ_64K +#define SIGFRAME_MAXSZ SZ_256K static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, bool extend) @@ -176,12 +170,10 @@ static void __user *apply_user_offset( static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { - struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; + struct user_fpsimd_state const *fpsimd = + ¤t->thread.uw.fpsimd_state; int err; - /* dump the hardware registers to the fpsimd_state structure */ - fpsimd_preserve_current_state(); - /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -196,7 +188,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) static int restore_fpsimd_context(struct fpsimd_context __user *ctx) { - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; __u32 magic, size; int err = 0; @@ -214,6 +206,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) __get_user_error(fpsimd.fpsr, &ctx->fpsr, err); __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); + clear_thread_flag(TIF_SVE); + /* load the hardware registers from the fpsimd_state structure */ if (!err) fpsimd_update_current_state(&fpsimd); @@ -221,10 +215,243 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } + struct user_ctxs { struct fpsimd_context __user *fpsimd; + struct sve_context __user *sve; + struct za_context __user *za; }; +#ifdef CONFIG_ARM64_SVE + +static int preserve_sve_context(struct sve_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + u16 flags = 0; + unsigned int vl = task_get_sve_vl(current); + unsigned int vq = 0; + + if (thread_sm_enabled(¤t->thread)) { + vl = task_get_sme_vl(current); + vq = sve_vq_from_vl(vl); + flags |= SVE_SIG_FLAG_SM; + } else if (test_thread_flag(TIF_SVE)) { + vq = sve_vq_from_vl(vl); + } + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(SVE_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), + &ctx->head.size, err); + __put_user_error(vl, &ctx->vl, err); + __put_user_error(flags, &ctx->flags, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + if (vq) { + /* + * This assumes that the SVE state has already been saved to + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). + */ + err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, + current->thread.sve_state, + SVE_SIG_REGS_SIZE(vq)); + } + + return err ? -EFAULT : 0; +} + +static int restore_sve_fpsimd_context(struct user_ctxs *user) +{ + int err; + unsigned int vl, vq; + struct user_fpsimd_state fpsimd; + struct sve_context sve; + + if (__copy_from_user(&sve, user->sve, sizeof(sve))) + return -EFAULT; + + if (sve.flags & SVE_SIG_FLAG_SM) { + if (!system_supports_sme()) + return -EINVAL; + + vl = task_get_sme_vl(current); + } else { + if (!system_supports_sve()) + return -EINVAL; + + vl = task_get_sve_vl(current); + } + + if (sve.vl != vl) + return -EINVAL; + + if (sve.head.size <= sizeof(*user->sve)) { + clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; + goto fpsimd_only; + } + + vq = sve_vq_from_vl(sve.vl); + + if (sve.head.size < SVE_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + /* + * Careful: we are about __copy_from_user() directly into + * thread.sve_state with preemption enabled, so protection is + * needed to prevent a racing context switch from writing stale + * registers back over the new data. + */ + + fpsimd_flush_task_state(current); + /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + + sve_alloc(current, true); + if (!current->thread.sve_state) { + clear_thread_flag(TIF_SVE); + return -ENOMEM; + } + + err = __copy_from_user(current->thread.sve_state, + (char __user const *)user->sve + + SVE_SIG_REGS_OFFSET, + SVE_SIG_REGS_SIZE(vq)); + if (err) + return -EFAULT; + + if (sve.flags & SVE_SIG_FLAG_SM) + current->thread.svcr |= SVCR_SM_MASK; + else + set_thread_flag(TIF_SVE); + +fpsimd_only: + /* copy the FP and status/control registers */ + /* restore_sigframe() already checked that user->fpsimd != NULL. */ + err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, + sizeof(fpsimd.vregs)); + __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); + __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + + /* load the hardware registers from the fpsimd_state structure */ + if (!err) + fpsimd_update_current_state(&fpsimd); + + return err ? -EFAULT : 0; +} + +#else /* ! CONFIG_ARM64_SVE */ + +static int restore_sve_fpsimd_context(struct user_ctxs *user) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +/* Turn any non-optimised out attempts to use this into a link error: */ +extern int preserve_sve_context(void __user *ctx); + +#endif /* ! CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +static int preserve_za_context(struct za_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + unsigned int vl = task_get_sme_vl(current); + unsigned int vq; + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + else + vq = 0; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZA_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), + &ctx->head.size, err); + __put_user_error(vl, &ctx->vl, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + if (vq) { + /* + * This assumes that the ZA state has already been saved to + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). + */ + err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, + current->thread.za_state, + ZA_SIG_REGS_SIZE(vq)); + } + + return err ? -EFAULT : 0; +} + +static int restore_za_context(struct user_ctxs *user) +{ + int err; + unsigned int vq; + struct za_context za; + + if (__copy_from_user(&za, user->za, sizeof(za))) + return -EFAULT; + + if (za.vl != task_get_sme_vl(current)) + return -EINVAL; + + if (za.head.size <= sizeof(*user->za)) { + current->thread.svcr &= ~SVCR_ZA_MASK; + return 0; + } + + vq = sve_vq_from_vl(za.vl); + + if (za.head.size < ZA_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + /* + * Careful: we are about __copy_from_user() directly into + * thread.za_state with preemption enabled, so protection is + * needed to prevent a racing context switch from writing stale + * registers back over the new data. + */ + + fpsimd_flush_task_state(current); + /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + + sme_alloc(current); + if (!current->thread.za_state) { + current->thread.svcr &= ~SVCR_ZA_MASK; + clear_thread_flag(TIF_SME); + return -ENOMEM; + } + + err = __copy_from_user(current->thread.za_state, + (char __user const *)user->za + + ZA_SIG_REGS_OFFSET, + ZA_SIG_REGS_SIZE(vq)); + if (err) + return -EFAULT; + + set_thread_flag(TIF_SME); + current->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} +#else /* ! CONFIG_ARM64_SME */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_za_context(void __user *ctx); +extern int restore_za_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_SME */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -237,6 +464,8 @@ static int parse_user_sigframe(struct user_ctxs *user, char const __user *const sfp = (char const __user *)sf; user->fpsimd = NULL; + user->sve = NULL; + user->za = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -274,6 +503,8 @@ static int parse_user_sigframe(struct user_ctxs *user, goto done; case FPSIMD_MAGIC: + if (!system_supports_fpsimd()) + goto invalid; if (user->fpsimd) goto invalid; @@ -287,6 +518,32 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break; + case SVE_MAGIC: + if (!system_supports_sve() && !system_supports_sme()) + goto invalid; + + if (user->sve) + goto invalid; + + if (size < sizeof(*user->sve)) + goto invalid; + + user->sve = (struct sve_context __user *)head; + break; + + case ZA_MAGIC: + if (!system_supports_sme()) + goto invalid; + + if (user->za) + goto invalid; + + if (size < sizeof(*user->za)) + goto invalid; + + user->za = (struct za_context __user *)head; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -343,6 +600,10 @@ static int parse_user_sigframe(struct user_ctxs *user, */ offset = 0; limit = extra_size; + + if (!access_ok(base, limit)) + goto invalid; + continue; default: @@ -359,9 +620,6 @@ static int parse_user_sigframe(struct user_ctxs *user, } done: - if (!user->fpsimd) - goto invalid; - return 0; invalid: @@ -395,14 +653,25 @@ static int restore_sigframe(struct pt_regs *regs, if (err == 0) err = parse_user_sigframe(&user, sf); - if (err == 0) - err = restore_fpsimd_context(user.fpsimd); + if (err == 0 && system_supports_fpsimd()) { + if (!user.fpsimd) + return -EINVAL; + + if (user.sve) + err = restore_sve_fpsimd_context(&user); + else + err = restore_fpsimd_context(user.fpsimd); + } + + if (err == 0 && system_supports_sme() && user.za) + err = restore_za_context(&user); return err; } -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +SYSCALL_DEFINE0(rt_sigreturn) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ @@ -417,7 +686,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) frame = (struct rt_sigframe __user *)regs->sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, frame)) @@ -429,36 +698,77 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return regs->regs[0]; badframe: - if (show_unhandled_signals) - pr_info_ratelimited("%s[%d]: bad frame in %s: pc=%08llx sp=%08llx\n", - current->comm, task_pid_nr(current), __func__, - regs->pc, regs->sp); - force_sig(SIGSEGV, current); + arm64_notify_segfault(regs->sp); return 0; } -/* Determine the layout of optional records in the signal frame */ -static int setup_sigframe_layout(struct rt_sigframe_user_layout *user) +/* + * Determine the layout of optional records in the signal frame + * + * add_all: if true, lays out the biggest possible signal frame for + * this task; otherwise, generates a layout for the current state + * of the task. + */ +static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, + bool add_all) { int err; - err = sigframe_alloc(user, &user->fpsimd_offset, - sizeof(struct fpsimd_context)); - if (err) - return err; + if (system_supports_fpsimd()) { + err = sigframe_alloc(user, &user->fpsimd_offset, + sizeof(struct fpsimd_context)); + if (err) + return err; + } /* fault information, if valid */ - if (current->thread.fault_code) { + if (add_all || current->thread.fault_code) { err = sigframe_alloc(user, &user->esr_offset, sizeof(struct esr_context)); if (err) return err; } + if (system_supports_sve()) { + unsigned int vq = 0; + + if (add_all || test_thread_flag(TIF_SVE) || + thread_sm_enabled(¤t->thread)) { + int vl = max(sve_max_vl(), sme_max_vl()); + + if (!add_all) + vl = thread_get_cur_vl(¤t->thread); + + vq = sve_vq_from_vl(vl); + } + + err = sigframe_alloc(user, &user->sve_offset, + SVE_SIG_CONTEXT_SIZE(vq)); + if (err) + return err; + } + + if (system_supports_sme()) { + unsigned int vl; + unsigned int vq = 0; + + if (add_all) + vl = sme_max_vl(); + else + vl = task_get_sme_vl(current); + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + + err = sigframe_alloc(user, &user->za_offset, + ZA_SIG_CONTEXT_SIZE(vq)); + if (err) + return err; + } + return sigframe_alloc_end(user); } - static int setup_sigframe(struct rt_sigframe_user_layout *user, struct pt_regs *regs, sigset_t *set) { @@ -480,7 +790,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); - if (err == 0) { + if (err == 0 && system_supports_fpsimd()) { struct fpsimd_context __user *fpsimd_ctx = apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); @@ -496,6 +806,21 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } + /* Scalable Vector Extension state (including streaming), if present */ + if ((system_supports_sve() || system_supports_sme()) && + err == 0 && user->sve_offset) { + struct sve_context __user *sve_ctx = + apply_user_offset(user, user->sve_offset); + err |= preserve_sve_context(sve_ctx); + } + + /* ZA state if present */ + if (system_supports_sme() && err == 0 && user->za_offset) { + struct za_context __user *za_ctx = + apply_user_offset(user, user->za_offset); + err |= preserve_za_context(za_ctx); + } + if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = @@ -549,7 +874,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, int err; init_user_layout(user); - err = setup_sigframe_layout(user); + err = setup_sigframe_layout(user, false); if (err) return err; @@ -564,7 +889,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, user->sigframe, sp_top - sp)) + if (!access_ok(user->sigframe, sp_top - sp)) return -EFAULT; return 0; @@ -580,6 +905,42 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->pc = (unsigned long)ka->sa.sa_handler; + /* + * Signal delivery is a (wacky) indirect function call in + * userspace, so simulate the same setting of BTYPE as a BLR + * <register containing the signal handler entry point>. + * Signal delivery to a location in a PROT_BTI guarded page + * that is not a function entry point will now trigger a + * SIGILL in userspace. + * + * If the signal handler entry point is not in a PROT_BTI + * guarded page, this is harmless. + */ + if (system_supports_bti()) { + regs->pstate &= ~PSR_BTYPE_MASK; + regs->pstate |= PSR_BTYPE_C; + } + + /* TCO (Tag Check Override) always cleared for signal handlers */ + regs->pstate &= ~PSR_TCO_BIT; + + /* Signal handlers are invoked with ZA and streaming mode disabled */ + if (system_supports_sme()) { + /* + * If we were in streaming mode the saved register + * state was SVE but we will exit SM and use the + * FPSIMD register state - flush the saved FPSIMD + * register state in case it gets loaded. + */ + if (current->thread.svcr & SVCR_SM_MASK) + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + + current->thread.svcr &= ~(SVCR_ZA_MASK | + SVCR_SM_MASK); + sme_smstop(); + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else @@ -595,6 +956,8 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct rt_sigframe __user *frame; int err = 0; + fpsimd_signal_preserve_current_state(); + if (get_sigframe(&user, ksig, regs)) return 1; @@ -630,11 +993,12 @@ static void setup_restart_syscall(struct pt_regs *regs) */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { - struct task_struct *tsk = current; sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; + rseq_signal_deliver(ksig, regs); + /* * Set up the stack frame */ @@ -652,14 +1016,8 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) */ ret |= !valid_user_regs(®s->user_regs, current); - /* - * Fast forward the stepping logic so we step into the signal - * handler. - */ - if (!ret) - user_fastforward_single_step(tsk); - - signal_setup_done(ret, ksig, 0); + /* Step into the signal handler if we are stepping */ + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } /* @@ -676,11 +1034,12 @@ static void do_signal(struct pt_regs *regs) unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; struct ksignal ksig; + bool syscall = in_syscall(regs); /* * If we were from a system call, check for system call restarting... */ - if (in_syscall(regs)) { + if (syscall) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; @@ -720,7 +1079,7 @@ static void do_signal(struct pt_regs *regs) retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { - regs->regs[0] = -EINTR; + syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } @@ -732,7 +1091,7 @@ static void do_signal(struct pt_regs *regs) * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ - if (in_syscall(regs) && regs->pc == restart_addr) { + if (syscall && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); @@ -741,41 +1100,102 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -asmlinkage void do_notify_resume(struct pt_regs *regs, - unsigned int thread_flags) +void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { - /* - * The assembly code enters us with IRQs off, but it hasn't - * informed the tracing code of that for efficiency reasons. - * Update the trace code with the current status. - */ - trace_hardirqs_off(); - do { - /* Check valid user FS if needed */ - addr_limit_user_check(); - if (thread_flags & _TIF_NEED_RESCHED) { + /* Unmask Debug and SError for the next task */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + schedule(); } else { - local_irq_enable(); + local_daif_restore(DAIF_PROCCTX); if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); - if (thread_flags & _TIF_SIGPENDING) + if (thread_flags & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, + (void __user *)NULL, current); + } + + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } + if (thread_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); if (thread_flags & _TIF_FOREIGN_FPSTATE) fpsimd_restore_current_state(); } - local_irq_disable(); - thread_flags = READ_ONCE(current_thread_info()->flags); + local_daif_mask(); + thread_flags = read_thread_flags(); } while (thread_flags & _TIF_WORK_MASK); } + +unsigned long __ro_after_init signal_minsigstksz; + +/* + * Determine the stack space required for guaranteed signal devliery. + * This function is used to populate AT_MINSIGSTKSZ at process startup. + * cpufeatures setup is assumed to be complete. + */ +void __init minsigstksz_setup(void) +{ + struct rt_sigframe_user_layout user; + + init_user_layout(&user); + + /* + * If this fails, SIGFRAME_MAXSZ needs to be enlarged. It won't + * be big enough, but it's our best guess: + */ + if (WARN_ON(setup_sigframe_layout(&user, true))) + return; + + signal_minsigstksz = sigframe_size(&user) + + round_up(sizeof(struct frame_record), 16) + + 16; /* max alignment padding */ +} + +/* + * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as + * changes likely come with new fields that should be added below. + */ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 9); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); +static_assert(sizeof(siginfo_t) == 128); +static_assert(__alignof__(siginfo_t) == 8); +static_assert(offsetof(siginfo_t, si_signo) == 0x00); +static_assert(offsetof(siginfo_t, si_errno) == 0x04); +static_assert(offsetof(siginfo_t, si_code) == 0x08); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_tid) == 0x10); +static_assert(offsetof(siginfo_t, si_overrun) == 0x14); +static_assert(offsetof(siginfo_t, si_status) == 0x18); +static_assert(offsetof(siginfo_t, si_utime) == 0x20); +static_assert(offsetof(siginfo_t, si_stime) == 0x28); +static_assert(offsetof(siginfo_t, si_value) == 0x18); +static_assert(offsetof(siginfo_t, si_int) == 0x18); +static_assert(offsetof(siginfo_t, si_ptr) == 0x18); +static_assert(offsetof(siginfo_t, si_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); +static_assert(offsetof(siginfo_t, si_lower) == 0x20); +static_assert(offsetof(siginfo_t, si_upper) == 0x28); +static_assert(offsetof(siginfo_t, si_pkey) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); +static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); +static_assert(offsetof(siginfo_t, si_band) == 0x10); +static_assert(offsetof(siginfo_t, si_fd) == 0x18); +static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_syscall) == 0x18); +static_assert(offsetof(siginfo_t, si_arch) == 0x1c); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index e09bf5d15606..4700f8522d27 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -1,21 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. * Modified by Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/compat.h> @@ -26,44 +15,10 @@ #include <asm/esr.h> #include <asm/fpsimd.h> #include <asm/signal32.h> +#include <asm/traps.h> #include <linux/uaccess.h> #include <asm/unistd.h> - -struct compat_sigcontext { - /* We always set these two fields to 0 */ - compat_ulong_t trap_no; - compat_ulong_t error_code; - - compat_ulong_t oldmask; - compat_ulong_t arm_r0; - compat_ulong_t arm_r1; - compat_ulong_t arm_r2; - compat_ulong_t arm_r3; - compat_ulong_t arm_r4; - compat_ulong_t arm_r5; - compat_ulong_t arm_r6; - compat_ulong_t arm_r7; - compat_ulong_t arm_r8; - compat_ulong_t arm_r9; - compat_ulong_t arm_r10; - compat_ulong_t arm_fp; - compat_ulong_t arm_ip; - compat_ulong_t arm_sp; - compat_ulong_t arm_lr; - compat_ulong_t arm_pc; - compat_ulong_t arm_cpsr; - compat_ulong_t fault_address; -}; - -struct compat_ucontext { - compat_ulong_t uc_flags; - compat_uptr_t uc_link; - compat_stack_t uc_stack; - struct compat_sigcontext uc_mcontext; - compat_sigset_t uc_sigmask; - int __unused[32 - (sizeof (compat_sigset_t) / sizeof (int))]; - compat_ulong_t uc_regspace[128] __attribute__((__aligned__(8))); -}; +#include <asm/vdso.h> struct compat_vfp_sigframe { compat_ulong_t magic; @@ -91,18 +46,6 @@ struct compat_aux_sigframe { unsigned long end_magic; } __attribute__((__aligned__(8))); -struct compat_sigframe { - struct compat_ucontext uc; - compat_ulong_t retcode[2]; -}; - -struct compat_rt_sigframe { - struct compat_siginfo info; - struct compat_sigframe sig; -}; - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) { compat_sigset_t cset; @@ -125,86 +68,6 @@ static inline int get_sigset_t(sigset_t *set, return 0; } -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err; - - if (!access_ok(VERIFY_WRITE, to, sizeof(*to))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - * This routine must convert siginfo from 64bit to 32bit as well - * at the same time. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, - SI_PAD_SIZE); - else switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_FAULT: - err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr, - &to->si_addr); -#ifdef BUS_MCEERR_AO - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_SYS: - err |= __put_user((compat_uptr_t)(unsigned long) - from->si_call_addr, &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - return err; -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - if (copy_from_user(to, from, __ARCH_SI_PREAMBLE_SIZE) || - copy_from_user(to->_sifields._pad, - from->_sifields._pad, SI_PAD_SIZE)) - return -EFAULT; - - return 0; -} - /* * VFP save/restore code. * @@ -228,7 +91,8 @@ union __fpsimd_vreg { static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) { - struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; + struct user_fpsimd_state const *fpsimd = + ¤t->thread.uw.fpsimd_state; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr, fpexc; @@ -239,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_preserve_current_state(); + fpsimd_signal_preserve_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -277,7 +141,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) { - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr; @@ -321,12 +185,11 @@ static int compat_restore_sigframe(struct pt_regs *regs, int err; sigset_t set; struct compat_aux_sigframe __user *aux; + unsigned long psr; err = get_sigset_t(&set, &sf->uc.uc_sigmask); - if (err == 0) { - sigdelsetmask(&set, ~_BLOCKABLE); + if (err == 0) set_current_blocked(&set); - } __get_user_error(regs->regs[0], &sf->uc.uc_mcontext.arm_r0, err); __get_user_error(regs->regs[1], &sf->uc.uc_mcontext.arm_r1, err); @@ -344,7 +207,9 @@ static int compat_restore_sigframe(struct pt_regs *regs, __get_user_error(regs->compat_sp, &sf->uc.uc_mcontext.arm_sp, err); __get_user_error(regs->compat_lr, &sf->uc.uc_mcontext.arm_lr, err); __get_user_error(regs->pc, &sf->uc.uc_mcontext.arm_pc, err); - __get_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); + __get_user_error(psr, &sf->uc.uc_mcontext.arm_cpsr, err); + + regs->pstate = compat_psr_to_pstate(psr); /* * Avoid compat_sys_sigreturn() restarting. @@ -354,14 +219,15 @@ static int compat_restore_sigframe(struct pt_regs *regs, err |= !valid_user_regs(®s->user_regs, current); aux = (struct compat_aux_sigframe __user *) sf->uc.uc_regspace; - if (err == 0) + if (err == 0 && system_supports_fpsimd()) err |= compat_restore_vfp_context(&aux->vfp); return err; } -asmlinkage int compat_sys_sigreturn(struct pt_regs *regs) +COMPAT_SYSCALL_DEFINE0(sigreturn) { + struct pt_regs *regs = current_pt_regs(); struct compat_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ @@ -377,7 +243,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs) frame = (struct compat_sigframe __user *)regs->compat_sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (compat_restore_sigframe(regs, frame)) @@ -386,16 +252,13 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs) return regs->regs[0]; badframe: - if (show_unhandled_signals) - pr_info_ratelimited("%s[%d]: bad frame in %s: pc=%08llx sp=%08llx\n", - current->comm, task_pid_nr(current), __func__, - regs->pc, regs->compat_sp); - force_sig(SIGSEGV, current); + arm64_notify_segfault(regs->compat_sp); return 0; } -asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs) +COMPAT_SYSCALL_DEFINE0(rt_sigreturn) { + struct pt_regs *regs = current_pt_regs(); struct compat_rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ @@ -411,7 +274,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs) frame = (struct compat_rt_sigframe __user *)regs->compat_sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (compat_restore_sigframe(regs, &frame->sig)) @@ -423,11 +286,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs) return regs->regs[0]; badframe: - if (show_unhandled_signals) - pr_info_ratelimited("%s[%d]: bad frame in %s: pc=%08llx sp=%08llx\n", - current->comm, task_pid_nr(current), __func__, - regs->pc, regs->compat_sp); - force_sig(SIGSEGV, current); + arm64_notify_segfault(regs->compat_sp); return 0; } @@ -446,7 +305,7 @@ static void __user *compat_get_sigframe(struct ksignal *ksig, /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, frame, framesize)) + if (!access_ok(frame, framesize)) frame = NULL; return frame; @@ -458,22 +317,22 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, { compat_ulong_t handler = ptr_to_compat(ka->sa.sa_handler); compat_ulong_t retcode; - compat_ulong_t spsr = regs->pstate & ~(PSR_f | COMPAT_PSR_E_BIT); + compat_ulong_t spsr = regs->pstate & ~(PSR_f | PSR_AA32_E_BIT); int thumb; /* Check if the handler is written for ARM or Thumb */ thumb = handler & 1; if (thumb) - spsr |= COMPAT_PSR_T_BIT; + spsr |= PSR_AA32_T_BIT; else - spsr &= ~COMPAT_PSR_T_BIT; + spsr &= ~PSR_AA32_T_BIT; /* The IT state must be cleared for both ARM and Thumb-2 */ - spsr &= ~COMPAT_PSR_IT_MASK; + spsr &= ~PSR_AA32_IT_MASK; /* Restore the original endianness */ - spsr |= COMPAT_PSR_ENDSTATE; + spsr |= PSR_AA32_ENDSTATE; if (ka->sa.sa_flags & SA_RESTORER) { retcode = ptr_to_compat(ka->sa.sa_restorer); @@ -484,8 +343,7 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = AARCH32_VECTORS_BASE + - AARCH32_KERN_SIGRET_CODE_OFFSET + + retcode = (unsigned long)current->mm->context.sigpage + (idx << 2) + thumb; } @@ -500,6 +358,7 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, struct pt_regs *regs, sigset_t *set) { struct compat_aux_sigframe __user *aux; + unsigned long psr = pstate_to_compat_psr(regs->pstate); int err = 0; __put_user_error(regs->regs[0], &sf->uc.uc_mcontext.arm_r0, err); @@ -518,7 +377,7 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, __put_user_error(regs->compat_sp, &sf->uc.uc_mcontext.arm_sp, err); __put_user_error(regs->compat_lr, &sf->uc.uc_mcontext.arm_lr, err); __put_user_error(regs->pc, &sf->uc.uc_mcontext.arm_pc, err); - __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); + __put_user_error(psr, &sf->uc.uc_mcontext.arm_cpsr, err); __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err); /* set the compat FSR WnR */ @@ -531,7 +390,7 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, aux = (struct compat_aux_sigframe __user *) sf->uc.uc_regspace; - if (err == 0) + if (err == 0 && system_supports_fpsimd()) err |= compat_preserve_vfp_context(&aux->vfp); __put_user_error(0, &aux->end_magic, err); @@ -594,3 +453,43 @@ void compat_setup_restart_syscall(struct pt_regs *regs) { regs->regs[7] = __NR_compat_restart_syscall; } + +/* + * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as + * changes likely come with new fields that should be added below. + */ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 9); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); +static_assert(sizeof(compat_siginfo_t) == 128); +static_assert(__alignof__(compat_siginfo_t) == 4); +static_assert(offsetof(compat_siginfo_t, si_signo) == 0x00); +static_assert(offsetof(compat_siginfo_t, si_errno) == 0x04); +static_assert(offsetof(compat_siginfo_t, si_code) == 0x08); +static_assert(offsetof(compat_siginfo_t, si_pid) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_uid) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_tid) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_overrun) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_status) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_utime) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_stime) == 0x1c); +static_assert(offsetof(compat_siginfo_t, si_value) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_int) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_ptr) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_addr) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_addr_lsb) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_lower) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_call_addr) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_syscall) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_arch) == 0x14); diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S new file mode 100644 index 000000000000..ccbd4aab4ba4 --- /dev/null +++ b/arch/arm64/kernel/sigreturn32.S @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AArch32 sigreturn code. + * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. + * + * Copyright (C) 2005-2011 Nicolas Pitre <nico@fluxnic.net> + * Copyright (C) 2012-2018 ARM Ltd. + * + * For ARM syscalls, the syscall number has to be loaded into r7. + * We do not support an OABI userspace. + * + * For Thumb syscalls, we also pass the syscall number via r7. We therefore + * need two 16-bit instructions. + */ + +#include <asm/unistd.h> + + .section .rodata + .globl __aarch32_sigret_code_start +__aarch32_sigret_code_start: + + /* + * ARM Code + */ + .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn + .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn + .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn + + /* + * ARM code + */ + .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn + .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn + .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn + + .globl __aarch32_sigret_code_end +__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index df67652e46f0..97c9de57725d 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -1,7 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/errno.h> #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/assembler.h> +#include <asm/smp.h> .text /* @@ -26,7 +28,7 @@ * aff0 = mpidr_masked & 0xff; * aff1 = mpidr_masked & 0xff00; * aff2 = mpidr_masked & 0xff0000; - * aff2 = mpidr_masked & 0xff00000000; + * aff3 = mpidr_masked & 0xff00000000; * dst = (aff0 >> rs0 | aff1 >> rs1 | aff2 >> rs2 | aff3 >> rs3); *} * Input registers: rs0, rs1, rs2, rs3, mpidr, mask @@ -60,7 +62,7 @@ * * x0 = struct sleep_stack_data area */ -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS] stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16] stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32] @@ -93,21 +95,27 @@ ENTRY(__cpu_suspend_enter) ldp x29, lr, [sp], #16 mov x0, #1 ret -ENDPROC(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) - .pushsection ".idmap.text", "ax" -ENTRY(cpu_resume) - bl el2_setup // if in EL2 drop to EL1 cleanly + .pushsection ".idmap.text", "awx" +SYM_CODE_START(cpu_resume) + bl init_kernel_el + bl finalise_el2 +#if VA_BITS > 48 + ldr_l x0, vabits_actual +#endif bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ + adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =_cpu_resume br x8 -ENDPROC(cpu_resume) +SYM_CODE_END(cpu_resume) .ltorg .popsection -ENTRY(_cpu_resume) +SYM_FUNC_START(_cpu_resume) mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -130,7 +138,7 @@ ENTRY(_cpu_resume) */ bl cpu_do_resume -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) mov x0, sp bl kasan_unpoison_task_stack_below #endif @@ -143,4 +151,4 @@ ENTRY(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -ENDPROC(_cpu_resume) +SYM_FUNC_END(_cpu_resume) diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 62522342e1e4..487381164ff6 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -1,34 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2015, Linaro Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License Version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include <linux/linkage.h> #include <linux/arm-smccc.h> + #include <asm/asm-offsets.h> +#include <asm/assembler.h> +#include <asm/thread_info.h> + +/* + * If we have SMCCC v1.3 and (as is likely) no SVE state in + * the registers then set the SMCCC hint bit to say there's no + * need to preserve it. Do this by directly adjusting the SMCCC + * function value which is already stored in x0 ready to be called. + */ +SYM_FUNC_START(__arm_smccc_sve_check) + + ldr_l x16, smccc_has_sve_hint + cbz x16, 2f + + get_current_task x16 + ldr x16, [x16, #TSK_TI_FLAGS] + tbnz x16, #TIF_FOREIGN_FPSTATE, 1f // Any live FP state? + tbnz x16, #TIF_SVE, 2f // Does that state include SVE? + +1: orr x0, x0, ARM_SMCCC_1_3_SVE_HINT + +2: ret +SYM_FUNC_END(__arm_smccc_sve_check) +EXPORT_SYMBOL(__arm_smccc_sve_check) .macro SMCCC instr - .cfi_startproc + stp x29, x30, [sp, #-16]! + mov x29, sp +alternative_if ARM64_SVE + bl __arm_smccc_sve_check +alternative_else_nop_endif \instr #0 - ldr x4, [sp] + ldr x4, [sp, #16] stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS] - ldr x4, [sp, #8] + ldr x4, [sp, #24] cbz x4, 1f /* no quirk structure */ ldr x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS] cmp x9, #ARM_SMCCC_QUIRK_QCOM_A6 b.ne 1f str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS] -1: ret - .cfi_endproc +1: ldp x29, x30, [sp], #16 + ret .endm /* @@ -37,9 +57,10 @@ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_smc) +SYM_FUNC_START(__arm_smccc_smc) SMCCC smc -ENDPROC(__arm_smccc_smc) +SYM_FUNC_END(__arm_smccc_smc) +EXPORT_SYMBOL(__arm_smccc_smc) /* * void arm_smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2, @@ -47,6 +68,64 @@ ENDPROC(__arm_smccc_smc) * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_hvc) +SYM_FUNC_START(__arm_smccc_hvc) SMCCC hvc -ENDPROC(__arm_smccc_hvc) +SYM_FUNC_END(__arm_smccc_hvc) +EXPORT_SYMBOL(__arm_smccc_hvc) + + .macro SMCCC_1_2 instr + /* Save `res` and free a GPR that won't be clobbered */ + stp x1, x19, [sp, #-16]! + + /* Ensure `args` won't be clobbered while loading regs in next step */ + mov x19, x0 + + /* Load the registers x0 - x17 from the struct arm_smccc_1_2_regs */ + ldp x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS] + ldp x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS] + ldp x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS] + ldp x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS] + ldp x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS] + ldp x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS] + ldp x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS] + ldp x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS] + ldp x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS] + + \instr #0 + + /* Load the `res` from the stack */ + ldr x19, [sp] + + /* Store the registers x0 - x17 into the result structure */ + stp x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS] + stp x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS] + stp x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS] + stp x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS] + stp x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS] + stp x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS] + stp x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS] + stp x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS] + stp x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS] + + /* Restore original x19 */ + ldp xzr, x19, [sp], #16 + ret +.endm + +/* + * void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args, + * struct arm_smccc_1_2_regs *res); + */ +SYM_FUNC_START(arm_smccc_1_2_hvc) + SMCCC_1_2 hvc +SYM_FUNC_END(arm_smccc_1_2_hvc) +EXPORT_SYMBOL(arm_smccc_1_2_hvc) + +/* + * void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args, + * struct arm_smccc_1_2_regs *res); + */ +SYM_FUNC_START(arm_smccc_1_2_smc) + SMCCC_1_2 smc +SYM_FUNC_END(arm_smccc_1_2_smc) +EXPORT_SYMBOL(arm_smccc_1_2_smc) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 9f7195a5773e..ffc5d76cf695 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1,23 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * SMP initialisation and IPI support * Based on arch/arm/kernel/smp.c * * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> +#include <linux/arm_sdei.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/spinlock.h> @@ -34,12 +24,15 @@ #include <linux/smp.h> #include <linux/seq_file.h> #include <linux/irq.h> +#include <linux/irqchip/arm-gic-v3.h> #include <linux/percpu.h> #include <linux/clockchips.h> #include <linux/completion.h> #include <linux/of.h> #include <linux/irq_work.h> +#include <linux/kernel_stat.h> #include <linux/kexec.h> +#include <linux/kvm_host.h> #include <asm/alternative.h> #include <asm/atomic.h> @@ -47,10 +40,10 @@ #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/cpu_ops.h> +#include <asm/daifflags.h> +#include <asm/kvm_mmu.h> #include <asm/mmu_context.h> #include <asm/numa.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/processor.h> #include <asm/smp_plat.h> #include <asm/sections.h> @@ -71,7 +64,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ -int cpus_stuck_in_kernel; +static int cpus_stuck_in_kernel; enum ipi_msg_type { IPI_RESCHEDULE, @@ -80,47 +73,18 @@ enum ipi_msg_type { IPI_CPU_CRASH_STOP, IPI_TIMER, IPI_IRQ_WORK, - IPI_WAKEUP + IPI_WAKEUP, + NR_IPI }; -#ifdef CONFIG_ARM64_VHE - -/* Whether the boot CPU is running in HYP mode or not*/ -static bool boot_cpu_hyp_mode; - -static inline void save_boot_cpu_run_el(void) -{ - boot_cpu_hyp_mode = is_kernel_in_hyp_mode(); -} - -static inline bool is_boot_cpu_in_hyp_mode(void) -{ - return boot_cpu_hyp_mode; -} - -/* - * Verify that a secondary CPU is running the kernel at the same - * EL as that of the boot CPU. - */ -void verify_cpu_run_el(void) -{ - bool in_el2 = is_kernel_in_hyp_mode(); - bool boot_cpu_el2 = is_boot_cpu_in_hyp_mode(); - - if (in_el2 ^ boot_cpu_el2) { - pr_crit("CPU%d: mismatched Exception Level(EL%d) with boot CPU(EL%d)\n", - smp_processor_id(), - in_el2 ? 2 : 1, - boot_cpu_el2 ? 2 : 1); - cpu_panic_kernel(); - } -} +static int ipi_irq_base __read_mostly; +static int nr_ipi __read_mostly = NR_IPI; +static struct irq_desc *ipi_desc[NR_IPI] __read_mostly; -#else -static inline void save_boot_cpu_run_el(void) {} -#endif +static void ipi_setup(int cpu); #ifdef CONFIG_HOTPLUG_CPU +static void ipi_teardown(int cpu); static int op_cpu_kill(unsigned int cpu); #else static inline int op_cpu_kill(unsigned int cpu) @@ -136,8 +100,10 @@ static inline int op_cpu_kill(unsigned int cpu) */ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { - if (cpu_ops[cpu]->cpu_boot) - return cpu_ops[cpu]->cpu_boot(cpu); + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops->cpu_boot) + return ops->cpu_boot(cpu); return -EOPNOTSUPP; } @@ -154,73 +120,85 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); - __flush_dcache_area(&secondary_data, sizeof(secondary_data)); - /* - * Now bring the CPU into our world. - */ + /* Now bring the CPU into our world */ ret = boot_secondary(cpu, idle); - if (ret == 0) { - /* - * CPU was successfully started, wait for it to come online or - * time out. - */ - wait_for_completion_timeout(&cpu_running, - msecs_to_jiffies(1000)); - - if (!cpu_online(cpu)) { - pr_crit("CPU%u: failed to come online\n", cpu); - ret = -EIO; - } - } else { + if (ret) { pr_err("CPU%u: failed to boot: %d\n", cpu, ret); + return ret; } + /* + * CPU was successfully started, wait for it to come online or + * time out. + */ + wait_for_completion_timeout(&cpu_running, + msecs_to_jiffies(5000)); + if (cpu_online(cpu)) + return 0; + + pr_crit("CPU%u: failed to come online\n", cpu); secondary_data.task = NULL; - secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); - if (ret && status) { - - if (status == CPU_MMU_OFF) - status = READ_ONCE(__early_cpu_boot_status); + if (status == CPU_MMU_OFF) + status = READ_ONCE(__early_cpu_boot_status); - switch (status) { - default: - pr_err("CPU%u: failed in unknown state : 0x%lx\n", - cpu, status); - break; - case CPU_KILL_ME: - if (!op_cpu_kill(cpu)) { - pr_crit("CPU%u: died during early boot\n", cpu); - break; - } - /* Fall through */ - pr_crit("CPU%u: may not have shut down cleanly\n", cpu); - case CPU_STUCK_IN_KERNEL: - pr_crit("CPU%u: is stuck in kernel\n", cpu); - cpus_stuck_in_kernel++; + switch (status & CPU_BOOT_STATUS_MASK) { + default: + pr_err("CPU%u: failed in unknown state : 0x%lx\n", + cpu, status); + cpus_stuck_in_kernel++; + break; + case CPU_KILL_ME: + if (!op_cpu_kill(cpu)) { + pr_crit("CPU%u: died during early boot\n", cpu); break; - case CPU_PANIC_KERNEL: - panic("CPU%u detected unsupported configuration\n", cpu); } + pr_crit("CPU%u: may not have shut down cleanly\n", cpu); + fallthrough; + case CPU_STUCK_IN_KERNEL: + pr_crit("CPU%u: is stuck in kernel\n", cpu); + if (status & CPU_STUCK_REASON_52_BIT_VA) + pr_crit("CPU%u: does not support 52-bit VAs\n", cpu); + if (status & CPU_STUCK_REASON_NO_GRAN) { + pr_crit("CPU%u: does not support %luK granule\n", + cpu, PAGE_SIZE / SZ_1K); + } + cpus_stuck_in_kernel++; + break; + case CPU_PANIC_KERNEL: + panic("CPU%u detected unsupported configuration\n", cpu); } - return ret; + return -EIO; +} + +static void init_gic_priority_masking(void) +{ + u32 cpuflags; + + if (WARN_ON(!gic_enable_sre())) + return; + + cpuflags = read_sysreg(daif); + + WARN_ON(!(cpuflags & PSR_I_BIT)); + WARN_ON(!(cpuflags & PSR_F_BIT)); + + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } /* * This is the secondary CPU boot entry. We're using this CPUs * idle thread stack, but a set of temporary page tables. */ -asmlinkage void secondary_start_kernel(void) +asmlinkage notrace void secondary_start_kernel(void) { + u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; struct mm_struct *mm = &init_mm; - unsigned int cpu; - - cpu = task_cpu(current); - set_my_cpu_offset(per_cpu_offset(cpu)); + const struct cpu_operations *ops; + unsigned int cpu = smp_processor_id(); /* * All kernel threads share the same mm context; grab a @@ -235,7 +213,10 @@ asmlinkage void secondary_start_kernel(void) */ cpu_uninstall_idmap(); - preempt_disable(); + if (system_uses_irq_prio_masking()) + init_gic_priority_masking(); + + rcu_cpu_starting(cpu); trace_hardirqs_off(); /* @@ -245,34 +226,38 @@ asmlinkage void secondary_start_kernel(void) */ check_local_cpu_capabilities(); - if (cpu_ops[cpu]->cpu_postboot) - cpu_ops[cpu]->cpu_postboot(); + ops = get_cpu_ops(cpu); + if (ops->cpu_postboot) + ops->cpu_postboot(); /* * Log the CPU info before it is marked online and might get read. */ cpuinfo_store_cpu(); + store_cpu_topology(cpu); /* * Enable GIC and timers. */ notify_cpu_starting(cpu); - store_cpu_topology(cpu); + ipi_setup(cpu); + + numa_add_cpu(cpu); /* * OK, now it's safe to let the boot CPU continue. Wait for * the CPU migration code to notice that the CPU is online * before we continue. */ - pr_info("CPU%u: Booted secondary processor [%08x]\n", - cpu, read_cpuid_id()); + pr_info("CPU%u: Booted secondary processor 0x%010lx [0x%08x]\n", + cpu, (unsigned long)mpidr, + read_cpuid_id()); update_cpu_boot_status(CPU_BOOT_SUCCESS); set_cpu_online(cpu, true); complete(&cpu_running); - local_irq_enable(); - local_async_enable(); + local_daif_restore(DAIF_PROCCTX); /* * OK, it's off to the idle thread for us @@ -283,19 +268,21 @@ asmlinkage void secondary_start_kernel(void) #ifdef CONFIG_HOTPLUG_CPU static int op_cpu_disable(unsigned int cpu) { + const struct cpu_operations *ops = get_cpu_ops(cpu); + /* * If we don't have a cpu_die method, abort before we reach the point * of no return. CPU0 may not have an cpu_ops, so test for it. */ - if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_die) + if (!ops || !ops->cpu_die) return -EOPNOTSUPP; /* * We may need to abort a hot unplug for some other mechanism-specific * reason. */ - if (cpu_ops[cpu]->cpu_disable) - return cpu_ops[cpu]->cpu_disable(cpu); + if (ops->cpu_disable) + return ops->cpu_disable(cpu); return 0; } @@ -312,11 +299,15 @@ int __cpu_disable(void) if (ret) return ret; + remove_cpu_topology(cpu); + numa_remove_cpu(cpu); + /* * Take this CPU offline. Once we clear this, we can't return, * and we must not schedule until we're ready to give up the cpu. */ set_cpu_online(cpu, false); + ipi_teardown(cpu); /* * OK - migrate IRQs away from this CPU @@ -328,15 +319,17 @@ int __cpu_disable(void) static int op_cpu_kill(unsigned int cpu) { + const struct cpu_operations *ops = get_cpu_ops(cpu); + /* * If we have no means of synchronising with the dying CPU, then assume * that it is really dead. We can only wait for an arbitrary length of * time and hope that it's dead, so let's skip the wait and just hope. */ - if (!cpu_ops[cpu]->cpu_kill) + if (!ops->cpu_kill) return 0; - return cpu_ops[cpu]->cpu_kill(cpu); + return ops->cpu_kill(cpu); } /* @@ -351,7 +344,7 @@ void __cpu_die(unsigned int cpu) pr_crit("CPU%u: cpu didn't die\n", cpu); return; } - pr_notice("CPU%u: shutdown\n", cpu); + pr_debug("CPU%u: shutdown\n", cpu); /* * Now that the dying CPU is beyond the point of no return w.r.t. @@ -361,25 +354,21 @@ void __cpu_die(unsigned int cpu) */ err = op_cpu_kill(cpu); if (err) - pr_warn("CPU%d may not have shut down cleanly: %d\n", - cpu, err); + pr_warn("CPU%d may not have shut down cleanly: %d\n", cpu, err); } /* * Called from the idle thread for the CPU which has been shutdown. * - * Note that we disable IRQs here, but do not re-enable them - * before returning to the caller. This is also the behaviour - * of the other hotplug-cpu capable cores, so presumably coming - * out of idle fixes this. */ void cpu_die(void) { unsigned int cpu = smp_processor_id(); + const struct cpu_operations *ops = get_cpu_ops(cpu); idle_task_exit(); - local_irq_disable(); + local_daif_mask(); /* Tell __cpu_die() that this CPU is now safe to dispose of */ (void)cpu_report_death(); @@ -389,12 +378,22 @@ void cpu_die(void) * mechanism must perform all required cache maintenance to ensure that * no dirty lines are lost in the process of shutting down the CPU. */ - cpu_ops[cpu]->cpu_die(cpu); + ops->cpu_die(cpu); BUG(); } #endif +static void __cpu_try_die(int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops && ops->cpu_die) + ops->cpu_die(cpu); +#endif +} + /* * Kill the calling secondary CPU, early in bringup before it is turned * online. @@ -407,13 +406,13 @@ void cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); + rcu_report_dead(cpu); + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + update_cpu_boot_status(CPU_KILL_ME); + __cpu_try_die(cpu); + } -#ifdef CONFIG_HOTPLUG_CPU - update_cpu_boot_status(CPU_KILL_ME); - /* Check if we can park ourselves */ - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_die) - cpu_ops[cpu]->cpu_die(cpu); -#endif update_cpu_boot_status(CPU_STUCK_IN_KERNEL); cpu_park_loop(); @@ -428,6 +427,10 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); + if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) { + kvm_compute_layout(); + kvm_apply_hyp_relocations(); + } } void __init smp_cpus_done(unsigned int max_cpus) @@ -441,47 +444,26 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); /* - * Initialise the static keys early as they may be enabled by the - * cpufeature code. + * The runtime per-cpu areas have been allocated by + * setup_per_cpu_areas(), and CPU0's boot time per-cpu area will be + * freed shortly, so we must move over to the runtime per-cpu area. */ - jump_label_init(); + set_my_cpu_offset(per_cpu_offset(smp_processor_id())); cpuinfo_store_boot_cpu(); - save_boot_cpu_run_el(); - /* - * Run the errata work around checks on the boot CPU, once we have - * initialised the cpu feature infrastructure from - * cpuinfo_store_boot_cpu() above. - */ - update_cpu_errata_workarounds(); -} - -static u64 __init of_get_cpu_mpidr(struct device_node *dn) -{ - const __be32 *cell; - u64 hwid; /* - * A cpu node with missing "reg" property is - * considered invalid to build a cpu_logical_map - * entry. + * We now know enough about the boot CPU to apply the + * alternatives that cannot wait until interrupt handling + * and/or scheduling is enabled. */ - cell = of_get_property(dn, "reg", NULL); - if (!cell) { - pr_err("%pOF: missing reg property\n", dn); - return INVALID_HWID; - } + apply_boot_alternatives(); - hwid = of_read_number(cell, of_n_addr_cells(dn)); - /* - * Non affinity bits must be set to 0 in the DT - */ - if (hwid & ~MPIDR_HWID_BITMASK) { - pr_err("%pOF: invalid reg property\n", dn); - return INVALID_HWID; - } - return hwid; + /* Conditionally switch to GIC PMR for interrupt masking */ + if (system_uses_irq_prio_masking()) + init_gic_priority_masking(); + + kasan_init_hw_tags(); } /* @@ -506,10 +488,13 @@ static bool __init is_mpidr_duplicate(unsigned int cpu, u64 hwid) */ static int __init smp_cpu_setup(int cpu) { - if (cpu_read_ops(cpu)) + const struct cpu_operations *ops; + + if (init_cpu_ops(cpu)) return -ENODEV; - if (cpu_ops[cpu]->cpu_init(cpu)) + ops = get_cpu_ops(cpu); + if (ops->cpu_init(cpu)) return -ENODEV; set_cpu_possible(cpu, true); @@ -527,6 +512,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry @@ -563,7 +549,6 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) } bootcpu_valid = true; cpu_madt_gicc[0] = *processor; - early_map_cpu_to_node(0, acpi_numa_get_nid(0, hwid)); return; } @@ -571,7 +556,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) return; /* map the logical cpu id to cpu MPIDR */ - cpu_logical_map(cpu_count) = hwid; + set_cpu_logical_map(cpu_count, hwid); cpu_madt_gicc[cpu_count] = *processor; @@ -586,13 +571,11 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) */ acpi_set_mailbox_entry(cpu_count, processor); - early_map_cpu_to_node(cpu_count, acpi_numa_get_nid(cpu_count, hwid)); - cpu_count++; } static int __init -acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, +acpi_parse_gic_cpu_interface(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *processor; @@ -601,14 +584,40 @@ acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, if (BAD_MADT_GICC_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_map_gic_cpu_interface(processor); return 0; } + +static void __init acpi_parse_and_init_cpus(void) +{ + int i; + + /* + * do a walk of MADT to determine how many CPUs + * we have including disabled CPUs, and get information + * we need for SMP init. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + acpi_parse_gic_cpu_interface, 0); + + /* + * In ACPI, SMP and CPU NUMA information is provided in separate + * static tables, namely the MADT and the SRAT. + * + * Thus, it is simpler to first create the cpu logical map through + * an MADT walk and then map the logical cpus to their node ids + * as separate steps. + */ + acpi_map_cpus_to_nodes(); + + for (i = 0; i < nr_cpu_ids; i++) + early_map_cpu_to_node(i, acpi_numa_get_nid(i)); +} #else -#define acpi_table_parse_madt(...) do { } while (0) +#define acpi_parse_and_init_cpus(...) do { } while (0) #endif /* @@ -620,10 +629,10 @@ static void __init of_parse_and_init_cpus(void) { struct device_node *dn; - for_each_node_by_type(dn, "cpu") { - u64 hwid = of_get_cpu_mpidr(dn); + for_each_of_cpu_node(dn) { + u64 hwid = of_get_cpu_hwid(dn, 0); - if (hwid == INVALID_HWID) + if (hwid & ~MPIDR_HWID_BITMASK) goto next; if (is_mpidr_duplicate(cpu_count, hwid)) { @@ -661,7 +670,7 @@ static void __init of_parse_and_init_cpus(void) goto next; pr_debug("cpu logical map 0x%llx\n", hwid); - cpu_logical_map(cpu_count) = hwid; + set_cpu_logical_map(cpu_count, hwid); early_map_cpu_to_node(cpu_count, of_node_to_nid(dn)); next: @@ -681,13 +690,7 @@ void __init smp_init_cpus(void) if (acpi_disabled) of_parse_and_init_cpus(); else - /* - * do a walk of MADT to determine how many CPUs - * we have including disabled CPUs, and get information - * we need for SMP init - */ - acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, - acpi_parse_gic_cpu_interface, 0); + acpi_parse_and_init_cpus(); if (cpu_count > nr_cpu_ids) pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n", @@ -708,13 +711,14 @@ void __init smp_init_cpus(void) for (i = 1; i < nr_cpu_ids; i++) { if (cpu_logical_map(i) != INVALID_HWID) { if (smp_cpu_setup(i)) - cpu_logical_map(i) = INVALID_HWID; + set_cpu_logical_map(i, INVALID_HWID); } } } void __init smp_prepare_cpus(unsigned int max_cpus) { + const struct cpu_operations *ops; int err; unsigned int cpu; unsigned int this_cpu; @@ -724,6 +728,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) this_cpu = smp_processor_id(); store_cpu_topology(this_cpu); numa_store_cpu_info(this_cpu); + numa_add_cpu(this_cpu); /* * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set @@ -744,10 +749,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (cpu == smp_processor_id()) continue; - if (!cpu_ops[cpu]) + ops = get_cpu_ops(cpu); + if (!ops) continue; - err = cpu_ops[cpu]->cpu_prepare(cpu); + err = ops->cpu_prepare(cpu); if (err) continue; @@ -756,31 +762,21 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } -void (*__smp_cross_call)(const struct cpumask *, unsigned int); - -void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) -{ - __smp_cross_call = fn; -} - static const char *ipi_types[NR_IPI] __tracepoint_string = { -#define S(x,s) [x] = s - S(IPI_RESCHEDULE, "Rescheduling interrupts"), - S(IPI_CALL_FUNC, "Function call interrupts"), - S(IPI_CPU_STOP, "CPU stop interrupts"), - S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"), - S(IPI_TIMER, "Timer broadcast interrupts"), - S(IPI_IRQ_WORK, "IRQ work interrupts"), - S(IPI_WAKEUP, "CPU wake-up interrupts"), + [IPI_RESCHEDULE] = "Rescheduling interrupts", + [IPI_CALL_FUNC] = "Function call interrupts", + [IPI_CPU_STOP] = "CPU stop interrupts", + [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_TIMER] = "Timer broadcast interrupts", + [IPI_IRQ_WORK] = "IRQ work interrupts", + [IPI_WAKEUP] = "CPU wake-up interrupts", }; -static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) -{ - trace_ipi_raise(target, ipi_types[ipinr]); - __smp_cross_call(target, ipinr); -} +static void smp_cross_call(const struct cpumask *target, unsigned int ipinr); + +unsigned long irq_err_count; -void show_ipi_list(struct seq_file *p, int prec) +int arch_show_interrupts(struct seq_file *p, int prec) { unsigned int cpu, i; @@ -788,21 +784,12 @@ void show_ipi_list(struct seq_file *p, int prec) seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", - __get_irq_stat(cpu, ipi_irqs[i])); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); seq_printf(p, " %s\n", ipi_types[i]); } -} - -u64 smp_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = 0; - int i; - - for (i = 0; i < NR_IPI; i++) - sum += __get_irq_stat(cpu, ipi_irqs[i]); - return sum; + seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count); + return 0; } void arch_send_call_function_ipi_mask(const struct cpumask *mask) @@ -825,22 +812,27 @@ void arch_send_wakeup_ipi_mask(const struct cpumask *mask) #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - if (__smp_cross_call) - smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); } #endif -/* - * ipi_cpu_stop - handle IPI from smp_send_stop() - */ -static void ipi_cpu_stop(unsigned int cpu) +static void local_cpu_stop(void) { - set_cpu_online(cpu, false); + set_cpu_online(smp_processor_id(), false); - local_irq_disable(); + local_daif_mask(); + sdei_mask_local_cpu(); + cpu_park_loop(); +} - while (1) - cpu_relax(); +/* + * We need to implement panic_smp_self_stop() for parallel panic() calls, so + * that cpu_online_mask gets correctly updated and smp_send_stop() can skip + * CPUs that have already stopped themselves. + */ +void panic_smp_self_stop(void) +{ + local_cpu_stop(); } #ifdef CONFIG_KEXEC_CORE @@ -855,11 +847,10 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) atomic_dec(&waiting_for_crash_ipi); local_irq_disable(); + sdei_mask_local_cpu(); -#ifdef CONFIG_HOTPLUG_CPU - if (cpu_ops[cpu]->cpu_die) - cpu_ops[cpu]->cpu_die(cpu); -#endif + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + __cpu_try_die(cpu); /* just in case */ cpu_park_loop(); @@ -869,15 +860,12 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) /* * Main handler for inter-processor interrupts */ -void handle_IPI(int ipinr, struct pt_regs *regs) +static void do_handle_IPI(int ipinr) { unsigned int cpu = smp_processor_id(); - struct pt_regs *old_regs = set_irq_regs(regs); - if ((unsigned)ipinr < NR_IPI) { + if ((unsigned)ipinr < NR_IPI) trace_ipi_entry_rcuidle(ipi_types[ipinr]); - __inc_irq_stat(cpu, ipi_irqs[ipinr]); - } switch (ipinr) { case IPI_RESCHEDULE: @@ -885,21 +873,16 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; case IPI_CALL_FUNC: - irq_enter(); generic_smp_call_function_interrupt(); - irq_exit(); break; case IPI_CPU_STOP: - irq_enter(); - ipi_cpu_stop(cpu); - irq_exit(); + local_cpu_stop(); break; case IPI_CPU_CRASH_STOP: if (IS_ENABLED(CONFIG_KEXEC_CORE)) { - irq_enter(); - ipi_cpu_crash_stop(cpu, regs); + ipi_cpu_crash_stop(cpu, get_irq_regs()); unreachable(); } @@ -907,17 +890,13 @@ void handle_IPI(int ipinr, struct pt_regs *regs) #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST case IPI_TIMER: - irq_enter(); tick_receive_broadcast(); - irq_exit(); break; #endif #ifdef CONFIG_IRQ_WORK case IPI_IRQ_WORK: - irq_enter(); irq_work_run(); - irq_exit(); break; #endif @@ -936,7 +915,66 @@ void handle_IPI(int ipinr, struct pt_regs *regs) if ((unsigned)ipinr < NR_IPI) trace_ipi_exit_rcuidle(ipi_types[ipinr]); - set_irq_regs(old_regs); +} + +static irqreturn_t ipi_handler(int irq, void *data) +{ + do_handle_IPI(irq - ipi_irq_base); + return IRQ_HANDLED; +} + +static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) +{ + trace_ipi_raise(target, ipi_types[ipinr]); + __ipi_send_mask(ipi_desc[ipinr], target); +} + +static void ipi_setup(int cpu) +{ + int i; + + if (WARN_ON_ONCE(!ipi_irq_base)) + return; + + for (i = 0; i < nr_ipi; i++) + enable_percpu_irq(ipi_irq_base + i, 0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ipi_teardown(int cpu) +{ + int i; + + if (WARN_ON_ONCE(!ipi_irq_base)) + return; + + for (i = 0; i < nr_ipi; i++) + disable_percpu_irq(ipi_irq_base + i); +} +#endif + +void __init set_smp_ipi_range(int ipi_base, int n) +{ + int i; + + WARN_ON(n < NR_IPI); + nr_ipi = min(n, NR_IPI); + + for (i = 0; i < nr_ipi; i++) { + int err; + + err = request_percpu_irq(ipi_base + i, ipi_handler, + "IPI", &cpu_number); + WARN_ON(err); + + ipi_desc[i] = irq_to_desc(ipi_base + i); + irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + } + + ipi_irq_base = ipi_base; + + /* Setup the boot CPU immediately */ + ipi_setup(smp_processor_id()); } void smp_send_reschedule(int cpu) @@ -951,11 +989,22 @@ void tick_broadcast(const struct cpumask *mask) } #endif +/* + * The number of CPUs online, not counting this CPU (which may not be + * fully online and so not counted in num_online_cpus()). + */ +static inline unsigned int num_other_online_cpus(void) +{ + unsigned int this_cpu_online = cpu_online(smp_processor_id()); + + return num_online_cpus() - this_cpu_online; +} + void smp_send_stop(void) { unsigned long timeout; - if (num_online_cpus() > 1) { + if (num_other_online_cpus()) { cpumask_t mask; cpumask_copy(&mask, cpu_online_mask); @@ -968,12 +1017,14 @@ void smp_send_stop(void) /* Wait up to one second for other CPUs to stop */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && timeout--) + while (num_other_online_cpus() && timeout--) udelay(1); - if (num_online_cpus() > 1) - pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + if (num_other_online_cpus()) + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(cpu_online_mask)); + + sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE @@ -992,13 +1043,19 @@ void crash_smp_send_stop(void) cpus_stopped = 1; - if (num_online_cpus() == 1) + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) { + sdei_mask_local_cpu(); return; + } cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); pr_crit("SMP: stopping secondary CPUs\n"); smp_cross_call(&mask, IPI_CPU_CRASH_STOP); @@ -1009,8 +1066,10 @@ void crash_smp_send_stop(void) udelay(1); if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + sdei_mask_local_cpu(); } bool smp_crash_stop_failed(void) @@ -1019,20 +1078,13 @@ bool smp_crash_stop_failed(void) } #endif -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static bool have_cpu_die(void) { #ifdef CONFIG_HOTPLUG_CPU int any_cpu = raw_smp_processor_id(); + const struct cpu_operations *ops = get_cpu_ops(any_cpu); - if (cpu_ops[any_cpu] && cpu_ops[any_cpu]->cpu_die) + if (ops && ops->cpu_die) return true; #endif return false; @@ -1042,5 +1094,6 @@ bool cpus_are_stuck_in_kernel(void) { bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die()); - return !!cpus_stuck_in_kernel || smp_spin_tables; + return !!cpus_stuck_in_kernel || smp_spin_tables || + is_protected_kvm_enabled(); } diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index 93034651c87e..49029eace3ad 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Spin Table SMP initialisation * * Copyright (C) 2013 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/delay.h> @@ -47,7 +36,7 @@ static void write_pen_release(u64 val) unsigned long size = sizeof(secondary_holding_pen_release); secondary_holding_pen_release = val; - __flush_dcache_area(start, size); + dcache_clean_inval_poc((unsigned long)start, (unsigned long)start + size); } @@ -77,6 +66,7 @@ static int smp_spin_table_cpu_init(unsigned int cpu) static int smp_spin_table_cpu_prepare(unsigned int cpu) { __le64 __iomem *release_addr; + phys_addr_t pa_holding_pen = __pa_symbol(secondary_holding_pen); if (!cpu_release_addr[cpu]) return -ENODEV; @@ -94,14 +84,15 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) /* * We write the release address as LE regardless of the native - * endianess of the kernel. Therefore, any boot-loaders that + * endianness of the kernel. Therefore, any boot-loaders that * read this address need to convert this address to the - * boot-loader's endianess before jumping. This is mandated by + * boot-loader's endianness before jumping. This is mandated by * the boot protocol. */ - writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr); - __flush_dcache_area((__force void *)release_addr, - sizeof(*release_addr)); + writeq_relaxed(pa_holding_pen, release_addr); + dcache_clean_inval_poc((__force unsigned long)release_addr, + (__force unsigned long)release_addr + + sizeof(*release_addr)); /* * Send an event to wake up the secondary CPU. diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 76809ccd309c..634279b3b03d 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Stack tracing support * * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kernel.h> #include <linux/export.h> @@ -28,161 +17,204 @@ #include <asm/stacktrace.h> /* - * AArch64 PCS assigns the frame pointer to x29. + * Start an unwind from a pt_regs. * - * A simple function prologue looks like this: - * sub sp, sp, #0x10 - * stp x29, x30, [sp] - * mov x29, sp + * The unwind will begin at the PC within the regs. * - * A simple function epilogue looks like this: - * mov sp, x29 - * ldp x29, x30, [sp] - * add sp, sp, #0x10 + * The regs must be on a stack currently owned by the calling task. */ -int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) +static inline void unwind_init_from_regs(struct unwind_state *state, + struct pt_regs *regs) { - unsigned long fp = frame->fp; + unwind_init_common(state, current); - if (fp & 0xf) - return -EINVAL; + state->fp = regs->regs[29]; + state->pc = regs->pc; +} - if (!tsk) - tsk = current; +/* + * Start an unwind from a caller. + * + * The unwind will begin at the caller of whichever function this is inlined + * into. + * + * The function which invokes this must be noinline. + */ +static __always_inline void unwind_init_from_caller(struct unwind_state *state) +{ + unwind_init_common(state, current); + + state->fp = (unsigned long)__builtin_frame_address(1); + state->pc = (unsigned long)__builtin_return_address(0); +} + +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked tasks saved PC (i.e. the caller of + * cpu_switch_to()). + * + * The caller should ensure the task is blocked in cpu_switch_to() for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static inline void unwind_init_from_task(struct unwind_state *state, + struct task_struct *task) +{ + unwind_init_common(state, task); - if (!on_accessible_stack(tsk, fp)) - return -EINVAL; + state->fp = thread_saved_fp(task); + state->pc = thread_saved_pc(task); +} - frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); +/* + * Unwind from one frame record (A) to the next frame record (B). + * + * We terminate early if the location of B indicates a malformed chain of frame + * records (e.g. a cycle), determined based on the location and fp value of A + * and the location (but not the fp value) of B. + */ +static int notrace unwind_next(struct unwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->fp; + int err; + + /* Final frame; nothing to unwind */ + if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) + return -ENOENT; + + err = unwind_next_frame_record(state); + if (err) + return err; + + state->pc = ptrauth_strip_insn_pac(state->pc); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (tsk->ret_stack && - (frame->pc == (unsigned long)return_to_handler)) { + (state->pc == (unsigned long)return_to_handler)) { + unsigned long orig_pc; /* * This is a case where function graph tracer has * modified a return address (LR) in a stack frame * to hook a function return. * So replace it to an original value. */ - frame->pc = tsk->ret_stack[frame->graph--].ret; + orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, + (void *)state->fp); + if (WARN_ON_ONCE(state->pc == orig_pc)) + return -EINVAL; + state->pc = orig_pc; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - /* - * Frames created upon entry from EL0 have NULL FP and PC values, so - * don't bother reporting these. Frames created by __noreturn functions - * might have a valid FP even if PC is bogus, so only terminate where - * both are NULL. - */ - if (!frame->fp && !frame->pc) - return -EINVAL; +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(state->pc)) + state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); +#endif return 0; } +NOKPROBE_SYMBOL(unwind_next); -void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) { while (1) { int ret; - if (fn(frame, data)) + if (!consume_entry(cookie, state->pc)) break; - ret = unwind_frame(tsk, frame); + ret = unwind_next(state); if (ret < 0) break; } } +NOKPROBE_SYMBOL(unwind); -#ifdef CONFIG_STACKTRACE -struct stack_trace_data { - struct stack_trace *trace; - unsigned int no_sched_functions; - unsigned int skip; -}; - -static int save_trace(struct stackframe *frame, void *d) +static bool dump_backtrace_entry(void *arg, unsigned long where) { - struct stack_trace_data *data = d; - struct stack_trace *trace = data->trace; - unsigned long addr = frame->pc; - - if (data->no_sched_functions && in_sched_functions(addr)) - return 0; - if (data->skip) { - data->skip--; - return 0; - } - - trace->entries[trace->nr_entries++] = addr; - - return trace->nr_entries >= trace->max_entries; + char *loglvl = arg; + printk("%s %pSb\n", loglvl, (void *)where); + return true; } -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { - struct stack_trace_data data; - struct stackframe frame; - - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = 0; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); - frame.fp = regs->regs[29]; - frame.pc = regs->pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = current->curr_ret_stack; -#endif - - walk_stackframe(current, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; -} + if (regs && user_mode(regs)) + return; -static noinline void __save_stack_trace(struct task_struct *tsk, - struct stack_trace *trace, unsigned int nosched) -{ - struct stack_trace_data data; - struct stackframe frame; + if (!tsk) + tsk = current; if (!try_get_task_stack(tsk)) return; - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = nosched; - - if (tsk != current) { - frame.fp = thread_saved_fp(tsk); - frame.pc = thread_saved_pc(tsk); - } else { - /* We don't want this function nor the caller */ - data.skip += 2; - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.pc = (unsigned long)__save_stack_trace; - } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = tsk->curr_ret_stack; -#endif - - walk_stackframe(tsk, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + printk("%sCall trace:\n", loglvl); + arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { - __save_stack_trace(tsk, trace, 1); + dump_backtrace(NULL, tsk, loglvl); + barrier(); } -void save_stack_trace(struct stack_trace *trace) -{ - __save_stack_trace(current, trace, 0); -} +/* + * Per-cpu stacks are only accessible when unwinding the current task in a + * non-preemptible context. + */ +#define STACKINFO_CPU(name) \ + ({ \ + ((task == current) && !preemptible()) \ + ? stackinfo_get_##name() \ + : stackinfo_get_unknown(); \ + }) -EXPORT_SYMBOL_GPL(save_stack_trace); +/* + * SDEI stacks are only accessible when unwinding the current task in an NMI + * context. + */ +#define STACKINFO_SDEI(name) \ + ({ \ + ((task == current) && in_nmi()) \ + ? stackinfo_get_sdei_##name() \ + : stackinfo_get_unknown(); \ + }) + +noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) +{ + struct stack_info stacks[] = { + stackinfo_get_task(task), + STACKINFO_CPU(irq), +#if defined(CONFIG_VMAP_STACK) + STACKINFO_CPU(overflow), #endif +#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE) + STACKINFO_SDEI(normal), + STACKINFO_SDEI(critical), +#endif + }; + struct unwind_state state = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }; + + if (regs) { + if (task != current) + return; + unwind_init_from_regs(&state, regs); + } else if (task == current) { + unwind_init_from_caller(&state); + } else { + unwind_init_from_task(&state, task); + } + + unwind(&state, consume_entry, cookie); +} diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 1e3be9064cfa..8b02d310838f 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -1,17 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/pgtable.h> #include <asm/alternative.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> +#include <asm/cpuidle.h> +#include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/exec.h> -#include <asm/pgtable.h> +#include <asm/mte.h> #include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/smp_plat.h> #include <asm/suspend.h> -#include <asm/tlbflush.h> /* * This is allocated by cpu_suspend_init(), and used to store a pointer to @@ -39,6 +43,8 @@ void notrace __cpu_suspend_exit(void) { unsigned int cpu = smp_processor_id(); + mte_suspend_exit(); + /* * We are resuming from reset with the idmap active in TTBR0_EL1. * We must uninstall the idmap and restore the expected MMU @@ -46,21 +52,33 @@ void notrace __cpu_suspend_exit(void) */ cpu_uninstall_idmap(); + /* Restore CnP bit in TTBR1_EL1 */ + if (system_supports_cnp()) + cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); + /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, - CONFIG_ARM64_PAN)); - uao_thread_switch(current); + __uaccess_enable_hw_pan(); /* * Restore HW breakpoint registers to sane values * before debug exceptions are possibly reenabled - * through local_dbg_restore. + * by cpu_suspend()s local_daif_restore() call. */ if (hw_breakpoint_restore) hw_breakpoint_restore(cpu); + + /* + * On resume, firmware implementing dynamic mitigation will + * have turned the mitigation on. If the user has forcefully + * disabled it, make sure their wishes are obeyed. + */ + spectre_v4_enable_mitigation(NULL); + + /* Restore additional feature-specific configuration */ + ptrauth_suspend_exit(); } /* @@ -75,21 +93,31 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int ret = 0; unsigned long flags; struct sleep_stack_data state; + struct arm_cpuidle_irq_context context; + + /* Report any MTE async fault before going to suspend */ + mte_suspend_enter(); /* * From this point debug exceptions are disabled to prevent * updates to mdscr register (saved and restored along with * general purpose registers) from kernel debuggers. */ - local_dbg_save(flags); + flags = local_daif_save(); /* - * Function graph tracer state gets incosistent when the kernel + * Function graph tracer state gets inconsistent when the kernel * calls functions that never return (aka suspend finishers) hence * disable graph tracing during their execution. */ pause_graph_tracing(); + /* + * Switch to using DAIF.IF instead of PMR in order to reliably + * resume if we're using pseudo-NMIs. + */ + arm_cpuidle_save_irq_context(&context); + if (__cpu_suspend_enter(&state)) { /* Call the suspend finisher */ ret = fn(arg); @@ -104,17 +132,19 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) if (!ret) ret = -EOPNOTSUPP; } else { - __cpu_suspend_exit(); + RCU_NONIDLE(__cpu_suspend_exit()); } + arm_cpuidle_restore_irq_context(&context); + unpause_graph_tracing(); /* * Restore pstate flags. OS lock and mdscr have been already * restored, so from this point onwards, debugging is fully - * renabled if it was enabled when core started shutdown. + * reenabled if it was enabled when core started shutdown. */ - local_dbg_restore(flags); + local_daif_restore(flags); return ret; } diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index 26fe8ea93ea2..d5ffaaab31a7 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AArch64-specific system calls implementation * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/compiler.h> @@ -25,16 +14,18 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/syscalls.h> + #include <asm/cpufeature.h> +#include <asm/syscall.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t off) +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) { if (offset_in_page(off) != 0) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } SYSCALL_DEFINE1(arm64_personality, unsigned int, personality) @@ -42,24 +33,29 @@ SYSCALL_DEFINE1(arm64_personality, unsigned int, personality) if (personality(personality) == PER_LINUX32 && !system_supports_32bit_el0()) return -EINVAL; - return sys_personality(personality); + return ksys_personality(personality); +} + +asmlinkage long sys_ni_syscall(void); + +asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused) +{ + return sys_ni_syscall(); } /* * Wrappers to pass the pt_regs argument. */ -asmlinkage long sys_rt_sigreturn_wrapper(void); -#define sys_rt_sigreturn sys_rt_sigreturn_wrapper -#define sys_personality sys_arm64_personality +#define __arm64_sys_personality __arm64_sys_arm64_personality #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, +#define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); +#include <asm/unistd.h> -/* - * The sys_call_table array must be 4K aligned to be accessible from - * kernel/entry.S. - */ -void * const sys_call_table[__NR_syscalls] __aligned(4096) = { - [0 ... __NR_syscalls - 1] = sys_ni_syscall, +#undef __SYSCALL +#define __SYSCALL(nr, sym) [nr] = __arm64_##sym, + +const syscall_fn_t sys_call_table[__NR_syscalls] = { + [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall, #include <asm/unistd.h> }; diff --git a/arch/arm64/kernel/sys32.c b/arch/arm64/kernel/sys32.c index a40b1343b819..fc40386afb1b 100644 --- a/arch/arm64/kernel/sys32.c +++ b/arch/arm64/kernel/sys32.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/sys32.c * * Copyright (C) 2015 ARM Ltd. - * - * This program is free software(void); you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http(void);//www.gnu.org/licenses/>. */ /* @@ -22,31 +11,125 @@ */ #define __COMPAT_SYSCALL_NR +#include <linux/compat.h> #include <linux/compiler.h> #include <linux/syscalls.h> -asmlinkage long compat_sys_sigreturn_wrapper(void); -asmlinkage long compat_sys_rt_sigreturn_wrapper(void); -asmlinkage long compat_sys_statfs64_wrapper(void); -asmlinkage long compat_sys_fstatfs64_wrapper(void); -asmlinkage long compat_sys_pread64_wrapper(void); -asmlinkage long compat_sys_pwrite64_wrapper(void); -asmlinkage long compat_sys_truncate64_wrapper(void); -asmlinkage long compat_sys_ftruncate64_wrapper(void); -asmlinkage long compat_sys_readahead_wrapper(void); -asmlinkage long compat_sys_fadvise64_64_wrapper(void); -asmlinkage long compat_sys_sync_file_range2_wrapper(void); -asmlinkage long compat_sys_fallocate_wrapper(void); -asmlinkage long compat_sys_mmap2_wrapper(void); +#include <asm/syscall.h> -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, +asmlinkage long compat_sys_sigreturn(void); +asmlinkage long compat_sys_rt_sigreturn(void); + +COMPAT_SYSCALL_DEFINE3(aarch32_statfs64, const char __user *, pathname, + compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + /* + * 32-bit ARM applies an OABI compatibility fixup to statfs64 and + * fstatfs64 regardless of whether OABI is in use, and therefore + * arbitrary binaries may rely upon it, so we must do the same. + * For more details, see commit: + * + * 713c481519f19df9 ("[ARM] 3108/2: old ABI compat: statfs64 and + * fstatfs64") + */ + if (sz == 88) + sz = 84; + + return kcompat_sys_statfs64(pathname, sz, buf); +} + +COMPAT_SYSCALL_DEFINE3(aarch32_fstatfs64, unsigned int, fd, compat_size_t, sz, + struct compat_statfs64 __user *, buf) +{ + /* see aarch32_statfs64 */ + if (sz == 88) + sz = 84; + + return kcompat_sys_fstatfs64(fd, sz, buf); +} /* - * The sys_call_table array must be 4K aligned to be accessible from - * kernel/entry.S. + * Note: off_4k is always in units of 4K. If we can't do the + * requested offset because it is not page-aligned, we return -EINVAL. */ -void * const compat_sys_call_table[__NR_compat_syscalls] __aligned(4096) = { - [0 ... __NR_compat_syscalls - 1] = sys_ni_syscall, +COMPAT_SYSCALL_DEFINE6(aarch32_mmap2, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off_4k) +{ + if (off_4k & (~PAGE_MASK >> 12)) + return -EINVAL; + + off_4k >>= (PAGE_SHIFT - 12); + + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off_4k); +} + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define arg_u32p(name) u32, name##_hi, u32, name##_lo +#else +#define arg_u32p(name) u32, name##_lo, u32, name##_hi +#endif + +#define arg_u64(name) (((u64)name##_hi << 32) | name##_lo) + +COMPAT_SYSCALL_DEFINE6(aarch32_pread64, unsigned int, fd, char __user *, buf, + size_t, count, u32, __pad, arg_u32p(pos)) +{ + return ksys_pread64(fd, buf, count, arg_u64(pos)); +} + +COMPAT_SYSCALL_DEFINE6(aarch32_pwrite64, unsigned int, fd, + const char __user *, buf, size_t, count, u32, __pad, + arg_u32p(pos)) +{ + return ksys_pwrite64(fd, buf, count, arg_u64(pos)); +} + +COMPAT_SYSCALL_DEFINE4(aarch32_truncate64, const char __user *, pathname, + u32, __pad, arg_u32p(length)) +{ + return ksys_truncate(pathname, arg_u64(length)); +} + +COMPAT_SYSCALL_DEFINE4(aarch32_ftruncate64, unsigned int, fd, u32, __pad, + arg_u32p(length)) +{ + return ksys_ftruncate(fd, arg_u64(length)); +} + +COMPAT_SYSCALL_DEFINE5(aarch32_readahead, int, fd, u32, __pad, + arg_u32p(offset), size_t, count) +{ + return ksys_readahead(fd, arg_u64(offset), count); +} + +COMPAT_SYSCALL_DEFINE6(aarch32_fadvise64_64, int, fd, int, advice, + arg_u32p(offset), arg_u32p(len)) +{ + return ksys_fadvise64_64(fd, arg_u64(offset), arg_u64(len), advice); +} + +COMPAT_SYSCALL_DEFINE6(aarch32_sync_file_range2, int, fd, unsigned int, flags, + arg_u32p(offset), arg_u32p(nbytes)) +{ + return ksys_sync_file_range(fd, arg_u64(offset), arg_u64(nbytes), + flags); +} + +COMPAT_SYSCALL_DEFINE6(aarch32_fallocate, int, fd, int, mode, + arg_u32p(offset), arg_u32p(len)) +{ + return ksys_fallocate(fd, mode, arg_u64(offset), arg_u64(len)); +} + +#undef __SYSCALL +#define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); +#include <asm/unistd32.h> + +#undef __SYSCALL +#define __SYSCALL(nr, sym) [nr] = __arm64_##sym, + +const syscall_fn_t compat_sys_call_table[__NR_compat_syscalls] = { + [0 ... __NR_compat_syscalls - 1] = __arm64_sys_ni_syscall, #include <asm/unistd32.h> }; diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 8b8bbd3eaa52..df14336c3a29 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -1,25 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/sys_arm.c * * Copyright (C) People who wrote linux/arch/i386/kernel/sys_i386.c * Copyright (C) 1995, 1996 Russell King. * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/compat.h> -#include <linux/personality.h> +#include <linux/cpufeature.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> @@ -27,6 +16,8 @@ #include <linux/uaccess.h> #include <asm/cacheflush.h> +#include <asm/system_misc.h> +#include <asm/tlbflush.h> #include <asm/unistd.h> static long @@ -40,7 +31,16 @@ __do_compat_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = __flush_cache_user_range(start, start + chunk); + if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { + /* + * The workaround requires an inner-shareable tlbi. + * We pick the reserved-ASID to minimise the impact. + */ + __tlbi(aside1is, __TLBI_VADDR(0, 0)); + dsb(ish); + } + + ret = caches_clean_inval_user_pou(start, start + chunk); if (ret) return ret; @@ -57,7 +57,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) if (end < start || flags) return -EINVAL; - if (!access_ok(VERIFY_READ, start, end - start)) + if (!access_ok((const void __user *)start, end - start)) return -EFAULT; return __do_compat_cache_op(start, end); @@ -65,11 +65,11 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) /* * Handle all unrecognised system calls. */ -long compat_arm_syscall(struct pt_regs *regs) +long compat_arm_syscall(struct pt_regs *regs, int scno) { - unsigned int no = regs->regs[7]; + unsigned long addr; - switch (no) { + switch (scno) { /* * Flush a region from virtual address 'r0' to virtual address 'r1' * _exclusive_. There is no alignment requirement on either address; @@ -88,7 +88,7 @@ long compat_arm_syscall(struct pt_regs *regs) return do_compat_cache_op(regs->regs[0], regs->regs[1], regs->regs[2]); case __ARM_NR_compat_set_tls: - current->thread.tp_value = regs->regs[0]; + current->thread.uw.tp_value = regs->regs[0]; /* * Protect against register corruption from context switch. @@ -99,6 +99,20 @@ long compat_arm_syscall(struct pt_regs *regs) return 0; default: - return -ENOSYS; + /* + * Calls 0xf0xxx..0xf07ff are defined to return -ENOSYS + * if not implemented, rather than raising SIGILL. This + * way the calling program can gracefully determine whether + * a feature is supported. + */ + if (scno < __ARM_NR_COMPAT_END) + return -ENOSYS; + break; } + + addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); + + arm64_notify_die("Oops - bad compat syscall(2)", regs, + SIGILL, ILL_ILLTRP, addr, 0); + return 0; } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c new file mode 100644 index 000000000000..d72e8f23422d --- /dev/null +++ b/arch/arm64/kernel/syscall.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/compiler.h> +#include <linux/context_tracking.h> +#include <linux/errno.h> +#include <linux/nospec.h> +#include <linux/ptrace.h> +#include <linux/randomize_kstack.h> +#include <linux/syscalls.h> + +#include <asm/daifflags.h> +#include <asm/debug-monitors.h> +#include <asm/exception.h> +#include <asm/fpsimd.h> +#include <asm/syscall.h> +#include <asm/thread_info.h> +#include <asm/unistd.h> + +long compat_arm_syscall(struct pt_regs *regs, int scno); +long sys_ni_syscall(void); + +static long do_ni_syscall(struct pt_regs *regs, int scno) +{ +#ifdef CONFIG_COMPAT + long ret; + if (is_compat_task()) { + ret = compat_arm_syscall(regs, scno); + if (ret != -ENOSYS) + return ret; + } +#endif + + return sys_ni_syscall(); +} + +static long __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn) +{ + return syscall_fn(regs); +} + +static void invoke_syscall(struct pt_regs *regs, unsigned int scno, + unsigned int sc_nr, + const syscall_fn_t syscall_table[]) +{ + long ret; + + add_random_kstack_offset(); + + if (scno < sc_nr) { + syscall_fn_t syscall_fn; + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; + ret = __invoke_syscall(regs, syscall_fn); + } else { + ret = do_ni_syscall(regs, scno); + } + + syscall_set_return_value(current, regs, 0, ret); + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * but not enough for arm64 stack utilization comfort. To keep + * reasonable stack head room, reduce the maximum offset to 9 bits. + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: the AAPCS mandates a + * 16-byte (i.e. 4-bit) aligned SP at function boundaries. + * + * The resulting 5 bits of entropy is seen in SP[8:4]. + */ + choose_random_kstack_offset(get_random_u16() & 0x1FF); +} + +static inline bool has_syscall_work(unsigned long flags) +{ + return unlikely(flags & _TIF_SYSCALL_WORK); +} + +int syscall_trace_enter(struct pt_regs *regs); +void syscall_trace_exit(struct pt_regs *regs); + +static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, + const syscall_fn_t syscall_table[]) +{ + unsigned long flags = read_thread_flags(); + + regs->orig_x0 = regs->regs[0]; + regs->syscallno = scno; + + /* + * BTI note: + * The architecture does not guarantee that SPSR.BTYPE is zero + * on taking an SVC, so we could return to userspace with a + * non-zero BTYPE after the syscall. + * + * This shouldn't matter except when userspace is explicitly + * doing something stupid, such as setting PROT_BTI on a page + * that lacks conforming BTI/PACIxSP instructions, falling + * through from one executable page to another with differing + * PROT_BTI, or messing with BTYPE via ptrace: in such cases, + * userspace should not be surprised if a SIGILL occurs on + * syscall return. + * + * So, don't touch regs->pstate & PSR_BTYPE_MASK here. + * (Similarly for HVC and SMC elsewhere.) + */ + + local_daif_restore(DAIF_PROCCTX); + + if (flags & _TIF_MTE_ASYNC_FAULT) { + /* + * Process the asynchronous tag check fault before the actual + * syscall. do_notify_resume() will send a signal to userspace + * before the syscall is restarted. + */ + syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); + return; + } + + if (has_syscall_work(flags)) { + /* + * The de-facto standard way to skip a system call using ptrace + * is to set the system call to -1 (NO_SYSCALL) and set x0 to a + * suitable error code for consumption by userspace. However, + * this cannot be distinguished from a user-issued syscall(-1) + * and so we must set x0 to -ENOSYS here in case the tracer doesn't + * issue the skip and we fall into trace_exit with x0 preserved. + * + * This is slightly odd because it also means that if a tracer + * sets the system call number to -1 but does not initialise x0, + * then x0 will be preserved for all system calls apart from a + * user-issued syscall(-1). However, requesting a skip and not + * setting the return value is unlikely to do anything sensible + * anyway. + */ + if (scno == NO_SYSCALL) + syscall_set_return_value(current, regs, -ENOSYS, 0); + scno = syscall_trace_enter(regs); + if (scno == NO_SYSCALL) + goto trace_exit; + } + + invoke_syscall(regs, scno, sc_nr, syscall_table); + + /* + * The tracing status may have changed under our feet, so we have to + * check again. However, if we were tracing entry, then we always trace + * exit regardless, as the old entry assembly did. + */ + if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { + local_daif_mask(); + flags = read_thread_flags(); + if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) + return; + local_daif_restore(DAIF_PROCCTX); + } + +trace_exit: + syscall_trace_exit(regs); +} + +/* + * As per the ABI exit SME streaming mode and clear the SVE state not + * shared with FPSIMD on syscall entry. + */ +static inline void fp_user_discard(void) +{ + /* + * If SME is active then exit streaming mode. If ZA is active + * then flush the SVE registers but leave userspace access to + * both SVE and SME enabled, otherwise disable SME for the + * task and fall through to disabling SVE too. This means + * that after a syscall we never have any streaming mode + * register state to track, if this changes the KVM code will + * need updating. + */ + if (system_supports_sme() && test_thread_flag(TIF_SME)) { + u64 svcr = read_sysreg_s(SYS_SVCR); + + if (svcr & SVCR_SM_MASK) + sme_smstop_sm(); + } + + if (!system_supports_sve()) + return; + + /* + * If SME is not active then disable SVE, the registers will + * be cleared when userspace next attempts to access them and + * we do not need to track the SVE register state until then. + */ + clear_thread_flag(TIF_SVE); + + /* + * task_fpsimd_load() won't be called to update CPACR_EL1 in + * ret_to_user unless TIF_FOREIGN_FPSTATE is still set, which only + * happens if a context switch or kernel_neon_begin() or context + * modification (sigreturn, ptrace) intervenes. + * So, ensure that CPACR_EL1 is already correct for the fast-path case. + */ + sve_user_disable(); +} + +void do_el0_svc(struct pt_regs *regs) +{ + fp_user_discard(); + el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table); +} + +#ifdef CONFIG_COMPAT +void do_el0_svc_compat(struct pt_regs *regs) +{ + el0_svc_common(regs, regs->regs[7], __NR_compat_syscalls, + compat_sys_call_table); +} +#endif diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index a4391280fba9..b5855eb7435d 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -1,21 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/time.c * * Copyright (C) 1991, 1992, 1995 Linus Torvalds * Modifications for ARM (C) 1994-2001 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/clockchips.h> @@ -29,38 +18,37 @@ #include <linux/timex.h> #include <linux/errno.h> #include <linux/profile.h> +#include <linux/stacktrace.h> #include <linux/syscore_ops.h> #include <linux/timer.h> #include <linux/irq.h> #include <linux/delay.h> #include <linux/clocksource.h> -#include <linux/clk-provider.h> +#include <linux/of_clk.h> #include <linux/acpi.h> #include <clocksource/arm_arch_timer.h> #include <asm/thread_info.h> -#include <asm/stacktrace.h> +#include <asm/paravirt.h> -unsigned long profile_pc(struct pt_regs *regs) +static bool profile_pc_cb(void *arg, unsigned long pc) { - struct stackframe frame; + unsigned long *prof_pc = arg; - if (!in_lock_functions(regs->pc)) - return regs->pc; + if (in_lock_functions(pc)) + return true; + *prof_pc = pc; + return false; +} - frame.fp = regs->regs[29]; - frame.pc = regs->pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = -1; /* no task info */ -#endif - do { - int ret = unwind_frame(NULL, &frame); - if (ret < 0) - return 0; - } while (in_lock_functions(frame.pc)); +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long prof_pc = 0; - return frame.pc; + arch_stack_walk(profile_pc_cb, &prof_pc, current, regs); + + return prof_pc; } EXPORT_SYMBOL(profile_pc); @@ -79,4 +67,6 @@ void __init time_init(void) /* Calibrate the delay loop directly */ lpj_fine = arch_timer_rate / HZ; + + pv_time_init(); } diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 8d48b233e6ce..817d788cd866 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -11,303 +11,338 @@ * for more details. */ +#include <linux/acpi.h> #include <linux/arch_topology.h> -#include <linux/cpu.h> -#include <linux/cpumask.h> +#include <linux/cacheinfo.h> +#include <linux/cpufreq.h> #include <linux/init.h> #include <linux/percpu.h> -#include <linux/node.h> -#include <linux/nodemask.h> -#include <linux/of.h> -#include <linux/sched.h> -#include <linux/sched/topology.h> -#include <linux/slab.h> -#include <linux/string.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> -static int __init get_cpu_for_node(struct device_node *node) +#ifdef CONFIG_ACPI +static bool __init acpi_cpu_is_threaded(int cpu) { - struct device_node *cpu_node; - int cpu; + int is_threaded = acpi_pptt_cpu_is_thread(cpu); + + /* + * if the PPTT doesn't have thread information, assume a homogeneous + * machine and return the current CPU's thread state. + */ + if (is_threaded < 0) + is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; + + return !!is_threaded; +} - cpu_node = of_parse_phandle(node, "cpu", 0); - if (!cpu_node) - return -1; +/* + * Propagate the topology information of the processor_topology_node tree to the + * cpu_topology array. + */ +int __init parse_acpi_topology(void) +{ + int cpu, topology_id; + + if (acpi_disabled) + return 0; for_each_possible_cpu(cpu) { - if (of_get_cpu_node(cpu, NULL) == cpu_node) { - topology_parse_cpu_capacity(cpu_node, cpu); - of_node_put(cpu_node); - return cpu; + topology_id = find_acpi_cpu_topology(cpu, 0); + if (topology_id < 0) + return topology_id; + + if (acpi_cpu_is_threaded(cpu)) { + cpu_topology[cpu].thread_id = topology_id; + topology_id = find_acpi_cpu_topology(cpu, 1); + cpu_topology[cpu].core_id = topology_id; + } else { + cpu_topology[cpu].thread_id = -1; + cpu_topology[cpu].core_id = topology_id; } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; + topology_id = find_acpi_cpu_topology_package(cpu); + cpu_topology[cpu].package_id = topology_id; } - pr_crit("Unable to find CPU node for %pOF\n", cpu_node); + return 0; +} +#endif + +#ifdef CONFIG_ARM64_AMU_EXTN +#define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) +#define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) +#else +#define read_corecnt() (0UL) +#define read_constcnt() (0UL) +#endif + +#undef pr_fmt +#define pr_fmt(fmt) "AMU: " fmt - of_node_put(cpu_node); - return -1; +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale); +static DEFINE_PER_CPU(u64, arch_const_cycles_prev); +static DEFINE_PER_CPU(u64, arch_core_cycles_prev); +static cpumask_var_t amu_fie_cpus; + +void update_freq_counters_refs(void) +{ + this_cpu_write(arch_core_cycles_prev, read_corecnt()); + this_cpu_write(arch_const_cycles_prev, read_constcnt()); } -static int __init parse_core(struct device_node *core, int cluster_id, - int core_id) +static inline bool freq_counters_valid(int cpu) { - char name[10]; - bool leaf = true; - int i = 0; - int cpu; - struct device_node *t; - - do { - snprintf(name, sizeof(name), "thread%d", i); - t = of_get_child_by_name(core, name); - if (t) { - leaf = false; - cpu = get_cpu_for_node(t); - if (cpu >= 0) { - cpu_topology[cpu].cluster_id = cluster_id; - cpu_topology[cpu].core_id = core_id; - cpu_topology[cpu].thread_id = i; - } else { - pr_err("%pOF: Can't get CPU for thread\n", - t); - of_node_put(t); - return -EINVAL; - } - of_node_put(t); - } - i++; - } while (t); - - cpu = get_cpu_for_node(core); - if (cpu >= 0) { - if (!leaf) { - pr_err("%pOF: Core has both threads and CPU\n", - core); - return -EINVAL; - } + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; - cpu_topology[cpu].cluster_id = cluster_id; - cpu_topology[cpu].core_id = core_id; - } else if (leaf) { - pr_err("%pOF: Can't get CPU for leaf core\n", core); - return -EINVAL; + if (!cpu_has_amu_feat(cpu)) { + pr_debug("CPU%d: counters are not supported.\n", cpu); + return false; } - return 0; + if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || + !per_cpu(arch_core_cycles_prev, cpu))) { + pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); + return false; + } + + return true; } -static int __init parse_cluster(struct device_node *cluster, int depth) +static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) { - char name[10]; - bool leaf = true; - bool has_cores = false; - struct device_node *c; - static int cluster_id __initdata; - int core_id = 0; - int i, ret; + u64 ratio; + + if (unlikely(!max_rate || !ref_rate)) { + pr_debug("CPU%d: invalid maximum or reference frequency.\n", + cpu); + return -EINVAL; + } /* - * First check for child clusters; we currently ignore any - * information about the nesting of clusters and present the - * scheduler with a flat list of them. + * Pre-compute the fixed ratio between the frequency of the constant + * reference counter and the maximum frequency of the CPU. + * + * ref_rate + * arch_max_freq_scale = ---------- * SCHED_CAPACITY_SCALE² + * max_rate + * + * We use a factor of 2 * SCHED_CAPACITY_SHIFT -> SCHED_CAPACITY_SCALE² + * in order to ensure a good resolution for arch_max_freq_scale for + * very low reference frequencies (down to the KHz range which should + * be unlikely). */ - i = 0; - do { - snprintf(name, sizeof(name), "cluster%d", i); - c = of_get_child_by_name(cluster, name); - if (c) { - leaf = false; - ret = parse_cluster(c, depth + 1); - of_node_put(c); - if (ret != 0) - return ret; - } - i++; - } while (c); - - /* Now check for cores */ - i = 0; - do { - snprintf(name, sizeof(name), "core%d", i); - c = of_get_child_by_name(cluster, name); - if (c) { - has_cores = true; - - if (depth == 0) { - pr_err("%pOF: cpu-map children should be clusters\n", - c); - of_node_put(c); - return -EINVAL; - } - - if (leaf) { - ret = parse_core(c, cluster_id, core_id++); - } else { - pr_err("%pOF: Non-leaf cluster with core %s\n", - cluster, name); - ret = -EINVAL; - } - - of_node_put(c); - if (ret != 0) - return ret; - } - i++; - } while (c); - - if (leaf && !has_cores) - pr_warn("%pOF: empty cluster\n", cluster); + ratio = ref_rate << (2 * SCHED_CAPACITY_SHIFT); + ratio = div64_u64(ratio, max_rate); + if (!ratio) { + WARN_ONCE(1, "Reference frequency too low.\n"); + return -EINVAL; + } - if (leaf) - cluster_id++; + per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio; return 0; } -static int __init parse_dt_topology(void) +static void amu_scale_freq_tick(void) { - struct device_node *cn, *map; - int ret = 0; - int cpu; + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; - cn = of_find_node_by_path("/cpus"); - if (!cn) { - pr_err("No CPU information found in DT\n"); - return 0; - } + prev_const_cnt = this_cpu_read(arch_const_cycles_prev); + prev_core_cnt = this_cpu_read(arch_core_cycles_prev); - /* - * When topology is provided cpu-map is essentially a root - * cluster with restricted subnodes. - */ - map = of_get_child_by_name(cn, "cpu-map"); - if (!map) - goto out; + update_freq_counters_refs(); - ret = parse_cluster(map, 0); - if (ret != 0) - goto out_map; + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); - topology_normalize_cpu_scale(); + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; /* - * Check that all cores are in the topology; the SMP code will - * only mark cores described in the DT as possible. + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. */ - for_each_possible_cpu(cpu) - if (cpu_topology[cpu].cluster_id == -1) - ret = -EINVAL; - -out_map: - of_node_put(map); -out: - of_node_put(cn); - return ret; -} + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); -/* - * cpu topology table - */ -struct cpu_topology cpu_topology[NR_CPUS]; -EXPORT_SYMBOL_GPL(cpu_topology); - -const struct cpumask *cpu_coregroup_mask(int cpu) -{ - return &cpu_topology[cpu].core_sibling; + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); } -static void update_siblings_masks(unsigned int cpuid) +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; + +static void amu_fie_setup(const struct cpumask *cpus) { - struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; int cpu; - /* update core and thread sibling masks */ - for_each_possible_cpu(cpu) { - cpu_topo = &cpu_topology[cpu]; + /* We are already set since the last insmod of cpufreq driver */ + if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; - if (cpuid_topo->cluster_id != cpu_topo->cluster_id) - continue; + for_each_cpu(cpu, cpus) { + if (!freq_counters_valid(cpu) || + freq_inv_set_max_ratio(cpu, + cpufreq_get_hw_max_freq(cpu) * 1000ULL, + arch_timer_get_rate())) + return; + } - cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); - if (cpuid_topo->core_id != cpu_topo->core_id) - continue; + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); - cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); - } + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); } -void store_cpu_topology(unsigned int cpuid) +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) { - struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; - u64 mpidr; + struct cpufreq_policy *policy = data; - if (cpuid_topo->cluster_id != -1) - goto topology_populated; + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); - mpidr = read_cpuid_mpidr(); + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of arch_freq_scale will remain valid as that is the frequency + * those CPUs are running at. + */ - /* Uniprocessor systems can rely on default topology values */ - if (mpidr & MPIDR_UP_BITMASK) - return; + return 0; +} - /* Create cpu topology mapping based on MPIDR. */ - if (mpidr & MPIDR_MT_BITMASK) { - /* Multiprocessor system : Multi-threads per core */ - cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); - cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8; - } else { - /* Multiprocessor system : Single-thread per core */ - cpuid_topo->thread_id = -1; - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) | - MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16; - } +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; - pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n", - cpuid, cpuid_topo->cluster_id, cpuid_topo->core_id, - cpuid_topo->thread_id, mpidr); + ret = cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + if (ret) + free_cpumask_var(amu_fie_cpus); -topology_populated: - update_siblings_masks(cpuid); + return ret; } +core_initcall(init_amu_fie); -static void __init reset_cpu_topology(void) +#ifdef CONFIG_ACPI_CPPC_LIB +#include <acpi/cppc_acpi.h> + +static void cpu_read_corecnt(void *val) { - unsigned int cpu; + /* + * A value of 0 can be returned if the current CPU does not support AMUs + * or if the counter is disabled for this CPU. A return value of 0 at + * counter read is properly handled as an error case by the users of the + * counter. + */ + *(u64 *)val = read_corecnt(); +} - for_each_possible_cpu(cpu) { - struct cpu_topology *cpu_topo = &cpu_topology[cpu]; +static void cpu_read_constcnt(void *val) +{ + /* + * Return 0 if the current CPU is affected by erratum 2457168. A value + * of 0 is also returned if the current CPU does not support AMUs or if + * the counter is disabled. A return value of 0 at counter read is + * properly handled as an error case by the users of the counter. + */ + *(u64 *)val = this_cpu_has_cap(ARM64_WORKAROUND_2457168) ? + 0UL : read_constcnt(); +} + +static inline +int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) +{ + /* + * Abort call on counterless CPU or when interrupts are + * disabled - can lead to deadlock in smp sync call. + */ + if (!cpu_has_amu_feat(cpu)) + return -EOPNOTSUPP; - cpu_topo->thread_id = -1; - cpu_topo->core_id = 0; - cpu_topo->cluster_id = -1; + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; - cpumask_clear(&cpu_topo->core_sibling); - cpumask_set_cpu(cpu, &cpu_topo->core_sibling); - cpumask_clear(&cpu_topo->thread_sibling); - cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); - } + smp_call_function_single(cpu, func, val, 1); + + return 0; } -void __init init_cpu_topology(void) +/* + * Refer to drivers/acpi/cppc_acpi.c for the description of the functions + * below. + */ +bool cpc_ffh_supported(void) { - reset_cpu_topology(); + int cpu = get_cpu_with_amu_feat(); /* - * Discard anything that was parsed if we hit an error so we - * don't use partial information. + * FFH is considered supported if there is at least one present CPU that + * supports AMUs. Using FFH to read core and reference counters for CPUs + * that do not support AMUs, have counters disabled or that are affected + * by errata, will result in a return value of 0. + * + * This is done to allow any enabled and valid counters to be read + * through FFH, knowing that potentially returning 0 as counter value is + * properly handled by the users of these counters. */ - if (of_have_populated_dt() && parse_dt_topology()) - reset_cpu_topology(); + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; + + return true; +} + +int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +{ + int ret = -EOPNOTSUPP; + + switch ((u64)reg->address) { + case 0x0: + ret = counters_read_on_cpu(cpu, cpu_read_corecnt, val); + break; + case 0x1: + ret = counters_read_on_cpu(cpu, cpu_read_constcnt, val); + break; + } + + if (!ret) { + *val &= GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + *val >>= reg->bit_offset; + } + + return ret; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -EOPNOTSUPP; } +#endif /* CONFIG_ACPI_CPPC_LIB */ diff --git a/arch/arm64/kernel/trace-events-emulation.h b/arch/arm64/kernel/trace-events-emulation.h index ae1dd598ea65..6c40f58b844a 100644 --- a/arch/arm64/kernel/trace-events-emulation.h +++ b/arch/arm64/kernel/trace-events-emulation.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM emulation diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 5ea4b85aee0e..23d281ed7621 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1,26 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/traps.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/bug.h> +#include <linux/context_tracking.h> #include <linux/signal.h> -#include <linux/personality.h> #include <linux/kallsyms.h> +#include <linux/kprobes.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/hardirq.h> @@ -35,195 +25,168 @@ #include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> +#include <linux/kasan.h> +#include <linux/cfi.h> #include <asm/atomic.h> #include <asm/bug.h> +#include <asm/cpufeature.h> +#include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/esr.h> +#include <asm/exception.h> +#include <asm/extable.h> #include <asm/insn.h> +#include <asm/kprobes.h> +#include <asm/patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> -#include <asm/exception.h> #include <asm/system_misc.h> #include <asm/sysreg.h> -static const char *handler[]= { - "Synchronous Abort", - "IRQ", - "FIQ", - "Error" -}; - -int show_unhandled_signals = 1; +static bool __kprobes __check_eq(unsigned long pstate) +{ + return (pstate & PSR_Z_BIT) != 0; +} -/* - * Dump out the contents of some kernel memory nicely... - */ -static void dump_mem(const char *lvl, const char *str, unsigned long bottom, - unsigned long top) +static bool __kprobes __check_ne(unsigned long pstate) { - unsigned long first; - mm_segment_t fs; - int i; + return (pstate & PSR_Z_BIT) == 0; +} - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. - */ - fs = get_fs(); - set_fs(KERNEL_DS); +static bool __kprobes __check_cs(unsigned long pstate) +{ + return (pstate & PSR_C_BIT) != 0; +} - printk("%s%s(0x%016lx to 0x%016lx)\n", lvl, str, bottom, top); +static bool __kprobes __check_cc(unsigned long pstate) +{ + return (pstate & PSR_C_BIT) == 0; +} - for (first = bottom & ~31; first < top; first += 32) { - unsigned long p; - char str[sizeof(" 12345678") * 8 + 1]; +static bool __kprobes __check_mi(unsigned long pstate) +{ + return (pstate & PSR_N_BIT) != 0; +} - memset(str, ' ', sizeof(str)); - str[sizeof(str) - 1] = '\0'; +static bool __kprobes __check_pl(unsigned long pstate) +{ + return (pstate & PSR_N_BIT) == 0; +} - for (p = first, i = 0; i < (32 / 8) - && p < top; i++, p += 8) { - if (p >= bottom && p < top) { - unsigned long val; +static bool __kprobes __check_vs(unsigned long pstate) +{ + return (pstate & PSR_V_BIT) != 0; +} - if (__get_user(val, (unsigned long *)p) == 0) - sprintf(str + i * 17, " %016lx", val); - else - sprintf(str + i * 17, " ????????????????"); - } - } - printk("%s%04lx:%s\n", lvl, first & 0xffff, str); - } +static bool __kprobes __check_vc(unsigned long pstate) +{ + return (pstate & PSR_V_BIT) == 0; +} - set_fs(fs); +static bool __kprobes __check_hi(unsigned long pstate) +{ + pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (pstate & PSR_C_BIT) != 0; } -static void dump_backtrace_entry(unsigned long where) +static bool __kprobes __check_ls(unsigned long pstate) { - /* - * Note that 'where' can have a physical address, but it's not handled. - */ - print_ip_sym(where); + pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (pstate & PSR_C_BIT) == 0; } -static void __dump_instr(const char *lvl, struct pt_regs *regs) +static bool __kprobes __check_ge(unsigned long pstate) { - unsigned long addr = instruction_pointer(regs); - char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; - int i; + pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (pstate & PSR_N_BIT) == 0; +} - for (i = -4; i < 1; i++) { - unsigned int val, bad; +static bool __kprobes __check_lt(unsigned long pstate) +{ + pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (pstate & PSR_N_BIT) != 0; +} - bad = __get_user(val, &((u32 *)addr)[i]); +static bool __kprobes __check_gt(unsigned long pstate) +{ + /*PSR_N_BIT ^= PSR_V_BIT */ + unsigned long temp = pstate ^ (pstate << 3); - if (!bad) - p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); - else { - p += sprintf(p, "bad PC value"); - break; - } - } - printk("%sCode: %s\n", lvl, str); + temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ + return (temp & PSR_N_BIT) == 0; } -static void dump_instr(const char *lvl, struct pt_regs *regs) +static bool __kprobes __check_le(unsigned long pstate) { - if (!user_mode(regs)) { - mm_segment_t fs = get_fs(); - set_fs(KERNEL_DS); - __dump_instr(lvl, regs); - set_fs(fs); - } else { - __dump_instr(lvl, regs); - } + /*PSR_N_BIT ^= PSR_V_BIT */ + unsigned long temp = pstate ^ (pstate << 3); + + temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ + return (temp & PSR_N_BIT) != 0; } -void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static bool __kprobes __check_al(unsigned long pstate) { - struct stackframe frame; - int skip; + return true; +} + +/* + * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that + * it behaves identically to 0b1110 ("al"). + */ +pstate_check_t * const aarch32_opcode_cond_checks[16] = { + __check_eq, __check_ne, __check_cs, __check_cc, + __check_mi, __check_pl, __check_vs, __check_vc, + __check_hi, __check_ls, __check_ge, __check_lt, + __check_gt, __check_le, __check_al, __check_al +}; - pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); +int show_unhandled_signals = 0; - if (!tsk) - tsk = current; +static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) +{ + unsigned long addr = instruction_pointer(regs); + char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; + int i; - if (!try_get_task_stack(tsk)) + if (user_mode(regs)) return; - if (tsk == current) { - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.pc = (unsigned long)dump_backtrace; - } else { - /* - * task blocked in __switch_to - */ - frame.fp = thread_saved_fp(tsk); - frame.pc = thread_saved_pc(tsk); - } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = tsk->curr_ret_stack; -#endif + for (i = -4; i < 1; i++) { + unsigned int val, bad; - skip = !!regs; - printk("Call trace:\n"); - while (1) { - unsigned long stack; - int ret; - - /* skip until specified stack frame */ - if (!skip) { - dump_backtrace_entry(frame.pc); - } else if (frame.fp == regs->regs[29]) { - skip = 0; - /* - * Mostly, this is the case where this function is - * called in panic/abort. As exception handler's - * stack frame does not contain the corresponding pc - * at which an exception has taken place, use regs->pc - * instead. - */ - dump_backtrace_entry(regs->pc); - } - ret = unwind_frame(tsk, &frame); - if (ret < 0) - break; - if (in_entry_text(frame.pc)) { - stack = frame.fp - offsetof(struct pt_regs, stackframe); + bad = aarch64_insn_read(&((u32 *)addr)[i], &val); - if (on_accessible_stack(tsk, stack)) - dump_mem("", "Exception stack", stack, - stack + sizeof(struct pt_regs)); + if (!bad) + p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); + else { + p += sprintf(p, "bad PC value"); + break; } } - put_task_stack(tsk); -} - -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - dump_backtrace(NULL, tsk); - barrier(); + printk("%sCode: %s\n", lvl, str); } #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" +#elif defined(CONFIG_PREEMPT_RT) +#define S_PREEMPT " PREEMPT_RT" #else #define S_PREEMPT "" #endif + #define S_SMP " SMP" -static int __die(const char *str, int err, struct pt_regs *regs) +static int __die(const char *str, long err, struct pt_regs *regs) { - struct task_struct *tsk = current; static int die_counter; int ret; - pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ @@ -232,15 +195,9 @@ static int __die(const char *str, int err, struct pt_regs *regs) return ret; print_modules(); - __show_regs(regs); - pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), - end_of_stack(tsk)); + show_regs(regs); - if (!user_mode(regs)) { - dump_backtrace(regs, tsk); - dump_instr(KERN_EMERG, regs); - } + dump_kernel_instr(KERN_EMERG, regs); return ret; } @@ -250,7 +207,7 @@ static DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. */ -void die(const char *str, struct pt_regs *regs, int err) +void die(const char *str, struct pt_regs *regs, long err) { int ret; unsigned long flags; @@ -271,28 +228,151 @@ void die(const char *str, struct pt_regs *regs, int err) oops_exit(); if (in_interrupt()) - panic("Fatal exception in interrupt"); + panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) - panic("Fatal exception"); + panic("%s: Fatal exception", str); raw_spin_unlock_irqrestore(&die_lock, flags); if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); +} + +static void arm64_show_signal(int signo, const char *str) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct task_struct *tsk = current; + unsigned long esr = tsk->thread.fault_code; + struct pt_regs *regs = task_pt_regs(tsk); + + /* Leave if the signal won't be shown */ + if (!show_unhandled_signals || + !unhandled_signal(tsk, signo) || + !__ratelimit(&rs)) + return; + + pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk)); + if (esr) + pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr); + + pr_cont("%s", str); + print_vma_addr(KERN_CONT " in ", regs->pc); + pr_cont("\n"); + __show_regs(regs); +} + +void arm64_force_sig_fault(int signo, int code, unsigned long far, + const char *str) +{ + arm64_show_signal(signo, str); + if (signo == SIGKILL) + force_sig(SIGKILL); + else + force_sig_fault(signo, code, (void __user *)far); +} + +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, + const char *str) +{ + arm64_show_signal(SIGBUS, str); + force_sig_mceerr(code, (void __user *)far, lsb); +} + +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, + const char *str) +{ + arm64_show_signal(SIGTRAP, str); + force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, - struct siginfo *info, int err) + int signo, int sicode, unsigned long far, + unsigned long err) { if (user_mode(regs)) { + WARN_ON(regs != current_pt_regs()); current->thread.fault_address = 0; current->thread.fault_code = err; - force_sig_info(info->si_signo, info, current); + + arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } } +#ifdef CONFIG_COMPAT +#define PSTATE_IT_1_0_SHIFT 25 +#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) +#define PSTATE_IT_7_2_SHIFT 10 +#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) + +static u32 compat_get_it_state(struct pt_regs *regs) +{ + u32 it, pstate = regs->pstate; + + it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; + it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; + + return it; +} + +static void compat_set_it_state(struct pt_regs *regs, u32 it) +{ + u32 pstate_it; + + pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; + pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; + + regs->pstate &= ~PSR_AA32_IT_MASK; + regs->pstate |= pstate_it; +} + +static void advance_itstate(struct pt_regs *regs) +{ + u32 it; + + /* ARM mode */ + if (!(regs->pstate & PSR_AA32_T_BIT) || + !(regs->pstate & PSR_AA32_IT_MASK)) + return; + + it = compat_get_it_state(regs); + + /* + * If this is the last instruction of the block, wipe the IT + * state. Otherwise advance it. + */ + if (!(it & 7)) + it = 0; + else + it = (it & 0xe0) | ((it << 1) & 0x1f); + + compat_set_it_state(regs, it); +} +#else +static void advance_itstate(struct pt_regs *regs) +{ +} +#endif + +void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) +{ + regs->pc += size; + + /* + * If we were single stepping, we want to get the step exception after + * we return from the trap. + */ + if (user_mode(regs)) + user_fastforward_single_step(current); + + if (compat_user_mode(regs)) + advance_itstate(regs); + else + regs->pstate &= ~PSR_BTYPE_MASK; +} + static LIST_HEAD(undef_hook); static DEFINE_RAW_SPINLOCK(undef_lock); @@ -320,12 +400,14 @@ static int call_undef_hook(struct pt_regs *regs) unsigned long flags; u32 instr; int (*fn)(struct pt_regs *regs, u32 instr) = NULL; - void __user *pc = (void __user *)instruction_pointer(regs); - - if (!user_mode(regs)) - return 1; + unsigned long pc = instruction_pointer(regs); - if (compat_thumb_mode(regs)) { + if (!user_mode(regs)) { + __le32 instr_le; + if (get_kernel_nofault(instr_le, (__le32 *)pc)) + goto exit; + instr = le32_to_cpu(instr_le); + } else if (compat_thumb_mode(regs)) { /* 16-bit Thumb instruction */ __le16 instr_le; if (get_user(instr_le, (__le16 __user *)pc)) @@ -358,12 +440,13 @@ exit: return fn ? fn(regs, instr) : 1; } -static void force_signal_inject(int signal, int code, struct pt_regs *regs, - unsigned long address) +void force_signal_inject(int signal, int code, unsigned long address, unsigned long err) { - siginfo_t info; - void __user *pc = (void __user *)instruction_pointer(regs); const char *desc; + struct pt_regs *regs = current_pt_regs(); + + if (WARN_ON(!user_mode(regs))) + return; switch (signal) { case SIGILL: @@ -373,43 +456,37 @@ static void force_signal_inject(int signal, int code, struct pt_regs *regs, desc = "illegal memory access"; break; default: - desc = "bad mode"; + desc = "unknown or unrecoverable error"; break; } - if (unhandled_signal(current, signal) && - show_unhandled_signals_ratelimited()) { - pr_info("%s[%d]: %s: pc=%p\n", - current->comm, task_pid_nr(current), desc, pc); - dump_instr(KERN_INFO, regs); + /* Force signals we don't understand to SIGKILL */ + if (WARN_ON(signal != SIGKILL && + siginfo_layout(signal, code) != SIL_FAULT)) { + signal = SIGKILL; } - info.si_signo = signal; - info.si_errno = 0; - info.si_code = code; - info.si_addr = pc; - - arm64_notify_die(desc, regs, &info, 0); + arm64_notify_die(desc, regs, signal, code, address, err); } /* * Set up process info to signal segmentation fault - called on access error. */ -void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr) +void arm64_notify_segfault(unsigned long addr) { int code; - down_read(¤t->mm->mmap_sem); - if (find_vma(current->mm, addr) == NULL) + mmap_read_lock(current->mm); + if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); - force_signal_inject(SIGSEGV, code, regs, addr); + force_signal_inject(SIGSEGV, code, addr, 0); } -asmlinkage void __exception do_undefinstr(struct pt_regs *regs) +void do_undefinstr(struct pt_regs *regs, unsigned long esr) { /* check for AArch32 breakpoint instructions */ if (!aarch32_break_handler(regs)) @@ -418,17 +495,41 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) if (call_undef_hook(regs) == 0) return; - force_signal_inject(SIGILL, ILL_ILLOPC, regs, 0); + if (!user_mode(regs)) + die("Oops - Undefined instruction", regs, esr); + + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); +} +NOKPROBE_SYMBOL(do_undefinstr); + +void do_el0_bti(struct pt_regs *regs) +{ + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } -int cpu_enable_cache_maint_trap(void *__unused) +void do_el1_bti(struct pt_regs *regs, unsigned long esr) { - config_sctlr_el1(SCTLR_EL1_UCI, 0); - return 0; + die("Oops - BTI", regs, esr); } +NOKPROBE_SYMBOL(do_el1_bti); + +void do_el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); +} + +void do_el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + /* + * Unexpected FPAC exception in the kernel: kill the task before it + * does any more harm. + */ + die("Oops - FPAC", regs, esr); +} +NOKPROBE_SYMBOL(do_el1_fpac) #define __user_cache_maint(insn, address, res) \ - if (address >= user_addr_max()) { \ + if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ @@ -436,25 +537,21 @@ int cpu_enable_cache_maint_trap(void *__unused) "1: " insn ", %1\n" \ " mov %w0, #0\n" \ "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %w0, %w2\n" \ - " b 2b\n" \ - " .popsection\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) \ : "=r" (res) \ - : "r" (address), "i" (-EFAULT)); \ + : "r" (address)); \ uaccess_ttbr0_disable(); \ } -static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) +static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs) { - unsigned long address; - int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; + unsigned long tagged_address, address; + int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = untagged_addr(pt_regs_read_reg(regs, rt)); + tagged_address = pt_regs_read_reg(regs, rt); + address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -463,6 +560,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVADP: /* DC CVADP */ + __user_cache_maint("sys 3, c7, c13, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ __user_cache_maint("sys 3, c7, c12, 1", address, ret); break; @@ -473,49 +573,74 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) __user_cache_maint("ic ivau", address, ret); break; default: - force_signal_inject(SIGILL, ILL_ILLOPC, regs, 0); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } if (ret) - arm64_notify_segfault(regs, address); + arm64_notify_segfault(tagged_address); else - regs->pc += 4; + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void ctr_read_handler(unsigned int esr, struct pt_regs *regs) +static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; + int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); + if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { + /* Hide DIC so that we can trap the unnecessary maintenance...*/ + val &= ~BIT(CTR_EL0_DIC_SHIFT); + + /* ... and fake IminLine to reduce the number of traps. */ + val &= ~CTR_EL0_IminLine_MASK; + val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK; + } + pt_regs_write_reg(regs, rt, val); - regs->pc += 4; + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_counter_get_cntvct()); - regs->pc += 4; + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; + int rt = ESR_ELx_SYS64_ISS_RT(esr); pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - regs->pc += 4; + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} + +static void mrs_handler(unsigned long esr, struct pt_regs *regs) +{ + u32 sysreg, rt; + + rt = ESR_ELx_SYS64_ISS_RT(esr); + sysreg = esr_sys64_to_sysreg(esr); + + if (do_emulate_mrs(regs, sysreg, rt) != 0) + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); +} + +static void wfi_handler(unsigned long esr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } struct sys64_hook { - unsigned int esr_mask; - unsigned int esr_val; - void (*handler)(unsigned int esr, struct pt_regs *regs); + unsigned long esr_mask; + unsigned long esr_val; + void (*handler)(unsigned long esr, struct pt_regs *regs); }; -static struct sys64_hook sys64_hooks[] = { +static const struct sys64_hook sys64_hooks[] = { { .esr_mask = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL, @@ -534,55 +659,154 @@ static struct sys64_hook sys64_hooks[] = { .handler = cntvct_read_handler, }, { + /* Trap read access to CNTVCTSS_EL0 */ + .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, + .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCTSS, + .handler = cntvct_read_handler, + }, + { /* Trap read access to CNTFRQ_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ, .handler = cntfrq_read_handler, }, + { + /* Trap read access to CPUID registers */ + .esr_mask = ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK, + .esr_val = ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL, + .handler = mrs_handler, + }, + { + /* Trap WFI instructions executed in userspace */ + .esr_mask = ESR_ELx_WFx_MASK, + .esr_val = ESR_ELx_WFx_WFI_VAL, + .handler = wfi_handler, + }, {}, }; -asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs) +#ifdef CONFIG_COMPAT +static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs) { - struct sys64_hook *hook; + int cond; - for (hook = sys64_hooks; hook->handler; hook++) + /* Only a T32 instruction can trap without CV being set */ + if (!(esr & ESR_ELx_CV)) { + u32 it; + + it = compat_get_it_state(regs); + if (!it) + return true; + + cond = it >> 4; + } else { + cond = (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; + } + + return aarch32_opcode_cond_checks[cond](regs->pstate); +} + +static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) +{ + int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; + + pt_regs_write_reg(regs, reg, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, 4); +} + +static const struct sys64_hook cp15_32_hooks[] = { + { + .esr_mask = ESR_ELx_CP15_32_ISS_SYS_MASK, + .esr_val = ESR_ELx_CP15_32_ISS_SYS_CNTFRQ, + .handler = compat_cntfrq_read_handler, + }, + {}, +}; + +static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs) +{ + int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; + int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; + u64 val = arch_timer_read_counter(); + + pt_regs_write_reg(regs, rt, lower_32_bits(val)); + pt_regs_write_reg(regs, rt2, upper_32_bits(val)); + arm64_skip_faulting_instruction(regs, 4); +} + +static const struct sys64_hook cp15_64_hooks[] = { + { + .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, + .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT, + .handler = compat_cntvct_read_handler, + }, + { + .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, + .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS, + .handler = compat_cntvct_read_handler, + }, + {}, +}; + +void do_cp15instr(unsigned long esr, struct pt_regs *regs) +{ + const struct sys64_hook *hook, *hook_base; + + if (!cp15_cond_valid(esr, regs)) { + /* + * There is no T16 variant of a CP access, so we + * always advance PC by 4 bytes. + */ + arm64_skip_faulting_instruction(regs, 4); + return; + } + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_CP15_32: + hook_base = cp15_32_hooks; + break; + case ESR_ELx_EC_CP15_64: + hook_base = cp15_64_hooks; + break; + default: + do_undefinstr(regs, esr); + return; + } + + for (hook = hook_base; hook->handler; hook++) if ((hook->esr_mask & esr) == hook->esr_val) { hook->handler(esr, regs); return; } /* - * New SYS instructions may previously have been undefined at EL0. Fall - * back to our usual undefined instruction handler so that we handle - * these consistently. + * New cp15 instructions may previously have been undefined at + * EL0. Fall back to our usual undefined instruction handler + * so that we handle these consistently. */ - do_undefinstr(regs); + do_undefinstr(regs, esr); } +NOKPROBE_SYMBOL(do_cp15instr); +#endif -long compat_arm_syscall(struct pt_regs *regs); - -asmlinkage long do_ni_syscall(struct pt_regs *regs) +void do_sysinstr(unsigned long esr, struct pt_regs *regs) { -#ifdef CONFIG_COMPAT - long ret; - if (is_compat_task()) { - ret = compat_arm_syscall(regs); - if (ret != -ENOSYS) - return ret; - } -#endif + const struct sys64_hook *hook; - if (show_unhandled_signals_ratelimited()) { - pr_info("%s[%d]: syscall %d\n", current->comm, - task_pid_nr(current), regs->syscallno); - dump_instr("", regs); - if (user_mode(regs)) - __show_regs(regs); - } + for (hook = sys64_hooks; hook->handler; hook++) + if ((hook->esr_mask & esr) == hook->esr_val) { + hook->handler(esr, regs); + return; + } - return sys_ni_syscall(); + /* + * New SYS instructions may previously have been undefined at EL0. Fall + * back to our usual undefined instruction handler so that we handle + * these consistently. + */ + do_undefinstr(regs, esr); } +NOKPROBE_SYMBOL(do_sysinstr); static const char *esr_class_str[] = { [0 ... ESR_ELx_EC_MAX] = "UNRECOGNIZED EC", @@ -594,7 +818,9 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP14_LS] = "CP14 LDC/STC", [ESR_ELx_EC_FP_ASIMD] = "ASIMD", [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", + [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", + [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", @@ -603,6 +829,10 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_HVC64] = "HVC (AArch64)", [ESR_ELx_EC_SMC64] = "SMC (AArch64)", [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)", + [ESR_ELx_EC_SVE] = "SVE", + [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", + [ESR_ELx_EC_FPAC] = "FPAC", + [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", @@ -624,51 +854,24 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_BRK64] = "BRK (AArch64)", }; -const char *esr_get_class_string(u32 esr) +const char *esr_get_class_string(unsigned long esr) { return esr_class_str[ESR_ELx_EC(esr)]; } /* - * bad_mode handles the impossible case in the exception vector. This is always - * fatal. - */ -asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) -{ - console_verbose(); - - pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n", - handler[reason], smp_processor_id(), esr, - esr_get_class_string(esr)); - - die("Oops - bad mode", regs, 0); - local_irq_disable(); - panic("bad mode"); -} - -/* * bad_el0_sync handles unexpected, but potentially recoverable synchronous - * exceptions taken from EL0. Unlike bad_mode, this returns. + * exceptions taken from EL0. */ -asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) +void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) { - siginfo_t info; - void __user *pc = (void __user *)instruction_pointer(regs); - console_verbose(); - - pr_crit("Bad EL0 synchronous exception detected on CPU%d, code 0x%08x -- %s\n", - smp_processor_id(), esr, esr_get_class_string(esr)); - __show_regs(regs); - - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_ILLOPC; - info.si_addr = pc; + unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; - current->thread.fault_code = 0; + current->thread.fault_code = esr; - force_sig_info(info.si_signo, &info, current); + arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, + "Bad EL0 synchronous exception"); } #ifdef CONFIG_VMAP_STACK @@ -676,24 +879,22 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); -asmlinkage void handle_bad_stack(struct pt_regs *regs) +void panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); - unsigned int esr = read_sysreg(esr_el1); - unsigned long far = read_sysreg(far_el1); console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); - pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr)); pr_emerg("FAR: 0x%016lx\n", far); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", - irq_stk, irq_stk + THREAD_SIZE); + irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); @@ -708,24 +909,58 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) } #endif -void __pte_error(const char *file, int line, unsigned long val) +void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { - pr_err("%s:%d: bad pte %016lx.\n", file, line, val); -} + console_verbose(); -void __pmd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); + pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", + smp_processor_id(), esr, esr_get_class_string(esr)); + if (regs) + __show_regs(regs); + + nmi_panic(regs, "Asynchronous SError Interrupt"); + + cpu_park_loop(); + unreachable(); } -void __pud_error(const char *file, int line, unsigned long val) +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { - pr_err("%s:%d: bad pud %016lx.\n", file, line, val); + unsigned long aet = arm64_ras_serror_get_severity(esr); + + switch (aet) { + case ESR_ELx_AET_CE: /* corrected error */ + case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ + /* + * The CPU can make progress. We may take UEO again as + * a more severe error. + */ + return false; + + case ESR_ELx_AET_UEU: /* Uncorrected Unrecoverable */ + case ESR_ELx_AET_UER: /* Uncorrected Recoverable */ + /* + * The CPU can't make progress. The exception may have + * been imprecise. + * + * Neoverse-N1 #1349291 means a non-KVM SError reported as + * Unrecoverable should be treated as Uncontainable. We + * call arm64_serror_panic() in both cases. + */ + return true; + + case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ + default: + /* Error has been silently propagated */ + arm64_serror_panic(regs, esr); + } } -void __pgd_error(const char *file, int line, unsigned long val) +void do_serror(struct pt_regs *regs, unsigned long esr) { - pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); + /* non-RAS errors are not containable */ + if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) + arm64_serror_panic(regs, esr); } /* GENERIC_BUG traps */ @@ -742,14 +977,11 @@ int is_valid_bugaddr(unsigned long addr) return 1; } -static int bug_handler(struct pt_regs *regs, unsigned int esr) +static int bug_handler(struct pt_regs *regs, unsigned long esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: - die("Oops - BUG", regs, 0); + die("Oops - BUG", regs, esr); break; case BUG_TRAP_TYPE_WARN: @@ -761,28 +993,138 @@ static int bug_handler(struct pt_regs *regs, unsigned int esr) } /* If thread survives, skip over the BUG instruction and continue: */ - regs->pc += AARCH64_INSN_SIZE; /* skip BRK and resume */ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } static struct break_hook bug_break_hook = { - .esr_val = 0xf2000000 | BUG_BRK_IMM, - .esr_mask = 0xffffffff, .fn = bug_handler, + .imm = BUG_BRK_IMM, +}; + +#ifdef CONFIG_CFI_CLANG +static int cfi_handler(struct pt_regs *regs, unsigned long esr) +{ + unsigned long target; + u32 type; + + target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr)); + type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr)); + + switch (report_cfi_failure(regs, regs->pc, &target, type)) { + case BUG_TRAP_TYPE_BUG: + die("Oops - CFI", regs, 0); + break; + + case BUG_TRAP_TYPE_WARN: + break; + + default: + return DBG_HOOK_ERROR; + } + + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + return DBG_HOOK_HANDLED; +} + +static struct break_hook cfi_break_hook = { + .fn = cfi_handler, + .imm = CFI_BRK_IMM_BASE, + .mask = CFI_BRK_IMM_MASK, +}; +#endif /* CONFIG_CFI_CLANG */ + +static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) +{ + pr_err("%s generated an invalid instruction at %pS!\n", + "Kernel text patching", + (void *)instruction_pointer(regs)); + + /* We cannot handle this */ + return DBG_HOOK_ERROR; +} + +static struct break_hook fault_break_hook = { + .fn = reserved_fault_handler, + .imm = FAULT_BRK_IMM, }; +#ifdef CONFIG_KASAN_SW_TAGS + +#define KASAN_ESR_RECOVER 0x20 +#define KASAN_ESR_WRITE 0x10 +#define KASAN_ESR_SIZE_MASK 0x0f +#define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) + +static int kasan_handler(struct pt_regs *regs, unsigned long esr) +{ + bool recover = esr & KASAN_ESR_RECOVER; + bool write = esr & KASAN_ESR_WRITE; + size_t size = KASAN_ESR_SIZE(esr); + u64 addr = regs->regs[0]; + u64 pc = regs->pc; + + kasan_report(addr, size, write, pc); + + /* + * The instrumentation allows to control whether we can proceed after + * a crash was detected. This is done by passing the -recover flag to + * the compiler. Disabling recovery allows to generate more compact + * code. + * + * Unfortunately disabling recovery doesn't work for the kernel right + * now. KASAN reporting is disabled in some contexts (for example when + * the allocator accesses slab object metadata; this is controlled by + * current->kasan_depth). All these accesses are detected by the tool, + * even though the reports for them are not printed. + * + * This is something that might be fixed at some point in the future. + */ + if (!recover) + die("Oops - KASAN", regs, esr); + + /* If thread survives, skip over the brk instruction and continue: */ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + return DBG_HOOK_HANDLED; +} + +static struct break_hook kasan_break_hook = { + .fn = kasan_handler, + .imm = KASAN_BRK_IMM, + .mask = KASAN_BRK_MASK, +}; +#endif + + +#define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK) + /* * Initial handler for AArch64 BRK exceptions * This handler only used until debug_traps_init(). */ -int __init early_brk64(unsigned long addr, unsigned int esr, +int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs) { +#ifdef CONFIG_CFI_CLANG + if ((esr_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE) + return cfi_handler(regs, esr) != DBG_HOOK_HANDLED; +#endif +#ifdef CONFIG_KASAN_SW_TAGS + if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) + return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; +#endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; } -/* This registration must happen early, before debug_traps_init(). */ void __init trap_init(void) { - register_break_hook(&bug_break_hook); + register_kernel_break_hook(&bug_break_hook); +#ifdef CONFIG_CFI_CLANG + register_kernel_break_hook(&cfi_break_hook); +#endif + register_kernel_break_hook(&fault_break_hook); +#ifdef CONFIG_KASAN_SW_TAGS + register_kernel_break_hook(&kasan_break_hook); +#endif + debug_traps_init(); } diff --git a/arch/arm64/kernel/vdso-wrap.S b/arch/arm64/kernel/vdso-wrap.S new file mode 100644 index 000000000000..c4b1990bf2be --- /dev/null +++ b/arch/arm64/kernel/vdso-wrap.S @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> + + .globl vdso_start, vdso_end + .section .rodata + .balign PAGE_SIZE +vdso_start: + .incbin "arch/arm64/kernel/vdso/vdso.so" + .balign PAGE_SIZE +vdso_end: + + .previous + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 2d419006ad43..99ae81ab91a7 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * VDSO implementation for AArch64 and vector page setup for AArch32. + * VDSO implementations. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> */ @@ -29,228 +18,458 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> +#include <linux/time_namespace.h> #include <linux/timekeeper_internal.h> #include <linux/vmalloc.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> +#include <vdso/vsyscall.h> #include <asm/cacheflush.h> #include <asm/signal32.h> #include <asm/vdso.h> -#include <asm/vdso_datapage.h> -extern char vdso_start[], vdso_end[]; -static unsigned long vdso_pages __ro_after_init; +enum vdso_abi { + VDSO_ABI_AA64, + VDSO_ABI_AA32, +}; + +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + +struct vdso_abi_info { + const char *name; + const char *vdso_code_start; + const char *vdso_code_end; + unsigned long vdso_pages; + /* Data Mapping */ + struct vm_special_mapping *dm; + /* Code Mapping */ + struct vm_special_mapping *cm; +}; + +static struct vdso_abi_info vdso_info[] __ro_after_init = { + [VDSO_ABI_AA64] = { + .name = "vdso", + .vdso_code_start = vdso_start, + .vdso_code_end = vdso_end, + }, +#ifdef CONFIG_COMPAT_VDSO + [VDSO_ABI_AA32] = { + .name = "vdso32", + .vdso_code_start = vdso32_start, + .vdso_code_end = vdso32_end, + }, +#endif /* CONFIG_COMPAT_VDSO */ +}; /* * The vDSO data page. */ static union { - struct vdso_data data; + struct vdso_data data[CS_BASES]; u8 page[PAGE_SIZE]; } vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +struct vdso_data *vdso_data = vdso_data_store.data; -#ifdef CONFIG_COMPAT -/* - * Create and map the vectors page for AArch32 tasks. - */ -static struct page *vectors_page[1] __ro_after_init; +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + current->mm->context.vdso = (void *)new_vma->vm_start; + + return 0; +} -static int __init alloc_vectors_page(void) +static int __init __vdso_init(enum vdso_abi abi) { - extern char __kuser_helper_start[], __kuser_helper_end[]; - extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; + int i; + struct page **vdso_pagelist; + unsigned long pfn; - int kuser_sz = __kuser_helper_end - __kuser_helper_start; - int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; - unsigned long vpage; + if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) { + pr_err("vDSO is not a valid ELF object!\n"); + return -EINVAL; + } - vpage = get_zeroed_page(GFP_ATOMIC); + vdso_info[abi].vdso_pages = ( + vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start) >> + PAGE_SHIFT; - if (!vpage) + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, + sizeof(struct page *), + GFP_KERNEL); + if (vdso_pagelist == NULL) return -ENOMEM; - /* kuser helpers */ - memcpy((void *)vpage + 0x1000 - kuser_sz, __kuser_helper_start, - kuser_sz); + /* Grab the vDSO code pages. */ + pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); - /* sigreturn code */ - memcpy((void *)vpage + AARCH32_KERN_SIGRET_CODE_OFFSET, - __aarch32_sigret_code_start, sigret_sz); + for (i = 0; i < vdso_info[abi].vdso_pages; i++) + vdso_pagelist[i] = pfn_to_page(pfn + i); - flush_icache_range(vpage, vpage + PAGE_SIZE); - vectors_page[0] = virt_to_page(vpage); + vdso_info[abi].cm->pages = vdso_pagelist; return 0; } -arch_initcall(alloc_vectors_page); -int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) { - struct mm_struct *mm = current->mm; - unsigned long addr = AARCH32_VECTORS_BASE; - static const struct vm_special_mapping spec = { - .name = "[vectors]", - .pages = vectors_page, - - }; - void *ret; - - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - current->mm->context.vdso = (void *)addr; - - /* Map vectors page at the high address. */ - ret = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC, - &spec); - - up_write(&mm->mmap_sem); - - return PTR_ERR_OR_ZERO(ret); + return (struct vdso_data *)(vvar_page); } -#endif /* CONFIG_COMPAT */ -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) +/* + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_end - vdso_start; + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - if (vdso_size != new_size) - return -EINVAL; + mmap_read_lock(mm); - current->mm->context.vdso = (void *)new_vma->vm_start; + for_each_vma(vmi, vma) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) + zap_page_range(vma, vma->vm_start, size); +#ifdef CONFIG_COMPAT_VDSO + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) + zap_page_range(vma, vma->vm_start, size); +#endif + } + mmap_read_unlock(mm); return 0; } -static struct vm_special_mapping vdso_spec[2] __ro_after_init = { - { - .name = "[vvar]", - }, - { - .name = "[vdso]", - .mremap = vdso_mremap, - }, -}; +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops. + * For more details check_vma_flags() and __access_remote_vm() + */ + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif -static int __init vdso_init(void) +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) { - int i; - struct page **vdso_pagelist; + struct page *timens_page = find_timens_vvar_page(vma); unsigned long pfn; - if (memcmp(vdso_start, "\177ELF", 4)) { - pr_err("vDSO is not a valid ELF object!\n"); - return -EINVAL; + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = sym_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = sym_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; } - vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; - pr_info("vdso: %ld pages (%ld code @ %p, %ld data @ %p)\n", - vdso_pages + 1, vdso_pages, vdso_start, 1L, vdso_data); - - /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), - GFP_KERNEL); - if (vdso_pagelist == NULL) - return -ENOMEM; - - /* Grab the vDSO data page. */ - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); - - - /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(vdso_start); - - for (i = 0; i < vdso_pages; i++) - vdso_pagelist[i + 1] = pfn_to_page(pfn + i); - - vdso_spec[0].pages = &vdso_pagelist[0]; - vdso_spec[1].pages = &vdso_pagelist[1]; - - return 0; + return vmf_insert_pfn(vma, vmf->address, pfn); } -arch_initcall(vdso_init); -int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) +static int __setup_additional_pages(enum vdso_abi abi, + struct mm_struct *mm, + struct linux_binprm *bprm, + int uses_interp) { - struct mm_struct *mm = current->mm; unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + unsigned long gp_flags = 0; void *ret; - vdso_text_len = vdso_pages << PAGE_SHIFT; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { ret = ERR_PTR(vdso_base); goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, - VM_READ|VM_MAYREAD, - &vdso_spec[0]); + + ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_PFNMAP, + vdso_info[abi].dm); if (IS_ERR(ret)) goto up_fail; - vdso_base += PAGE_SIZE; + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) + gp_flags = VM_ARM64_BTI; + + vdso_base += VVAR_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_spec[1]); + vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; - - up_write(&mm->mmap_sem); return 0; up_fail: mm->context.vdso = NULL; - up_write(&mm->mmap_sem); return PTR_ERR(ret); } +#ifdef CONFIG_COMPAT /* - * Update the vDSO data page to keep in sync with kernel timekeeping. + * Create and map the vectors page for AArch32 tasks. */ -void update_vsyscall(struct timekeeper *tk) +enum aarch32_map { + AA32_MAP_VECTORS, /* kuser helpers */ + AA32_MAP_SIGPAGE, + AA32_MAP_VVAR, + AA32_MAP_VDSO, +}; + +static struct page *aarch32_vectors_page __ro_after_init; +static struct page *aarch32_sig_page __ro_after_init; + +static int aarch32_sigpage_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + current->mm->context.sigpage = (void *)new_vma->vm_start; + + return 0; +} + +static struct vm_special_mapping aarch32_vdso_maps[] = { + [AA32_MAP_VECTORS] = { + .name = "[vectors]", /* ABI */ + .pages = &aarch32_vectors_page, + }, + [AA32_MAP_SIGPAGE] = { + .name = "[sigpage]", /* ABI */ + .pages = &aarch32_sig_page, + .mremap = aarch32_sigpage_mremap, + }, + [AA32_MAP_VVAR] = { + .name = "[vvar]", + .fault = vvar_fault, + }, + [AA32_MAP_VDSO] = { + .name = "[vdso]", + .mremap = vdso_mremap, + }, +}; + +static int aarch32_alloc_kuser_vdso_page(void) +{ + extern char __kuser_helper_start[], __kuser_helper_end[]; + int kuser_sz = __kuser_helper_end - __kuser_helper_start; + unsigned long vdso_page; + + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; + + vdso_page = get_zeroed_page(GFP_KERNEL); + if (!vdso_page) + return -ENOMEM; + + memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, + kuser_sz); + aarch32_vectors_page = virt_to_page(vdso_page); + return 0; +} + +#define COMPAT_SIGPAGE_POISON_WORD 0xe7fddef1 +static int aarch32_alloc_sigpage(void) +{ + extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; + int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; + __le32 poison = cpu_to_le32(COMPAT_SIGPAGE_POISON_WORD); + void *sigpage; + + sigpage = (void *)__get_free_page(GFP_KERNEL); + if (!sigpage) + return -ENOMEM; + + memset32(sigpage, (__force u32)poison, PAGE_SIZE / sizeof(poison)); + memcpy(sigpage, __aarch32_sigret_code_start, sigret_sz); + aarch32_sig_page = virt_to_page(sigpage); + return 0; +} + +static int __init __aarch32_alloc_vdso_pages(void) +{ + + if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) + return 0; + + vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; + vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; + + return __vdso_init(VDSO_ABI_AA32); +} + +static int __init aarch32_alloc_vdso_pages(void) +{ + int ret; + + ret = __aarch32_alloc_vdso_pages(); + if (ret) + return ret; + + ret = aarch32_alloc_sigpage(); + if (ret) + return ret; + + return aarch32_alloc_kuser_vdso_page(); +} +arch_initcall(aarch32_alloc_vdso_pages); + +static int aarch32_kuser_helpers_setup(struct mm_struct *mm) { - u32 use_syscall = !tk->tkr_mono.clock->archdata.vdso_direct; - - ++vdso_data->tb_seq_count; - smp_wmb(); - - vdso_data->use_syscall = use_syscall; - vdso_data->xtime_coarse_sec = tk->xtime_sec; - vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift; - vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; - vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; - - if (!use_syscall) { - /* tkr_mono.cycle_last == tkr_raw.cycle_last */ - vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; - vdso_data->raw_time_sec = tk->raw_sec; - vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; - vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->cs_mono_mult = tk->tkr_mono.mult; - vdso_data->cs_raw_mult = tk->tkr_raw.mult; - /* tkr_mono.shift == tkr_raw.shift */ - vdso_data->cs_shift = tk->tkr_mono.shift; + void *ret; + + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; + + /* + * Avoid VM_MAYWRITE for compatibility with arch/arm/, where it's + * not safe to CoW the page containing the CPU exception vectors. + */ + ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, + VM_READ | VM_EXEC | + VM_MAYREAD | VM_MAYEXEC, + &aarch32_vdso_maps[AA32_MAP_VECTORS]); + + return PTR_ERR_OR_ZERO(ret); +} + +static int aarch32_sigreturn_setup(struct mm_struct *mm) +{ + unsigned long addr; + void *ret; + + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = ERR_PTR(addr); + goto out; + } + + /* + * VM_MAYWRITE is required to allow gdb to Copy-on-Write and + * set breakpoints. + */ + ret = _install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ | VM_EXEC | VM_MAYREAD | + VM_MAYWRITE | VM_MAYEXEC, + &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); + if (IS_ERR(ret)) + goto out; + + mm->context.sigpage = (void *)addr; + +out: + return PTR_ERR_OR_ZERO(ret); +} + +int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + int ret; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = aarch32_kuser_helpers_setup(mm); + if (ret) + goto out; + + if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { + ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, + uses_interp); + if (ret) + goto out; } - smp_wmb(); - ++vdso_data->tb_seq_count; + ret = aarch32_sigreturn_setup(mm); +out: + mmap_write_unlock(mm); + return ret; } +#endif /* CONFIG_COMPAT */ + +enum aarch64_map { + AA64_MAP_VVAR, + AA64_MAP_VDSO, +}; + +static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { + [AA64_MAP_VVAR] = { + .name = "[vvar]", + .fault = vvar_fault, + }, + [AA64_MAP_VDSO] = { + .name = "[vdso]", + .mremap = vdso_mremap, + }, +}; -void update_vsyscall_tz(void) +static int __init vdso_init(void) +{ + vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; + + return __vdso_init(VDSO_ABI_AA64); +} +arch_initcall(vdso_init); + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; + struct mm_struct *mm = current->mm; + int ret; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + mmap_write_unlock(mm); + + return ret; } diff --git a/arch/arm64/kernel/vdso/.gitignore b/arch/arm64/kernel/vdso/.gitignore index f8b69d84238e..652e31d82582 100644 --- a/arch/arm64/kernel/vdso/.gitignore +++ b/arch/arm64/kernel/vdso/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso.lds diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 62c84f7cb01b..619e2dc7ee14 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Building a vDSO image for AArch64. # @@ -5,33 +6,63 @@ # Heavily based on the vDSO Makefiles for other archs. # -obj-vdso := gettimeofday.o note.o sigreturn.o +# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before +# the inclusion of generic Makefile. +ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64 +include $(srctree)/lib/vdso/Makefile + +obj-vdso := vgettimeofday.o note.o sigreturn.o # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti + +# -Bsymbolic has been added for consistency with arm, the compat vDSO and +# potential future proofing if we end up with internal calls to the exported +# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so +# preparation in build-time C")). +ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ + -Bsymbolic --build-id=sha1 -n $(btildflags-y) + +ifdef CONFIG_LD_ORPHAN_WARN + ldflags-y += --orphan-handling=warn +endif + +ldflags-y += -T + +ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +# -Wmissing-prototypes and -Wmissing-declarations are removed from +# the CFLAGS of vgettimeofday.c to make possible to build the +# kernel with CONFIG_WERROR enabled. +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ + $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ + $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ + -Wmissing-prototypes -Wmissing-declarations +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +UBSAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y +KCOV_INSTRUMENT := n + +CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables + +ifneq ($(c-gettimeofday-y),) + CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) +endif # Disable gcov profiling for VDSO code GCOV_PROFILE := n -# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared -# down to collect2, resulting in silent corruption of the vDSO image. -ccflags-y += -Wl,-shared - -obj-y += vdso.o -extra-y += vdso.lds +targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (incbin is bad) -$(obj)/vdso.o : $(obj)/vdso.so - # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) - $(call if_changed,vdsold) +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,vdsold_and_vdso_check) # Strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -41,22 +72,14 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # Generate VDSO offsets using helper script gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ -endef + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) -# Assembly rules for the .S files -$(obj-vdso): %.o: %.S FORCE - $(call if_changed_dep,vdsoas) - # Actual build commands -quiet_cmd_vdsold = VDSOL $@ - cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $^ -o $@ -quiet_cmd_vdsoas = VDSOA $@ - cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdsold_and_vdso_check = LD $@ + cmd_vdsold_and_vdso_check = $(cmd_ld); $(cmd_vdso_check) # Install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ diff --git a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh index 01924ff071ad..0387209d65b1 100755 --- a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh +++ b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0 # # Match symbols in the DSO that look like VDSO_*; produce a header file @@ -7,9 +8,9 @@ # Doing this inside the Makefile will break the $(filter-out) function, # causing Kbuild to rebuild the vdso-offsets header file every time. # -# Author: Will Deacon <will.deacon@arm.com +# Author: Will Deacon <will.deacon@arm.com> # LC_ALL=C sed -n -e 's/^00*/0/' -e \ -'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2\t0x\1/p' +'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2 0x\1/p' diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S deleted file mode 100644 index 76320e920965..000000000000 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Userspace implementations of gettimeofday() and friends. - * - * Copyright (C) 2012 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * Author: Will Deacon <will.deacon@arm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> - -#define NSEC_PER_SEC_LO16 0xca00 -#define NSEC_PER_SEC_HI16 0x3b9a - -vdso_data .req x6 -seqcnt .req w7 -w_tmp .req w8 -x_tmp .req x8 - -/* - * Conventions for macro arguments: - * - An argument is write-only if its name starts with "res". - * - All other arguments are read-only, unless otherwise specified. - */ - - .macro seqcnt_acquire -9999: ldr seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT] - tbnz seqcnt, #0, 9999b - dmb ishld - .endm - - .macro seqcnt_check fail - dmb ishld - ldr w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT] - cmp w_tmp, seqcnt - b.ne \fail - .endm - - .macro syscall_check fail - ldr w_tmp, [vdso_data, #VDSO_USE_SYSCALL] - cbnz w_tmp, \fail - .endm - - .macro get_nsec_per_sec res - mov \res, #NSEC_PER_SEC_LO16 - movk \res, #NSEC_PER_SEC_HI16, lsl #16 - .endm - - /* - * Returns the clock delta, in nanoseconds left-shifted by the clock - * shift. - */ - .macro get_clock_shifted_nsec res, cycle_last, mult - /* Read the virtual counter. */ - isb - mrs x_tmp, cntvct_el0 - /* Calculate cycle delta and convert to ns. */ - sub \res, x_tmp, \cycle_last - /* We can only guarantee 56 bits of precision. */ - movn x_tmp, #0xff00, lsl #48 - and \res, x_tmp, \res - mul \res, \res, \mult - .endm - - /* - * Returns in res_{sec,nsec} the REALTIME timespec, based on the - * "wall time" (xtime) and the clock_mono delta. - */ - .macro get_ts_realtime res_sec, res_nsec, \ - clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec - add \res_nsec, \clock_nsec, \xtime_nsec - udiv x_tmp, \res_nsec, \nsec_to_sec - add \res_sec, \xtime_sec, x_tmp - msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec - .endm - - /* - * Returns in res_{sec,nsec} the timespec based on the clock_raw delta, - * used for CLOCK_MONOTONIC_RAW. - */ - .macro get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec - udiv \res_sec, \clock_nsec, \nsec_to_sec - msub \res_nsec, \res_sec, \nsec_to_sec, \clock_nsec - .endm - - /* sec and nsec are modified in place. */ - .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec - /* Add timespec. */ - add \sec, \sec, \ts_sec - add \nsec, \nsec, \ts_nsec - - /* Normalise the new timespec. */ - cmp \nsec, \nsec_to_sec - b.lt 9999f - sub \nsec, \nsec, \nsec_to_sec - add \sec, \sec, #1 -9999: - cmp \nsec, #0 - b.ge 9998f - add \nsec, \nsec, \nsec_to_sec - sub \sec, \sec, #1 -9998: - .endm - - .macro clock_gettime_return, shift=0 - .if \shift == 1 - lsr x11, x11, x12 - .endif - stp x10, x11, [x1, #TSPEC_TV_SEC] - mov x0, xzr - ret - .endm - - .macro jump_slot jumptable, index, label - .if (. - \jumptable) != 4 * (\index) - .error "Jump slot index mismatch" - .endif - b \label - .endm - - .text - -/* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */ -ENTRY(__kernel_gettimeofday) - .cfi_startproc - adr vdso_data, _vdso_data - /* If tv is NULL, skip to the timezone code. */ - cbz x0, 2f - - /* Compute the time of day. */ -1: seqcnt_acquire - syscall_check fail=4f - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=1b - - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - - /* Convert ns to us. */ - mov x13, #1000 - lsl x13, x13, x12 - udiv x11, x11, x13 - stp x10, x11, [x0, #TVAL_TV_SEC] -2: - /* If tz is NULL, return 0. */ - cbz x1, 3f - ldp w4, w5, [vdso_data, #VDSO_TZ_MINWEST] - stp w4, w5, [x1, #TZ_MINWEST] -3: - mov x0, xzr - ret -4: - /* Syscall fallback. */ - mov x8, #__NR_gettimeofday - svc #0 - ret - .cfi_endproc -ENDPROC(__kernel_gettimeofday) - -#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE - -/* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */ -ENTRY(__kernel_clock_gettime) - .cfi_startproc - cmp w0, #JUMPSLOT_MAX - b.hi syscall - adr vdso_data, _vdso_data - adr x_tmp, jumptable - add x_tmp, x_tmp, w0, uxtw #2 - br x_tmp - - ALIGN -jumptable: - jump_slot jumptable, CLOCK_REALTIME, realtime - jump_slot jumptable, CLOCK_MONOTONIC, monotonic - b syscall - b syscall - jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw - jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse - jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse - - .if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1) - .error "Wrong jumptable size" - .endif - - ALIGN -realtime: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=realtime - - /* All computations are done with left-shifted nsecs. */ - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -monotonic: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic - - /* All computations are done with left-shifted nsecs. */ - lsl x4, x4, x12 - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - - add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -monotonic_raw: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_raw_mult, w12 = cs_shift */ - ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] - ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] - seqcnt_check fail=monotonic_raw - - /* All computations are done with left-shifted nsecs. */ - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_clock_raw res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, nsec_to_sec=x9 - - add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -realtime_coarse: - seqcnt_acquire - ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] - seqcnt_check fail=realtime_coarse - clock_gettime_return - - ALIGN -monotonic_coarse: - seqcnt_acquire - ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] - ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic_coarse - - /* Computations are done in (non-shifted) nsecs. */ - get_nsec_per_sec res=x9 - add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return - - ALIGN -syscall: /* Syscall fallback. */ - mov x8, #__NR_clock_gettime - svc #0 - ret - .cfi_endproc -ENDPROC(__kernel_clock_gettime) - -/* int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); */ -ENTRY(__kernel_clock_getres) - .cfi_startproc - cmp w0, #CLOCK_REALTIME - ccmp w0, #CLOCK_MONOTONIC, #0x4, ne - ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne - b.ne 1f - - ldr x2, 5f - b 2f -1: - cmp w0, #CLOCK_REALTIME_COARSE - ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne - b.ne 4f - ldr x2, 6f -2: - cbz w1, 3f - stp xzr, x2, [x1] - -3: /* res == NULL. */ - mov w0, wzr - ret - -4: /* Syscall fallback. */ - mov x8, #__NR_clock_getres - svc #0 - ret -5: - .quad CLOCK_REALTIME_RES -6: - .quad CLOCK_COARSE_RES - .cfi_endproc -ENDPROC(__kernel_clock_getres) diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S index b82c85e5d972..3d4e82290c80 100644 --- a/arch/arm64/kernel/vdso/note.S +++ b/arch/arm64/kernel/vdso/note.S @@ -1,18 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> * * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. @@ -22,7 +11,13 @@ #include <linux/uts.h> #include <linux/version.h> #include <linux/elfnote.h> +#include <linux/build-salt.h> +#include <asm/assembler.h> ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END + +BUILD_SALT + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/sigreturn.S b/arch/arm64/kernel/vdso/sigreturn.S index 20d98effa7dd..0e18729abc3b 100644 --- a/arch/arm64/kernel/vdso/sigreturn.S +++ b/arch/arm64/kernel/vdso/sigreturn.S @@ -1,37 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Sigreturn trampoline for returning from a signal when the SA_RESTORER - * flag is not set. + * flag is not set. It serves primarily as a hall of shame for crappy + * unwinders and features an exciting but mysterious NOP instruction. * - * Copyright (C) 2012 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * It's also fragile as hell, so please think twice before changing anything + * in here. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. + * Copyright (C) 2012 ARM Limited * * Author: Will Deacon <will.deacon@arm.com> */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/unistd.h> .text - nop -ENTRY(__kernel_rt_sigreturn) - .cfi_startproc - .cfi_signal_frame - .cfi_def_cfa x29, 0 - .cfi_offset x29, 0 * 8 - .cfi_offset x30, 1 * 8 +/* + * NOTE!!! You may notice that all of the .cfi directives in this file have + * been commented out. This is because they have been shown to trigger segfaults + * in libgcc when unwinding out of a SIGCANCEL handler to invoke pthread + * cleanup handlers during the thread cancellation dance. By omitting the + * directives, we trigger an arm64-specific fallback path in the unwinder which + * recognises the signal frame and restores many of the registers directly from + * the sigcontext. Re-enabling the cfi directives here therefore needs to be + * much more comprehensive to reduce the risk of further regressions. + */ + +/* Ensure that the mysterious NOP can be associated with a function. */ +// .cfi_startproc + +/* + * .cfi_signal_frame causes the corresponding Frame Description Entry (FDE) in + * the .eh_frame section to be annotated as a signal frame. This allows DWARF + * unwinders (e.g. libstdc++) to implement _Unwind_GetIPInfo() and identify + * the next frame using the unmodified return address instead of subtracting 1, + * which may yield the wrong FDE. + */ +// .cfi_signal_frame + +/* + * Tell the unwinder where to locate the frame record linking back to the + * interrupted context. We don't provide unwind info for registers other than + * the frame pointer and the link register here; in practice, this is likely to + * be insufficient for unwinding in C/C++ based runtimes, especially without a + * means to restore the stack pointer. Thankfully, unwinders and debuggers + * already have baked-in strategies for attempting to unwind out of signals. + */ +// .cfi_def_cfa x29, 0 +// .cfi_offset x29, 0 * 8 +// .cfi_offset x30, 1 * 8 + +/* + * This mysterious NOP is required for some unwinders (e.g. libc++) that + * unconditionally subtract one from the result of _Unwind_GetIP() in order to + * identify the calling function. + * Hack borrowed from arch/powerpc/kernel/vdso64/sigtramp.S. + */ + nop // Mysterious NOP + +/* + * GDB, libgcc and libunwind rely on being able to identify the sigreturn + * instruction sequence to unwind from signal handlers. We cannot, therefore, + * use SYM_FUNC_START() here, as it will emit a BTI C instruction and break the + * unwinder. Thankfully, this function is only ever called from a RET and so + * omitting the landing pad is perfectly fine. + */ +SYM_CODE_START(__kernel_rt_sigreturn) +// PLEASE DO NOT MODIFY mov x8, #__NR_rt_sigreturn +// PLEASE DO NOT MODIFY svc #0 - .cfi_endproc -ENDPROC(__kernel_rt_sigreturn) +// PLEASE DO NOT MODIFY +// .cfi_endproc +SYM_CODE_END(__kernel_rt_sigreturn) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S deleted file mode 100644 index 82379a70ef03..000000000000 --- a/arch/arm64/kernel/vdso/vdso.S +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2012 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * Author: Will Deacon <will.deacon@arm.com> - */ - -#include <linux/init.h> -#include <linux/linkage.h> -#include <linux/const.h> -#include <asm/page.h> - - .globl vdso_start, vdso_end - .section .rodata - .balign PAGE_SIZE -vdso_start: - .incbin "arch/arm64/kernel/vdso/vdso.so" - .balign PAGE_SIZE -vdso_end: - - .previous diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index beca249bc2f3..6028f1fe2d1c 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -1,20 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * GNU linker script for the VDSO library. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> * Heavily based on the vDSO linker scripts for other archs. */ @@ -22,13 +11,17 @@ #include <linux/const.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -39,6 +32,13 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } + /* + * Discard .note.gnu.property sections which are unused and have + * different alignment requirement from vDSO note sections. + */ + /DISCARD/ : { + *(.note.GNU-stack .note.gnu.property) + } .note : { *(.note.*) } :text :note . = ALIGN(16); @@ -48,20 +48,37 @@ SECTIONS PROVIDE (_etext = .); PROVIDE (etext = .); - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text + . = ALIGN(4); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } .dynamic : { *(.dynamic) } :text :dynamic - .rodata : { *(.rodata*) } :text + .rela.dyn : ALIGN(8) { *(.rela .rela*) } + + .rodata : { + *(.rodata*) + *(.got) + *(.got.plt) + *(.plt) + *(.plt.*) + *(.iplt) + *(.igot .igot.plt) + } :text _end = .; PROVIDE(end = .); + DWARF_DEBUG + ELF_DETAILS + /DISCARD/ : { - *(.note.GNU-stack) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) + *(.eh_frame .eh_frame_hdr) } } @@ -74,7 +91,6 @@ PHDRS text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; } /* diff --git a/arch/arm64/kernel/vdso/vgettimeofday.c b/arch/arm64/kernel/vdso/vgettimeofday.c new file mode 100644 index 000000000000..4236cf34d7d9 --- /dev/null +++ b/arch/arm64/kernel/vdso/vgettimeofday.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 userspace implementations of gettimeofday() and similar. + * + * Copyright (C) 2018 ARM Limited + * + */ + +int __kernel_clock_gettime(clockid_t clock, + struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __kernel_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __kernel_clock_getres(clockid_t clock_id, + struct __kernel_timespec *res) +{ + return __cvdso_clock_getres(clock_id, res); +} diff --git a/arch/arm64/kernel/vdso32-wrap.S b/arch/arm64/kernel/vdso32-wrap.S new file mode 100644 index 000000000000..e72ac7bc4c04 --- /dev/null +++ b/arch/arm64/kernel/vdso32-wrap.S @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 ARM Limited + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/page.h> + + .globl vdso32_start, vdso32_end + .section .rodata + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/arm64/kernel/vdso32/vdso.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/arm64/kernel/vdso32/.gitignore b/arch/arm64/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..3542fa24e26b --- /dev/null +++ b/arch/arm64/kernel/vdso32/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso.lds +vdso.so.raw diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile new file mode 100644 index 000000000000..36c8f66cad25 --- /dev/null +++ b/arch/arm64/kernel/vdso32/Makefile @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for vdso32 +# + +# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before +# the inclusion of generic Makefile. +ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 +include $(srctree)/lib/vdso/Makefile + +# Same as cc-*option, but using CC_COMPAT instead of CC +ifeq ($(CONFIG_CC_IS_CLANG), y) +CC_COMPAT ?= $(CC) +CC_COMPAT += --target=arm-linux-gnueabi +else +CC_COMPAT ?= $(CROSS_COMPILE_COMPAT)gcc +endif + +ifeq ($(CONFIG_LD_IS_LLD), y) +LD_COMPAT ?= $(LD) +else +LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld +endif + +cc32-option = $(call try-run,\ + $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) +cc32-disable-warning = $(call try-run,\ + $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) + +# We cannot use the global flags to compile the vDSO files, the main reason +# being that the 32-bit compiler may be older than the main (64-bit) compiler +# and therefore may not understand flags set using $(cc-option ...). Besides, +# arch-specific options should be taken from the arm Makefile instead of the +# arm64 one. +# As a result we set our own flags here. + +# KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile +VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc +VDSO_CPPFLAGS += -isystem $(shell $(CC_COMPAT) -print-file-name=include 2>/dev/null) +VDSO_CPPFLAGS += $(LINUXINCLUDE) + +# Common C and assembly flags +# From top-level Makefile +VDSO_CAFLAGS := $(VDSO_CPPFLAGS) +VDSO_CAFLAGS += $(call cc32-option,-fno-PIE) +ifdef CONFIG_DEBUG_INFO +VDSO_CAFLAGS += -g +endif + +# From arm Makefile +VDSO_CAFLAGS += $(call cc32-option,-fno-dwarf2-cfi-asm) +VDSO_CAFLAGS += -mabi=aapcs-linux -mfloat-abi=soft +ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) +VDSO_CAFLAGS += -mbig-endian +else +VDSO_CAFLAGS += -mlittle-endian +endif + +# From arm vDSO Makefile +VDSO_CAFLAGS += -fPIC -fno-builtin -fno-stack-protector +VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING +VDSO_CAFLAGS += -march=armv8-a + +VDSO_CFLAGS := $(VDSO_CAFLAGS) +VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 +# KBUILD_CFLAGS from top-level Makefile +VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ + -fno-strict-aliasing -fno-common \ + -Werror-implicit-function-declaration \ + -Wno-format-security \ + -Wdeclaration-after-statement \ + -std=gnu11 +VDSO_CFLAGS += -O2 +# Some useful compiler-dependent flags from top-level Makefile +VDSO_CFLAGS += $(call cc32-option,-Wdeclaration-after-statement,) +VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) +VDSO_CFLAGS += -fno-strict-overflow +VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) +VDSO_CFLAGS += -Werror=date-time +VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) + +# The 32-bit compiler does not provide 128-bit integers, which are used in +# some headers that are indirectly included from the vDSO code. +# This hack makes the compiler happy and should trigger a warning/error if +# variables of such type are referenced. +VDSO_CFLAGS += -D__uint128_t='void*' +# Silence some warnings coming from headers that operate on long's +# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) +VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) +VDSO_CFLAGS += -Wno-int-to-pointer-cast + +# Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is +# unreliable. +ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) +VDSO_CFLAGS += -mthumb -fomit-frame-pointer +else +VDSO_CFLAGS += -marm +endif + +VDSO_AFLAGS := $(VDSO_CAFLAGS) +VDSO_AFLAGS += -D__ASSEMBLY__ + +# From arm vDSO Makefile +VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 +VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 +VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1 +VDSO_LDFLAGS += --orphan-handling=warn + + +# Borrow vdsomunge.c from the arm vDSO +# We have to use a relative path because scripts/Makefile.host prefixes +# $(hostprogs) with $(obj) +munge := ../../../arm/vdso/vdsomunge +hostprogs := $(munge) + +c-obj-vdso := note.o +c-obj-vdso-gettimeofday := vgettimeofday.o + +ifneq ($(c-gettimeofday-y),) +VDSO_CFLAGS_gettimeofday_o += -include $(c-gettimeofday-y) +endif + +VDSO_CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os + +# Build rules +targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso.so.dbg vdso.so.raw +c-obj-vdso := $(addprefix $(obj)/, $(c-obj-vdso)) +c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) +asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) +obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) + +targets += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE + $(call if_changed,vdsosym) + +# Strip rule for vdso.so +$(obj)/vdso.so: OBJCOPYFLAGS := -S +$(obj)/vdso.so: $(obj)/vdso.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE + $(call if_changed,vdsomunge) + +# Link rule for the .so file, .lds has to be first +$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,vdsold_and_vdso_check) + +# Compilation rules for the vDSO sources +$(c-obj-vdso): %.o: %.c FORCE + $(call if_changed_dep,vdsocc) +$(c-obj-vdso-gettimeofday): %.o: %.c FORCE + $(call if_changed_dep,vdsocc_gettimeofday) +$(asm-obj-vdso): %.o: %.S FORCE + $(call if_changed_dep,vdsoas) + +# Actual build commands +quiet_cmd_vdsold_and_vdso_check = LD32 $@ + cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check) + +quiet_cmd_vdsold = LD32 $@ + cmd_vdsold = $(LD_COMPAT) $(VDSO_LDFLAGS) \ + -T $(filter %.lds,$^) $(filter %.o,$^) -o $@ +quiet_cmd_vdsocc = CC32 $@ + cmd_vdsocc = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $< +quiet_cmd_vdsocc_gettimeofday = CC32 $@ + cmd_vdsocc_gettimeofday = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) $(VDSO_CFLAGS_gettimeofday_o) -c -o $@ $< +quiet_cmd_vdsoas = AS32 $@ + cmd_vdsoas = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_AFLAGS) -c -o $@ $< + +quiet_cmd_vdsomunge = MUNGE $@ + cmd_vdsomunge = $(obj)/$(munge) $< $@ + +# Generate vDSO offsets using helper script (borrowed from the 64-bit vDSO) +gen-vdsosym := $(srctree)/$(src)/../vdso/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ +# The AArch64 nm should be able to read an AArch32 binary + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +# Install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL32 $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/vdso32.so + +vdso.so: $(obj)/vdso.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso.so diff --git a/arch/arm64/kernel/vdso32/note.c b/arch/arm64/kernel/vdso32/note.c new file mode 100644 index 000000000000..eff5bf9efb8b --- /dev/null +++ b/arch/arm64/kernel/vdso32/note.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012-2018 ARM Limited + * + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> +#include <linux/build-salt.h> + +ELFNOTE32("Linux", 0, LINUX_VERSION_CODE); +BUILD_SALT; diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S new file mode 100644 index 000000000000..8d95d7d35057 --- /dev/null +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Adapted from arm64 version. + * + * GNU linker script for the VDSO library. + * Heavily based on the vDSO linker scripts for other archs. + * + * Copyright (C) 2012-2018 ARM Limited + */ + +#include <linux/const.h> +#include <asm/page.h> +#include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> + +OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") +OUTPUT_ARCH(arm) + +SECTIONS +{ + PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.got) + *(.got.plt) + *(.plt) + *(.rel.iplt) + *(.iplt) + *(.igot.plt) + } :text + + .text : { + *(.text*) + *(.glue_7) + *(.glue_7t) + *(.vfp11_veneer) + *(.v4_bx) + } :text =0xe7f001f2 + + .rel.dyn : { *(.rel*) } + + .ARM.exidx : { *(.ARM.exidx*) } + DWARF_DEBUG + ELF_DETAILS + .ARM.attributes 0 : { *(.ARM.attributes) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ +} + +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_clock_getres; + __vdso_clock_gettime64; + local: *; + }; +} diff --git a/arch/arm64/kernel/vdso32/vgettimeofday.c b/arch/arm64/kernel/vdso32/vgettimeofday.c new file mode 100644 index 000000000000..5acff29c5991 --- /dev/null +++ b/arch/arm64/kernel/vdso32/vgettimeofday.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 compat userspace implementations of gettimeofday() and similar. + * + * Copyright (C) 2018 ARM Limited + * + */ + +int __vdso_clock_gettime(clockid_t clock, + struct old_timespec32 *ts) +{ + return __cvdso_clock_gettime32(clock, ts); +} + +int __vdso_clock_gettime64(clockid_t clock, + struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __vdso_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __vdso_clock_getres(clockid_t clock_id, + struct old_timespec32 *res) +{ + return __cvdso_clock_getres_time32(clock_id, res); +} + +/* Avoid unresolved references emitted by GCC */ + +void __aeabi_unwind_cpp_pr0(void) +{ +} + +void __aeabi_unwind_cpp_pr1(void) +{ +} + +void __aeabi_unwind_cpp_pr2(void) +{ +} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index fe56c268a7d9..45131e354e27 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -1,65 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ld script to make ARM Linux kernel * taken from the i386 version by Russell King * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> */ +#include <asm/hyp_image.h> +#ifdef CONFIG_KVM +#define HYPERVISOR_EXTABLE \ + . = ALIGN(SZ_8); \ + __start___kvm_ex_table = .; \ + *(__kvm_ex_table) \ + __stop___kvm_ex_table = .; + +#define HYPERVISOR_DATA_SECTIONS \ + HYP_SECTION_NAME(.rodata) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_rodata_start = .; \ + *(HYP_SECTION_NAME(.data..ro_after_init)) \ + *(HYP_SECTION_NAME(.rodata)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_rodata_end = .; \ + } + +#define HYPERVISOR_PERCPU_SECTION \ + . = ALIGN(PAGE_SIZE); \ + HYP_SECTION_NAME(.data..percpu) : { \ + *(HYP_SECTION_NAME(.data..percpu)) \ + } + +#define HYPERVISOR_RELOC_SECTION \ + .hyp.reloc : ALIGN(4) { \ + __hyp_reloc_begin = .; \ + *(.hyp.reloc) \ + __hyp_reloc_end = .; \ + } + +#define BSS_FIRST_SECTIONS \ + __hyp_bss_start = .; \ + *(HYP_SECTION_NAME(.bss)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_bss_end = .; + +/* + * We require that __hyp_bss_start and __bss_start are aligned, and enforce it + * with an assertion. But the BSS_SECTION macro places an empty .sbss section + * between them, which can in some cases cause the linker to misalign them. To + * work around the issue, force a page alignment for __bss_start. + */ +#define SBSS_ALIGN PAGE_SIZE +#else /* CONFIG_KVM */ +#define HYPERVISOR_EXTABLE +#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_PERCPU_SECTION +#define HYPERVISOR_RELOC_SECTION +#define SBSS_ALIGN 0 +#endif + +#define RO_EXCEPTION_TABLE_ALIGN 4 +#define RUNTIME_DISCARD_EXIT + #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> #include <asm/kernel-pgtable.h> -#include <asm/thread_info.h> +#include <asm/kexec.h> #include <asm/memory.h> #include <asm/page.h> -#include <asm/pgtable.h> #include "image.h" -/* .exit.text needed in case of alternative patching */ -#define ARM_EXIT_KEEP(x) x -#define ARM_EXIT_DISCARD(x) - OUTPUT_ARCH(aarch64) ENTRY(_text) jiffies = jiffies_64; #define HYPERVISOR_TEXT \ - /* \ - * Align to 4 KB so that \ - * a) the HYP vector table is at its minimum \ - * alignment of 2048 bytes \ - * b) the HYP init code will not cross a page \ - * boundary if its size does not exceed \ - * 4 KB (see related ASSERT() below) \ - */ \ - . = ALIGN(SZ_4K); \ - VMLINUX_SYMBOL(__hyp_idmap_text_start) = .; \ + . = ALIGN(PAGE_SIZE); \ + __hyp_idmap_text_start = .; \ *(.hyp.idmap.text) \ - VMLINUX_SYMBOL(__hyp_idmap_text_end) = .; \ - VMLINUX_SYMBOL(__hyp_text_start) = .; \ + __hyp_idmap_text_end = .; \ + __hyp_text_start = .; \ *(.hyp.text) \ - VMLINUX_SYMBOL(__hyp_text_end) = .; + HYPERVISOR_EXTABLE \ + . = ALIGN(PAGE_SIZE); \ + __hyp_text_end = .; #define IDMAP_TEXT \ . = ALIGN(SZ_4K); \ - VMLINUX_SYMBOL(__idmap_text_start) = .; \ + __idmap_text_start = .; \ *(.idmap.text) \ - VMLINUX_SYMBOL(__idmap_text_end) = .; + __idmap_text_end = .; #ifdef CONFIG_HIBERNATION #define HIBERNATE_TEXT \ - . = ALIGN(SZ_4K); \ - VMLINUX_SYMBOL(__hibernate_exit_text_start) = .;\ + __hibernate_exit_text_start = .; \ *(.hibernate_exit.text) \ - VMLINUX_SYMBOL(__hibernate_exit_text_end) = .; + __hibernate_exit_text_end = .; #else #define HIBERNATE_TEXT #endif +#ifdef CONFIG_KEXEC_CORE +#define KEXEC_TEXT \ + __relocate_new_kernel_start = .; \ + *(.kexec_relocate.text) \ + __relocate_new_kernel_end = .; +#else +#define KEXEC_TEXT +#endif + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + __entry_tramp_text_start = .; \ + *(.entry.tramp.text) \ + . = ALIGN(PAGE_SIZE); \ + __entry_tramp_text_end = .; \ + *(.entry.tramp.rodata) +#else +#define TRAMP_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which - * runs from stext to _edata, must be a round multiple of the PE/COFF - * FileAlignment, which we set to its minimum value of 0x200. 'stext' + * runs from _stext to _edata, must be a round multiple of the PE/COFF + * FileAlignment, which we set to its minimum value of 0x200. '_stext' * itself is 4 KB aligned, so padding out _edata to a 0x200 aligned * boundary should be sufficient. */ @@ -80,27 +145,20 @@ SECTIONS * matching the same input section name. There is no documented * order of matching. */ + DISCARDS /DISCARD/ : { - ARM_EXIT_DISCARD(EXIT_TEXT) - ARM_EXIT_DISCARD(EXIT_DATA) - EXIT_CALL - *(.discard) - *(.discard.*) *(.interp .dynamic) - *(.dynsym .dynstr .hash) + *(.dynsym .dynstr .hash .gnu.hash) } - . = KIMAGE_VADDR + TEXT_OFFSET; + . = KIMAGE_VADDR; .head.text : { _text = .; HEAD_TEXT } - .text : { /* Real text segment */ + .text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */ _stext = .; /* Text and read-only data */ - __exception_text_start = .; - *(.exception.text) - __exception_text_end = .; IRQENTRY_TEXT SOFTIRQENTRY_TEXT ENTRY_TEXT @@ -111,28 +169,60 @@ SECTIONS KPROBES_TEXT HYPERVISOR_TEXT IDMAP_TEXT - HIBERNATE_TEXT - *(.fixup) *(.gnu.warning) . = ALIGN(16); *(.got) /* Global offset table */ } + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, + "Unexpected GOT/PLT entries detected!") + . = ALIGN(SEGMENT_ALIGN); _etext = .; /* End of text section */ - RO_DATA(PAGE_SIZE) /* everything from this point to */ - EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ - NOTES + /* everything from this point to __init_begin will be marked RO NX */ + RO_DATA(PAGE_SIZE) + + HYPERVISOR_DATA_SECTIONS + + /* code sections that are never executed via the kernel mapping */ + .rodata.text : { + TRAMP_TEXT + HIBERNATE_TEXT + KEXEC_TEXT + . = ALIGN(PAGE_SIZE); + } + + idmap_pg_dir = .; + . += PAGE_SIZE; + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif + + reserved_pg_dir = .; + . += PAGE_SIZE; + + swapper_pg_dir = .; + . += PAGE_SIZE; . = ALIGN(SEGMENT_ALIGN); __init_begin = .; __inittext_begin = .; INIT_TEXT_SECTION(8) + + __exittext_begin = .; .exit.text : { - ARM_EXIT_KEEP(EXIT_TEXT) + EXIT_TEXT } + __exittext_end = .; . = ALIGN(4); .altinstructions : { @@ -140,35 +230,43 @@ SECTIONS *(.altinstructions) __alt_instructions_end = .; } - .altinstr_replacement : { - *(.altinstr_replacement) - } - . = ALIGN(PAGE_SIZE); + . = ALIGN(SEGMENT_ALIGN); __inittext_end = .; __initdata_begin = .; + init_idmap_pg_dir = .; + . += INIT_IDMAP_DIR_SIZE; + init_idmap_pg_end = .; + .init.data : { INIT_DATA INIT_SETUP(16) INIT_CALLS CON_INITCALL - SECURITY_INITCALL INIT_RAM_FS - *(.init.rodata.* .init.bss) /* from the EFI stub */ + *(.init.altinstructions .init.bss) /* from the EFI stub */ } .exit.data : { - ARM_EXIT_KEEP(EXIT_DATA) + EXIT_DATA } PERCPU_SECTION(L1_CACHE_BYTES) + HYPERVISOR_PERCPU_SECTION + + HYPERVISOR_RELOC_SECTION - .rela : ALIGN(8) { + .rela.dyn : ALIGN(8) { + __rela_start = .; *(.rela .rela*) + __rela_end = .; } - __rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR); - __rela_size = SIZEOF(.rela); + .relr.dyn : ALIGN(8) { + __relr_start = .; + *(.relr.dyn) + __relr_end = .; + } . = ALIGN(SEGMENT_ALIGN); __initdata_end = .; @@ -176,7 +274,7 @@ SECTIONS _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) /* * Data written with the MMU off but read with the MMU on requires @@ -200,41 +298,74 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; - BSS_SECTION(0, 0, 0) + BSS_SECTION(SBSS_ALIGN, 0, 0) . = ALIGN(PAGE_SIZE); - idmap_pg_dir = .; - . += IDMAP_DIR_SIZE; - swapper_pg_dir = .; - . += SWAPPER_DIR_SIZE; - -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - reserved_ttbr0 = .; - . += RESERVED_TTBR0_SIZE; -#endif + init_pg_dir = .; + . += INIT_DIR_SIZE; + init_pg_end = .; + . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS HEAD_SYMBOLS + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } +#include "image-vars.h" + /* - * The HYP init code and ID map text can't be longer than a page each, - * and should not cross a page boundary. + * The HYP init code and ID map text can't be longer than a page each. The + * former is page-aligned, but the latter may not be with 16K or 64K pages, so + * it should also not cross a page boundary. */ -ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, - "HYP init code too big or misaligned") +ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE, + "HYP init code too big") ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "ID map text too big or misaligned") #ifdef CONFIG_HIBERNATION -ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) - <= SZ_4K, "Hibernate exit text too big or misaligned") +ASSERT(__hibernate_exit_text_end - __hibernate_exit_text_start <= SZ_4K, + "Hibernate exit text is bigger than 4 KiB") +#endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, + "Entry trampoline text too big") +#endif +#ifdef CONFIG_KVM +ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned") #endif - /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned") + +ASSERT(swapper_pg_dir - reserved_pg_dir == RESERVED_SWAPPER_OFFSET, + "RESERVED_SWAPPER_OFFSET is wrong!") + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT(swapper_pg_dir - tramp_pg_dir == TRAMP_SWAPPER_OFFSET, + "TRAMP_SWAPPER_OFFSET is wrong!") +#endif + +#ifdef CONFIG_KEXEC_CORE +/* kexec relocation code should fit into one KEXEC_CONTROL_PAGE_SIZE */ +ASSERT(__relocate_new_kernel_end - __relocate_new_kernel_start <= SZ_4K, + "kexec relocation code is bigger than 4 KiB") +ASSERT(KEXEC_CONTROL_PAGE_SIZE >= SZ_4K, "KEXEC_CONTROL_PAGE_SIZE is broken") +#endif |