diff options
Diffstat (limited to 'arch/riscv/kernel')
122 files changed, 11255 insertions, 3106 deletions
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 33bb60a354cd..c7b542573407 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -7,10 +7,12 @@ ifdef CONFIG_FTRACE CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) endif -CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_table.o += $(call cc-disable-warning, override-init) +CFLAGS_compat_syscall_table.o += $(call cc-disable-warning, override-init) -ifdef CONFIG_KEXEC +ifdef CONFIG_KEXEC_CORE AFLAGS_kexec_relocate.o := -mcmodel=medany $(call cc-option,-mno-relax) endif @@ -18,19 +20,32 @@ endif ifdef CONFIG_RISCV_ALTERNATIVE_EARLY CFLAGS_alternative.o := -mcmodel=medany CFLAGS_cpufeature.o := -mcmodel=medany +CFLAGS_sbi_ecall.o := -mcmodel=medany ifdef CONFIG_FTRACE CFLAGS_REMOVE_alternative.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_cpufeature.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sbi_ecall.o = $(CC_FLAGS_FTRACE) +endif +ifdef CONFIG_RELOCATABLE +CFLAGS_alternative.o += -fno-pie +CFLAGS_cpufeature.o += -fno-pie +CFLAGS_sbi_ecall.o += -fno-pie endif ifdef CONFIG_KASAN KASAN_SANITIZE_alternative.o := n KASAN_SANITIZE_cpufeature.o := n +KASAN_SANITIZE_sbi_ecall.o := n +endif +ifdef CONFIG_FORTIFY_SOURCE +CFLAGS_alternative.o += -D__NO_FORTIFY +CFLAGS_cpufeature.o += -D__NO_FORTIFY +CFLAGS_sbi_ecall.o += -D__NO_FORTIFY endif endif -extra-y += head.o -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds +obj-y += head.o obj-y += soc.o obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o obj-y += cpu.o @@ -40,21 +55,33 @@ obj-y += irq.o obj-y += process.o obj-y += ptrace.o obj-y += reset.o +obj-y += return_address.o obj-y += setup.o obj-y += signal.o obj-y += syscall_table.o obj-y += sys_riscv.o +obj-y += sys_hwprobe.o obj-y += time.o obj-y += traps.o obj-y += riscv_ksyms.o obj-y += stacktrace.o obj-y += cacheinfo.o obj-y += patch.o +obj-y += vendor_extensions.o +obj-y += vendor_extensions/ obj-y += probes/ +obj-y += tests/ obj-$(CONFIG_MMU) += vdso.o vdso/ -obj-$(CONFIG_RISCV_M_MODE) += traps_misaligned.o +obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o +obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o +obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o +obj-$(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS) += vec-copy-unaligned.o + obj-$(CONFIG_FPU) += fpu.o +obj-$(CONFIG_FPU) += kernel_mode_fpu.o +obj-$(CONFIG_RISCV_ISA_V) += vector.o +obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += cpu_ops.o @@ -64,27 +91,37 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o +obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o -obj-$(CONFIG_TRACE_IRQFLAGS) += trace_irq.o - obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o -obj-$(CONFIG_RISCV_SBI) += sbi.o +obj-$(CONFIG_RISCV_SBI) += sbi.o sbi_ecall.o ifeq ($(CONFIG_RISCV_SBI), y) +obj-$(CONFIG_SMP) += sbi-ipi.o obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o -obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o +obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o kexec_image.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_CFI_CLANG) += cfi.o + obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o obj-$(CONFIG_COMPAT) += compat_signal.o obj-$(CONFIG_COMPAT) += compat_vdso/ + +obj-$(CONFIG_64BIT) += pi/ +obj-$(CONFIG_ACPI) += acpi.o +obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o + +obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += bugs.o diff --git a/arch/riscv/kernel/Makefile.syscalls b/arch/riscv/kernel/Makefile.syscalls new file mode 100644 index 000000000000..9668fd1faf60 --- /dev/null +++ b/arch/riscv/kernel/Makefile.syscalls @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +syscall_abis_32 += riscv memfd_secret +syscall_abis_64 += riscv rlimit memfd_secret diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c new file mode 100644 index 000000000000..3f6d5a6789e8 --- /dev/null +++ b/arch/riscv/kernel/acpi.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RISC-V Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2013-2014, Linaro Ltd. + * Author: Al Stone <al.stone@linaro.org> + * Author: Graeme Gregory <graeme.gregory@linaro.org> + * Author: Hanjun Guo <hanjun.guo@linaro.org> + * Author: Tomasz Nowicki <tomasz.nowicki@linaro.org> + * Author: Naresh Bhat <naresh.bhat@linaro.org> + * + * Copyright (C) 2021-2023, Ventana Micro Systems Inc. + * Author: Sunil V L <sunilvl@ventanamicro.com> + */ + +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/io.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> +#include <linux/pci.h> +#include <linux/serial_core.h> + +int acpi_noirq = 1; /* skip ACPI IRQ initialization */ +int acpi_disabled = 1; +EXPORT_SYMBOL(acpi_disabled); + +int acpi_pci_disabled = 1; /* skip ACPI PCI scan and IRQ initialization */ +EXPORT_SYMBOL(acpi_pci_disabled); + +static bool param_acpi_off __initdata; +static bool param_acpi_on __initdata; +static bool param_acpi_force __initdata; + +static struct acpi_madt_rintc cpu_madt_rintc[NR_CPUS]; + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) + param_acpi_off = true; + else if (strcmp(arg, "on") == 0) /* prefer ACPI over DT */ + param_acpi_on = true; + else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */ + param_acpi_force = true; + else + return -EINVAL; /* Core will print when we return error */ + + return 0; +} +early_param("acpi", parse_acpi); + +/* + * acpi_fadt_sanity_check() - Check FADT presence and carry out sanity + * checks on it + * + * Return 0 on success, <0 on failure + */ +static int __init acpi_fadt_sanity_check(void) +{ + struct acpi_table_header *table; + struct acpi_table_fadt *fadt; + acpi_status status; + int ret = 0; + + /* + * FADT is required on riscv; retrieve it to check its presence + * and carry out revision and ACPI HW reduced compliancy tests + */ + status = acpi_get_table(ACPI_SIG_FADT, 0, &table); + if (ACPI_FAILURE(status)) { + const char *msg = acpi_format_exception(status); + + pr_err("Failed to get FADT table, %s\n", msg); + return -ENODEV; + } + + fadt = (struct acpi_table_fadt *)table; + + /* + * The revision in the table header is the FADT's Major revision. The + * FADT also has a minor revision, which is stored in the FADT itself. + * + * TODO: Currently, we check for 6.5 as the minimum version to check + * for HW_REDUCED flag. However, once RISC-V updates are released in + * the ACPI spec, we need to update this check for exact minor revision + */ + if (table->revision < 6 || (table->revision == 6 && fadt->minor_revision < 5)) + pr_err(FW_BUG "Unsupported FADT revision %d.%d, should be 6.5+\n", + table->revision, fadt->minor_revision); + + if (!(fadt->flags & ACPI_FADT_HW_REDUCED)) { + pr_err("FADT not ACPI hardware reduced compliant\n"); + ret = -EINVAL; + } + + /* + * acpi_get_table() creates FADT table mapping that + * should be released after parsing and before resuming boot + */ + acpi_put_table(table); + return ret; +} + +/* + * acpi_boot_table_init() called from setup_arch(), always. + * 1. find RSDP and get its address, and then find XSDT + * 2. extract all tables and checksums them all + * 3. check ACPI FADT HW reduced flag + * + * We can parse ACPI boot-time tables such as MADT after + * this function is called. + * + * On return ACPI is enabled if either: + * + * - ACPI tables are initialized and sanity checks passed + * - acpi=force was passed in the command line and ACPI was not disabled + * explicitly through acpi=off command line parameter + * + * ACPI is disabled on function return otherwise + */ +void __init acpi_boot_table_init(void) +{ + /* + * Enable ACPI instead of device tree unless + * - ACPI has been disabled explicitly (acpi=off), or + * - firmware has not populated ACPI ptr in EFI system table + * and ACPI has not been [force] enabled (acpi=on|force) + */ + if (param_acpi_off || + (!param_acpi_on && !param_acpi_force && + efi.acpi20 == EFI_INVALID_TABLE_ADDR)) + goto done; + + /* + * ACPI is disabled at this point. Enable it in order to parse + * the ACPI tables and carry out sanity checks + */ + enable_acpi(); + + /* + * If ACPI tables are initialized and FADT sanity checks passed, + * leave ACPI enabled and carry on booting; otherwise disable ACPI + * on initialization error. + * If acpi=force was passed on the command line it forces ACPI + * to be enabled even if its initialization failed. + */ + if (acpi_table_init() || acpi_fadt_sanity_check()) { + pr_err("Failed to init ACPI tables\n"); + if (!param_acpi_force) + disable_acpi(); + } + +done: + if (acpi_disabled) { + if (earlycon_acpi_spcr_enable) + early_init_dt_scan_chosen_stdout(); + } else { + acpi_parse_spcr(earlycon_acpi_spcr_enable, true); + } +} + +static int acpi_parse_madt_rintc(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_rintc *rintc = (struct acpi_madt_rintc *)header; + int cpuid; + + if (!(rintc->flags & ACPI_MADT_ENABLED)) + return 0; + + cpuid = riscv_hartid_to_cpuid(rintc->hart_id); + /* + * When CONFIG_SMP is disabled, mapping won't be created for + * all cpus. + * CPUs more than num_possible_cpus, will be ignored. + */ + if (cpuid >= 0 && cpuid < num_possible_cpus()) + cpu_madt_rintc[cpuid] = *rintc; + + return 0; +} + +/* + * Instead of parsing (and freeing) the ACPI table, cache + * the RINTC structures since they are frequently used + * like in cpuinfo. + */ +void __init acpi_init_rintc_map(void) +{ + if (acpi_table_parse_madt(ACPI_MADT_TYPE_RINTC, acpi_parse_madt_rintc, 0) <= 0) { + pr_err("No valid RINTC entries exist\n"); + BUG(); + } +} + +struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu) +{ + return &cpu_madt_rintc[cpu]; +} + +/* + * __acpi_map_table() will be called before paging_init(), so early_ioremap() + * or early_memremap() should be called here to for ACPI table mapping. + */ +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) +{ + if (!size) + return NULL; + + return early_memremap(phys, size); +} + +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) +{ + if (!map || !size) + return; + + early_memunmap(map, size); +} + +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + efi_memory_desc_t *md, *region = NULL; + pgprot_t prot; + + if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP))) + return NULL; + + for_each_efi_memory_desc(md) { + u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (phys < md->phys_addr || phys >= end) + continue; + + if (phys + size > end) { + pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n"); + return NULL; + } + region = md; + break; + } + + /* + * It is fine for AML to remap regions that are not represented in the + * EFI memory map at all, as it only describes normal memory, and MMIO + * regions that require a virtual mapping to make them accessible to + * the EFI runtime services. + */ + prot = PAGE_KERNEL_IO; + if (region) { + switch (region->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + if (memblock_is_map_memory(phys) || + !memblock_is_region_memory(phys, size)) { + pr_warn(FW_BUG "requested region covers kernel memory\n"); + return NULL; + } + + /* + * Mapping kernel memory is permitted if the region in + * question is covered by a single memblock with the + * NOMAP attribute set: this enables the use of ACPI + * table overrides passed via initramfs. + * This particular use case only requires read access. + */ + fallthrough; + + case EFI_RUNTIME_SERVICES_CODE: + /* + * This would be unusual, but not problematic per se, + * as long as we take care not to create a writable + * mapping for executable code. + */ + prot = PAGE_KERNEL_RO; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + /* + * ACPI reclaim memory is used to pass firmware tables + * and other data that is intended for consumption by + * the OS only, which may decide it wants to reclaim + * that memory and use it for something else. We never + * do that, but we usually add it to the linear map + * anyway, in which case we should use the existing + * mapping. + */ + if (memblock_is_map_memory(phys)) + return (void __iomem *)__va(phys); + fallthrough; + + default: + if (region->attribute & EFI_MEMORY_WB) + prot = PAGE_KERNEL; + else if ((region->attribute & EFI_MEMORY_WC) || + (region->attribute & EFI_MEMORY_WT)) + prot = pgprot_writecombine(PAGE_KERNEL); + } + } + + return ioremap_prot(phys, size, prot); +} + +#ifdef CONFIG_PCI + +/* + * raw_pci_read/write - Platform-specific PCI config space access. + */ +int raw_pci_read(unsigned int domain, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *val) +{ + struct pci_bus *b = pci_find_bus(domain, bus); + + if (!b) + return PCIBIOS_DEVICE_NOT_FOUND; + return b->ops->read(b, devfn, reg, len, val); +} + +int raw_pci_write(unsigned int domain, unsigned int bus, + unsigned int devfn, int reg, int len, u32 val) +{ + struct pci_bus *b = pci_find_bus(domain, bus); + + if (!b) + return PCIBIOS_DEVICE_NOT_FOUND; + return b->ops->write(b, devfn, reg, len, val); +} + +#endif /* CONFIG_PCI */ diff --git a/arch/riscv/kernel/acpi_numa.c b/arch/riscv/kernel/acpi_numa.c new file mode 100644 index 000000000000..130769e3a99c --- /dev/null +++ b/arch/riscv/kernel/acpi_numa.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACPI 6.6 based NUMA setup for RISCV + * Lots of code was borrowed from arch/arm64/kernel/acpi_numa.c + * + * Copyright 2004 Andi Kleen, SuSE Labs. + * Copyright (C) 2013-2016, Linaro Ltd. + * Author: Hanjun Guo <hanjun.guo@linaro.org> + * Copyright (C) 2024 Intel Corporation. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#define pr_fmt(fmt) "ACPI: NUMA: " fmt + +#include <linux/acpi.h> +#include <linux/bitmap.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/topology.h> + +#include <asm/numa.h> + +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; + +static int __init acpi_numa_get_nid(unsigned int cpu) +{ + return acpi_early_node_map[cpu]; +} + +static inline int get_cpu_for_acpi_id(u32 uid) +{ + int cpu; + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + if (uid == get_acpi_id_for_cpu(cpu)) + return cpu; + + return -EINVAL; +} + +static int __init acpi_parse_rintc_pxm(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_rintc_affinity *pa; + int cpu, pxm, node; + + if (srat_disabled()) + return -EINVAL; + + pa = (struct acpi_srat_rintc_affinity *)header; + if (!pa) + return -EINVAL; + + if (!(pa->flags & ACPI_SRAT_RINTC_ENABLED)) + return 0; + + pxm = pa->proximity_domain; + node = pxm_to_node(pxm); + + /* + * If we can't map the UID to a logical cpu this + * means that the UID is not part of possible cpus + * so we do not need a NUMA mapping for it, skip + * the SRAT entry and keep parsing. + */ + cpu = get_cpu_for_acpi_id(pa->acpi_processor_uid); + if (cpu < 0) + return 0; + + acpi_early_node_map[cpu] = node; + pr_info("SRAT: PXM %d -> HARTID 0x%lx -> Node %d\n", pxm, + cpuid_to_hartid_map(cpu), node); + + return 0; +} + +void __init acpi_map_cpus_to_nodes(void) +{ + int i; + + /* + * In ACPI, SMP and CPU NUMA information is provided in separate + * static tables, namely the MADT and the SRAT. + * + * Thus, it is simpler to first create the cpu logical map through + * an MADT walk and then map the logical cpus to their node ids + * as separate steps. + */ + acpi_table_parse_entries(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_RINTC_AFFINITY, acpi_parse_rintc_pxm, 0); + + for (i = 0; i < nr_cpu_ids; i++) + early_map_cpu_to_node(i, acpi_numa_get_nid(i)); +} + +/* Callback for Proximity Domain -> logical node ID mapping */ +void __init acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) +{ + int pxm, node; + + if (srat_disabled()) + return; + + if (pa->header.length < sizeof(struct acpi_srat_rintc_affinity)) { + pr_err("SRAT: Invalid SRAT header length: %d\n", pa->header.length); + bad_srat(); + return; + } + + if (!(pa->flags & ACPI_SRAT_RINTC_ENABLED)) + return; + + pxm = pa->proximity_domain; + node = acpi_map_pxm_to_node(pxm); + + if (node == NUMA_NO_NODE) { + pr_err("SRAT: Too many proximity domains %d\n", pxm); + bad_srat(); + return; + } + + node_set(node, numa_nodes_parsed); +} diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index c9d0d3c53223..7eb3cb1215c6 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -11,21 +11,25 @@ #include <linux/cpu.h> #include <linux/uaccess.h> #include <asm/alternative.h> +#include <asm/module.h> #include <asm/sections.h> +#include <asm/vdso.h> #include <asm/vendorid_list.h> #include <asm/sbi.h> #include <asm/csr.h> +#include <asm/insn.h> +#include <asm/text-patching.h> struct cpu_manufacturer_info_t { unsigned long vendor_id; unsigned long arch_id; unsigned long imp_id; - void (*vendor_patch_func)(struct alt_entry *begin, struct alt_entry *end, + void (*patch_func)(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); }; -static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) +static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) { #ifdef CONFIG_RISCV_M_MODE cpu_mfr_info->vendor_id = csr_read(CSR_MVENDORID); @@ -38,18 +42,105 @@ static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_inf #endif switch (cpu_mfr_info->vendor_id) { +#ifdef CONFIG_ERRATA_ANDES + case ANDES_VENDOR_ID: + cpu_mfr_info->patch_func = andes_errata_patch_func; + break; +#endif #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: - cpu_mfr_info->vendor_patch_func = sifive_errata_patch_func; + cpu_mfr_info->patch_func = sifive_errata_patch_func; break; #endif #ifdef CONFIG_ERRATA_THEAD case THEAD_VENDOR_ID: - cpu_mfr_info->vendor_patch_func = thead_errata_patch_func; + cpu_mfr_info->patch_func = thead_errata_patch_func; break; #endif default: - cpu_mfr_info->vendor_patch_func = NULL; + cpu_mfr_info->patch_func = NULL; + } +} + +static u32 riscv_instruction_at(void *p) +{ + u16 *parcel = p; + + return (u32)parcel[0] | (u32)parcel[1] << 16; +} + +static void riscv_alternative_fix_auipc_jalr(void *ptr, u32 auipc_insn, + u32 jalr_insn, int patch_offset) +{ + u32 call[2] = { auipc_insn, jalr_insn }; + s32 imm; + + /* get and adjust new target address */ + imm = riscv_insn_extract_utype_itype_imm(auipc_insn, jalr_insn); + imm -= patch_offset; + + /* update instructions */ + riscv_insn_insert_utype_itype_imm(&call[0], &call[1], imm); + + /* patch the call place again */ + patch_text_nosync(ptr, call, sizeof(u32) * 2); +} + +static void riscv_alternative_fix_jal(void *ptr, u32 jal_insn, int patch_offset) +{ + s32 imm; + + /* get and adjust new target address */ + imm = riscv_insn_extract_jtype_imm(jal_insn); + imm -= patch_offset; + + /* update instruction */ + riscv_insn_insert_jtype_imm(&jal_insn, imm); + + /* patch the call place again */ + patch_text_nosync(ptr, &jal_insn, sizeof(u32)); +} + +void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, + int patch_offset) +{ + int num_insn = len / sizeof(u32); + int i; + + for (i = 0; i < num_insn; i++) { + u32 insn = riscv_instruction_at(alt_ptr + i * sizeof(u32)); + + /* + * May be the start of an auipc + jalr pair + * Needs to check that at least one more instruction + * is in the list. + */ + if (riscv_insn_is_auipc(insn) && i < num_insn - 1) { + u32 insn2 = riscv_instruction_at(alt_ptr + (i + 1) * sizeof(u32)); + + if (!riscv_insn_is_jalr(insn2)) + continue; + + /* if instruction pair is a call, it will use the ra register */ + if (RV_EXTRACT_RD_REG(insn) != 1) + continue; + + riscv_alternative_fix_auipc_jalr(alt_ptr + i * sizeof(u32), + insn, insn2, patch_offset); + i++; + } + + if (riscv_insn_is_jal(insn)) { + s32 imm = riscv_insn_extract_jtype_imm(insn); + + /* Don't modify jumps inside the alternative block */ + if ((alt_ptr + i * sizeof(u32) + imm) >= alt_ptr && + (alt_ptr + i * sizeof(u32) + imm) < (alt_ptr + len)) + continue; + + riscv_alternative_fix_jal(alt_ptr + i * sizeof(u32), + insn, patch_offset); + } } } @@ -68,14 +159,39 @@ static void __init_or_module _apply_alternatives(struct alt_entry *begin, riscv_cpufeature_patch_func(begin, end, stage); - if (!cpu_mfr_info.vendor_patch_func) + if (!cpu_mfr_info.patch_func) + return; + + cpu_mfr_info.patch_func(begin, end, + cpu_mfr_info.arch_id, + cpu_mfr_info.imp_id, + stage); +} + +#ifdef CONFIG_MMU +static void __init apply_vdso_alternatives(void) +{ + const Elf_Ehdr *hdr; + const Elf_Shdr *shdr; + const Elf_Shdr *alt; + struct alt_entry *begin, *end; + + hdr = (Elf_Ehdr *)vdso_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".alternative"); + if (!alt) return; - cpu_mfr_info.vendor_patch_func(begin, end, - cpu_mfr_info.arch_id, - cpu_mfr_info.imp_id, - stage); + begin = (void *)hdr + alt->sh_offset, + end = (void *)hdr + alt->sh_offset + alt->sh_size, + + _apply_alternatives((struct alt_entry *)begin, + (struct alt_entry *)end, + RISCV_ALTERNATIVES_BOOT); } +#else +static void __init apply_vdso_alternatives(void) { } +#endif void __init apply_boot_alternatives(void) { @@ -85,6 +201,8 @@ void __init apply_boot_alternatives(void) _apply_alternatives((struct alt_entry *)__alt_start, (struct alt_entry *)__alt_end, RISCV_ALTERNATIVES_BOOT); + + apply_vdso_alternatives(); } /* diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index df9444397908..6e8c0d6feae9 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -4,15 +4,16 @@ * Copyright (C) 2017 SiFive */ -#define GENERATING_ASM_OFFSETS - #include <linux/kbuild.h> #include <linux/mm.h> #include <linux/sched.h> +#include <linux/ftrace.h> +#include <linux/suspend.h> #include <asm/kvm_host.h> #include <asm/thread_info.h> #include <asm/ptrace.h> #include <asm/cpu_ops_sbi.h> +#include <asm/stacktrace.h> #include <asm/suspend.h> void asm_offsets(void); @@ -33,11 +34,22 @@ void asm_offsets(void) OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]); OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]); OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]); - OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags); + OFFSET(TASK_THREAD_SUM, task_struct, thread.sum); + + OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu); OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count); OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp); OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp); +#ifdef CONFIG_SHADOW_CALL_STACK + OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp); +#endif +#ifdef CONFIG_64BIT + OFFSET(TASK_TI_A0, task_struct, thread_info.a0); + OFFSET(TASK_TI_A1, task_struct, thread_info.a1); + OFFSET(TASK_TI_A2, task_struct, thread_info.a2); +#endif + OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu); OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]); OFFSET(TASK_THREAD_F1, task_struct, thread.fstate.f[1]); OFFSET(TASK_THREAD_F2, task_struct, thread.fstate.f[2]); @@ -116,6 +128,10 @@ void asm_offsets(void) OFFSET(SUSPEND_CONTEXT_REGS, suspend_context, regs); + OFFSET(HIBERN_PBE_ADDR, pbe, address); + OFFSET(HIBERN_PBE_ORIG, pbe, orig_address); + OFFSET(HIBERN_PBE_NEXT, pbe, next); + OFFSET(KVM_ARCH_GUEST_ZERO, kvm_vcpu_arch, guest_context.zero); OFFSET(KVM_ARCH_GUEST_RA, kvm_vcpu_arch, guest_context.ra); OFFSET(KVM_ARCH_GUEST_SP, kvm_vcpu_arch, guest_context.sp); @@ -331,6 +347,10 @@ void asm_offsets(void) offsetof(struct task_struct, thread.s[11]) - offsetof(struct task_struct, thread.ra) ); + DEFINE(TASK_THREAD_SUM_RA, + offsetof(struct task_struct, thread.sum) + - offsetof(struct task_struct, thread.ra) + ); DEFINE(TASK_THREAD_F0_F0, offsetof(struct task_struct, thread.fstate.f[0]) @@ -474,4 +494,38 @@ void asm_offsets(void) OFFSET(KERNEL_MAP_VIRT_ADDR, kernel_mapping, virt_addr); OFFSET(SBI_HART_BOOT_TASK_PTR_OFFSET, sbi_hart_boot_data, task_ptr); OFFSET(SBI_HART_BOOT_STACK_PTR_OFFSET, sbi_hart_boot_data, stack_ptr); + + DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN)); + OFFSET(STACKFRAME_FP, stackframe, fp); + OFFSET(STACKFRAME_RA, stackframe, ra); +#ifdef CONFIG_FUNCTION_TRACER + DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS + DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct __arch_ftrace_regs), STACK_ALIGN)); + DEFINE(FREGS_EPC, offsetof(struct __arch_ftrace_regs, epc)); + DEFINE(FREGS_RA, offsetof(struct __arch_ftrace_regs, ra)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_S0, offsetof(struct __arch_ftrace_regs, s0)); + DEFINE(FREGS_T1, offsetof(struct __arch_ftrace_regs, t1)); +#ifdef CONFIG_CC_IS_CLANG + DEFINE(FREGS_T2, offsetof(struct __arch_ftrace_regs, t2)); + DEFINE(FREGS_T3, offsetof(struct __arch_ftrace_regs, t3)); + DEFINE(FREGS_T4, offsetof(struct __arch_ftrace_regs, t4)); + DEFINE(FREGS_T5, offsetof(struct __arch_ftrace_regs, t5)); + DEFINE(FREGS_T6, offsetof(struct __arch_ftrace_regs, t6)); +#endif + DEFINE(FREGS_A0, offsetof(struct __arch_ftrace_regs, a0)); + DEFINE(FREGS_A1, offsetof(struct __arch_ftrace_regs, a1)); + DEFINE(FREGS_A2, offsetof(struct __arch_ftrace_regs, a2)); + DEFINE(FREGS_A3, offsetof(struct __arch_ftrace_regs, a3)); + DEFINE(FREGS_A4, offsetof(struct __arch_ftrace_regs, a4)); + DEFINE(FREGS_A5, offsetof(struct __arch_ftrace_regs, a5)); + DEFINE(FREGS_A6, offsetof(struct __arch_ftrace_regs, a6)); + DEFINE(FREGS_A7, offsetof(struct __arch_ftrace_regs, a7)); +#endif } diff --git a/arch/riscv/kernel/bugs.c b/arch/riscv/kernel/bugs.c new file mode 100644 index 000000000000..3655fe7d678c --- /dev/null +++ b/arch/riscv/kernel/bugs.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Rivos Inc. + */ + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/sprintf.h> + +#include <asm/bugs.h> +#include <asm/vendor_extensions/thead.h> + +static enum mitigation_state ghostwrite_state; + +void ghostwrite_set_vulnerable(void) +{ + ghostwrite_state = VULNERABLE; +} + +/* + * Vendor extension alternatives will use the value set at the time of boot + * alternative patching, thus this must be called before boot alternatives are + * patched (and after extension probing) to be effective. + * + * Returns true if mitgated, false otherwise. + */ +bool ghostwrite_enable_mitigation(void) +{ + if (IS_ENABLED(CONFIG_RISCV_ISA_XTHEADVECTOR) && + ghostwrite_state == VULNERABLE && !cpu_mitigations_off()) { + disable_xtheadvector(); + ghostwrite_state = MITIGATED; + return true; + } + + return false; +} + +enum mitigation_state ghostwrite_get_state(void) +{ + return ghostwrite_state; +} + +ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (IS_ENABLED(CONFIG_RISCV_ISA_XTHEADVECTOR)) { + switch (ghostwrite_state) { + case UNAFFECTED: + return sprintf(buf, "Not affected\n"); + case MITIGATED: + return sprintf(buf, "Mitigation: xtheadvector disabled\n"); + case VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } + } else { + return sprintf(buf, "Not affected\n"); + } +} diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 90deabfe63ea..26b085dbdd07 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -3,9 +3,9 @@ * Copyright (C) 2017 SiFive */ +#include <linux/acpi.h> #include <linux/cpu.h> #include <linux/of.h> -#include <linux/of_device.h> #include <asm/cacheinfo.h> static struct riscv_cacheinfo_ops *rv_cache_ops; @@ -64,123 +64,73 @@ uintptr_t get_cache_geometry(u32 level, enum cache_type type) 0; } -static void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, - unsigned int level, unsigned int size, - unsigned int sets, unsigned int line_size) +static void ci_leaf_init(struct cacheinfo *this_leaf, + enum cache_type type, unsigned int level) { this_leaf->level = level; this_leaf->type = type; - this_leaf->size = size; - this_leaf->number_of_sets = sets; - this_leaf->coherency_line_size = line_size; - - /* - * If the cache is fully associative, there is no need to - * check the other properties. - */ - if (sets == 1) - return; - - /* - * Set the ways number for n-ways associative, make sure - * all properties are big than zero. - */ - if (sets > 0 && size > 0 && line_size > 0) - this_leaf->ways_of_associativity = (size / sets) / line_size; -} - -static void fill_cacheinfo(struct cacheinfo **this_leaf, - struct device_node *node, unsigned int level) -{ - unsigned int size, sets, line_size; - - if (!of_property_read_u32(node, "cache-size", &size) && - !of_property_read_u32(node, "cache-block-size", &line_size) && - !of_property_read_u32(node, "cache-sets", &sets)) { - ci_leaf_init((*this_leaf)++, CACHE_TYPE_UNIFIED, level, size, sets, line_size); - } - - if (!of_property_read_u32(node, "i-cache-size", &size) && - !of_property_read_u32(node, "i-cache-sets", &sets) && - !of_property_read_u32(node, "i-cache-block-size", &line_size)) { - ci_leaf_init((*this_leaf)++, CACHE_TYPE_INST, level, size, sets, line_size); - } - - if (!of_property_read_u32(node, "d-cache-size", &size) && - !of_property_read_u32(node, "d-cache-sets", &sets) && - !of_property_read_u32(node, "d-cache-block-size", &line_size)) { - ci_leaf_init((*this_leaf)++, CACHE_TYPE_DATA, level, size, sets, line_size); - } } int init_cache_level(unsigned int cpu) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct device_node *np = of_cpu_device_node_get(cpu); - struct device_node *prev = NULL; - int levels = 0, leaves = 0, level; - - if (of_property_read_bool(np, "cache-size")) - ++leaves; - if (of_property_read_bool(np, "i-cache-size")) - ++leaves; - if (of_property_read_bool(np, "d-cache-size")) - ++leaves; - if (leaves > 0) - levels = 1; - - prev = np; - while ((np = of_find_next_cache_node(np))) { - of_node_put(prev); - prev = np; - if (!of_device_is_compatible(np, "cache")) - break; - if (of_property_read_u32(np, "cache-level", &level)) - break; - if (level <= levels) - break; - if (of_property_read_bool(np, "cache-size")) - ++leaves; - if (of_property_read_bool(np, "i-cache-size")) - ++leaves; - if (of_property_read_bool(np, "d-cache-size")) - ++leaves; - levels = level; - } - - of_node_put(np); - this_cpu_ci->num_levels = levels; - this_cpu_ci->num_leaves = leaves; - - return 0; + return init_of_cache_level(cpu); } int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct device_node *np = of_cpu_device_node_get(cpu); - struct device_node *prev = NULL; + struct device_node *np, *prev; int levels = 1, level = 1; - /* Level 1 caches in cpu node */ - fill_cacheinfo(&this_leaf, np, level); + if (!acpi_disabled) { + int ret, fw_levels, split_levels; + + ret = acpi_get_cache_info(cpu, &fw_levels, &split_levels); + if (ret) + return ret; + + BUG_ON((split_levels > fw_levels) || + (split_levels + fw_levels > this_cpu_ci->num_leaves)); + + for (; level <= this_cpu_ci->num_levels; level++) { + if (level <= split_levels) { + ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + } else { + ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); + } + } + return 0; + } + + np = of_cpu_device_node_get(cpu); + if (!np) + return -ENOENT; + + if (of_property_present(np, "cache-size")) + ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); + if (of_property_present(np, "i-cache-size")) + ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (of_property_present(np, "d-cache-size")) + ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - /* Next level caches in cache nodes */ prev = np; while ((np = of_find_next_cache_node(np))) { of_node_put(prev); prev = np; - if (!of_device_is_compatible(np, "cache")) break; if (of_property_read_u32(np, "cache-level", &level)) break; if (level <= levels) break; - - fill_cacheinfo(&this_leaf, np, level); - + if (of_property_present(np, "cache-size")) + ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); + if (of_property_present(np, "i-cache-size")) + ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (of_property_present(np, "d-cache-size")) + ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); levels = level; } of_node_put(np); diff --git a/arch/riscv/kernel/cfi.c b/arch/riscv/kernel/cfi.c new file mode 100644 index 000000000000..64bdd3e1ab8c --- /dev/null +++ b/arch/riscv/kernel/cfi.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2023 Google LLC + */ +#include <linux/cfi.h> +#include <asm/insn.h> + +/* + * Returns the target address and the expected type when regs->epc points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + unsigned long *regs_ptr = (unsigned long *)regs; + int rs1_num; + u32 insn; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + * Â lw t1, -4(<reg>) + * lui t2, <hi20> + * addiw t2, t2, <lo12> + * beq t1, t2, .Ltmp1 + * ebreak ; <- regs->epc + * .Ltmp1: + * jalr <reg> + * + * We can read the expected type and the target address from the + * registers passed to the beq/jalr instructions. + */ + if (get_kernel_nofault(insn, (void *)regs->epc - 4)) + return false; + if (!riscv_insn_is_beq(insn)) + return false; + + *type = (u32)regs_ptr[RV_EXTRACT_RS1_REG(insn)]; + + if (get_kernel_nofault(insn, (void *)regs->epc) || + get_kernel_nofault(insn, (void *)regs->epc + GET_INSN_LENGTH(insn))) + return false; + + if (riscv_insn_is_jalr(insn)) + rs1_num = RV_EXTRACT_RS1_REG(insn); + else if (riscv_insn_is_c_jalr(insn)) + rs1_num = RVC_EXTRACT_C2_RS1_REG(insn); + else + return false; + + *target = regs_ptr[rs1_num]; + + return true; +} + +/* + * Checks if the ebreak trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->epc)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->epc); + + return report_cfi_failure(regs, regs->epc, &target, type); +} + +#ifdef CONFIG_CFI_CLANG +struct bpf_insn; + +/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ +extern unsigned int __bpf_prog_runX(const void *ctx, + const struct bpf_insn *insn); + +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +__ADDRESSABLE(__bpf_prog_runX); + +/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_hash,@object \n" +" .globl cfi_bpf_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_hash: \n" +" .word __kcfi_typeid___bpf_prog_runX \n" +" .size cfi_bpf_hash, 4 \n" +" .popsection \n" +); + +/* Must match bpf_callback_t */ +extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); + +__ADDRESSABLE(__bpf_callback_fn); + +/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_subprog_hash,@object \n" +" .globl cfi_bpf_subprog_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_subprog_hash: \n" +" .word __kcfi_typeid___bpf_callback_fn \n" +" .size cfi_bpf_subprog_hash, 4 \n" +" .popsection \n" +); + +u32 cfi_get_func_hash(void *func) +{ + u32 hash; + + if (get_kernel_nofault(hash, func - cfi_get_offset())) + return 0; + + return hash; +} +#endif diff --git a/arch/riscv/kernel/compat_syscall_table.c b/arch/riscv/kernel/compat_syscall_table.c index 651f2b009c28..e884c069e88f 100644 --- a/arch/riscv/kernel/compat_syscall_table.c +++ b/arch/riscv/kernel/compat_syscall_table.c @@ -8,12 +8,18 @@ #include <asm-generic/syscalls.h> #include <asm/syscall.h> +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) + +#undef __SYSCALL +#define __SYSCALL(nr, call) asmlinkage long __riscv_##call(const struct pt_regs *); +#include <asm/syscall_table_32.h> + #undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (call), +#define __SYSCALL(nr, call) [nr] = __riscv_##call, asmlinkage long compat_sys_rt_sigreturn(void); void * const compat_sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls - 1] = sys_ni_syscall, -#include <asm/unistd.h> + [0 ... __NR_syscalls - 1] = __riscv_sys_ni_syscall, +#include <asm/syscall_table_32.h> }; diff --git a/arch/riscv/kernel/compat_vdso/Makefile b/arch/riscv/kernel/compat_vdso/Makefile index 260daf3236d3..24e37d1ef7ec 100644 --- a/arch/riscv/kernel/compat_vdso/Makefile +++ b/arch/riscv/kernel/compat_vdso/Makefile @@ -11,9 +11,19 @@ compat_vdso-syms += flush_icache COMPAT_CC := $(CC) COMPAT_LD := $(LD) -COMPAT_CC_FLAGS := -march=rv32g -mabi=ilp32 +# binutils 2.35 does not support the zifencei extension, but in the ISA +# spec 20191213, G stands for IMAFD_ZICSR_ZIFENCEI. +ifdef CONFIG_TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI + COMPAT_CC_FLAGS := -march=rv32g -mabi=ilp32 +else + COMPAT_CC_FLAGS := -march=rv32imafd -mabi=ilp32 +endif COMPAT_LD_FLAGS := -melf32lriscv +# Disable attributes, as they're useless and break the build. +COMPAT_CC_FLAGS += $(call cc-option,-mno-riscv-attribute) +COMPAT_CC_FLAGS += $(call as-option,-Wa$(comma)-mno-arch-attr) + # Files to link into the compat_vdso obj-compat_vdso = $(patsubst %, %.o, $(compat_vdso-syms)) note.o @@ -22,13 +32,7 @@ targets := $(obj-compat_vdso) compat_vdso.so compat_vdso.so.dbg compat_vdso.lds obj-compat_vdso := $(addprefix $(obj)/, $(obj-compat_vdso)) obj-y += compat_vdso.o -CPPFLAGS_compat_vdso.lds += -P -C -U$(ARCH) - -# Disable profiling and instrumentation for VDSO code -GCOV_PROFILE := n -KCOV_INSTRUMENT := n -KASAN_SANITIZE := n -UBSAN_SANITIZE := n +CPPFLAGS_compat_vdso.lds += -P -C -DCOMPAT_VDSO -U$(ARCH) # Force dependency $(obj)/compat_vdso.o: $(obj)/compat_vdso.so @@ -48,7 +52,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # Generate VDSO offsets using helper script -gen-compat_vdsosym := $(srctree)/$(src)/gen_compat_vdso_offsets.sh +gen-compat_vdsosym := $(src)/gen_compat_vdso_offsets.sh quiet_cmd_compat_vdsosym = VDSOSYM $@ cmd_compat_vdsosym = $(NM) $< | $(gen-compat_vdsosym) | LC_ALL=C sort > $@ @@ -64,15 +68,5 @@ quiet_cmd_compat_vdsold = VDSOLD $@ rm $@.tmp # actual build commands -quiet_cmd_compat_vdsoas = VDSOAS $@ +quiet_cmd_compat_vdsoas = VDSOAS $@ cmd_compat_vdsoas = $(COMPAT_CC) $(a_flags) $(COMPAT_CC_FLAGS) -c -o $@ $< - -# install commands for the unstripped file -quiet_cmd_compat_vdso_install = INSTALL $@ - cmd_compat_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/compat_vdso/$@ - -compat_vdso.so: $(obj)/compat_vdso.so.dbg - @mkdir -p $(MODLIB)/compat_vdso - $(call cmd,compat_vdso_install) - -compat_vdso_install: compat_vdso.so diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S new file mode 100644 index 000000000000..2b3d9398c113 --- /dev/null +++ b/arch/riscv/kernel/copy-unaligned.S @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Rivos Inc. */ + +#include <linux/linkage.h> +#include <asm/asm.h> + + .text + +/* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using word loads and stores. */ +/* Note: The size is truncated to a multiple of 8 * SZREG */ +SYM_FUNC_START(__riscv_copy_words_unaligned) + andi a4, a2, ~((8*SZREG)-1) + beqz a4, 2f + add a3, a1, a4 +1: + REG_L a4, 0(a1) + REG_L a5, SZREG(a1) + REG_L a6, 2*SZREG(a1) + REG_L a7, 3*SZREG(a1) + REG_L t0, 4*SZREG(a1) + REG_L t1, 5*SZREG(a1) + REG_L t2, 6*SZREG(a1) + REG_L t3, 7*SZREG(a1) + REG_S a4, 0(a0) + REG_S a5, SZREG(a0) + REG_S a6, 2*SZREG(a0) + REG_S a7, 3*SZREG(a0) + REG_S t0, 4*SZREG(a0) + REG_S t1, 5*SZREG(a0) + REG_S t2, 6*SZREG(a0) + REG_S t3, 7*SZREG(a0) + addi a0, a0, 8*SZREG + addi a1, a1, 8*SZREG + bltu a1, a3, 1b + +2: + ret +SYM_FUNC_END(__riscv_copy_words_unaligned) + +/* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using only byte accesses. */ +/* Note: The size is truncated to a multiple of 8 */ +SYM_FUNC_START(__riscv_copy_bytes_unaligned) + andi a4, a2, ~(8-1) + beqz a4, 2f + add a3, a1, a4 +1: + lb a4, 0(a1) + lb a5, 1(a1) + lb a6, 2(a1) + lb a7, 3(a1) + lb t0, 4(a1) + lb t1, 5(a1) + lb t2, 6(a1) + lb t3, 7(a1) + sb a4, 0(a0) + sb a5, 1(a0) + sb a6, 2(a0) + sb a7, 3(a0) + sb t0, 4(a0) + sb t1, 5(a0) + sb t2, 6(a0) + sb t3, 7(a0) + addi a0, a0, 8 + addi a1, a1, 8 + bltu a1, a3, 1b + +2: + ret +SYM_FUNC_END(__riscv_copy_bytes_unaligned) diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h new file mode 100644 index 000000000000..85d4d11450cb --- /dev/null +++ b/arch/riscv/kernel/copy-unaligned.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos, Inc. + */ +#ifndef __RISCV_KERNEL_COPY_UNALIGNED_H +#define __RISCV_KERNEL_COPY_UNALIGNED_H + +#include <linux/types.h> + +void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size); +void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size); + +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS +void __riscv_copy_vec_words_unaligned(void *dst, const void *src, size_t size); +void __riscv_copy_vec_bytes_unaligned(void *dst, const void *src, size_t size); +#endif + +#endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */ diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c index f7a832e3a1d1..a1e38ecfc8be 100644 --- a/arch/riscv/kernel/cpu-hotplug.c +++ b/arch/riscv/kernel/cpu-hotplug.c @@ -8,16 +8,17 @@ #include <linux/sched.h> #include <linux/err.h> #include <linux/irq.h> +#include <linux/cpuhotplug.h> #include <linux/cpu.h> #include <linux/sched/hotplug.h> #include <asm/irq.h> #include <asm/cpu_ops.h> #include <asm/numa.h> -#include <asm/sbi.h> +#include <asm/smp.h> bool cpu_has_hotplug(unsigned int cpu) { - if (cpu_ops[cpu]->cpu_stop) + if (cpu_ops->cpu_stop) return true; return false; @@ -28,56 +29,49 @@ bool cpu_has_hotplug(unsigned int cpu) */ int __cpu_disable(void) { - int ret = 0; unsigned int cpu = smp_processor_id(); - if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_stop) + if (!cpu_ops->cpu_stop) return -EOPNOTSUPP; - if (cpu_ops[cpu]->cpu_disable) - ret = cpu_ops[cpu]->cpu_disable(cpu); - - if (ret) - return ret; - remove_cpu_topology(cpu); numa_remove_cpu(cpu); set_cpu_online(cpu, false); + riscv_ipi_disable(); irq_migrate_all_off_this_cpu(); - return ret; + return 0; } +#ifdef CONFIG_HOTPLUG_CPU /* - * Called on the thread which is asking for a CPU to be shutdown. + * Called on the thread which is asking for a CPU to be shutdown, if the + * CPU reported dead to the hotplug core. */ -void __cpu_die(unsigned int cpu) +void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { int ret = 0; - if (!cpu_wait_death(cpu, 5)) { - pr_err("CPU %u: didn't die\n", cpu); - return; - } pr_notice("CPU%u: off\n", cpu); /* Verify from the firmware if the cpu is really stopped*/ - if (cpu_ops[cpu]->cpu_is_stopped) - ret = cpu_ops[cpu]->cpu_is_stopped(cpu); + if (cpu_ops->cpu_is_stopped) + ret = cpu_ops->cpu_is_stopped(cpu); if (ret) - pr_warn("CPU%d may not have stopped: %d\n", cpu, ret); + pr_warn("CPU%u may not have stopped: %d\n", cpu, ret); } /* * Called from the idle thread for the CPU which has been shutdown. */ -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { idle_task_exit(); - (void)cpu_report_death(); + cpuhp_ap_report_dead(); - cpu_ops[smp_processor_id()]->cpu_stop(); + cpu_ops->cpu_stop(); /* It should never reach here */ BUG(); } +#endif diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index fba9e9f46a8c..f6b13e9f5e6c 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -3,48 +3,119 @@ * Copyright (C) 2012 Regents of the University of California */ +#include <linux/acpi.h> +#include <linux/cpu.h> +#include <linux/ctype.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/of.h> +#include <asm/acpi.h> +#include <asm/cpufeature.h> +#include <asm/csr.h> #include <asm/hwcap.h> +#include <asm/sbi.h> #include <asm/smp.h> #include <asm/pgtable.h> +#include <asm/vendor_extensions.h> + +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_hartid_map(cpu); +} /* * Returns the hart ID of the given device tree node, or -ENODEV if the node * isn't an enabled and valid RISC-V hart node. */ -int riscv_of_processor_hartid(struct device_node *node) +int riscv_of_processor_hartid(struct device_node *node, unsigned long *hart) +{ + int cpu; + + *hart = (unsigned long)of_get_cpu_hwid(node, 0); + if (*hart == ~0UL) { + pr_warn("Found CPU without hart ID\n"); + return -ENODEV; + } + + cpu = riscv_hartid_to_cpuid(*hart); + if (cpu < 0) + return cpu; + + if (!cpu_possible(cpu)) + return -ENODEV; + + return 0; +} + +int __init riscv_early_of_processor_hartid(struct device_node *node, unsigned long *hart) { const char *isa; - u32 hart; if (!of_device_is_compatible(node, "riscv")) { pr_warn("Found incompatible CPU\n"); return -ENODEV; } - hart = of_get_cpu_hwid(node, 0); - if (hart == ~0U) { + *hart = (unsigned long)of_get_cpu_hwid(node, 0); + if (*hart == ~0UL) { pr_warn("Found CPU without hart ID\n"); return -ENODEV; } if (!of_device_is_available(node)) { - pr_info("CPU with hartid=%d is not available\n", hart); + pr_info("CPU with hartid=%lu is not available\n", *hart); + return -ENODEV; + } + + if (of_property_read_string(node, "riscv,isa-base", &isa)) + goto old_interface; + + if (IS_ENABLED(CONFIG_32BIT) && strncasecmp(isa, "rv32i", 5)) { + pr_warn("CPU with hartid=%lu does not support rv32i", *hart); + return -ENODEV; + } + + if (IS_ENABLED(CONFIG_64BIT) && strncasecmp(isa, "rv64i", 5)) { + pr_warn("CPU with hartid=%lu does not support rv64i", *hart); + return -ENODEV; + } + + if (!of_property_present(node, "riscv,isa-extensions")) + return -ENODEV; + + if (of_property_match_string(node, "riscv,isa-extensions", "i") < 0 || + of_property_match_string(node, "riscv,isa-extensions", "m") < 0 || + of_property_match_string(node, "riscv,isa-extensions", "a") < 0) { + pr_warn("CPU with hartid=%lu does not support ima", *hart); + return -ENODEV; + } + + return 0; + +old_interface: + if (!riscv_isa_fallback) { + pr_warn("CPU with hartid=%lu is invalid: this kernel does not parse \"riscv,isa\"", + *hart); return -ENODEV; } if (of_property_read_string(node, "riscv,isa", &isa)) { - pr_warn("CPU with hartid=%d has no \"riscv,isa\" property\n", hart); + pr_warn("CPU with hartid=%lu has no \"riscv,isa-base\" or \"riscv,isa\" property\n", + *hart); return -ENODEV; } - if (isa[0] != 'r' || isa[1] != 'v') { - pr_warn("CPU with hartid=%d has an invalid ISA of \"%s\"\n", hart, isa); + + if (IS_ENABLED(CONFIG_32BIT) && strncasecmp(isa, "rv32ima", 7)) { + pr_warn("CPU with hartid=%lu does not support rv32ima", *hart); return -ENODEV; } - return hart; + if (IS_ENABLED(CONFIG_64BIT) && strncasecmp(isa, "rv64ima", 7)) { + pr_warn("CPU with hartid=%lu does not support rv64ima", *hart); + return -ENODEV; + } + + return 0; } /* @@ -53,111 +124,194 @@ int riscv_of_processor_hartid(struct device_node *node) * To achieve this, we walk up the DT tree until we find an active * RISC-V core (HART) node and extract the cpuid from it. */ -int riscv_of_parent_hartid(struct device_node *node) +int riscv_of_parent_hartid(struct device_node *node, unsigned long *hartid) { for (; node; node = node->parent) { - if (of_device_is_compatible(node, "riscv")) - return riscv_of_processor_hartid(node); + if (of_device_is_compatible(node, "riscv")) { + *hartid = (unsigned long)of_get_cpu_hwid(node, 0); + if (*hartid == ~0UL) { + pr_warn("Found CPU without hart ID\n"); + return -ENODEV; + } + return 0; + } } return -1; } -#ifdef CONFIG_PROC_FS -#define __RISCV_ISA_EXT_DATA(UPROP, EXTID) \ - { \ - .uprop = #UPROP, \ - .isa_ext_id = EXTID, \ - } -/* - * Here are the ordering rules of extension naming defined by RISC-V - * specification : - * 1. All extensions should be separated from other multi-letter extensions - * by an underscore. - * 2. The first letter following the 'Z' conventionally indicates the most - * closely related alphabetical extension category, IMAFDQLCBKJTPVH. - * If multiple 'Z' extensions are named, they should be ordered first - * by category, then alphabetically within a category. - * 3. Standard supervisor-level extensions (starts with 'S') should be - * listed after standard unprivileged extensions. If multiple - * supervisor-level extensions are listed, they should be ordered - * alphabetically. - * 4. Non-standard extensions (starts with 'X') must be listed after all - * standard extensions. They must be separated from other multi-letter - * extensions by an underscore. - */ -static struct riscv_isa_ext_data isa_ext_arr[] = { - __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), - __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), - __RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX), -}; +unsigned long __init riscv_get_marchid(void) +{ + struct riscv_cpuinfo *ci = this_cpu_ptr(&riscv_cpuinfo); + +#if IS_ENABLED(CONFIG_RISCV_SBI) + ci->marchid = sbi_spec_is_0_1() ? 0 : sbi_get_marchid(); +#elif IS_ENABLED(CONFIG_RISCV_M_MODE) + ci->marchid = csr_read(CSR_MARCHID); +#else + ci->marchid = 0; +#endif + return ci->marchid; +} -static void print_isa_ext(struct seq_file *f) +unsigned long __init riscv_get_mvendorid(void) { - struct riscv_isa_ext_data *edata; - int i = 0, arr_sz; + struct riscv_cpuinfo *ci = this_cpu_ptr(&riscv_cpuinfo); - arr_sz = ARRAY_SIZE(isa_ext_arr) - 1; +#if IS_ENABLED(CONFIG_RISCV_SBI) + ci->mvendorid = sbi_spec_is_0_1() ? 0 : sbi_get_mvendorid(); +#elif IS_ENABLED(CONFIG_RISCV_M_MODE) + ci->mvendorid = csr_read(CSR_MVENDORID); +#else + ci->mvendorid = 0; +#endif + return ci->mvendorid; +} - /* No extension support available */ - if (arr_sz <= 0) - return; +DEFINE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); - for (i = 0; i <= arr_sz; i++) { - edata = &isa_ext_arr[i]; - if (!__riscv_isa_extension_available(NULL, edata->isa_ext_id)) - continue; - seq_printf(f, "_%s", edata->uprop); +unsigned long riscv_cached_mvendorid(unsigned int cpu_id) +{ + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + + return ci->mvendorid; +} +EXPORT_SYMBOL(riscv_cached_mvendorid); + +unsigned long riscv_cached_marchid(unsigned int cpu_id) +{ + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + + return ci->marchid; +} +EXPORT_SYMBOL(riscv_cached_marchid); + +unsigned long riscv_cached_mimpid(unsigned int cpu_id) +{ + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + + return ci->mimpid; +} +EXPORT_SYMBOL(riscv_cached_mimpid); + +static int riscv_cpuinfo_starting(unsigned int cpu) +{ + struct riscv_cpuinfo *ci = this_cpu_ptr(&riscv_cpuinfo); + +#if IS_ENABLED(CONFIG_RISCV_SBI) + if (!ci->mvendorid) + ci->mvendorid = sbi_spec_is_0_1() ? 0 : sbi_get_mvendorid(); + if (!ci->marchid) + ci->marchid = sbi_spec_is_0_1() ? 0 : sbi_get_marchid(); + ci->mimpid = sbi_spec_is_0_1() ? 0 : sbi_get_mimpid(); +#elif IS_ENABLED(CONFIG_RISCV_M_MODE) + if (!ci->mvendorid) + ci->mvendorid = csr_read(CSR_MVENDORID); + if (!ci->marchid) + ci->marchid = csr_read(CSR_MARCHID); + ci->mimpid = csr_read(CSR_MIMPID); +#else + ci->mvendorid = 0; + ci->marchid = 0; + ci->mimpid = 0; +#endif + + return 0; +} + +static int __init riscv_cpuinfo_init(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "riscv/cpuinfo:starting", + riscv_cpuinfo_starting, NULL); + if (ret < 0) { + pr_err("cpuinfo: failed to register hotplug callbacks.\n"); + return ret; } + + return 0; } +arch_initcall(riscv_cpuinfo_init); -/* - * These are the only valid base (single letter) ISA extensions as per the spec. - * It also specifies the canonical order in which it appears in the spec. - * Some of the extension may just be a place holder for now (B, K, P, J). - * This should be updated once corresponding extensions are ratified. - */ -static const char base_riscv_exts[13] = "imafdqcbkjpvh"; +#ifdef CONFIG_PROC_FS + +#define ALL_CPUS -1 -static void print_isa(struct seq_file *f, const char *isa) +static void print_vendor_isa(struct seq_file *f, int cpu) { - int i; + struct riscv_isavendorinfo *vendor_bitmap; + struct riscv_isa_vendor_ext_data_list *ext_list; + const struct riscv_isa_ext_data *ext_data; + + for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) { + ext_list = riscv_isa_vendor_ext_list[i]; + ext_data = riscv_isa_vendor_ext_list[i]->ext_data; + + if (cpu == ALL_CPUS) + vendor_bitmap = &ext_list->all_harts_isa_bitmap; + else + vendor_bitmap = &ext_list->per_hart_isa_bitmap[cpu]; - seq_puts(f, "isa\t\t: "); - /* Print the rv[64/32] part */ - seq_write(f, isa, 4); - for (i = 0; i < sizeof(base_riscv_exts); i++) { - if (__riscv_isa_extension_available(NULL, base_riscv_exts[i] - 'a')) - /* Print only enabled the base ISA extensions */ - seq_write(f, &base_riscv_exts[i], 1); + for (int j = 0; j < ext_list->ext_data_count; j++) { + if (!__riscv_isa_extension_available(vendor_bitmap->isa, ext_data[j].id)) + continue; + + seq_printf(f, "_%s", ext_data[j].name); + } } - print_isa_ext(f); +} + +static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap, int cpu) +{ + + if (IS_ENABLED(CONFIG_32BIT)) + seq_write(f, "rv32", 4); + else + seq_write(f, "rv64", 4); + + for (int i = 0; i < riscv_isa_ext_count; i++) { + if (!__riscv_isa_extension_available(isa_bitmap, riscv_isa_ext[i].id)) + continue; + + /* Only multi-letter extensions are split by underscores */ + if (strnlen(riscv_isa_ext[i].name, 2) != 1) + seq_puts(f, "_"); + + seq_printf(f, "%s", riscv_isa_ext[i].name); + } + + print_vendor_isa(f, cpu); + seq_puts(f, "\n"); } static void print_mmu(struct seq_file *f) { - char sv_type[16]; + const char *sv_type; #ifdef CONFIG_MMU #if defined(CONFIG_32BIT) - strncpy(sv_type, "sv32", 5); + sv_type = "sv32"; #elif defined(CONFIG_64BIT) if (pgtable_l5_enabled) - strncpy(sv_type, "sv57", 5); + sv_type = "sv57"; else if (pgtable_l4_enabled) - strncpy(sv_type, "sv48", 5); + sv_type = "sv48"; else - strncpy(sv_type, "sv39", 5); + sv_type = "sv39"; #endif #else - strncpy(sv_type, "none", 5); + sv_type = "none"; #endif /* CONFIG_MMU */ seq_printf(f, "mmu\t\t: %s\n", sv_type); } static void *c_start(struct seq_file *m, loff_t *pos) { + if (*pos == nr_cpu_ids) + return NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return (void *)(uintptr_t)(1 + *pos); @@ -177,19 +331,44 @@ static void c_stop(struct seq_file *m, void *v) static int c_show(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; - struct device_node *node = of_get_cpu_node(cpu_id, NULL); - const char *compat, *isa; + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + struct device_node *node; + const char *compat; seq_printf(m, "processor\t: %lu\n", cpu_id); seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id)); - if (!of_property_read_string(node, "riscv,isa", &isa)) - print_isa(m, isa); + + /* + * For historical raisins, the isa: line is limited to the lowest common + * denominator of extensions supported across all harts. A true list of + * extensions supported on this hart is printed later in the hart isa: + * line. + */ + seq_puts(m, "isa\t\t: "); + print_isa(m, NULL, ALL_CPUS); print_mmu(m); - if (!of_property_read_string(node, "compatible", &compat) - && strcmp(compat, "riscv")) - seq_printf(m, "uarch\t\t: %s\n", compat); + + if (acpi_disabled) { + node = of_get_cpu_node(cpu_id, NULL); + + if (!of_property_read_string(node, "compatible", &compat) && + strcmp(compat, "riscv")) + seq_printf(m, "uarch\t\t: %s\n", compat); + + of_node_put(node); + } + + seq_printf(m, "mvendorid\t: 0x%lx\n", ci->mvendorid); + seq_printf(m, "marchid\t\t: 0x%lx\n", ci->marchid); + seq_printf(m, "mimpid\t\t: 0x%lx\n", ci->mimpid); + + /* + * Print the ISA extensions specific to this hart, which may show + * additional extensions not present across all harts. + */ + seq_puts(m, "hart isa\t: "); + print_isa(m, hart_isa[cpu_id].isa, cpu_id); seq_puts(m, "\n"); - of_node_put(node); return 0; } diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 170d07e57721..6a8bd8f4db07 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -9,30 +9,25 @@ #include <linux/string.h> #include <linux/sched.h> #include <asm/cpu_ops.h> +#include <asm/cpu_ops_sbi.h> #include <asm/sbi.h> #include <asm/smp.h> -const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; +const struct cpu_operations *cpu_ops __ro_after_init = &cpu_ops_spinwait; extern const struct cpu_operations cpu_ops_sbi; -#ifdef CONFIG_RISCV_BOOT_SPINWAIT -extern const struct cpu_operations cpu_ops_spinwait; -#else +#ifndef CONFIG_RISCV_BOOT_SPINWAIT const struct cpu_operations cpu_ops_spinwait = { - .name = "", - .cpu_prepare = NULL, .cpu_start = NULL, }; #endif -void __init cpu_set_ops(int cpuid) +void __init cpu_set_ops(void) { #if IS_ENABLED(CONFIG_RISCV_SBI) - if (sbi_probe_extension(SBI_EXT_HSM) > 0) { - if (!cpuid) - pr_info("SBI HSM extension detected\n"); - cpu_ops[cpuid] = &cpu_ops_sbi; - } else + if (sbi_probe_extension(SBI_EXT_HSM)) { + pr_info("SBI HSM extension detected\n"); + cpu_ops = &cpu_ops_sbi; + } #endif - cpu_ops[cpuid] = &cpu_ops_spinwait; } diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 4f5a6f84e2a4..e6fbaaf54956 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -65,37 +65,21 @@ static int sbi_hsm_hart_get_status(unsigned long hartid) static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) { unsigned long boot_addr = __pa_symbol(secondary_start_sbi); - int hartid = cpuid_to_hartid_map(cpuid); + unsigned long hartid = cpuid_to_hartid_map(cpuid); unsigned long hsm_data; struct sbi_hart_boot_data *bdata = &per_cpu(boot_data, cpuid); /* Make sure tidle is updated */ smp_mb(); bdata->task_ptr = tidle; - bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + bdata->stack_ptr = task_pt_regs(tidle); /* Make sure boot data is updated */ smp_mb(); hsm_data = __pa(bdata); return sbi_hsm_hart_start(hartid, boot_addr, hsm_data); } -static int sbi_cpu_prepare(unsigned int cpuid) -{ - if (!cpu_ops_sbi.cpu_start) { - pr_err("cpu start method not defined for CPU [%d]\n", cpuid); - return -ENODEV; - } - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU -static int sbi_cpu_disable(unsigned int cpuid) -{ - if (!cpu_ops_sbi.cpu_stop) - return -EOPNOTSUPP; - return 0; -} - static void sbi_cpu_stop(void) { int ret; @@ -107,7 +91,7 @@ static void sbi_cpu_stop(void) static int sbi_cpu_is_stopped(unsigned int cpuid) { int rc; - int hartid = cpuid_to_hartid_map(cpuid); + unsigned long hartid = cpuid_to_hartid_map(cpuid); rc = sbi_hsm_hart_get_status(hartid); @@ -118,11 +102,8 @@ static int sbi_cpu_is_stopped(unsigned int cpuid) #endif const struct cpu_operations cpu_ops_sbi = { - .name = "sbi", - .cpu_prepare = sbi_cpu_prepare, .cpu_start = sbi_cpu_start, #ifdef CONFIG_HOTPLUG_CPU - .cpu_disable = sbi_cpu_disable, .cpu_stop = sbi_cpu_stop, .cpu_is_stopped = sbi_cpu_is_stopped, #endif diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index 346847f6c41c..24869eb88908 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -11,6 +11,8 @@ #include <asm/sbi.h> #include <asm/smp.h> +#include "head.h" + const struct cpu_operations cpu_ops_spinwait; void *__cpu_spinwait_stack_pointer[NR_CPUS] __section(".data"); void *__cpu_spinwait_task_pointer[NR_CPUS] __section(".data"); @@ -18,7 +20,7 @@ void *__cpu_spinwait_task_pointer[NR_CPUS] __section(".data"); static void cpu_update_secondary_bootdata(unsigned int cpuid, struct task_struct *tidle) { - int hartid = cpuid_to_hartid_map(cpuid); + unsigned long hartid = cpuid_to_hartid_map(cpuid); /* * The hartid must be less than NR_CPUS to avoid out-of-bound access @@ -27,25 +29,15 @@ static void cpu_update_secondary_bootdata(unsigned int cpuid, * spinwait booting is not the recommended approach for any platforms * booting Linux in S-mode and can be disabled in the future. */ - if (hartid == INVALID_HARTID || hartid >= NR_CPUS) + if (hartid == INVALID_HARTID || hartid >= (unsigned long) NR_CPUS) return; /* Make sure tidle is updated */ smp_mb(); - WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], task_pt_regs(tidle)); WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); } -static int spinwait_cpu_prepare(unsigned int cpuid) -{ - if (!cpu_ops_spinwait.cpu_start) { - pr_err("cpu start method not defined for CPU [%d]\n", cpuid); - return -ENODEV; - } - return 0; -} - static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle) { /* @@ -62,7 +54,5 @@ static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle) } const struct cpu_operations cpu_ops_spinwait = { - .name = "spinwait", - .cpu_prepare = spinwait_cpu_prepare, .cpu_start = spinwait_cpu_start, }; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 12b05ce164bb..743d53415572 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -6,30 +6,44 @@ * Copyright (C) 2017 SiFive */ +#include <linux/acpi.h> #include <linux/bitmap.h> +#include <linux/cpu.h> +#include <linux/cpuhotplug.h> #include <linux/ctype.h> -#include <linux/libfdt.h> +#include <linux/log2.h> +#include <linux/memory.h> #include <linux/module.h> #include <linux/of.h> +#include <asm/acpi.h> #include <asm/alternative.h> -#include <asm/errata_list.h> +#include <asm/bugs.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> #include <asm/hwcap.h> -#include <asm/patch.h> -#include <asm/pgtable.h> +#include <asm/text-patching.h> +#include <asm/hwprobe.h> #include <asm/processor.h> -#include <asm/smp.h> -#include <asm/switch_to.h> +#include <asm/sbi.h> +#include <asm/vector.h> +#include <asm/vendor_extensions.h> +#include <asm/vendor_extensions/thead.h> #define NUM_ALPHA_EXTS ('z' - 'a' + 1) +static bool any_cpu_has_zicboz; +static bool any_cpu_has_zicbop; +static bool any_cpu_has_zicbom; + unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly; -#ifdef CONFIG_FPU -__ro_after_init DEFINE_STATIC_KEY_FALSE(cpu_hwcap_fpu); -#endif +/* Per-cpu ISA extensions. */ +struct riscv_isainfo hart_isa[NR_CPUS]; + +u32 thead_vlenb_of; /** * riscv_isa_extension_base() - Get base extension word @@ -41,9 +55,7 @@ __ro_after_init DEFINE_STATIC_KEY_FALSE(cpu_hwcap_fpu); */ unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap) { - if (!isa_bitmap) - return riscv_isa[0]; - return isa_bitmap[0]; + return !isa_bitmap ? riscv_isa[0] : isa_bitmap[0]; } EXPORT_SYMBOL_GPL(riscv_isa_extension_base); @@ -57,152 +69,790 @@ EXPORT_SYMBOL_GPL(riscv_isa_extension_base); * * NOTE: If isa_bitmap is NULL then Host ISA bitmap will be used. */ -bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit) +bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, unsigned int bit) { const unsigned long *bmap = (isa_bitmap) ? isa_bitmap : riscv_isa; if (bit >= RISCV_ISA_EXT_MAX) return false; - return test_bit(bit, bmap) ? true : false; + return test_bit(bit, bmap); } EXPORT_SYMBOL_GPL(__riscv_isa_extension_available); -void __init riscv_fill_hwcap(void) +static int riscv_ext_f_depends(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) { - struct device_node *node; - const char *isa; - char print_str[NUM_ALPHA_EXTS + 1]; - int i, j; - static unsigned long isa2hwcap[256] = {0}; + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_f)) + return 0; - isa2hwcap['i'] = isa2hwcap['I'] = COMPAT_HWCAP_ISA_I; - isa2hwcap['m'] = isa2hwcap['M'] = COMPAT_HWCAP_ISA_M; - isa2hwcap['a'] = isa2hwcap['A'] = COMPAT_HWCAP_ISA_A; - isa2hwcap['f'] = isa2hwcap['F'] = COMPAT_HWCAP_ISA_F; - isa2hwcap['d'] = isa2hwcap['D'] = COMPAT_HWCAP_ISA_D; - isa2hwcap['c'] = isa2hwcap['C'] = COMPAT_HWCAP_ISA_C; + return -EPROBE_DEFER; +} - elf_hwcap = 0; +static int riscv_ext_zicbom_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!riscv_cbom_block_size) { + pr_err("Zicbom detected in ISA string, disabling as no cbom-block-size found\n"); + return -EINVAL; + } + if (!is_power_of_2(riscv_cbom_block_size)) { + pr_err("Zicbom disabled as cbom-block-size present, but is not a power-of-2\n"); + return -EINVAL; + } - bitmap_zero(riscv_isa, RISCV_ISA_EXT_MAX); + any_cpu_has_zicbom = true; + return 0; +} - for_each_of_cpu_node(node) { - unsigned long this_hwcap = 0; - DECLARE_BITMAP(this_isa, RISCV_ISA_EXT_MAX); - const char *temp; +static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!riscv_cboz_block_size) { + pr_err("Zicboz detected in ISA string, disabling as no cboz-block-size found\n"); + return -EINVAL; + } + if (!is_power_of_2(riscv_cboz_block_size)) { + pr_err("Zicboz disabled as cboz-block-size present, but is not a power-of-2\n"); + return -EINVAL; + } + any_cpu_has_zicboz = true; + return 0; +} - if (riscv_of_processor_hartid(node) < 0) - continue; +static int riscv_ext_zicbop_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!riscv_cbop_block_size) { + pr_err("Zicbop detected in ISA string, disabling as no cbop-block-size found\n"); + return -EINVAL; + } + if (!is_power_of_2(riscv_cbop_block_size)) { + pr_err("Zicbop disabled as cbop-block-size present, but is not a power-of-2\n"); + return -EINVAL; + } + any_cpu_has_zicbop = true; + return 0; +} - if (of_property_read_string(node, "riscv,isa", &isa)) { - pr_warn("Unable to find \"riscv,isa\" devicetree entry\n"); - continue; +static int riscv_ext_f_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!IS_ENABLED(CONFIG_FPU)) + return -EINVAL; + + /* + * Due to extension ordering, d is checked before f, so no deferral + * is required. + */ + if (!__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_d)) { + pr_warn_once("This kernel does not support systems with F but not D\n"); + return -EINVAL; + } + + return 0; +} + +static int riscv_ext_d_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!IS_ENABLED(CONFIG_FPU)) + return -EINVAL; + + return 0; +} + +static int riscv_ext_vector_x_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_V)) + return -EINVAL; + + return 0; +} + +static int riscv_ext_vector_float_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_V)) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_FPU)) + return -EINVAL; + + /* + * The kernel doesn't support systems that don't implement both of + * F and D, so if any of the vector extensions that do floating point + * are to be usable, both floating point extensions need to be usable. + * + * Since this function validates vector only, and v/Zve* are probed + * after f/d, there's no need for a deferral here. + */ + if (!__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_d)) + return -EINVAL; + + return 0; +} + +static int riscv_ext_vector_crypto_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_V)) + return -EINVAL; + + /* + * It isn't the kernel's job to check that the binding is correct, so + * it should be enough to check that any of the vector extensions are + * enabled, which in-turn means that vector is usable in this kernel + */ + if (!__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZVE32X)) + return -EPROBE_DEFER; + + return 0; +} + +static int riscv_ext_zca_depends(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZCA)) + return 0; + + return -EPROBE_DEFER; +} +static int riscv_ext_zcd_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZCA) && + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_d)) + return 0; + + return -EPROBE_DEFER; +} + +static int riscv_ext_zcf_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return -EINVAL; + + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZCA) && + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_f)) + return 0; + + return -EPROBE_DEFER; +} + +static int riscv_vector_f_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_V)) + return -EINVAL; + + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZVE32F)) + return 0; + + return -EPROBE_DEFER; +} + +static int riscv_ext_zvfbfwma_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZFBFMIN) && + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_ZVFBFMIN)) + return 0; + + return -EPROBE_DEFER; +} + +static int riscv_ext_svadu_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + /* SVADE has already been detected, use SVADE only */ + if (__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_SVADE)) + return -EOPNOTSUPP; + + return 0; +} + +static const unsigned int riscv_a_exts[] = { + RISCV_ISA_EXT_ZAAMO, + RISCV_ISA_EXT_ZALRSC, +}; + +static const unsigned int riscv_zk_bundled_exts[] = { + RISCV_ISA_EXT_ZBKB, + RISCV_ISA_EXT_ZBKC, + RISCV_ISA_EXT_ZBKX, + RISCV_ISA_EXT_ZKND, + RISCV_ISA_EXT_ZKNE, + RISCV_ISA_EXT_ZKR, + RISCV_ISA_EXT_ZKT, +}; + +static const unsigned int riscv_zkn_bundled_exts[] = { + RISCV_ISA_EXT_ZBKB, + RISCV_ISA_EXT_ZBKC, + RISCV_ISA_EXT_ZBKX, + RISCV_ISA_EXT_ZKND, + RISCV_ISA_EXT_ZKNE, + RISCV_ISA_EXT_ZKNH, +}; + +static const unsigned int riscv_zks_bundled_exts[] = { + RISCV_ISA_EXT_ZBKB, + RISCV_ISA_EXT_ZBKC, + RISCV_ISA_EXT_ZKSED, + RISCV_ISA_EXT_ZKSH +}; + +#define RISCV_ISA_EXT_ZVKN \ + RISCV_ISA_EXT_ZVKNED, \ + RISCV_ISA_EXT_ZVKNHB, \ + RISCV_ISA_EXT_ZVKB, \ + RISCV_ISA_EXT_ZVKT + +static const unsigned int riscv_zvkn_bundled_exts[] = { + RISCV_ISA_EXT_ZVKN +}; + +static const unsigned int riscv_zvknc_bundled_exts[] = { + RISCV_ISA_EXT_ZVKN, + RISCV_ISA_EXT_ZVBC +}; + +static const unsigned int riscv_zvkng_bundled_exts[] = { + RISCV_ISA_EXT_ZVKN, + RISCV_ISA_EXT_ZVKG +}; + +#define RISCV_ISA_EXT_ZVKS \ + RISCV_ISA_EXT_ZVKSED, \ + RISCV_ISA_EXT_ZVKSH, \ + RISCV_ISA_EXT_ZVKB, \ + RISCV_ISA_EXT_ZVKT + +static const unsigned int riscv_zvks_bundled_exts[] = { + RISCV_ISA_EXT_ZVKS +}; + +static const unsigned int riscv_zvksc_bundled_exts[] = { + RISCV_ISA_EXT_ZVKS, + RISCV_ISA_EXT_ZVBC +}; + +static const unsigned int riscv_zvksg_bundled_exts[] = { + RISCV_ISA_EXT_ZVKS, + RISCV_ISA_EXT_ZVKG +}; + +static const unsigned int riscv_zvbb_exts[] = { + RISCV_ISA_EXT_ZVKB +}; + +#define RISCV_ISA_EXT_ZVE64F_IMPLY_LIST \ + RISCV_ISA_EXT_ZVE64X, \ + RISCV_ISA_EXT_ZVE32F, \ + RISCV_ISA_EXT_ZVE32X + +#define RISCV_ISA_EXT_ZVE64D_IMPLY_LIST \ + RISCV_ISA_EXT_ZVE64F, \ + RISCV_ISA_EXT_ZVE64F_IMPLY_LIST + +#define RISCV_ISA_EXT_V_IMPLY_LIST \ + RISCV_ISA_EXT_ZVE64D, \ + RISCV_ISA_EXT_ZVE64D_IMPLY_LIST + +static const unsigned int riscv_zve32f_exts[] = { + RISCV_ISA_EXT_ZVE32X +}; + +static const unsigned int riscv_zve64f_exts[] = { + RISCV_ISA_EXT_ZVE64F_IMPLY_LIST +}; + +static const unsigned int riscv_zve64d_exts[] = { + RISCV_ISA_EXT_ZVE64D_IMPLY_LIST +}; + +static const unsigned int riscv_v_exts[] = { + RISCV_ISA_EXT_V_IMPLY_LIST +}; + +static const unsigned int riscv_zve64x_exts[] = { + RISCV_ISA_EXT_ZVE32X, + RISCV_ISA_EXT_ZVE64X +}; + +/* + * While the [ms]envcfg CSRs were not defined until version 1.12 of the RISC-V + * privileged ISA, the existence of the CSRs is implied by any extension which + * specifies [ms]envcfg bit(s). Hence, we define a custom ISA extension for the + * existence of the CSR, and treat it as a subset of those other extensions. + */ +static const unsigned int riscv_xlinuxenvcfg_exts[] = { + RISCV_ISA_EXT_XLINUXENVCFG +}; + +/* + * Zc* spec states that: + * - C always implies Zca + * - C+F implies Zcf (RV32 only) + * - C+D implies Zcd + * + * These extensions will be enabled and then validated depending on the + * availability of F/D RV32. + */ +static const unsigned int riscv_c_exts[] = { + RISCV_ISA_EXT_ZCA, + RISCV_ISA_EXT_ZCF, + RISCV_ISA_EXT_ZCD, +}; + +/* + * The canonical order of ISA extension names in the ISA string is defined in + * chapter 27 of the unprivileged specification. + * + * Ordinarily, for in-kernel data structures, this order is unimportant but + * isa_ext_arr defines the order of the ISA string in /proc/cpuinfo. + * + * The specification uses vague wording, such as should, when it comes to + * ordering, so for our purposes the following rules apply: + * + * 1. All multi-letter extensions must be separated from other extensions by an + * underscore. + * + * 2. Additional standard extensions (starting with 'Z') must be sorted after + * single-letter extensions and before any higher-privileged extensions. + * + * 3. The first letter following the 'Z' conventionally indicates the most + * closely related alphabetical extension category, IMAFDQLCBKJTPVH. + * If multiple 'Z' extensions are named, they must be ordered first by + * category, then alphabetically within a category. + * + * 3. Standard supervisor-level extensions (starting with 'S') must be listed + * after standard unprivileged extensions. If multiple supervisor-level + * extensions are listed, they must be ordered alphabetically. + * + * 4. Standard machine-level extensions (starting with 'Zxm') must be listed + * after any lower-privileged, standard extensions. If multiple + * machine-level extensions are listed, they must be ordered + * alphabetically. + * + * 5. Non-standard extensions (starting with 'X') must be listed after all + * standard extensions. If multiple non-standard extensions are listed, they + * must be ordered alphabetically. + * + * An example string following the order is: + * rv64imadc_zifoo_zigoo_zafoo_sbar_scar_zxmbaz_xqux_xrux + * + * New entries to this struct should follow the ordering rules described above. + */ +const struct riscv_isa_ext_data riscv_isa_ext[] = { + __RISCV_ISA_EXT_DATA(i, RISCV_ISA_EXT_i), + __RISCV_ISA_EXT_DATA(m, RISCV_ISA_EXT_m), + __RISCV_ISA_EXT_SUPERSET(a, RISCV_ISA_EXT_a, riscv_a_exts), + __RISCV_ISA_EXT_DATA_VALIDATE(f, RISCV_ISA_EXT_f, riscv_ext_f_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(d, RISCV_ISA_EXT_d, riscv_ext_d_validate), + __RISCV_ISA_EXT_DATA(q, RISCV_ISA_EXT_q), + __RISCV_ISA_EXT_SUPERSET(c, RISCV_ISA_EXT_c, riscv_c_exts), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(v, RISCV_ISA_EXT_v, riscv_v_exts, riscv_ext_vector_float_validate), + __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicbom, RISCV_ISA_EXT_ZICBOM, riscv_xlinuxenvcfg_exts, riscv_ext_zicbom_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zicbop, RISCV_ISA_EXT_ZICBOP, riscv_ext_zicbop_validate), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts, riscv_ext_zicboz_validate), + __RISCV_ISA_EXT_DATA(ziccrse, RISCV_ISA_EXT_ZICCRSE), + __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), + __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND), + __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), + __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI), + __RISCV_ISA_EXT_DATA(zihintntl, RISCV_ISA_EXT_ZIHINTNTL), + __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), + __RISCV_ISA_EXT_DATA(zihpm, RISCV_ISA_EXT_ZIHPM), + __RISCV_ISA_EXT_DATA(zimop, RISCV_ISA_EXT_ZIMOP), + __RISCV_ISA_EXT_DATA(zaamo, RISCV_ISA_EXT_ZAAMO), + __RISCV_ISA_EXT_DATA(zabha, RISCV_ISA_EXT_ZABHA), + __RISCV_ISA_EXT_DATA(zacas, RISCV_ISA_EXT_ZACAS), + __RISCV_ISA_EXT_DATA(zalrsc, RISCV_ISA_EXT_ZALRSC), + __RISCV_ISA_EXT_DATA(zawrs, RISCV_ISA_EXT_ZAWRS), + __RISCV_ISA_EXT_DATA(zfa, RISCV_ISA_EXT_ZFA), + __RISCV_ISA_EXT_DATA_VALIDATE(zfbfmin, RISCV_ISA_EXT_ZFBFMIN, riscv_ext_f_depends), + __RISCV_ISA_EXT_DATA(zfh, RISCV_ISA_EXT_ZFH), + __RISCV_ISA_EXT_DATA(zfhmin, RISCV_ISA_EXT_ZFHMIN), + __RISCV_ISA_EXT_DATA(zca, RISCV_ISA_EXT_ZCA), + __RISCV_ISA_EXT_DATA_VALIDATE(zcb, RISCV_ISA_EXT_ZCB, riscv_ext_zca_depends), + __RISCV_ISA_EXT_DATA_VALIDATE(zcd, RISCV_ISA_EXT_ZCD, riscv_ext_zcd_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zcf, RISCV_ISA_EXT_ZCF, riscv_ext_zcf_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zcmop, RISCV_ISA_EXT_ZCMOP, riscv_ext_zca_depends), + __RISCV_ISA_EXT_DATA(zba, RISCV_ISA_EXT_ZBA), + __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), + __RISCV_ISA_EXT_DATA(zbc, RISCV_ISA_EXT_ZBC), + __RISCV_ISA_EXT_DATA(zbkb, RISCV_ISA_EXT_ZBKB), + __RISCV_ISA_EXT_DATA(zbkc, RISCV_ISA_EXT_ZBKC), + __RISCV_ISA_EXT_DATA(zbkx, RISCV_ISA_EXT_ZBKX), + __RISCV_ISA_EXT_DATA(zbs, RISCV_ISA_EXT_ZBS), + __RISCV_ISA_EXT_BUNDLE(zk, riscv_zk_bundled_exts), + __RISCV_ISA_EXT_BUNDLE(zkn, riscv_zkn_bundled_exts), + __RISCV_ISA_EXT_DATA(zknd, RISCV_ISA_EXT_ZKND), + __RISCV_ISA_EXT_DATA(zkne, RISCV_ISA_EXT_ZKNE), + __RISCV_ISA_EXT_DATA(zknh, RISCV_ISA_EXT_ZKNH), + __RISCV_ISA_EXT_DATA(zkr, RISCV_ISA_EXT_ZKR), + __RISCV_ISA_EXT_BUNDLE(zks, riscv_zks_bundled_exts), + __RISCV_ISA_EXT_DATA(zkt, RISCV_ISA_EXT_ZKT), + __RISCV_ISA_EXT_DATA(zksed, RISCV_ISA_EXT_ZKSED), + __RISCV_ISA_EXT_DATA(zksh, RISCV_ISA_EXT_ZKSH), + __RISCV_ISA_EXT_DATA(ztso, RISCV_ISA_EXT_ZTSO), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zvbb, RISCV_ISA_EXT_ZVBB, riscv_zvbb_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvbc, RISCV_ISA_EXT_ZVBC, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zve32f, RISCV_ISA_EXT_ZVE32F, riscv_zve32f_exts, riscv_ext_vector_float_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zve32x, RISCV_ISA_EXT_ZVE32X, riscv_ext_vector_x_validate), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zve64d, RISCV_ISA_EXT_ZVE64D, riscv_zve64d_exts, riscv_ext_vector_float_validate), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zve64f, RISCV_ISA_EXT_ZVE64F, riscv_zve64f_exts, riscv_ext_vector_float_validate), + __RISCV_ISA_EXT_SUPERSET_VALIDATE(zve64x, RISCV_ISA_EXT_ZVE64X, riscv_zve64x_exts, riscv_ext_vector_x_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvfbfmin, RISCV_ISA_EXT_ZVFBFMIN, riscv_vector_f_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvfbfwma, RISCV_ISA_EXT_ZVFBFWMA, riscv_ext_zvfbfwma_validate), + __RISCV_ISA_EXT_DATA(zvfh, RISCV_ISA_EXT_ZVFH), + __RISCV_ISA_EXT_DATA(zvfhmin, RISCV_ISA_EXT_ZVFHMIN), + __RISCV_ISA_EXT_DATA_VALIDATE(zvkb, RISCV_ISA_EXT_ZVKB, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvkg, RISCV_ISA_EXT_ZVKG, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_BUNDLE_VALIDATE(zvkn, riscv_zvkn_bundled_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_BUNDLE_VALIDATE(zvknc, riscv_zvknc_bundled_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvkned, RISCV_ISA_EXT_ZVKNED, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_BUNDLE_VALIDATE(zvkng, riscv_zvkng_bundled_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvknha, RISCV_ISA_EXT_ZVKNHA, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvknhb, RISCV_ISA_EXT_ZVKNHB, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_BUNDLE_VALIDATE(zvks, riscv_zvks_bundled_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_BUNDLE_VALIDATE(zvksc, riscv_zvksc_bundled_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvksed, RISCV_ISA_EXT_ZVKSED, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvksh, RISCV_ISA_EXT_ZVKSH, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_BUNDLE_VALIDATE(zvksg, riscv_zvksg_bundled_exts, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zvkt, RISCV_ISA_EXT_ZVKT, riscv_ext_vector_crypto_validate), + __RISCV_ISA_EXT_DATA(smaia, RISCV_ISA_EXT_SMAIA), + __RISCV_ISA_EXT_DATA(smmpm, RISCV_ISA_EXT_SMMPM), + __RISCV_ISA_EXT_SUPERSET(smnpm, RISCV_ISA_EXT_SMNPM, riscv_xlinuxenvcfg_exts), + __RISCV_ISA_EXT_DATA(smstateen, RISCV_ISA_EXT_SMSTATEEN), + __RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA), + __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), + __RISCV_ISA_EXT_SUPERSET(ssnpm, RISCV_ISA_EXT_SSNPM, riscv_xlinuxenvcfg_exts), + __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), + __RISCV_ISA_EXT_DATA(svade, RISCV_ISA_EXT_SVADE), + __RISCV_ISA_EXT_DATA_VALIDATE(svadu, RISCV_ISA_EXT_SVADU, riscv_ext_svadu_validate), + __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), + __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), + __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), + __RISCV_ISA_EXT_DATA(svvptc, RISCV_ISA_EXT_SVVPTC), +}; + +const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext); + +static void riscv_isa_set_ext(const struct riscv_isa_ext_data *ext, unsigned long *bitmap) +{ + if (ext->id != RISCV_ISA_EXT_INVALID) + set_bit(ext->id, bitmap); + + for (int i = 0; i < ext->subset_ext_size; i++) { + if (ext->subset_ext_ids[i] != RISCV_ISA_EXT_INVALID) + set_bit(ext->subset_ext_ids[i], bitmap); + } +} + +static const struct riscv_isa_ext_data *riscv_get_isa_ext_data(unsigned int ext_id) +{ + for (int i = 0; i < riscv_isa_ext_count; i++) { + if (riscv_isa_ext[i].id == ext_id) + return &riscv_isa_ext[i]; + } + + return NULL; +} + +/* + * "Resolve" a source ISA bitmap into one that matches kernel configuration as + * well as correct extension dependencies. Some extensions depends on specific + * kernel configuration to be usable (V needs CONFIG_RISCV_ISA_V for instance) + * and this function will actually validate all the extensions provided in + * source_isa into the resolved_isa based on extensions validate() callbacks. + */ +static void __init riscv_resolve_isa(unsigned long *source_isa, + unsigned long *resolved_isa, unsigned long *this_hwcap, + unsigned long *isa2hwcap) +{ + bool loop; + const struct riscv_isa_ext_data *ext; + DECLARE_BITMAP(prev_resolved_isa, RISCV_ISA_EXT_MAX); + int max_loop_count = riscv_isa_ext_count, ret; + unsigned int bit; + + do { + loop = false; + if (max_loop_count-- < 0) { + pr_err("Failed to reach a stable ISA state\n"); + return; } + bitmap_copy(prev_resolved_isa, resolved_isa, RISCV_ISA_EXT_MAX); + for_each_set_bit(bit, source_isa, RISCV_ISA_EXT_MAX) { + ext = riscv_get_isa_ext_data(bit); - temp = isa; -#if IS_ENABLED(CONFIG_32BIT) - if (!strncmp(isa, "rv32", 4)) - isa += 4; -#elif IS_ENABLED(CONFIG_64BIT) - if (!strncmp(isa, "rv64", 4)) - isa += 4; -#endif - /* The riscv,isa DT property must start with rv64 or rv32 */ - if (temp == isa) - continue; - bitmap_zero(this_isa, RISCV_ISA_EXT_MAX); - for (; *isa; ++isa) { - const char *ext = isa++; - const char *ext_end = isa; - bool ext_long = false, ext_err = false; - - switch (*ext) { - case 's': - /** - * Workaround for invalid single-letter 's' & 'u'(QEMU). - * No need to set the bit in riscv_isa as 's' & 'u' are - * not valid ISA extensions. It works until multi-letter - * extension starting with "Su" appears. - */ - if (ext[-1] != '_' && ext[1] == 'u') { - ++isa; - ext_err = true; - break; - } - fallthrough; - case 'x': - case 'z': - ext_long = true; - /* Multi-letter extension must be delimited */ - for (; *isa && *isa != '_'; ++isa) - if (unlikely(!islower(*isa) - && !isdigit(*isa))) - ext_err = true; - /* Parse backwards */ - ext_end = isa; - if (unlikely(ext_err)) - break; - if (!isdigit(ext_end[-1])) - break; - /* Skip the minor version */ - while (isdigit(*--ext_end)) - ; - if (ext_end[0] != 'p' - || !isdigit(ext_end[-1])) { - /* Advance it to offset the pre-decrement */ - ++ext_end; - break; + if (ext && ext->validate) { + ret = ext->validate(ext, resolved_isa); + if (ret == -EPROBE_DEFER) { + loop = true; + continue; + } else if (ret) { + /* Disable the extension entirely */ + clear_bit(bit, source_isa); + continue; } - /* Skip the major version */ - while (isdigit(*--ext_end)) - ; - ++ext_end; + } + + set_bit(bit, resolved_isa); + /* No need to keep it in source isa now that it is enabled */ + clear_bit(bit, source_isa); + + /* Single letter extensions get set in hwcap */ + if (bit < RISCV_ISA_EXT_BASE) + *this_hwcap |= isa2hwcap[bit]; + } + } while (loop && !bitmap_equal(prev_resolved_isa, resolved_isa, RISCV_ISA_EXT_MAX)); +} + +static void __init match_isa_ext(const char *name, const char *name_end, unsigned long *bitmap) +{ + for (int i = 0; i < riscv_isa_ext_count; i++) { + const struct riscv_isa_ext_data *ext = &riscv_isa_ext[i]; + + if ((name_end - name == strlen(ext->name)) && + !strncasecmp(name, ext->name, name_end - name)) { + riscv_isa_set_ext(ext, bitmap); + break; + } + } +} + +static void __init riscv_parse_isa_string(const char *isa, unsigned long *bitmap) +{ + /* + * For all possible cpus, we have already validated in + * the boot process that they at least contain "rv" and + * whichever of "32"/"64" this kernel supports, and so this + * section can be skipped. + */ + isa += 4; + + while (*isa) { + const char *ext = isa++; + const char *ext_end = isa; + bool ext_err = false; + + switch (*ext) { + case 'x': + case 'X': + if (acpi_disabled) + pr_warn_once("Vendor extensions are ignored in riscv,isa. Use riscv,isa-extensions instead."); + /* + * To skip an extension, we find its end. + * As multi-letter extensions must be split from other multi-letter + * extensions with an "_", the end of a multi-letter extension will + * either be the null character or the "_" at the start of the next + * multi-letter extension. + */ + for (; *isa && *isa != '_'; ++isa) + ; + ext_err = true; + break; + case 's': + /* + * Workaround for invalid single-letter 's' & 'u' (QEMU). + * No need to set the bit in riscv_isa as 's' & 'u' are + * not valid ISA extensions. It works unless the first + * multi-letter extension in the ISA string begins with + * "Su" and is not prefixed with an underscore. + */ + if (ext[-1] != '_' && ext[1] == 'u') { + ++isa; + ext_err = true; break; - default: - if (unlikely(!islower(*ext))) { + } + fallthrough; + case 'S': + case 'z': + case 'Z': + /* + * Before attempting to parse the extension itself, we find its end. + * As multi-letter extensions must be split from other multi-letter + * extensions with an "_", the end of a multi-letter extension will + * either be the null character or the "_" at the start of the next + * multi-letter extension. + * + * Next, as the extensions version is currently ignored, we + * eliminate that portion. This is done by parsing backwards from + * the end of the extension, removing any numbers. This may be a + * major or minor number however, so the process is repeated if a + * minor number was found. + * + * ext_end is intended to represent the first character *after* the + * name portion of an extension, but will be decremented to the last + * character itself while eliminating the extensions version number. + * A simple re-increment solves this problem. + */ + for (; *isa && *isa != '_'; ++isa) + if (unlikely(!isalnum(*isa))) ext_err = true; - break; - } - /* Find next extension */ - if (!isdigit(*isa)) - break; - /* Skip the minor version */ - while (isdigit(*++isa)) - ; - if (*isa != 'p') - break; - if (!isdigit(*++isa)) { - --isa; - break; - } - /* Skip the major version */ - while (isdigit(*++isa)) - ; + + ext_end = isa; + if (unlikely(ext_err)) + break; + + if (!isdigit(ext_end[-1])) + break; + + while (isdigit(*--ext_end)) + ; + + if (tolower(ext_end[0]) != 'p' || !isdigit(ext_end[-1])) { + ++ext_end; break; } - if (*isa != '_') + + while (isdigit(*--ext_end)) + ; + + ++ext_end; + break; + default: + /* + * Things are a little easier for single-letter extensions, as they + * are parsed forwards. + * + * After checking that our starting position is valid, we need to + * ensure that, when isa was incremented at the start of the loop, + * that it arrived at the start of the next extension. + * + * If we are already on a non-digit, there is nothing to do. Either + * we have a multi-letter extension's _, or the start of an + * extension. + * + * Otherwise we have found the current extension's major version + * number. Parse past it, and a subsequent p/minor version number + * if present. The `p` extension must not appear immediately after + * a number, so there is no fear of missing it. + * + */ + if (unlikely(!isalpha(*ext))) { + ext_err = true; + break; + } + + if (!isdigit(*isa)) + break; + + while (isdigit(*++isa)) + ; + + if (tolower(*isa) != 'p') + break; + + if (!isdigit(*++isa)) { --isa; + break; + } -#define SET_ISA_EXT_MAP(name, bit) \ - do { \ - if ((ext_end - ext == sizeof(name) - 1) && \ - !memcmp(ext, name, sizeof(name) - 1)) \ - set_bit(bit, this_isa); \ - } while (false) \ + while (isdigit(*++isa)) + ; - if (unlikely(ext_err)) + break; + } + + /* + * The parser expects that at the start of an iteration isa points to the + * first character of the next extension. As we stop parsing an extension + * on meeting a non-alphanumeric character, an extra increment is needed + * where the succeeding extension is a multi-letter prefixed with an "_". + */ + if (*isa == '_') + ++isa; + + if (unlikely(ext_err)) + continue; + + match_isa_ext(ext, ext_end, bitmap); + } +} + +static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) +{ + struct device_node *node; + const char *isa; + int rc; + struct acpi_table_header *rhct; + acpi_status status; + unsigned int cpu; + u64 boot_vendorid; + u64 boot_archid; + + if (!acpi_disabled) { + status = acpi_get_table(ACPI_SIG_RHCT, 0, &rhct); + if (ACPI_FAILURE(status)) + return; + } + + boot_vendorid = riscv_get_mvendorid(); + boot_archid = riscv_get_marchid(); + + for_each_possible_cpu(cpu) { + struct riscv_isainfo *isainfo = &hart_isa[cpu]; + unsigned long this_hwcap = 0; + DECLARE_BITMAP(source_isa, RISCV_ISA_EXT_MAX) = { 0 }; + + if (acpi_disabled) { + node = of_cpu_device_node_get(cpu); + if (!node) { + pr_warn("Unable to find cpu node\n"); + continue; + } + + rc = of_property_read_string(node, "riscv,isa", &isa); + of_node_put(node); + if (rc) { + pr_warn("Unable to find \"riscv,isa\" devicetree entry\n"); continue; - if (!ext_long) { - this_hwcap |= isa2hwcap[(unsigned char)(*ext)]; - set_bit(*ext - 'a', this_isa); - } else { - SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF); - SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); } -#undef SET_ISA_EXT_MAP + } else { + rc = acpi_get_riscv_isa(rhct, cpu, &isa); + if (rc < 0) { + pr_warn("Unable to get ISA for the hart - %d\n", cpu); + continue; + } + } + + riscv_parse_isa_string(isa, source_isa); + + /* + * These ones were as they were part of the base ISA when the + * port & dt-bindings were upstreamed, and so can be set + * unconditionally where `i` is in riscv,isa on DT systems. + */ + if (acpi_disabled) { + set_bit(RISCV_ISA_EXT_ZICSR, source_isa); + set_bit(RISCV_ISA_EXT_ZIFENCEI, source_isa); + set_bit(RISCV_ISA_EXT_ZICNTR, source_isa); + set_bit(RISCV_ISA_EXT_ZIHPM, source_isa); + } + + /* + * "V" in ISA strings is ambiguous in practice: it should mean + * just the standard V-1.0 but vendors aren't well behaved. + * Many vendors with T-Head CPU cores which implement the 0.7.1 + * version of the vector specification put "v" into their DTs. + * CPU cores with the ratified spec will contain non-zero + * marchid. + */ + if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) { + this_hwcap &= ~isa2hwcap[RISCV_ISA_EXT_v]; + clear_bit(RISCV_ISA_EXT_v, source_isa); } + riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap); + /* * All "okay" hart should have same isa. Set HWCAP based on * common capabilities of every "okay" hart, in case they don't @@ -214,18 +864,235 @@ void __init riscv_fill_hwcap(void) elf_hwcap = this_hwcap; if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX)) - bitmap_copy(riscv_isa, this_isa, RISCV_ISA_EXT_MAX); + bitmap_copy(riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); + else + bitmap_and(riscv_isa, riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); + } + + if (!acpi_disabled && rhct) + acpi_put_table((struct acpi_table_header *)rhct); +} + +static void __init riscv_fill_cpu_vendor_ext(struct device_node *cpu_node, int cpu) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return; + + for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) { + struct riscv_isa_vendor_ext_data_list *ext_list = riscv_isa_vendor_ext_list[i]; + + for (int j = 0; j < ext_list->ext_data_count; j++) { + const struct riscv_isa_ext_data ext = ext_list->ext_data[j]; + struct riscv_isavendorinfo *isavendorinfo = &ext_list->per_hart_isa_bitmap[cpu]; + + if (of_property_match_string(cpu_node, "riscv,isa-extensions", + ext.property) < 0) + continue; + + /* + * Assume that subset extensions are all members of the + * same vendor. + */ + if (ext.subset_ext_size) + for (int k = 0; k < ext.subset_ext_size; k++) + set_bit(ext.subset_ext_ids[k], isavendorinfo->isa); + + set_bit(ext.id, isavendorinfo->isa); + } + } +} + +/* + * Populate all_harts_isa_bitmap for each vendor with all of the extensions that + * are shared across CPUs for that vendor. + */ +static void __init riscv_fill_vendor_ext_list(int cpu) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return; + + for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) { + struct riscv_isa_vendor_ext_data_list *ext_list = riscv_isa_vendor_ext_list[i]; + + if (!ext_list->is_initialized) { + bitmap_copy(ext_list->all_harts_isa_bitmap.isa, + ext_list->per_hart_isa_bitmap[cpu].isa, + RISCV_ISA_VENDOR_EXT_MAX); + ext_list->is_initialized = true; + } else { + bitmap_and(ext_list->all_harts_isa_bitmap.isa, + ext_list->all_harts_isa_bitmap.isa, + ext_list->per_hart_isa_bitmap[cpu].isa, + RISCV_ISA_VENDOR_EXT_MAX); + } + } +} + +static int has_thead_homogeneous_vlenb(void) +{ + int cpu; + u32 prev_vlenb = 0; + u32 vlenb; + + /* Ignore thead,vlenb property if xtheavector is not enabled in the kernel */ + if (!IS_ENABLED(CONFIG_RISCV_ISA_XTHEADVECTOR)) + return 0; + + for_each_possible_cpu(cpu) { + struct device_node *cpu_node; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) { + pr_warn("Unable to find cpu node\n"); + return -ENOENT; + } + + if (of_property_read_u32(cpu_node, "thead,vlenb", &vlenb)) { + of_node_put(cpu_node); + + if (prev_vlenb) + return -ENOENT; + continue; + } + + if (prev_vlenb && vlenb != prev_vlenb) { + of_node_put(cpu_node); + return -ENOENT; + } + + prev_vlenb = vlenb; + of_node_put(cpu_node); + } + + thead_vlenb_of = vlenb; + return 0; +} + +static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap) +{ + unsigned int cpu; + bool mitigated; + + for_each_possible_cpu(cpu) { + unsigned long this_hwcap = 0; + struct device_node *cpu_node; + struct riscv_isainfo *isainfo = &hart_isa[cpu]; + DECLARE_BITMAP(source_isa, RISCV_ISA_EXT_MAX) = { 0 }; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) { + pr_warn("Unable to find cpu node\n"); + continue; + } + + if (!of_property_present(cpu_node, "riscv,isa-extensions")) { + of_node_put(cpu_node); + continue; + } + + for (int i = 0; i < riscv_isa_ext_count; i++) { + const struct riscv_isa_ext_data *ext = &riscv_isa_ext[i]; + + if (of_property_match_string(cpu_node, "riscv,isa-extensions", + ext->property) < 0) + continue; + + riscv_isa_set_ext(ext, source_isa); + } + + riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap); + riscv_fill_cpu_vendor_ext(cpu_node, cpu); + + of_node_put(cpu_node); + + /* + * All "okay" harts should have same isa. Set HWCAP based on + * common capabilities of every "okay" hart, in case they don't. + */ + if (elf_hwcap) + elf_hwcap &= this_hwcap; + else + elf_hwcap = this_hwcap; + + if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX)) + bitmap_copy(riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); else - bitmap_and(riscv_isa, riscv_isa, this_isa, RISCV_ISA_EXT_MAX); + bitmap_and(riscv_isa, riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); + + riscv_fill_vendor_ext_list(cpu); + } + + /* + * Execute ghostwrite mitigation immediately after detecting extensions + * to disable xtheadvector if necessary. + */ + mitigated = ghostwrite_enable_mitigation(); + + if (!mitigated && has_xtheadvector_no_alternatives() && has_thead_homogeneous_vlenb() < 0) { + pr_warn("Unsupported heterogeneous vlenb detected, vector extension disabled.\n"); + disable_xtheadvector(); + } + + if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX)) + return -ENOENT; + + return 0; +} + +#ifdef CONFIG_RISCV_ISA_FALLBACK +bool __initdata riscv_isa_fallback = true; +#else +bool __initdata riscv_isa_fallback; +static int __init riscv_isa_fallback_setup(char *__unused) +{ + riscv_isa_fallback = true; + return 1; +} +early_param("riscv_isa_fallback", riscv_isa_fallback_setup); +#endif + +void __init riscv_fill_hwcap(void) +{ + char print_str[NUM_ALPHA_EXTS + 1]; + unsigned long isa2hwcap[26] = {0}; + int i, j; + + isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; + isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; + isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; + isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; + isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; + isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; + isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; + + if (!acpi_disabled) { + riscv_fill_hwcap_from_isa_string(isa2hwcap); + } else { + int ret = riscv_fill_hwcap_from_ext_list(isa2hwcap); + + if (ret && riscv_isa_fallback) { + pr_info("Falling back to deprecated \"riscv,isa\"\n"); + riscv_fill_hwcap_from_isa_string(isa2hwcap); + } } - /* We don't support systems with F but without D, so mask those out - * here. */ + /* + * We don't support systems with F but without D, so mask those out + * here. + */ if ((elf_hwcap & COMPAT_HWCAP_ISA_F) && !(elf_hwcap & COMPAT_HWCAP_ISA_D)) { pr_info("This kernel does not support systems with F but not D\n"); elf_hwcap &= ~COMPAT_HWCAP_ISA_F; } + if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_ZVE32X) || + has_xtheadvector_no_alternatives()) { + /* + * This cannot fail when called on the boot hart + */ + riscv_v_setup_vsize(); + } + memset(print_str, 0, sizeof(print_str)); for (i = 0, j = 0; i < NUM_ALPHA_EXTS; i++) if (riscv_isa[0] & BIT_MASK(i)) @@ -237,77 +1104,124 @@ void __init riscv_fill_hwcap(void) if (elf_hwcap & BIT_MASK(i)) print_str[j++] = (char)('a' + i); pr_info("riscv: ELF capabilities %s\n", print_str); - -#ifdef CONFIG_FPU - if (elf_hwcap & (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D)) - static_branch_enable(&cpu_hwcap_fpu); -#endif } -#ifdef CONFIG_RISCV_ALTERNATIVE -struct cpufeature_info { - char name[ERRATA_STRING_LENGTH_MAX]; - bool (*check_func)(unsigned int stage); -}; - -static bool __init_or_module cpufeature_svpbmt_check_func(unsigned int stage) +unsigned long riscv_get_elf_hwcap(void) { -#ifdef CONFIG_RISCV_ISA_SVPBMT - switch (stage) { - case RISCV_ALTERNATIVES_EARLY_BOOT: - return false; - default: - return riscv_isa_extension_available(NULL, SVPBMT); - } -#endif + unsigned long hwcap; - return false; -} + hwcap = (elf_hwcap & ((1UL << RISCV_ISA_EXT_BASE) - 1)); -static const struct cpufeature_info __initdata_or_module -cpufeature_list[CPUFEATURE_NUMBER] = { - { - .name = "svpbmt", - .check_func = cpufeature_svpbmt_check_func - }, -}; + if (!riscv_v_vstate_ctrl_user_allowed()) + hwcap &= ~COMPAT_HWCAP_ISA_V; + + return hwcap; +} -static u32 __init_or_module cpufeature_probe(unsigned int stage) +void __init riscv_user_isa_enable(void) { - const struct cpufeature_info *info; - u32 cpu_req_feature = 0; - int idx; + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ)) + current->thread.envcfg |= ENVCFG_CBZE; + else if (any_cpu_has_zicboz) + pr_warn("Zicboz disabled as it is unavailable on some harts\n"); - for (idx = 0; idx < CPUFEATURE_NUMBER; idx++) { - info = &cpufeature_list[idx]; + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOM)) + current->thread.envcfg |= ENVCFG_CBCFE; + else if (any_cpu_has_zicbom) + pr_warn("Zicbom disabled as it is unavailable on some harts\n"); - if (info->check_func(stage)) - cpu_req_feature |= (1U << idx); + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOP) && + any_cpu_has_zicbop) + pr_warn("Zicbop disabled as it is unavailable on some harts\n"); +} + +#ifdef CONFIG_RISCV_ALTERNATIVE +/* + * Alternative patch sites consider 48 bits when determining when to patch + * the old instruction sequence with the new. These bits are broken into a + * 16-bit vendor ID and a 32-bit patch ID. A non-zero vendor ID means the + * patch site is for an erratum, identified by the 32-bit patch ID. When + * the vendor ID is zero, the patch site is for a cpufeature. cpufeatures + * further break down patch ID into two 16-bit numbers. The lower 16 bits + * are the cpufeature ID and the upper 16 bits are used for a value specific + * to the cpufeature and patch site. If the upper 16 bits are zero, then it + * implies no specific value is specified. cpufeatures that want to control + * patching on a per-site basis will provide non-zero values and implement + * checks here. The checks return true when patching should be done, and + * false otherwise. + */ +static bool riscv_cpufeature_patch_check(u16 id, u16 value) +{ + if (!value) + return true; + + switch (id) { + case RISCV_ISA_EXT_ZICBOZ: + /* + * Zicboz alternative applications provide the maximum + * supported block size order, or zero when it doesn't + * matter. If the current block size exceeds the maximum, + * then the alternative cannot be applied. + */ + return riscv_cboz_block_size <= (1U << value); } - return cpu_req_feature; + return false; } void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned int stage) { - u32 cpu_req_feature = cpufeature_probe(stage); struct alt_entry *alt; - u32 tmp; + void *oldptr, *altptr; + u16 id, value, vendor; + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return; for (alt = begin; alt < end; alt++) { - if (alt->vendor_id != 0) - continue; - if (alt->errata_id >= CPUFEATURE_NUMBER) { - WARN(1, "This feature id:%d is not in kernel cpufeature list", - alt->errata_id); + id = PATCH_ID_CPUFEATURE_ID(alt->patch_id); + vendor = PATCH_ID_CPUFEATURE_ID(alt->vendor_id); + + /* + * Any alternative with a patch_id that is less than + * RISCV_ISA_EXT_MAX is interpreted as a standard extension. + * + * Any alternative with patch_id that is greater than or equal + * to RISCV_VENDOR_EXT_ALTERNATIVES_BASE is interpreted as a + * vendor extension. + */ + if (id < RISCV_ISA_EXT_MAX) { + /* + * This patch should be treated as errata so skip + * processing here. + */ + if (alt->vendor_id != 0) + continue; + + if (!__riscv_isa_extension_available(NULL, id)) + continue; + + value = PATCH_ID_CPUFEATURE_VALUE(alt->patch_id); + if (!riscv_cpufeature_patch_check(id, value)) + continue; + } else if (id >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE) { + if (!__riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, + id - RISCV_VENDOR_EXT_ALTERNATIVES_BASE)) + continue; + } else { + WARN(1, "This extension id:%d is not in ISA extension list", id); continue; } - tmp = (1U << alt->errata_id); - if (cpu_req_feature & tmp) - patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len); + oldptr = ALT_OLD_PTR(alt); + altptr = ALT_ALT_PTR(alt); + + mutex_lock(&text_mutex); + patch_text_nosync(oldptr, altptr, alt->alt_len); + riscv_alternative_fix_offsets(oldptr, alt->alt_len, oldptr - altptr); + mutex_unlock(&text_mutex); } } #endif diff --git a/arch/riscv/kernel/crash_save_regs.S b/arch/riscv/kernel/crash_save_regs.S index 7832fb763aba..b2a1908c0463 100644 --- a/arch/riscv/kernel/crash_save_regs.S +++ b/arch/riscv/kernel/crash_save_regs.S @@ -44,7 +44,7 @@ SYM_CODE_START(riscv_crash_save_regs) REG_S t6, PT_T6(a0) /* x31 */ csrr t1, CSR_STATUS - csrr t2, CSR_EPC + auipc t2, 0x0 csrr t3, CSR_TVAL csrr t4, CSR_CAUSE diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S index 8e733aa48ba6..2efc3aaf4a8c 100644 --- a/arch/riscv/kernel/efi-header.S +++ b/arch/riscv/kernel/efi-header.S @@ -6,9 +6,10 @@ #include <linux/pe.h> #include <linux/sizes.h> +#include <asm/set_memory.h> .macro __EFI_PE_HEADER - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE coff_header: #ifdef CONFIG_64BIT .short IMAGE_FILE_MACHINE_RISCV64 // Machine @@ -26,14 +27,18 @@ coff_header: optional_header: #ifdef CONFIG_64BIT - .short PE_OPT_MAGIC_PE32PLUS // PE32+ format + .short IMAGE_NT_OPTIONAL_HDR64_MAGIC // PE32+ format #else - .short PE_OPT_MAGIC_PE32 // PE32 format + .short IMAGE_NT_OPTIONAL_HDR32_MAGIC // PE32 format #endif .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion .long __pecoff_text_end - efi_header_end // SizeOfCode - .long __pecoff_data_virt_size // SizeOfInitializedData +#ifdef __clang__ + .long __pecoff_data_virt_size // SizeOfInitializedData +#else + .long __pecoff_data_virt_end - __pecoff_text_end // SizeOfInitializedData +#endif .long 0 // SizeOfUninitializedData .long __efistub_efi_pe_entry - _start // AddressOfEntryPoint .long efi_header_end - _start // BaseOfCode @@ -59,7 +64,7 @@ extra_header_fields: .long efi_header_end - _start // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short 0 // DllCharacteristics + .short IMAGE_DLLCHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve @@ -91,9 +96,17 @@ section_table: IMAGE_SCN_MEM_EXECUTE // Characteristics .ascii ".data\0\0\0" - .long __pecoff_data_virt_size // VirtualSize +#ifdef __clang__ + .long __pecoff_data_virt_size // VirtualSize +#else + .long __pecoff_data_virt_end - __pecoff_text_end // VirtualSize +#endif .long __pecoff_text_end - _start // VirtualAddress - .long __pecoff_data_raw_size // SizeOfRawData +#ifdef __clang__ + .long __pecoff_data_raw_size // SizeOfRawData +#else + .long __pecoff_data_raw_end - __pecoff_text_end // SizeOfRawData +#endif .long __pecoff_text_end - _start // PointerToRawData .long 0 // PointerToRelocations diff --git a/arch/riscv/kernel/efi.c b/arch/riscv/kernel/efi.c index 1aa540350abd..b64bf1624a05 100644 --- a/arch/riscv/kernel/efi.c +++ b/arch/riscv/kernel/efi.c @@ -60,7 +60,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); unsigned long val; if (md->attribute & EFI_MEMORY_RO) { @@ -78,7 +78,8 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) } int __init efi_set_mapping_permissions(struct mm_struct *mm, - efi_memory_desc_t *md) + efi_memory_desc_t *md, + bool ignored) { BUG_ON(md->type != EFI_RUNTIME_SERVICES_CODE && md->type != EFI_RUNTIME_SERVICES_DATA); diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c deleted file mode 100644 index 0cb94992c15b..000000000000 --- a/arch/riscv/kernel/elf_kexec.c +++ /dev/null @@ -1,448 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Load ELF vmlinux file for the kexec_file_load syscall. - * - * Copyright (C) 2021 Huawei Technologies Co, Ltd. - * - * Author: Liao Chang (liaochang1@huawei.com) - * - * Based on kexec-tools' kexec-elf-riscv.c, heavily modified - * for kernel. - */ - -#define pr_fmt(fmt) "kexec_image: " fmt - -#include <linux/elf.h> -#include <linux/kexec.h> -#include <linux/slab.h> -#include <linux/of.h> -#include <linux/libfdt.h> -#include <linux/types.h> -#include <linux/memblock.h> -#include <asm/setup.h> - -static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, - struct kexec_elf_info *elf_info, unsigned long old_pbase, - unsigned long new_pbase) -{ - int i; - int ret = 0; - size_t size; - struct kexec_buf kbuf; - const struct elf_phdr *phdr; - - kbuf.image = image; - - for (i = 0; i < ehdr->e_phnum; i++) { - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - size = phdr->p_filesz; - if (size > phdr->p_memsz) - size = phdr->p_memsz; - - kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; - kbuf.bufsz = size; - kbuf.buf_align = phdr->p_align; - kbuf.mem = phdr->p_paddr - old_pbase + new_pbase; - kbuf.memsz = phdr->p_memsz; - kbuf.top_down = false; - ret = kexec_add_buffer(&kbuf); - if (ret) - break; - } - - return ret; -} - -/* - * Go through the available phsyical memory regions and find one that hold - * an image of the specified size. - */ -static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, - struct elfhdr *ehdr, struct kexec_elf_info *elf_info, - unsigned long *old_pbase, unsigned long *new_pbase) -{ - int i; - int ret; - struct kexec_buf kbuf; - const struct elf_phdr *phdr; - unsigned long lowest_paddr = ULONG_MAX; - unsigned long lowest_vaddr = ULONG_MAX; - - for (i = 0; i < ehdr->e_phnum; i++) { - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - if (lowest_paddr > phdr->p_paddr) - lowest_paddr = phdr->p_paddr; - - if (lowest_vaddr > phdr->p_vaddr) - lowest_vaddr = phdr->p_vaddr; - } - - kbuf.image = image; - kbuf.buf_min = lowest_paddr; - kbuf.buf_max = ULONG_MAX; - kbuf.buf_align = PAGE_SIZE; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); - kbuf.top_down = false; - ret = arch_kexec_locate_mem_hole(&kbuf); - if (!ret) { - *old_pbase = lowest_paddr; - *new_pbase = kbuf.mem; - image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem; - } - return ret; -} - -static int get_nr_ram_ranges_callback(struct resource *res, void *arg) -{ - unsigned int *nr_ranges = arg; - - (*nr_ranges)++; - return 0; -} - -static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) -{ - struct crash_mem *cmem = arg; - - cmem->ranges[cmem->nr_ranges].start = res->start; - cmem->ranges[cmem->nr_ranges].end = res->end; - cmem->nr_ranges++; - - return 0; -} - -static int prepare_elf_headers(void **addr, unsigned long *sz) -{ - struct crash_mem *cmem; - unsigned int nr_ranges; - int ret; - - nr_ranges = 1; /* For exclusion of crashkernel region */ - walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); - - cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); - if (!cmem) - return -ENOMEM; - - cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; - ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); - if (ret) - goto out; - - /* Exclude crashkernel region */ - ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); - if (!ret) - ret = crash_prepare_elf64_headers(cmem, true, addr, sz); - -out: - kfree(cmem); - return ret; -} - -static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, - unsigned long cmdline_len) -{ - int elfcorehdr_strlen; - char *cmdline_ptr; - - cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL); - if (!cmdline_ptr) - return NULL; - - elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", - image->elf_load_addr); - - if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) { - pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n"); - kfree(cmdline_ptr); - return NULL; - } - - memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len); - /* Ensure it's nul terminated */ - cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; - return cmdline_ptr; -} - -static void *elf_kexec_load(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len) -{ - int ret; - unsigned long old_kernel_pbase = ULONG_MAX; - unsigned long new_kernel_pbase = 0UL; - unsigned long initrd_pbase = 0UL; - unsigned long headers_sz; - unsigned long kernel_start; - void *fdt, *headers; - struct elfhdr ehdr; - struct kexec_buf kbuf; - struct kexec_elf_info elf_info; - char *modified_cmdline = NULL; - - ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info); - if (ret) - return ERR_PTR(ret); - - ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info, - &old_kernel_pbase, &new_kernel_pbase); - if (ret) - goto out; - kernel_start = image->start; - pr_notice("The entry point of kernel at 0x%lx\n", image->start); - - /* Add the kernel binary to the image */ - ret = riscv_kexec_elf_load(image, &ehdr, &elf_info, - old_kernel_pbase, new_kernel_pbase); - if (ret) - goto out; - - kbuf.image = image; - kbuf.buf_min = new_kernel_pbase + kernel_len; - kbuf.buf_max = ULONG_MAX; - - /* Add elfcorehdr */ - if (image->type == KEXEC_TYPE_CRASH) { - ret = prepare_elf_headers(&headers, &headers_sz); - if (ret) { - pr_err("Preparing elf core header failed\n"); - goto out; - } - - kbuf.buffer = headers; - kbuf.bufsz = headers_sz; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.memsz = headers_sz; - kbuf.buf_align = ELF_CORE_HEADER_ALIGN; - kbuf.top_down = true; - - ret = kexec_add_buffer(&kbuf); - if (ret) { - vfree(headers); - goto out; - } - image->elf_headers = headers; - image->elf_load_addr = kbuf.mem; - image->elf_headers_sz = headers_sz; - - pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.memsz); - - /* Setup cmdline for kdump kernel case */ - modified_cmdline = setup_kdump_cmdline(image, cmdline, - cmdline_len); - if (!modified_cmdline) { - pr_err("Setting up cmdline for kdump kernel failed\n"); - ret = -EINVAL; - goto out; - } - cmdline = modified_cmdline; - } - -#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY - /* Add purgatory to the image */ - kbuf.top_down = true; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_load_purgatory(image, &kbuf); - if (ret) { - pr_err("Error loading purgatory ret=%d\n", ret); - goto out; - } - ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry", - &kernel_start, - sizeof(kernel_start), 0); - if (ret) - pr_err("Error update purgatory ret=%d\n", ret); -#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ - - /* Add the initrd to the image */ - if (initrd != NULL) { - kbuf.buffer = initrd; - kbuf.bufsz = kbuf.memsz = initrd_len; - kbuf.buf_align = PAGE_SIZE; - kbuf.top_down = false; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - initrd_pbase = kbuf.mem; - pr_notice("Loaded initrd at 0x%lx\n", initrd_pbase); - } - - /* Add the DTB to the image */ - fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase, - initrd_len, cmdline, 0); - if (!fdt) { - pr_err("Error setting up the new device tree.\n"); - ret = -EINVAL; - goto out; - } - - fdt_pack(fdt); - kbuf.buffer = fdt; - kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); - kbuf.buf_align = PAGE_SIZE; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.top_down = true; - ret = kexec_add_buffer(&kbuf); - if (ret) { - pr_err("Error add DTB kbuf ret=%d\n", ret); - goto out_free_fdt; - } - pr_notice("Loaded device tree at 0x%lx\n", kbuf.mem); - goto out; - -out_free_fdt: - kvfree(fdt); -out: - kfree(modified_cmdline); - kexec_free_elf_info(&elf_info); - return ret ? ERR_PTR(ret) : NULL; -} - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RISCV_IMM_BITS 12 -#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) -#define RISCV_CONST_HIGH_PART(x) \ - (((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1)) -#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x)) - -#define ENCODE_ITYPE_IMM(x) \ - (RV_X(x, 0, 12) << 20) -#define ENCODE_BTYPE_IMM(x) \ - ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \ - (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31)) -#define ENCODE_UTYPE_IMM(x) \ - (RV_X(x, 12, 20) << 12) -#define ENCODE_JTYPE_IMM(x) \ - ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \ - (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31)) -#define ENCODE_CBTYPE_IMM(x) \ - ((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \ - (RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12)) -#define ENCODE_CJTYPE_IMM(x) \ - ((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \ - (RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \ - (RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12)) -#define ENCODE_UJTYPE_IMM(x) \ - (ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \ - (ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32)) -#define ENCODE_UITYPE_IMM(x) \ - (ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32)) - -#define CLEAN_IMM(type, x) \ - ((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x)) - -int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab) -{ - const char *strtab, *name, *shstrtab; - const Elf_Shdr *sechdrs; - Elf64_Rela *relas; - int i, r_type; - - /* String & section header string table */ - sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; - shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; - - relas = (void *)pi->ehdr + relsec->sh_offset; - - for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { - const Elf_Sym *sym; /* symbol to relocate */ - unsigned long addr; /* final location after relocation */ - unsigned long val; /* relocated symbol value */ - unsigned long sec_base; /* relocated symbol value */ - void *loc; /* tmp location to modify */ - - sym = (void *)pi->ehdr + symtab->sh_offset; - sym += ELF64_R_SYM(relas[i].r_info); - - if (sym->st_name) - name = strtab + sym->st_name; - else - name = shstrtab + sechdrs[sym->st_shndx].sh_name; - - loc = pi->purgatory_buf; - loc += section->sh_offset; - loc += relas[i].r_offset; - - if (sym->st_shndx == SHN_ABS) - sec_base = 0; - else if (sym->st_shndx >= pi->ehdr->e_shnum) { - pr_err("Invalid section %d for symbol %s\n", - sym->st_shndx, name); - return -ENOEXEC; - } else - sec_base = pi->sechdrs[sym->st_shndx].sh_addr; - - val = sym->st_value; - val += sec_base; - val += relas[i].r_addend; - - addr = section->sh_addr + relas[i].r_offset; - - r_type = ELF64_R_TYPE(relas[i].r_info); - - switch (r_type) { - case R_RISCV_BRANCH: - *(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) | - ENCODE_BTYPE_IMM(val - addr); - break; - case R_RISCV_JAL: - *(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) | - ENCODE_JTYPE_IMM(val - addr); - break; - /* - * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I - * sym is expected to be next to R_RISCV_PCREL_HI20 - * in purgatory relsec. Handle it like R_RISCV_CALL - * sym, instead of searching the whole relsec. - */ - case R_RISCV_PCREL_HI20: - case R_RISCV_CALL: - *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) | - ENCODE_UJTYPE_IMM(val - addr); - break; - case R_RISCV_RVC_BRANCH: - *(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) | - ENCODE_CBTYPE_IMM(val - addr); - break; - case R_RISCV_RVC_JUMP: - *(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) | - ENCODE_CJTYPE_IMM(val - addr); - break; - case R_RISCV_ADD32: - *(u32 *)loc += val; - break; - case R_RISCV_SUB32: - *(u32 *)loc -= val; - break; - /* It has been applied by R_RISCV_PCREL_HI20 sym */ - case R_RISCV_PCREL_LO12_I: - case R_RISCV_ALIGN: - case R_RISCV_RELAX: - break; - default: - pr_err("Unknown rela relocation: %d\n", r_type); - return -ENOEXEC; - } - } - return 0; -} - -const struct kexec_file_ops elf_kexec_ops = { - .probe = kexec_elf_probe, - .load = elf_kexec_load, -}; diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index b9eda3fcbd6d..75656afa2d6b 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -9,26 +9,114 @@ #include <asm/asm.h> #include <asm/csr.h> +#include <asm/scs.h> #include <asm/unistd.h> +#include <asm/page.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> #include <asm/errata_list.h> +#include <linux/sizes.h> + + .section .irqentry.text, "ax" + +.macro new_vmalloc_check + REG_S a0, TASK_TI_A0(tp) + csrr a0, CSR_CAUSE + /* Exclude IRQs */ + blt a0, zero, .Lnew_vmalloc_restore_context_a0 + + REG_S a1, TASK_TI_A1(tp) + /* Only check new_vmalloc if we are in page/protection fault */ + li a1, EXC_LOAD_PAGE_FAULT + beq a0, a1, .Lnew_vmalloc_kernel_address + li a1, EXC_STORE_PAGE_FAULT + beq a0, a1, .Lnew_vmalloc_kernel_address + li a1, EXC_INST_PAGE_FAULT + bne a0, a1, .Lnew_vmalloc_restore_context_a1 + +.Lnew_vmalloc_kernel_address: + /* Is it a kernel address? */ + csrr a0, CSR_TVAL + bge a0, zero, .Lnew_vmalloc_restore_context_a1 + + /* Check if a new vmalloc mapping appeared that could explain the trap */ + REG_S a2, TASK_TI_A2(tp) + /* + * Computes: + * a0 = &new_vmalloc[BIT_WORD(cpu)] + * a1 = BIT_MASK(cpu) + */ + REG_L a2, TASK_TI_CPU(tp) + /* + * Compute the new_vmalloc element position: + * (cpu / 64) * 8 = (cpu >> 6) << 3 + */ + srli a1, a2, 6 + slli a1, a1, 3 + la a0, new_vmalloc + add a0, a0, a1 + /* + * Compute the bit position in the new_vmalloc element: + * bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6 + * = cpu - ((cpu >> 6) << 3) << 3 + */ + slli a1, a1, 3 + sub a1, a2, a1 + /* Compute the "get mask": 1 << bit_pos */ + li a2, 1 + sll a1, a2, a1 + + /* Check the value of new_vmalloc for this cpu */ + REG_L a2, 0(a0) + and a2, a2, a1 + beq a2, zero, .Lnew_vmalloc_restore_context + + /* Atomically reset the current cpu bit in new_vmalloc */ + amoxor.d a0, a1, (a0) + + /* Only emit a sfence.vma if the uarch caches invalid entries */ + ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1) + + REG_L a0, TASK_TI_A0(tp) + REG_L a1, TASK_TI_A1(tp) + REG_L a2, TASK_TI_A2(tp) + csrw CSR_SCRATCH, x0 + sret -#if !IS_ENABLED(CONFIG_PREEMPTION) -.set resume_kernel, restore_all -#endif +.Lnew_vmalloc_restore_context: + REG_L a2, TASK_TI_A2(tp) +.Lnew_vmalloc_restore_context_a1: + REG_L a1, TASK_TI_A1(tp) +.Lnew_vmalloc_restore_context_a0: + REG_L a0, TASK_TI_A0(tp) +.endm -ENTRY(handle_exception) + +SYM_CODE_START(handle_exception) /* * If coming from userspace, preserve the user thread pointer and load * the kernel thread pointer. If we came from the kernel, the scratch * register will contain 0, and we should continue on the current TP. */ csrrw tp, CSR_SCRATCH, tp - bnez tp, _save_context + bnez tp, .Lsave_context -_restore_kernel_tpsp: +.Lrestore_kernel_tpsp: csrr tp, CSR_SCRATCH + +#ifdef CONFIG_64BIT + /* + * The RISC-V kernel does not eagerly emit a sfence.vma after each + * new vmalloc mapping, which may result in exceptions: + * - if the uarch caches invalid entries, the new mapping would not be + * observed by the page table walker and an invalidation is needed. + * - if the uarch does not cache invalid entries, a reordered access + * could "miss" the new mapping and traps: in that case, we only need + * to retry the access, no sfence.vma is required. + */ + new_vmalloc_check +#endif + REG_S sp, TASK_TI_KERNEL_SP(tp) #ifdef CONFIG_VMAP_STACK @@ -39,48 +127,23 @@ _restore_kernel_tpsp: REG_L sp, TASK_TI_KERNEL_SP(tp) #endif -_save_context: +.Lsave_context: REG_S sp, TASK_TI_USER_SP(tp) REG_L sp, TASK_TI_KERNEL_SP(tp) addi sp, sp, -(PT_SIZE_ON_STACK) REG_S x1, PT_RA(sp) REG_S x3, PT_GP(sp) REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x8, PT_S0(sp) - REG_S x9, PT_S1(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x18, PT_S2(sp) - REG_S x19, PT_S3(sp) - REG_S x20, PT_S4(sp) - REG_S x21, PT_S5(sp) - REG_S x22, PT_S6(sp) - REG_S x23, PT_S7(sp) - REG_S x24, PT_S8(sp) - REG_S x25, PT_S9(sp) - REG_S x26, PT_S10(sp) - REG_S x27, PT_S11(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) + save_from_x6_to_x31 /* * Disable user-mode memory access as it should only be set in the * actual user copy routines. * - * Disable the FPU to detect illegal usage of floating point in kernel - * space. + * Disable the FPU/Vector to detect illegal usage of floating point + * or vector in kernel space. */ - li t0, SR_SUM | SR_FS + li t0, SR_SUM | SR_FS_VS REG_L s0, TASK_TI_USER_SP(tp) csrrc s1, CSR_STATUS, t0 @@ -102,23 +165,16 @@ _save_context: csrw CSR_SCRATCH, x0 /* Load the global pointer */ -.option push -.option norelax - la gp, __global_pointer$ -.option pop + load_global_pointer -#ifdef CONFIG_TRACE_IRQFLAGS - call __trace_hardirqs_off -#endif + /* Load the kernel shadow call stack pointer if coming from userspace */ + scs_load_current_if_task_changed s5 -#ifdef CONFIG_CONTEXT_TRACKING_USER - /* If previous state is in user mode, call user_exit_callable(). */ - li a0, SR_PP - and a0, s1, a0 - bnez a0, skip_context_tracking - call user_exit_callable -skip_context_tracking: +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + move a0, sp + call riscv_v_context_nesting_start #endif + move a0, sp /* pt_regs */ /* * MSB of cause differentiates between @@ -126,134 +182,35 @@ skip_context_tracking: */ bge s4, zero, 1f - la ra, ret_from_exception - /* Handle interrupts */ - move a0, sp /* pt_regs */ - la a1, generic_handle_arch_irq - jr a1 -1: - /* - * Exceptions run with interrupts enabled or disabled depending on the - * state of SR_PIE in m/sstatus. - */ - andi t0, s1, SR_PIE - beqz t0, 1f - /* kprobes, entered via ebreak, must have interrupts disabled. */ - li t0, EXC_BREAKPOINT - beq s4, t0, 1f -#ifdef CONFIG_TRACE_IRQFLAGS - call __trace_hardirqs_on -#endif - csrs CSR_STATUS, SR_IE - + call do_irq + j ret_from_exception 1: - la ra, ret_from_exception - /* Handle syscalls */ - li t0, EXC_SYSCALL - beq s4, t0, handle_syscall - /* Handle other exceptions */ slli t0, s4, RISCV_LGPTR la t1, excp_vect_table la t2, excp_vect_table_end - move a0, sp /* pt_regs */ add t0, t1, t0 /* Check if exception code lies within bounds */ - bgeu t0, t2, 1f - REG_L t0, 0(t0) - jr t0 -1: - tail do_trap_unknown - -handle_syscall: -#ifdef CONFIG_RISCV_M_MODE - /* - * When running is M-Mode (no MMU config), MPIE does not get set. - * As a result, we need to force enable interrupts here because - * handle_exception did not do set SR_IE as it always sees SR_PIE - * being cleared. - */ - csrs CSR_STATUS, SR_IE -#endif -#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER) - /* Recover a0 - a7 for system calls */ - REG_L a0, PT_A0(sp) - REG_L a1, PT_A1(sp) - REG_L a2, PT_A2(sp) - REG_L a3, PT_A3(sp) - REG_L a4, PT_A4(sp) - REG_L a5, PT_A5(sp) - REG_L a6, PT_A6(sp) - REG_L a7, PT_A7(sp) -#endif - /* save the initial A0 value (needed in signal handlers) */ - REG_S a0, PT_ORIG_A0(sp) - /* - * Advance SEPC to avoid executing the original - * scall instruction on sret - */ - addi s2, s2, 0x4 - REG_S s2, PT_EPC(sp) - /* Trace syscalls, but only if requested by the user. */ - REG_L t0, TASK_TI_FLAGS(tp) - andi t0, t0, _TIF_SYSCALL_WORK - bnez t0, handle_syscall_trace_enter -check_syscall_nr: - /* Check to make sure we don't jump to a bogus syscall number. */ - li t0, __NR_syscalls - la s0, sys_ni_syscall - /* - * Syscall number held in a7. - * If syscall number is above allowed value, redirect to ni_syscall. - */ - bgeu a7, t0, 3f -#ifdef CONFIG_COMPAT - REG_L s0, PT_STATUS(sp) - srli s0, s0, SR_UXL_SHIFT - andi s0, s0, (SR_UXL >> SR_UXL_SHIFT) - li t0, (SR_UXL_32 >> SR_UXL_SHIFT) - sub t0, s0, t0 - bnez t0, 1f - - /* Call compat_syscall */ - la s0, compat_sys_call_table - j 2f -1: -#endif - /* Call syscall */ - la s0, sys_call_table -2: - slli t0, a7, RISCV_LGPTR - add s0, s0, t0 - REG_L s0, 0(s0) + bgeu t0, t2, 3f + REG_L t1, 0(t0) +2: jalr t1 + j ret_from_exception 3: - jalr s0 -ret_from_syscall: - /* Set user a0 to kernel a0 */ - REG_S a0, PT_A0(sp) - /* - * We didn't execute the actual syscall. - * Seccomp already set return value for the current task pt_regs. - * (If it was configured with SECCOMP_RET_ERRNO/TRACE) - */ -ret_from_syscall_rejected: -#ifdef CONFIG_DEBUG_RSEQ - move a0, sp - call rseq_syscall -#endif - /* Trace syscalls, but only if requested by the user. */ - REG_L t0, TASK_TI_FLAGS(tp) - andi t0, t0, _TIF_SYSCALL_WORK - bnez t0, handle_syscall_trace_exit + la t1, do_trap_unknown + j 2b +SYM_CODE_END(handle_exception) +ASM_NOKPROBE(handle_exception) -ret_from_exception: +/* + * The ret_from_exception must be called with interrupt disabled. Here is the + * caller list: + * - handle_exception + * - ret_from_fork + */ +SYM_CODE_START_NOALIGN(ret_from_exception) REG_L s0, PT_STATUS(sp) - csrc CSR_STATUS, SR_IE -#ifdef CONFIG_TRACE_IRQFLAGS - call __trace_hardirqs_off -#endif #ifdef CONFIG_RISCV_M_MODE /* the MPP value is too large to be used as an immediate arg for addi */ li t0, SR_MPP @@ -261,38 +218,28 @@ ret_from_exception: #else andi s0, s0, SR_SPP #endif - bnez s0, resume_kernel + bnez s0, 1f -resume_userspace: - /* Interrupts must be disabled here so flags are checked atomically */ - REG_L s0, TASK_TI_FLAGS(tp) /* current_thread_info->flags */ - andi s1, s0, _TIF_WORK_MASK - bnez s1, work_pending - -#ifdef CONFIG_CONTEXT_TRACKING_USER - call user_enter_callable +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + call stackleak_erase_on_task_stack #endif /* Save unwound kernel stack pointer in thread_info */ addi s0, sp, PT_SIZE_ON_STACK REG_S s0, TASK_TI_KERNEL_SP(tp) + /* Save the kernel shadow call stack pointer */ + scs_save_current + /* * Save TP into the scratch register , so we can find the kernel data * structures again. */ csrw CSR_SCRATCH, tp - -restore_all: -#ifdef CONFIG_TRACE_IRQFLAGS - REG_L s1, PT_STATUS(sp) - andi t0, s1, SR_PIE - beqz t0, 1f - call __trace_hardirqs_on - j 2f 1: - call __trace_hardirqs_off -2: +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + move a0, sp + call riscv_v_context_nesting_end #endif REG_L a0, PT_STATUS(sp) /* @@ -322,32 +269,7 @@ restore_all: REG_L x3, PT_GP(sp) REG_L x4, PT_TP(sp) REG_L x5, PT_T0(sp) - REG_L x6, PT_T1(sp) - REG_L x7, PT_T2(sp) - REG_L x8, PT_S0(sp) - REG_L x9, PT_S1(sp) - REG_L x10, PT_A0(sp) - REG_L x11, PT_A1(sp) - REG_L x12, PT_A2(sp) - REG_L x13, PT_A3(sp) - REG_L x14, PT_A4(sp) - REG_L x15, PT_A5(sp) - REG_L x16, PT_A6(sp) - REG_L x17, PT_A7(sp) - REG_L x18, PT_S2(sp) - REG_L x19, PT_S3(sp) - REG_L x20, PT_S4(sp) - REG_L x21, PT_S5(sp) - REG_L x22, PT_S6(sp) - REG_L x23, PT_S7(sp) - REG_L x24, PT_S8(sp) - REG_L x25, PT_S9(sp) - REG_L x26, PT_S10(sp) - REG_L x27, PT_S11(sp) - REG_L x28, PT_T3(sp) - REG_L x29, PT_T4(sp) - REG_L x30, PT_T5(sp) - REG_L x31, PT_T6(sp) + restore_from_x6_to_x31 REG_L x2, PT_SP(sp) @@ -356,134 +278,28 @@ restore_all: #else sret #endif - -#if IS_ENABLED(CONFIG_PREEMPTION) -resume_kernel: - REG_L s0, TASK_TI_PREEMPT_COUNT(tp) - bnez s0, restore_all - REG_L s0, TASK_TI_FLAGS(tp) - andi s0, s0, _TIF_NEED_RESCHED - beqz s0, restore_all - call preempt_schedule_irq - j restore_all -#endif - -work_pending: - /* Enter slow path for supplementary processing */ - la ra, ret_from_exception - andi s1, s0, _TIF_NEED_RESCHED - bnez s1, work_resched -work_notifysig: - /* Handle pending signals and notify-resume requests */ - csrs CSR_STATUS, SR_IE /* Enable interrupts for do_notify_resume() */ - move a0, sp /* pt_regs */ - move a1, s0 /* current_thread_info->flags */ - tail do_notify_resume -work_resched: - tail schedule - -/* Slow paths for ptrace. */ -handle_syscall_trace_enter: - move a0, sp - call do_syscall_trace_enter - move t0, a0 - REG_L a0, PT_A0(sp) - REG_L a1, PT_A1(sp) - REG_L a2, PT_A2(sp) - REG_L a3, PT_A3(sp) - REG_L a4, PT_A4(sp) - REG_L a5, PT_A5(sp) - REG_L a6, PT_A6(sp) - REG_L a7, PT_A7(sp) - bnez t0, ret_from_syscall_rejected - j check_syscall_nr -handle_syscall_trace_exit: - move a0, sp - call do_syscall_trace_exit - j ret_from_exception +SYM_INNER_LABEL(ret_from_exception_end, SYM_L_GLOBAL) +SYM_CODE_END(ret_from_exception) +ASM_NOKPROBE(ret_from_exception) #ifdef CONFIG_VMAP_STACK -handle_kernel_stack_overflow: - la sp, shadow_stack - addi sp, sp, SHADOW_OVERFLOW_STACK_SIZE +SYM_CODE_START_LOCAL(handle_kernel_stack_overflow) + /* we reach here from kernel context, sscratch must be 0 */ + csrrw x31, CSR_SCRATCH, x31 + asm_per_cpu sp, overflow_stack, x31 + li x31, OVERFLOW_STACK_SIZE + add sp, sp, x31 + /* zero out x31 again and restore x31 */ + xor x31, x31, x31 + csrrw x31, CSR_SCRATCH, x31 - //save caller register to shadow stack - addi sp, sp, -(PT_SIZE_ON_STACK) - REG_S x1, PT_RA(sp) - REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) - - la ra, restore_caller_reg - tail get_overflow_stack - -restore_caller_reg: - //save per-cpu overflow stack - REG_S a0, -8(sp) - //restore caller register from shadow_stack - REG_L x1, PT_RA(sp) - REG_L x5, PT_T0(sp) - REG_L x6, PT_T1(sp) - REG_L x7, PT_T2(sp) - REG_L x10, PT_A0(sp) - REG_L x11, PT_A1(sp) - REG_L x12, PT_A2(sp) - REG_L x13, PT_A3(sp) - REG_L x14, PT_A4(sp) - REG_L x15, PT_A5(sp) - REG_L x16, PT_A6(sp) - REG_L x17, PT_A7(sp) - REG_L x28, PT_T3(sp) - REG_L x29, PT_T4(sp) - REG_L x30, PT_T5(sp) - REG_L x31, PT_T6(sp) - - //load per-cpu overflow stack - REG_L sp, -8(sp) addi sp, sp, -(PT_SIZE_ON_STACK) //save context to overflow stack REG_S x1, PT_RA(sp) REG_S x3, PT_GP(sp) REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x8, PT_S0(sp) - REG_S x9, PT_S1(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x18, PT_S2(sp) - REG_S x19, PT_S3(sp) - REG_S x20, PT_S4(sp) - REG_S x21, PT_S5(sp) - REG_S x22, PT_S6(sp) - REG_S x23, PT_S7(sp) - REG_S x24, PT_S8(sp) - REG_S x25, PT_S9(sp) - REG_S x26, PT_S10(sp) - REG_S x27, PT_S11(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) + save_from_x6_to_x31 REG_L s0, TASK_TI_KERNEL_SP(tp) csrr s1, CSR_STATUS @@ -499,23 +315,62 @@ restore_caller_reg: REG_S s5, PT_TP(sp) move a0, sp tail handle_bad_stack +SYM_CODE_END(handle_kernel_stack_overflow) +ASM_NOKPROBE(handle_kernel_stack_overflow) #endif -END(handle_exception) - -ENTRY(ret_from_fork) - la ra, ret_from_exception - tail schedule_tail -ENDPROC(ret_from_fork) +SYM_CODE_START(ret_from_fork_kernel_asm) + call schedule_tail + move a0, s1 /* fn_arg */ + move a1, s0 /* fn */ + move a2, sp /* pt_regs */ + call ret_from_fork_kernel + j ret_from_exception +SYM_CODE_END(ret_from_fork_kernel_asm) -ENTRY(ret_from_kernel_thread) +SYM_CODE_START(ret_from_fork_user_asm) call schedule_tail - /* Call fn(arg) */ - la ra, ret_from_exception - move a0, s1 - jr s0 -ENDPROC(ret_from_kernel_thread) + move a0, sp /* pt_regs */ + call ret_from_fork_user + j ret_from_exception +SYM_CODE_END(ret_from_fork_user_asm) +#ifdef CONFIG_IRQ_STACKS +/* + * void call_on_irq_stack(struct pt_regs *regs, + * void (*func)(struct pt_regs *)); + * + * Calls func(regs) using the per-CPU IRQ stack. + */ +SYM_FUNC_START(call_on_irq_stack) + /* Create a frame record to save ra and s0 (fp) */ + addi sp, sp, -STACKFRAME_SIZE_ON_STACK + REG_S ra, STACKFRAME_RA(sp) + REG_S s0, STACKFRAME_FP(sp) + addi s0, sp, STACKFRAME_SIZE_ON_STACK + + /* Switch to the per-CPU shadow call stack */ + scs_save_current + scs_load_irq_stack t0 + + /* Switch to the per-CPU IRQ stack and call the handler */ + load_per_cpu t0, irq_stack_ptr, t1 + li t1, IRQ_STACK_SIZE + add sp, t0, t1 + jalr a1 + + /* Switch back to the thread shadow call stack */ + scs_load_current + + /* Switch back to the thread stack and restore ra and s0 */ + addi sp, s0, -STACKFRAME_SIZE_ON_STACK + REG_L ra, STACKFRAME_RA(sp) + REG_L s0, STACKFRAME_FP(sp) + addi sp, sp, STACKFRAME_SIZE_ON_STACK + + ret +SYM_FUNC_END(call_on_irq_stack) +#endif /* CONFIG_IRQ_STACKS */ /* * Integer register context switch @@ -527,7 +382,7 @@ ENDPROC(ret_from_kernel_thread) * The value of a0 and a1 must be preserved by this function, as that's how * arguments are passed to schedule_tail. */ -ENTRY(__switch_to) +SYM_FUNC_START(__switch_to) /* Save context into prev->thread */ li a4, TASK_THREAD_RA add a3, a0, a4 @@ -546,7 +401,18 @@ ENTRY(__switch_to) REG_S s9, TASK_THREAD_S9_RA(a3) REG_S s10, TASK_THREAD_S10_RA(a3) REG_S s11, TASK_THREAD_S11_RA(a3) + + /* save the user space access flag */ + csrr s0, CSR_STATUS + REG_S s0, TASK_THREAD_SUM_RA(a3) + + /* Save the kernel shadow call stack pointer */ + scs_save_current /* Restore context from next->thread */ + REG_L s0, TASK_THREAD_SUM_RA(a4) + li s1, SR_SUM + and s0, s0, s1 + csrs CSR_STATUS, s0 REG_L ra, TASK_THREAD_RA_RA(a4) REG_L sp, TASK_THREAD_SP_RA(a4) REG_L s0, TASK_THREAD_S0_RA(a4) @@ -563,8 +429,10 @@ ENTRY(__switch_to) REG_L s11, TASK_THREAD_S11_RA(a4) /* The offset of thread_info in task_struct is zero. */ move tp, a1 + /* Switch to the next shadow call stack */ + scs_load_current ret -ENDPROC(__switch_to) +SYM_FUNC_END(__switch_to) #ifndef CONFIG_MMU #define do_page_fault do_trap_unknown @@ -573,7 +441,7 @@ ENDPROC(__switch_to) .section ".rodata" .align LGREG /* Exception vector table */ -ENTRY(excp_vect_table) +SYM_DATA_START_LOCAL(excp_vect_table) RISCV_PTR do_trap_insn_misaligned ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault) RISCV_PTR do_trap_insn_illegal @@ -582,7 +450,7 @@ ENTRY(excp_vect_table) RISCV_PTR do_trap_load_fault RISCV_PTR do_trap_store_misaligned RISCV_PTR do_trap_store_fault - RISCV_PTR do_trap_ecall_u /* system call, gets intercepted */ + RISCV_PTR do_trap_ecall_u /* system call */ RISCV_PTR do_trap_ecall_s RISCV_PTR do_trap_unknown RISCV_PTR do_trap_ecall_m @@ -591,12 +459,11 @@ ENTRY(excp_vect_table) RISCV_PTR do_page_fault /* load page fault */ RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ -excp_vect_table_end: -END(excp_vect_table) +SYM_DATA_END_LABEL(excp_vect_table, SYM_L_LOCAL, excp_vect_table_end) #ifndef CONFIG_MMU -ENTRY(__user_rt_sigreturn) +SYM_DATA_START(__user_rt_sigreturn) li a7, __NR_rt_sigreturn - scall -END(__user_rt_sigreturn) + ecall +SYM_DATA_END(__user_rt_sigreturn) #endif diff --git a/arch/riscv/kernel/fpu.S b/arch/riscv/kernel/fpu.S index dd2205473de7..f74f6b60e347 100644 --- a/arch/riscv/kernel/fpu.S +++ b/arch/riscv/kernel/fpu.S @@ -19,7 +19,7 @@ #include <asm/csr.h> #include <asm/asm-offsets.h> -ENTRY(__fstate_save) +SYM_FUNC_START(__fstate_save) li a2, TASK_THREAD_F0 add a0, a0, a2 li t1, SR_FS @@ -60,9 +60,9 @@ ENTRY(__fstate_save) sw t0, TASK_THREAD_FCSR_F0(a0) csrc CSR_STATUS, t1 ret -ENDPROC(__fstate_save) +SYM_FUNC_END(__fstate_save) -ENTRY(__fstate_restore) +SYM_FUNC_START(__fstate_restore) li a2, TASK_THREAD_F0 add a0, a0, a2 li t1, SR_FS @@ -103,4 +103,125 @@ ENTRY(__fstate_restore) fscsr t0 csrc CSR_STATUS, t1 ret -ENDPROC(__fstate_restore) +SYM_FUNC_END(__fstate_restore) + +#define get_f32(which) fmv.x.s a0, which; j 2f +#define put_f32(which) fmv.s.x which, a1; j 2f +#if __riscv_xlen == 64 +# define get_f64(which) fmv.x.d a0, which; j 2f +# define put_f64(which) fmv.d.x which, a1; j 2f +#else +# define get_f64(which) fsd which, 0(a1); j 2f +# define put_f64(which) fld which, 0(a1); j 2f +#endif + +.macro fp_access_prologue + /* + * Compute jump offset to store the correct FP register since we don't + * have indirect FP register access + */ + sll t0, a0, 3 + la t2, 1f + add t0, t0, t2 + li t1, SR_FS + csrs CSR_STATUS, t1 + jr t0 +1: +.endm + +.macro fp_access_epilogue +2: + csrc CSR_STATUS, t1 + ret +.endm + +#define fp_access_body(__access_func) \ + __access_func(f0); \ + __access_func(f1); \ + __access_func(f2); \ + __access_func(f3); \ + __access_func(f4); \ + __access_func(f5); \ + __access_func(f6); \ + __access_func(f7); \ + __access_func(f8); \ + __access_func(f9); \ + __access_func(f10); \ + __access_func(f11); \ + __access_func(f12); \ + __access_func(f13); \ + __access_func(f14); \ + __access_func(f15); \ + __access_func(f16); \ + __access_func(f17); \ + __access_func(f18); \ + __access_func(f19); \ + __access_func(f20); \ + __access_func(f21); \ + __access_func(f22); \ + __access_func(f23); \ + __access_func(f24); \ + __access_func(f25); \ + __access_func(f26); \ + __access_func(f27); \ + __access_func(f28); \ + __access_func(f29); \ + __access_func(f30); \ + __access_func(f31) + + +#ifdef CONFIG_RISCV_SCALAR_MISALIGNED + +/* + * Disable compressed instructions set to keep a constant offset between FP + * load/store/move instructions + */ +.option norvc +/* + * put_f32_reg - Set a FP register from a register containing the value + * a0 = FP register index to be set + * a1 = value to be loaded in the FP register + */ +SYM_FUNC_START(put_f32_reg) + fp_access_prologue + fp_access_body(put_f32) + fp_access_epilogue +SYM_FUNC_END(put_f32_reg) + +/* + * get_f32_reg - Get a FP register value and return it + * a0 = FP register index to be retrieved + */ +SYM_FUNC_START(get_f32_reg) + fp_access_prologue + fp_access_body(get_f32) + fp_access_epilogue +SYM_FUNC_END(get_f32_reg) + +/* + * put_f64_reg - Set a 64 bits FP register from a value or a pointer. + * a0 = FP register index to be set + * a1 = value/pointer to be loaded in the FP register (when xlen == 32 bits, we + * load the value to a pointer). + */ +SYM_FUNC_START(put_f64_reg) + fp_access_prologue + fp_access_body(put_f64) + fp_access_epilogue +SYM_FUNC_END(put_f64_reg) + +/* + * get_f64_reg - Get a 64 bits FP register value and returned it or store it to + * a pointer. + * a0 = FP register index to be retrieved + * a1 = If xlen == 32, pointer which should be loaded with the FP register value + * or unused if xlen == 64. In which case the FP register value is returned + * through a0 + */ +SYM_FUNC_START(get_f64_reg) + fp_access_prologue + fp_access_body(get_f64) + fp_access_epilogue +SYM_FUNC_END(get_f64_reg) + +#endif /* CONFIG_RISCV_SCALAR_MISALIGNED */ diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 2086f6585773..4c6c24380cfd 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -8,121 +8,134 @@ #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/memory.h> +#include <linux/irqflags.h> +#include <linux/stop_machine.h> #include <asm/cacheflush.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE -void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) +unsigned long ftrace_call_adjust(unsigned long addr) { - mutex_lock(&text_mutex); + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return addr + 8 + MCOUNT_AUIPC_SIZE; + + return addr + MCOUNT_AUIPC_SIZE; +} + +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + return fentry_ip - MCOUNT_AUIPC_SIZE; } -void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) +void arch_ftrace_update_code(int command) { + mutex_lock(&text_mutex); + command |= FTRACE_MAY_SLEEP; + ftrace_modify_all_code(command); mutex_unlock(&text_mutex); + flush_icache_all(); } -static int ftrace_check_current_call(unsigned long hook_pos, - unsigned int *expected) +static int __ftrace_modify_call(unsigned long source, unsigned long target, bool validate) { + unsigned int call[2], offset; unsigned int replaced[2]; - unsigned int nops[2] = {NOP4, NOP4}; - /* we expect nops at the hook position */ - if (!expected) - expected = nops; - - /* - * Read the text we want to modify; - * return must be -EFAULT on read error - */ - if (copy_from_kernel_nofault(replaced, (void *)hook_pos, - MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* - * Make sure it is what we expect it to be; - * return must be -EINVAL on failed comparison - */ - if (memcmp(expected, replaced, sizeof(replaced))) { - pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n", - (void *)hook_pos, expected[0], expected[1], replaced[0], - replaced[1]); - return -EINVAL; + offset = target - source; + call[1] = to_jalr_t0(offset); + + if (validate) { + call[0] = to_auipc_t0(offset); + /* + * Read the text we want to modify; + * return must be -EFAULT on read error + */ + if (copy_from_kernel_nofault(replaced, (void *)source, 2 * MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (replaced[0] != call[0]) { + pr_err("%p: expected (%08x) but got (%08x)\n", + (void *)source, call[0], replaced[0]); + return -EINVAL; + } } + /* Replace the jalr at once. Return -EPERM on write error. */ + if (patch_insn_write((void *)(source + MCOUNT_AUIPC_SIZE), call + 1, MCOUNT_JALR_SIZE)) + return -EPERM; + return 0; } -static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, - bool enable) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec) { - unsigned int call[2]; - unsigned int nops[2] = {NOP4, NOP4}; + const struct ftrace_ops *ops = NULL; - make_call(hook_pos, target, call); + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + WARN_ON_ONCE(!ops); + } - /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */ - if (patch_text_nosync - ((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE)) - return -EPERM; + if (!ops) + ops = &ftrace_list_ops; - return 0; + return ops; } -/* - * Put 5 instructions with 16 bytes at the front of function within - * patchable function entry nops' area. - * - * 0: REG_S ra, -SZREG(sp) - * 1: auipc ra, 0x? - * 2: jalr -?(ra) - * 3: REG_L ra, -SZREG(sp) - * - * So the opcodes is: - * 0: 0xfe113c23 (sd)/0xfe112e23 (sw) - * 1: 0x???????? -> auipc - * 2: 0x???????? -> jalr - * 3: 0xff813083 (ld)/0xffc12083 (lw) - */ -#if __riscv_xlen == 64 -#define INSN0 0xfe113c23 -#define INSN3 0xff813083 -#elif __riscv_xlen == 32 -#define INSN0 0xfe112e23 -#define INSN3 0xffc12083 -#endif +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, const struct ftrace_ops *ops) +{ + unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8); -#define FUNC_ENTRY_SIZE 16 -#define FUNC_ENTRY_JMP 4 + return patch_text_nosync((void *)literal, &ops, sizeof(ops)); +} + +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, &ftrace_nop_ops); +} + +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec)); +} +#else +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; } +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; } +#endif int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned int call[4] = {INSN0, 0, 0, INSN3}; - unsigned long target = addr; - unsigned long caller = rec->ip + FUNC_ENTRY_JMP; + unsigned long distance, orig_addr, pc = rec->ip - MCOUNT_AUIPC_SIZE; + int ret; - call[1] = to_auipc_insn((unsigned int)(target - caller)); - call[2] = to_jalr_insn((unsigned int)(target - caller)); + ret = ftrace_rec_update_ops(rec); + if (ret) + return ret; - if (patch_text_nosync((void *)rec->ip, call, FUNC_ENTRY_SIZE)) - return -EPERM; + orig_addr = (unsigned long)&ftrace_caller; + distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr; + if (distance > JALR_RANGE) + addr = FTRACE_ADDR; - return 0; + return __ftrace_modify_call(pc, addr, false); } -int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, - unsigned long addr) +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned int nops[4] = {NOP4, NOP4, NOP4, NOP4}; + u32 nop4 = RISCV_INSN_NOP4; + int ret; + + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; - if (patch_text_nosync((void *)rec->ip, nops, FUNC_ENTRY_SIZE)) + if (patch_insn_write((void *)rec->ip, &nop4, MCOUNT_NOP4_SIZE)) return -EPERM; return 0; } - /* * This is called early on, and isn't wrapped by * ftrace_arch_code_modify_{prepare,post_process}() and therefor doesn't hold @@ -132,43 +145,71 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, */ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { - int out; + unsigned long pc = rec->ip - MCOUNT_AUIPC_SIZE; + unsigned int nops[2], offset; + int ret; + + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; + + offset = (unsigned long) &ftrace_caller - pc; + nops[0] = to_auipc_t0(offset); + nops[1] = RISCV_INSN_NOP4; - ftrace_arch_code_modify_prepare(); - out = ftrace_make_nop(mod, rec, MCOUNT_ADDR); - ftrace_arch_code_modify_post_process(); + mutex_lock(&text_mutex); + ret = patch_insn_write((void *)pc, nops, 2 * MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); - return out; + return ret; } +ftrace_func_t ftrace_call_dest = ftrace_stub; int ftrace_update_ftrace_func(ftrace_func_t func) { - int ret = __ftrace_modify_call((unsigned long)&ftrace_call, - (unsigned long)func, true); - if (!ret) { - ret = __ftrace_modify_call((unsigned long)&ftrace_regs_call, - (unsigned long)func, true); - } + /* + * When using CALL_OPS, the function to call is associated with the + * call site, and we don't have a global function pointer to update. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return 0; - return ret; + WRITE_ONCE(ftrace_call_dest, func); + /* + * The data fence ensure that the update to ftrace_call_dest happens + * before the write to function_trace_op later in the generic ftrace. + * If the sequence is not enforced, then an old ftrace_call_dest may + * race loading a new function_trace_op set in ftrace_modify_all_code + */ + smp_wmb(); + /* + * Updating ftrace dpes not take stop_machine path, so irqs should not + * be disabled. + */ + WARN_ON(irqs_disabled()); + smp_call_function(ftrace_sync_ipi, NULL, 1); + return 0; } -#endif -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#else /* CONFIG_DYNAMIC_FTRACE */ +unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { - unsigned int call[2]; - unsigned long caller = rec->ip + FUNC_ENTRY_JMP; + unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE; int ret; - make_call(caller, old_addr, call); - ret = ftrace_check_current_call(caller, call); - + ret = ftrace_rec_update_ops(rec); if (ret) return ret; - return __ftrace_modify_call(caller, addr, true); + return __ftrace_modify_call(caller, FTRACE_ADDR, true); } #endif @@ -196,32 +237,25 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } #ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); -extern void ftrace_graph_regs_call(void); -int ftrace_enable_ftrace_graph_caller(void) +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { - int ret; - - ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, true); - if (ret) - return ret; - - return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, - (unsigned long)&prepare_ftrace_return, true); -} + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long frame_pointer = arch_ftrace_regs(fregs)->s0; + unsigned long *parent = &arch_ftrace_regs(fregs)->ra; + unsigned long old; -int ftrace_disable_ftrace_graph_caller(void) -{ - int ret; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; - ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, false); - if (ret) - return ret; + /* + * We don't suffer access faults, so no extra fault-recovery assembly + * is needed here. + */ + old = *parent; - return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, - (unsigned long)&prepare_ftrace_return, false); + if (!function_graph_enter_regs(old, ip, frame_pointer, parent, fregs)) + *parent = return_hooker; } #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index b865046e4dbb..bdf3352acf4c 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -11,14 +11,14 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/csr.h> -#include <asm/cpu_ops_sbi.h> #include <asm/hwcap.h> #include <asm/image.h> +#include <asm/scs.h> #include <asm/xip_fixup.h> #include "efi-header.S" __HEAD -ENTRY(_start) +SYM_CODE_START(_start) /* * Image header expected by Linux boot-loaders. The image header data * structure is described in asm/image.h. @@ -88,6 +88,7 @@ relocate_enable_mmu: /* Compute satp for kernel page tables, but don't load it yet */ srl a2, a0, PAGE_SHIFT la a1, satp_mode + XIP_FIXUP_OFFSET a1 REG_L a1, 0(a1) or a2, a2, a1 @@ -110,10 +111,7 @@ relocate_enable_mmu: csrw CSR_TVEC, a0 /* Reload the global pointer */ -.option push -.option norelax - la gp, __global_pointer$ -.option pop + load_global_pointer /* * Switch to kernel page tables. A full fence is necessary in order to @@ -133,17 +131,20 @@ secondary_start_sbi: csrw CSR_IE, zero csrw CSR_IP, zero +#ifndef CONFIG_RISCV_M_MODE + /* Enable time CSR */ + li t0, 0x2 + csrw CSR_SCOUNTEREN, t0 +#endif + /* Load the global pointer */ - .option push - .option norelax - la gp, __global_pointer$ - .option pop + load_global_pointer /* - * Disable FPU to detect illegal usage of - * floating point in kernel space + * Disable FPU & VECTOR to detect illegal usage of + * floating point or vector in kernel space */ - li t0, SR_FS + li t0, SR_FS_VS csrc CSR_STATUS, t0 /* Set trap vector to spin forever to help debug */ @@ -168,12 +169,24 @@ secondary_start_sbi: XIP_FIXUP_OFFSET a0 call relocate_enable_mmu #endif - call setup_trap_vector - tail smp_callin + call .Lsetup_trap_vector + scs_load_current + call smp_callin #endif /* CONFIG_SMP */ .align 2 -setup_trap_vector: +.Lsecondary_park: + /* + * Park this hart if we: + * - have too many harts on CONFIG_RISCV_BOOT_SPINWAIT + * - receive an early trap, before setup_trap_vector finished + * - fail in smp_callin(), as a successful one wouldn't return + */ + wfi + j .Lsecondary_park + +.align 2 +.Lsetup_trap_vector: /* Set trap vector to exception handler */ la a0, handle_exception csrw CSR_TVEC, a0 @@ -185,15 +198,9 @@ setup_trap_vector: csrw CSR_SCRATCH, zero ret -.align 2 -.Lsecondary_park: - /* We lack SMP support or have too many harts, so park this hart */ - wfi - j .Lsecondary_park - -END(_start) +SYM_CODE_END(_start) -ENTRY(_start_kernel) +SYM_CODE_START(_start_kernel) /* Mask all interrupts */ csrw CSR_IE, zero csrw CSR_IP, zero @@ -210,7 +217,7 @@ ENTRY(_start_kernel) * not implement PMPs, so we set up a quick trap handler to just skip * touching the PMPs on any trap. */ - la a0, pmp_done + la a0, .Lpmp_done csrw CSR_TVEC, a0 li a0, -1 @@ -218,26 +225,27 @@ ENTRY(_start_kernel) li a0, (PMP_A_NAPOT | PMP_R | PMP_W | PMP_X) csrw CSR_PMPCFG0, a0 .align 2 -pmp_done: +.Lpmp_done: /* * The hartid in a0 is expected later on, and we have no firmware * to hand it to us. */ csrr a0, CSR_MHARTID +#else + /* Enable time CSR */ + li t0, 0x2 + csrw CSR_SCOUNTEREN, t0 #endif /* CONFIG_RISCV_M_MODE */ /* Load the global pointer */ -.option push -.option norelax - la gp, __global_pointer$ -.option pop + load_global_pointer /* - * Disable FPU to detect illegal usage of - * floating point in kernel space + * Disable FPU & VECTOR to detect illegal usage of + * floating point or vector in kernel space */ - li t0, SR_FS + li t0, SR_FS_VS csrc CSR_STATUS, t0 #ifdef CONFIG_RISCV_BOOT_SPINWAIT @@ -272,27 +280,25 @@ pmp_done: la sp, _end + THREAD_SIZE XIP_FIXUP_OFFSET sp mv s0, a0 + mv s1, a1 call __copy_data - /* Restore a0 copy */ + /* Restore a0 & a1 copy */ mv a0, s0 + mv a1, s1 #endif #ifndef CONFIG_XIP_KERNEL /* Clear BSS for flat non-ELF images */ la a3, __bss_start la a4, __bss_stop - ble a4, a3, clear_bss_done -clear_bss: + ble a4, a3, .Lclear_bss_done +.Lclear_bss: REG_S zero, (a3) add a3, a3, RISCV_SZPTR - blt a3, a4, clear_bss -clear_bss_done: + blt a3, a4, .Lclear_bss +.Lclear_bss_done: #endif - /* Save hart ID and DTB physical address */ - mv s0, a0 - mv s1, a1 - la a2, boot_cpu_hartid XIP_FIXUP_OFFSET a2 REG_S a0, (a2) @@ -301,12 +307,17 @@ clear_bss_done: la tp, init_task la sp, init_thread_union + THREAD_SIZE XIP_FIXUP_OFFSET sp + addi sp, sp, -PT_SIZE_ON_STACK + scs_load_init_stack #ifdef CONFIG_BUILTIN_DTB la a0, __dtb_start XIP_FIXUP_OFFSET a0 #else - mv a0, s1 + mv a0, a1 #endif /* CONFIG_BUILTIN_DTB */ + /* Set trap vector to spin forever to help debug */ + la a3, .Lsecondary_park + csrw CSR_TVEC, a3 call setup_vm #ifdef CONFIG_MMU la a0, early_pg_dir @@ -314,10 +325,12 @@ clear_bss_done: call relocate_enable_mmu #endif /* CONFIG_MMU */ - call setup_trap_vector + call .Lsetup_trap_vector /* Restore C environment */ la tp, init_task la sp, init_thread_union + THREAD_SIZE + addi sp, sp, -PT_SIZE_ON_STACK + scs_load_current #ifdef CONFIG_KASAN call kasan_early_init @@ -326,7 +339,7 @@ clear_bss_done: call soc_early_init tail start_kernel -#if CONFIG_RISCV_BOOT_SPINWAIT +#ifdef CONFIG_RISCV_BOOT_SPINWAIT .Lsecondary_start: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park @@ -355,10 +368,10 @@ clear_bss_done: tail .Lsecondary_start_common #endif /* CONFIG_RISCV_BOOT_SPINWAIT */ -END(_start_kernel) +SYM_CODE_END(_start_kernel) #ifdef CONFIG_RISCV_M_MODE -ENTRY(reset_regs) +SYM_CODE_START_LOCAL(reset_regs) li sp, 0 li gp, 0 li tp, 0 @@ -392,7 +405,7 @@ ENTRY(reset_regs) #ifdef CONFIG_FPU csrr t0, CSR_MISA andi t0, t0, (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D) - beqz t0, .Lreset_regs_done + beqz t0, .Lreset_regs_done_fpu li t1, SR_FS csrs CSR_STATUS, t1 @@ -430,8 +443,31 @@ ENTRY(reset_regs) fmv.s.x f31, zero csrw fcsr, 0 /* note that the caller must clear SR_FS */ +.Lreset_regs_done_fpu: #endif /* CONFIG_FPU */ -.Lreset_regs_done: + +#ifdef CONFIG_RISCV_ISA_V + csrr t0, CSR_MISA + li t1, COMPAT_HWCAP_ISA_V + and t0, t0, t1 + beqz t0, .Lreset_regs_done_vector + + /* + * Clear vector registers and reset vcsr + * VLMAX has a defined value, VLEN is a constant, + * and this form of vsetvli is defined to set vl to VLMAX. + */ + li t1, SR_VS + csrs CSR_STATUS, t1 + csrs CSR_VCSR, x0 + vsetvli t1, x0, e8, m8, ta, ma + vmv.v.i v0, 0 + vmv.v.i v8, 0 + vmv.v.i v16, 0 + vmv.v.i v24, 0 + /* note that the caller must clear SR_VS */ +.Lreset_regs_done_vector: +#endif /* CONFIG_RISCV_ISA_V */ ret -END(reset_regs) +SYM_CODE_END(reset_regs) #endif /* CONFIG_RISCV_M_MODE */ diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index 726731ada534..a556fdaafed9 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -10,7 +10,6 @@ extern atomic_t hart_lottery; -asmlinkage void do_page_fault(struct pt_regs *regs); asmlinkage void __init setup_vm(uintptr_t dtb_pa); #ifdef CONFIG_XIP_KERNEL asmlinkage void __init __copy_data(void); diff --git a/arch/riscv/kernel/hibernate-asm.S b/arch/riscv/kernel/hibernate-asm.S new file mode 100644 index 000000000000..d040dcf4add4 --- /dev/null +++ b/arch/riscv/kernel/hibernate-asm.S @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Hibernation low level support for RISCV. + * + * Copyright (C) 2023 StarFive Technology Co., Ltd. + * + * Author: Jee Heng Sia <jeeheng.sia@starfivetech.com> + */ + +#include <asm/asm.h> +#include <asm/asm-offsets.h> +#include <asm/assembler.h> +#include <asm/csr.h> + +#include <linux/linkage.h> + +/* + * int __hibernate_cpu_resume(void) + * Switch back to the hibernated image's page table prior to restoring the CPU + * context. + * + * Always returns 0 + */ +SYM_FUNC_START(__hibernate_cpu_resume) + /* switch to hibernated image's page table. */ + csrw CSR_SATP, s0 + sfence.vma + + REG_L a0, hibernate_cpu_context + + suspend_restore_regs + + /* Return zero value. */ + mv a0, zero + + ret +SYM_FUNC_END(__hibernate_cpu_resume) + +/* + * Prepare to restore the image. + * a0: satp of saved page tables. + * a1: satp of temporary page tables. + * a2: cpu_resume. + */ +SYM_FUNC_START(hibernate_restore_image) + mv s0, a0 + mv s1, a1 + mv s2, a2 + REG_L s4, restore_pblist + REG_L a1, relocated_restore_code + + jr a1 +SYM_FUNC_END(hibernate_restore_image) + +/* + * The below code will be executed from a 'safe' page. + * It first switches to the temporary page table, then starts to copy the pages + * back to the original memory location. Finally, it jumps to __hibernate_cpu_resume() + * to restore the CPU context. + */ +SYM_FUNC_START(hibernate_core_restore_code) + /* switch to temp page table. */ + csrw satp, s1 + sfence.vma +.Lcopy: + /* The below code will restore the hibernated image. */ + REG_L a1, HIBERN_PBE_ADDR(s4) + REG_L a0, HIBERN_PBE_ORIG(s4) + + copy_page a0, a1 + + REG_L s4, HIBERN_PBE_NEXT(s4) + bnez s4, .Lcopy + + jr s2 +SYM_FUNC_END(hibernate_core_restore_code) diff --git a/arch/riscv/kernel/hibernate.c b/arch/riscv/kernel/hibernate.c new file mode 100644 index 000000000000..671b686c0158 --- /dev/null +++ b/arch/riscv/kernel/hibernate.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Hibernation support for RISCV + * + * Copyright (C) 2023 StarFive Technology Co., Ltd. + * + * Author: Jee Heng Sia <jeeheng.sia@starfivetech.com> + */ + +#include <asm/barrier.h> +#include <asm/cacheflush.h> +#include <asm/mmu_context.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/set_memory.h> +#include <asm/smp.h> +#include <asm/suspend.h> + +#include <linux/cpu.h> +#include <linux/memblock.h> +#include <linux/pm.h> +#include <linux/sched.h> +#include <linux/suspend.h> +#include <linux/utsname.h> + +/* The logical cpu number we should resume on, initialised to a non-cpu number. */ +static int sleep_cpu = -EINVAL; + +/* Pointer to the temporary resume page table. */ +static pgd_t *resume_pg_dir; + +/* CPU context to be saved. */ +struct suspend_context *hibernate_cpu_context; +EXPORT_SYMBOL_GPL(hibernate_cpu_context); + +unsigned long relocated_restore_code; +EXPORT_SYMBOL_GPL(relocated_restore_code); + +/** + * struct arch_hibernate_hdr_invariants - container to store kernel build version. + * @uts_version: to save the build number and date so that we do not resume with + * a different kernel. + */ +struct arch_hibernate_hdr_invariants { + char uts_version[__NEW_UTS_LEN + 1]; +}; + +/** + * struct arch_hibernate_hdr - helper parameters that help us to restore the image. + * @invariants: container to store kernel build version. + * @hartid: to make sure same boot_cpu executes the hibernate/restore code. + * @saved_satp: original page table used by the hibernated image. + * @restore_cpu_addr: the kernel's image address to restore the CPU context. + */ +static struct arch_hibernate_hdr { + struct arch_hibernate_hdr_invariants invariants; + unsigned long hartid; + unsigned long saved_satp; + unsigned long restore_cpu_addr; +} resume_hdr; + +static void arch_hdr_invariants(struct arch_hibernate_hdr_invariants *i) +{ + memset(i, 0, sizeof(*i)); + memcpy(i->uts_version, init_utsname()->version, sizeof(i->uts_version)); +} + +/* + * Check if the given pfn is in the 'nosave' section. + */ +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = sym_to_pfn(&__nosave_begin); + unsigned long nosave_end_pfn = sym_to_pfn(&__nosave_end - 1); + + return ((pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn)); +} + +void notrace save_processor_state(void) +{ +} + +void notrace restore_processor_state(void) +{ +} + +/* + * Helper parameters need to be saved to the hibernation image header. + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct arch_hibernate_hdr *hdr = addr; + + if (max_size < sizeof(*hdr)) + return -EOVERFLOW; + + arch_hdr_invariants(&hdr->invariants); + + hdr->hartid = cpuid_to_hartid_map(sleep_cpu); + hdr->saved_satp = csr_read(CSR_SATP); + hdr->restore_cpu_addr = (unsigned long)__hibernate_cpu_resume; + + return 0; +} +EXPORT_SYMBOL_GPL(arch_hibernation_header_save); + +/* + * Retrieve the helper parameters from the hibernation image header. + */ +int arch_hibernation_header_restore(void *addr) +{ + struct arch_hibernate_hdr_invariants invariants; + struct arch_hibernate_hdr *hdr = addr; + int ret = 0; + + arch_hdr_invariants(&invariants); + + if (memcmp(&hdr->invariants, &invariants, sizeof(invariants))) { + pr_crit("Hibernate image not generated by this kernel!\n"); + return -EINVAL; + } + + sleep_cpu = riscv_hartid_to_cpuid(hdr->hartid); + if (sleep_cpu < 0) { + pr_crit("Hibernated on a CPU not known to this kernel!\n"); + sleep_cpu = -EINVAL; + return -EINVAL; + } + +#ifdef CONFIG_SMP + ret = bringup_hibernate_cpu(sleep_cpu); + if (ret) { + sleep_cpu = -EINVAL; + return ret; + } +#endif + resume_hdr = *hdr; + + return ret; +} +EXPORT_SYMBOL_GPL(arch_hibernation_header_restore); + +int swsusp_arch_suspend(void) +{ + int ret = 0; + + if (__cpu_suspend_enter(hibernate_cpu_context)) { + sleep_cpu = smp_processor_id(); + suspend_save_csrs(hibernate_cpu_context); + ret = swsusp_save(); + } else { + suspend_restore_csrs(hibernate_cpu_context); + flush_tlb_all(); + flush_icache_all(); + + /* + * Tell the hibernation core that we've just restored the memory. + */ + in_suspend = 0; + sleep_cpu = -EINVAL; + } + + return ret; +} + +static int temp_pgtable_map_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, + unsigned long end, pgprot_t prot) +{ + pte_t *src_ptep; + pte_t *dst_ptep; + + if (pmd_none(READ_ONCE(*dst_pmdp))) { + dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!dst_ptep) + return -ENOMEM; + + pmd_populate_kernel(NULL, dst_pmdp, dst_ptep); + } + + dst_ptep = pte_offset_kernel(dst_pmdp, start); + src_ptep = pte_offset_kernel(src_pmdp, start); + + do { + pte_t pte = READ_ONCE(*src_ptep); + + if (pte_present(pte)) + set_pte(dst_ptep, __pte(pte_val(pte) | pgprot_val(prot))); + } while (dst_ptep++, src_ptep++, start += PAGE_SIZE, start < end); + + return 0; +} + +static int temp_pgtable_map_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, + unsigned long end, pgprot_t prot) +{ + unsigned long next; + unsigned long ret; + pmd_t *src_pmdp; + pmd_t *dst_pmdp; + + if (pud_none(READ_ONCE(*dst_pudp))) { + dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pmdp) + return -ENOMEM; + + pud_populate(NULL, dst_pudp, dst_pmdp); + } + + dst_pmdp = pmd_offset(dst_pudp, start); + src_pmdp = pmd_offset(src_pudp, start); + + do { + pmd_t pmd = READ_ONCE(*src_pmdp); + + next = pmd_addr_end(start, end); + + if (pmd_none(pmd)) + continue; + + if (pmd_leaf(pmd)) { + set_pmd(dst_pmdp, __pmd(pmd_val(pmd) | pgprot_val(prot))); + } else { + ret = temp_pgtable_map_pte(dst_pmdp, src_pmdp, start, next, prot); + if (ret) + return -ENOMEM; + } + } while (dst_pmdp++, src_pmdp++, start = next, start != end); + + return 0; +} + +static int temp_pgtable_map_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, + unsigned long end, pgprot_t prot) +{ + unsigned long next; + unsigned long ret; + pud_t *dst_pudp; + pud_t *src_pudp; + + if (p4d_none(READ_ONCE(*dst_p4dp))) { + dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pudp) + return -ENOMEM; + + p4d_populate(NULL, dst_p4dp, dst_pudp); + } + + dst_pudp = pud_offset(dst_p4dp, start); + src_pudp = pud_offset(src_p4dp, start); + + do { + pud_t pud = READ_ONCE(*src_pudp); + + next = pud_addr_end(start, end); + + if (pud_none(pud)) + continue; + + if (pud_leaf(pud)) { + set_pud(dst_pudp, __pud(pud_val(pud) | pgprot_val(prot))); + } else { + ret = temp_pgtable_map_pmd(dst_pudp, src_pudp, start, next, prot); + if (ret) + return -ENOMEM; + } + } while (dst_pudp++, src_pudp++, start = next, start != end); + + return 0; +} + +static int temp_pgtable_map_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, + unsigned long end, pgprot_t prot) +{ + unsigned long next; + unsigned long ret; + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + + if (pgd_none(READ_ONCE(*dst_pgdp))) { + dst_p4dp = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!dst_p4dp) + return -ENOMEM; + + pgd_populate(NULL, dst_pgdp, dst_p4dp); + } + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + + do { + p4d_t p4d = READ_ONCE(*src_p4dp); + + next = p4d_addr_end(start, end); + + if (p4d_none(p4d)) + continue; + + if (p4d_leaf(p4d)) { + set_p4d(dst_p4dp, __p4d(p4d_val(p4d) | pgprot_val(prot))); + } else { + ret = temp_pgtable_map_pud(dst_p4dp, src_p4dp, start, next, prot); + if (ret) + return -ENOMEM; + } + } while (dst_p4dp++, src_p4dp++, start = next, start != end); + + return 0; +} + +static int temp_pgtable_mapping(pgd_t *pgdp, unsigned long start, unsigned long end, pgprot_t prot) +{ + pgd_t *dst_pgdp = pgd_offset_pgd(pgdp, start); + pgd_t *src_pgdp = pgd_offset_k(start); + unsigned long next; + unsigned long ret; + + do { + pgd_t pgd = READ_ONCE(*src_pgdp); + + next = pgd_addr_end(start, end); + + if (pgd_none(pgd)) + continue; + + if (pgd_leaf(pgd)) { + set_pgd(dst_pgdp, __pgd(pgd_val(pgd) | pgprot_val(prot))); + } else { + ret = temp_pgtable_map_p4d(dst_pgdp, src_pgdp, start, next, prot); + if (ret) + return -ENOMEM; + } + } while (dst_pgdp++, src_pgdp++, start = next, start != end); + + return 0; +} + +static unsigned long relocate_restore_code(void) +{ + void *page = (void *)get_safe_page(GFP_ATOMIC); + + if (!page) + return -ENOMEM; + + copy_page(page, hibernate_core_restore_code); + + /* Make the page containing the relocated code executable. */ + set_memory_x((unsigned long)page, 1); + + return (unsigned long)page; +} + +int swsusp_arch_resume(void) +{ + unsigned long end = (unsigned long)pfn_to_virt(max_low_pfn); + unsigned long start = PAGE_OFFSET; + int ret; + + /* + * Memory allocated by get_safe_page() will be dealt with by the hibernation core, + * we don't need to free it here. + */ + resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!resume_pg_dir) + return -ENOMEM; + + /* + * Create a temporary page table and map the whole linear region as executable and + * writable. + */ + ret = temp_pgtable_mapping(resume_pg_dir, start, end, __pgprot(_PAGE_WRITE | _PAGE_EXEC)); + if (ret) + return ret; + + /* Move the restore code to a new page so that it doesn't get overwritten by itself. */ + relocated_restore_code = relocate_restore_code(); + if (relocated_restore_code == -ENOMEM) + return -ENOMEM; + + /* + * Map the __hibernate_cpu_resume() address to the temporary page table so that the + * restore code can jumps to it after finished restore the image. The next execution + * code doesn't find itself in a different address space after switching over to the + * original page table used by the hibernated image. + * The __hibernate_cpu_resume() mapping is unnecessary for RV32 since the kernel and + * linear addresses are identical, but different for RV64. To ensure consistency, we + * map it for both RV32 and RV64 kernels. + * Additionally, we should ensure that the page is writable before restoring the image. + */ + start = (unsigned long)resume_hdr.restore_cpu_addr; + end = start + PAGE_SIZE; + + ret = temp_pgtable_mapping(resume_pg_dir, start, end, __pgprot(_PAGE_WRITE)); + if (ret) + return ret; + + hibernate_restore_image(resume_hdr.saved_satp, (PFN_DOWN(__pa(resume_pg_dir)) | satp_mode), + resume_hdr.restore_cpu_addr); + + return 0; +} + +#ifdef CONFIG_PM_SLEEP_SMP +int hibernate_resume_nonboot_cpu_disable(void) +{ + if (sleep_cpu < 0) { + pr_err("Failing to resume from hibernate on an unknown CPU\n"); + return -ENODEV; + } + + return freeze_secondary_cpus(sleep_cpu); +} +#endif + +static int __init riscv_hibernate_init(void) +{ + hibernate_cpu_context = kzalloc(sizeof(*hibernate_cpu_context), GFP_KERNEL); + + if (WARN_ON(!hibernate_cpu_context)) + return -ENOMEM; + + return 0; +} + +early_initcall(riscv_hibernate_init); diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h index 71a76a623257..3df30dd1c458 100644 --- a/arch/riscv/kernel/image-vars.h +++ b/arch/riscv/kernel/image-vars.h @@ -23,28 +23,14 @@ * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = memcmp; -__efistub_memchr = memchr; -__efistub_memcpy = memcpy; -__efistub_memmove = memmove; -__efistub_memset = memset; -__efistub_strlen = strlen; -__efistub_strnlen = strnlen; -__efistub_strcmp = strcmp; -__efistub_strncmp = strncmp; -__efistub_strrchr = strrchr; - -#ifdef CONFIG_KASAN -__efistub___memcpy = memcpy; -__efistub___memmove = memmove; -__efistub___memset = memset; -#endif - __efistub__start = _start; __efistub__start_kernel = _start_kernel; __efistub__end = _end; __efistub__edata = _edata; +__efistub___init_text_end = __init_text_end; +#if defined(CONFIG_EFI_EARLYCON) || defined(CONFIG_SYSFB) __efistub_screen_info = screen_info; +#endif #endif diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c index 7207fa08d78f..9ceda02507ca 100644 --- a/arch/riscv/kernel/irq.c +++ b/arch/riscv/kernel/irq.c @@ -7,8 +7,97 @@ #include <linux/interrupt.h> #include <linux/irqchip.h> +#include <linux/irqdomain.h> +#include <linux/module.h> +#include <linux/scs.h> #include <linux/seq_file.h> +#include <asm/sbi.h> #include <asm/smp.h> +#include <asm/softirq_stack.h> +#include <asm/stacktrace.h> + +static struct fwnode_handle *(*__get_intc_node)(void); + +void riscv_set_intc_hwnode_fn(struct fwnode_handle *(*fn)(void)) +{ + __get_intc_node = fn; +} + +struct fwnode_handle *riscv_get_intc_hwnode(void) +{ + if (__get_intc_node) + return __get_intc_node(); + + return NULL; +} +EXPORT_SYMBOL_GPL(riscv_get_intc_hwnode); + +#ifdef CONFIG_IRQ_STACKS +#include <asm/irq_stack.h> + +DECLARE_PER_CPU(ulong *, irq_shadow_call_stack_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(ulong *, irq_shadow_call_stack_ptr); +#endif + +static void init_irq_scs(void) +{ + int cpu; + + if (!scs_is_enabled()) + return; + + for_each_possible_cpu(cpu) + per_cpu(irq_shadow_call_stack_ptr, cpu) = + scs_alloc(cpu_to_node(cpu)); +} + +DEFINE_PER_CPU(ulong *, irq_stack_ptr); + +#ifdef CONFIG_VMAP_STACK +static void init_irq_stacks(void) +{ + int cpu; + ulong *p; + + for_each_possible_cpu(cpu) { + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); + per_cpu(irq_stack_ptr, cpu) = p; + } +} +#else +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ +DEFINE_PER_CPU_ALIGNED(ulong [IRQ_STACK_SIZE/sizeof(ulong)], irq_stack); + +static void init_irq_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +} +#endif /* CONFIG_VMAP_STACK */ + +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK +static void ___do_softirq(struct pt_regs *regs) +{ + __do_softirq(); +} + +void do_softirq_own_stack(void) +{ + if (on_thread_stack()) + call_on_irq_stack(NULL, ___do_softirq); + else + __do_softirq(); +} +#endif /* CONFIG_SOFTIRQ_ON_OWN_STACK */ + +#else +static void init_irq_scs(void) {} +static void init_irq_stacks(void) {} +#endif /* CONFIG_IRQ_STACKS */ int arch_show_interrupts(struct seq_file *p, int prec) { @@ -18,7 +107,10 @@ int arch_show_interrupts(struct seq_file *p, int prec) void __init init_IRQ(void) { + init_irq_scs(); + init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); + sbi_ipi_init(); } diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c index e6694759dbd0..b4c1a6a3fbd2 100644 --- a/arch/riscv/kernel/jump_label.c +++ b/arch/riscv/kernel/jump_label.c @@ -9,13 +9,14 @@ #include <linux/memory.h> #include <linux/mutex.h> #include <asm/bug.h> -#include <asm/patch.h> +#include <asm/cacheflush.h> +#include <asm/text-patching.h> +#include <asm/insn-def.h> -#define RISCV_INSN_NOP 0x00000013U #define RISCV_INSN_JAL 0x0000006fU -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { void *addr = (void *)jump_entry_code(entry); u32 insn; @@ -24,7 +25,7 @@ void arch_jump_label_transform(struct jump_entry *entry, long offset = jump_entry_target(entry) - jump_entry_code(entry); if (WARN_ON(offset & 1 || offset < -524288 || offset >= 524288)) - return; + return true; insn = RISCV_INSN_JAL | (((u32)offset & GENMASK(19, 12)) << (12 - 12)) | @@ -32,10 +33,23 @@ void arch_jump_label_transform(struct jump_entry *entry, (((u32)offset & GENMASK(10, 1)) << (21 - 1)) | (((u32)offset & GENMASK(20, 20)) << (31 - 20)); } else { - insn = RISCV_INSN_NOP; + insn = RISCV_INSN_NOP4; } - mutex_lock(&text_mutex); - patch_text_nosync(addr, &insn, sizeof(insn)); - mutex_unlock(&text_mutex); + if (early_boot_irqs_disabled) { + riscv_patch_in_stop_machine = 1; + patch_insn_write(addr, &insn, sizeof(insn)); + riscv_patch_in_stop_machine = 0; + } else { + mutex_lock(&text_mutex); + patch_insn_write(addr, &insn, sizeof(insn)); + mutex_unlock(&text_mutex); + } + + return true; +} + +void arch_jump_label_transform_apply(void) +{ + flush_icache_all(); } diff --git a/arch/riscv/kernel/kernel_mode_fpu.c b/arch/riscv/kernel/kernel_mode_fpu.c new file mode 100644 index 000000000000..0ac8348876c4 --- /dev/null +++ b/arch/riscv/kernel/kernel_mode_fpu.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 SiFive + */ + +#include <linux/export.h> +#include <linux/preempt.h> + +#include <asm/csr.h> +#include <asm/fpu.h> +#include <asm/processor.h> +#include <asm/switch_to.h> + +void kernel_fpu_begin(void) +{ + preempt_disable(); + fstate_save(current, task_pt_regs(current)); + csr_set(CSR_SSTATUS, SR_FS); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + csr_clear(CSR_SSTATUS, SR_FS); + fstate_restore(current, task_pt_regs(current)); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c new file mode 100644 index 000000000000..99972a48e86b --- /dev/null +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Catalin Marinas <catalin.marinas@arm.com> + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2021 SiFive + */ +#include <linux/compiler.h> +#include <linux/irqflags.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/types.h> + +#include <asm/vector.h> +#include <asm/switch_to.h> +#include <asm/simd.h> +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +#include <asm/asm-prototypes.h> +#endif + +static inline void riscv_v_flags_set(u32 flags) +{ + WRITE_ONCE(current->thread.riscv_v_flags, flags); +} + +static inline void riscv_v_start(u32 flags) +{ + int orig; + + orig = riscv_v_flags(); + BUG_ON((orig & flags) != 0); + riscv_v_flags_set(orig | flags); + barrier(); +} + +static inline void riscv_v_stop(u32 flags) +{ + int orig; + + barrier(); + orig = riscv_v_flags(); + BUG_ON((orig & flags) == 0); + riscv_v_flags_set(orig & ~flags); +} + +/* + * Claim ownership of the CPU vector context for use by the calling context. + * + * The caller may freely manipulate the vector context metadata until + * put_cpu_vector_context() is called. + */ +void get_cpu_vector_context(void) +{ + /* + * disable softirqs so it is impossible for softirqs to nest + * get_cpu_vector_context() when kernel is actively using Vector. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); + + riscv_v_start(RISCV_KERNEL_MODE_V); +} + +/* + * Release the CPU vector context. + * + * Must be called from a context in which get_cpu_vector_context() was + * previously called, with no call to put_cpu_vector_context() in the + * meantime. + */ +void put_cpu_vector_context(void) +{ + riscv_v_stop(RISCV_KERNEL_MODE_V); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); +} + +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static __always_inline u32 *riscv_v_flags_ptr(void) +{ + return ¤t->thread.riscv_v_flags; +} + +static inline void riscv_preempt_v_set_dirty(void) +{ + *riscv_v_flags_ptr() |= RISCV_PREEMPT_V_DIRTY; +} + +static inline void riscv_preempt_v_reset_flags(void) +{ + *riscv_v_flags_ptr() &= ~(RISCV_PREEMPT_V_DIRTY | RISCV_PREEMPT_V_NEED_RESTORE); +} + +static inline void riscv_v_ctx_depth_inc(void) +{ + *riscv_v_flags_ptr() += RISCV_V_CTX_UNIT_DEPTH; +} + +static inline void riscv_v_ctx_depth_dec(void) +{ + *riscv_v_flags_ptr() -= RISCV_V_CTX_UNIT_DEPTH; +} + +static inline u32 riscv_v_ctx_get_depth(void) +{ + return *riscv_v_flags_ptr() & RISCV_V_CTX_DEPTH_MASK; +} + +static int riscv_v_stop_kernel_context(void) +{ + if (riscv_v_ctx_get_depth() != 0 || !riscv_preempt_v_started(current)) + return 1; + + riscv_preempt_v_clear_dirty(current); + riscv_v_stop(RISCV_PREEMPT_V); + return 0; +} + +static int riscv_v_start_kernel_context(bool *is_nested) +{ + struct __riscv_v_ext_state *kvstate, *uvstate; + + kvstate = ¤t->thread.kernel_vstate; + if (!kvstate->datap) + return -ENOENT; + + if (riscv_preempt_v_started(current)) { + WARN_ON(riscv_v_ctx_get_depth() == 0); + *is_nested = true; + get_cpu_vector_context(); + if (riscv_preempt_v_dirty(current)) { + __riscv_v_vstate_save(kvstate, kvstate->datap); + riscv_preempt_v_clear_dirty(current); + } + riscv_preempt_v_set_restore(current); + return 0; + } + + /* Transfer the ownership of V from user to kernel, then save */ + riscv_v_start(RISCV_PREEMPT_V | RISCV_PREEMPT_V_DIRTY); + if (__riscv_v_vstate_check(task_pt_regs(current)->status, DIRTY)) { + uvstate = ¤t->thread.vstate; + __riscv_v_vstate_save(uvstate, uvstate->datap); + } + riscv_preempt_v_clear_dirty(current); + return 0; +} + +/* low-level V context handling code, called with irq disabled */ +asmlinkage void riscv_v_context_nesting_start(struct pt_regs *regs) +{ + int depth; + + if (!riscv_preempt_v_started(current)) + return; + + depth = riscv_v_ctx_get_depth(); + if (depth == 0 && __riscv_v_vstate_check(regs->status, DIRTY)) + riscv_preempt_v_set_dirty(); + + riscv_v_ctx_depth_inc(); +} + +asmlinkage void riscv_v_context_nesting_end(struct pt_regs *regs) +{ + struct __riscv_v_ext_state *vstate = ¤t->thread.kernel_vstate; + u32 depth; + + WARN_ON(!irqs_disabled()); + + if (!riscv_preempt_v_started(current)) + return; + + riscv_v_ctx_depth_dec(); + depth = riscv_v_ctx_get_depth(); + if (depth == 0) { + if (riscv_preempt_v_restore(current)) { + __riscv_v_vstate_restore(vstate, vstate->datap); + __riscv_v_vstate_clean(regs); + riscv_preempt_v_reset_flags(); + } + } +} +#else +#define riscv_v_start_kernel_context(nested) (-ENOENT) +#define riscv_v_stop_kernel_context() (-ENOENT) +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + +/* + * kernel_vector_begin(): obtain the CPU vector registers for use by the calling + * context + * + * Must not be called unless may_use_simd() returns true. + * Task context in the vector registers is saved back to memory as necessary. + * + * A matching call to kernel_vector_end() must be made before returning from the + * calling context. + * + * The caller may freely use the vector registers until kernel_vector_end() is + * called. + */ +void kernel_vector_begin(void) +{ + bool nested = false; + + if (WARN_ON(!(has_vector() || has_xtheadvector()))) + return; + + BUG_ON(!may_use_simd()); + + if (riscv_v_start_kernel_context(&nested)) { + get_cpu_vector_context(); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); + } + + if (!nested) + riscv_v_vstate_set_restore(current, task_pt_regs(current)); + + riscv_v_enable(); +} +EXPORT_SYMBOL_GPL(kernel_vector_begin); + +/* + * kernel_vector_end(): give the CPU vector registers back to the current task + * + * Must be called from a context in which kernel_vector_begin() was previously + * called, with no call to kernel_vector_end() in the meantime. + * + * The caller must not use the vector registers after this function is called, + * unless kernel_vector_begin() is called again in the meantime. + */ +void kernel_vector_end(void) +{ + if (WARN_ON(!(has_vector() || has_xtheadvector()))) + return; + + riscv_v_disable(); + + if (riscv_v_stop_kernel_context()) + put_cpu_vector_context(); +} +EXPORT_SYMBOL_GPL(kernel_vector_end); diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c new file mode 100644 index 000000000000..f4755d49b89e --- /dev/null +++ b/arch/riscv/kernel/kexec_elf.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Load ELF vmlinux file for the kexec_file_load syscall. + * + * Copyright (C) 2021 Huawei Technologies Co, Ltd. + * + * Author: Liao Chang (liaochang1@huawei.com) + * + * Based on kexec-tools' kexec-elf-riscv.c, heavily modified + * for kernel. + */ + +#define pr_fmt(fmt) "kexec_image: " fmt + +#include <linux/elf.h> +#include <linux/kexec.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/libfdt.h> +#include <linux/types.h> +#include <linux/memblock.h> +#include <asm/setup.h> + +static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info, unsigned long old_pbase, + unsigned long new_pbase) +{ + int i; + int ret = 0; + size_t size; + struct kexec_buf kbuf; + const struct elf_phdr *phdr; + + kbuf.image = image; + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + size = phdr->p_filesz; + if (size > phdr->p_memsz) + size = phdr->p_memsz; + + kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; + kbuf.bufsz = size; + kbuf.buf_align = phdr->p_align; + kbuf.mem = phdr->p_paddr - old_pbase + new_pbase; + kbuf.memsz = phdr->p_memsz; + kbuf.top_down = false; + ret = kexec_add_buffer(&kbuf); + if (ret) + break; + } + + return ret; +} + +/* + * Go through the available phsyical memory regions and find one that hold + * an image of the specified size. + */ +static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, + struct elfhdr *ehdr, struct kexec_elf_info *elf_info, + unsigned long *old_pbase, unsigned long *new_pbase) +{ + int i; + int ret; + struct kexec_buf kbuf; + const struct elf_phdr *phdr; + unsigned long lowest_paddr = ULONG_MAX; + unsigned long lowest_vaddr = ULONG_MAX; + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + if (lowest_paddr > phdr->p_paddr) + lowest_paddr = phdr->p_paddr; + + if (lowest_vaddr > phdr->p_vaddr) + lowest_vaddr = phdr->p_vaddr; + } + + kbuf.image = image; + kbuf.buf_min = lowest_paddr; + kbuf.buf_max = ULONG_MAX; + + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 + * + */ + kbuf.buf_align = PMD_SIZE; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.top_down = false; + ret = arch_kexec_locate_mem_hole(&kbuf); + if (!ret) { + *old_pbase = lowest_paddr; + *new_pbase = kbuf.mem; + image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem; + } + return ret; +} + +static void *elf_kexec_load(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + int ret; + unsigned long old_kernel_pbase = ULONG_MAX; + unsigned long new_kernel_pbase = 0UL; + struct elfhdr ehdr; + struct kexec_elf_info elf_info; + + ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info); + if (ret) + return ERR_PTR(ret); + + ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info, + &old_kernel_pbase, &new_kernel_pbase); + if (ret) + goto out; + + /* Add the kernel binary to the image */ + ret = riscv_kexec_elf_load(image, &ehdr, &elf_info, + old_kernel_pbase, new_kernel_pbase); + if (ret) + goto out; + + ret = load_extra_segments(image, image->start, kernel_len, + initrd, initrd_len, cmdline, cmdline_len); +out: + kexec_free_elf_info(&elf_info); + return ret ? ERR_PTR(ret) : NULL; +} + +const struct kexec_file_ops elf_kexec_ops = { + .probe = kexec_elf_probe, + .load = elf_kexec_load, +}; diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c new file mode 100644 index 000000000000..26a81774a78a --- /dev/null +++ b/arch/riscv/kernel/kexec_image.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V Kexec image loader + * + */ + +#define pr_fmt(fmt) "kexec_file(Image): " fmt + +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/pe.h> +#include <linux/string.h> +#include <asm/byteorder.h> +#include <asm/image.h> + +static int image_probe(const char *kernel_buf, unsigned long kernel_len) +{ + const struct riscv_image_header *h = (const struct riscv_image_header *)kernel_buf; + + if (!h || kernel_len < sizeof(*h)) + return -EINVAL; + + /* According to Documentation/riscv/boot-image-header.rst, + * use "magic2" field to check when version >= 0.2. + */ + + if (h->version >= RISCV_HEADER_VERSION && + memcmp(&h->magic2, RISCV_IMAGE_MAGIC2, sizeof(h->magic2))) + return -EINVAL; + + return 0; +} + +static void *image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + struct riscv_image_header *h; + u64 flags; + bool be_image, be_kernel; + struct kexec_buf kbuf; + int ret; + + /* Check Image header */ + h = (struct riscv_image_header *)kernel; + if (!h->image_size) { + ret = -EINVAL; + goto out; + } + + /* Check endianness */ + flags = le64_to_cpu(h->flags); + be_image = riscv_image_flag_field(flags, RISCV_IMAGE_FLAG_BE); + be_kernel = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + if (be_image != be_kernel) { + ret = -EINVAL; + goto out; + } + + /* Load the kernel image */ + kbuf.image = image; + kbuf.buf_min = 0; + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = false; + + kbuf.buffer = kernel; + kbuf.bufsz = kernel_len; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = le64_to_cpu(h->image_size); + kbuf.buf_align = le64_to_cpu(h->text_offset); + + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error add kernel image ret=%d\n", ret); + goto out; + } + + image->start = kbuf.mem; + + pr_info("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); + + ret = load_extra_segments(image, kbuf.mem, kbuf.memsz, + initrd, initrd_len, cmdline, cmdline_len); + +out: + return ret ? ERR_PTR(ret) : NULL; +} + +const struct kexec_file_ops image_kexec_ops = { + .probe = image_probe, + .load = image_load, +}; diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S index 059c5e216ae7..de0a4b35d01e 100644 --- a/arch/riscv/kernel/kexec_relocate.S +++ b/arch/riscv/kernel/kexec_relocate.S @@ -17,27 +17,17 @@ SYM_CODE_START(riscv_kexec_relocate) * s1: (const) Phys address to jump to after relocation * s2: (const) Phys address of the FDT image * s3: (const) The hartid of the current hart - * s4: Pointer to the destination address for the relocation - * s5: (const) Number of words per page - * s6: (const) 1, used for subtraction - * s7: (const) kernel_map.va_pa_offset, used when switching MMU off - * s8: (const) Physical address of the main loop - * s9: (debug) indirection page counter - * s10: (debug) entry counter - * s11: (debug) copied words counter + * s4: (const) kernel_map.va_pa_offset, used when switching MMU off + * s5: Pointer to the destination address for the relocation + * s6: (const) Physical address of the main loop */ mv s0, a0 mv s1, a1 mv s2, a2 mv s3, a3 - mv s4, zero - li s5, (PAGE_SIZE / RISCV_SZPTR) - li s6, 1 - mv s7, a4 - mv s8, zero - mv s9, zero - mv s10, zero - mv s11, zero + mv s4, a4 + mv s5, zero + mv s6, zero /* Disable / cleanup interrupts */ csrw CSR_SIE, zero @@ -52,21 +42,27 @@ SYM_CODE_START(riscv_kexec_relocate) * the start of the loop below so that we jump there in * any case. */ - la s8, 1f - sub s8, s8, s7 - csrw CSR_STVEC, s8 + la s6, 1f + sub s6, s6, s4 + csrw CSR_STVEC, s6 + + /* + * With C-extension, here we get 42 Bytes and the next + * .align directive would pad zeros here up to 44 Bytes. + * So manually put a nop here to avoid zeros padding. + */ + nop /* Process entries in a loop */ .align 2 1: - addi s10, s10, 1 REG_L t0, 0(s0) /* t0 = *image->entry */ addi s0, s0, RISCV_SZPTR /* image->entry++ */ /* IND_DESTINATION entry ? -> save destination address */ andi t1, t0, 0x1 beqz t1, 2f - andi s4, t0, ~0x1 + andi s5, t0, ~0x1 j 1b 2: @@ -74,9 +70,8 @@ SYM_CODE_START(riscv_kexec_relocate) andi t1, t0, 0x2 beqz t1, 2f andi s0, t0, ~0x2 - addi s9, s9, 1 csrw CSR_SATP, zero - jalr zero, s8, 0 + jr s6 2: /* IND_DONE entry ? -> jump to done label */ @@ -92,14 +87,13 @@ SYM_CODE_START(riscv_kexec_relocate) andi t1, t0, 0x8 beqz t1, 1b /* Unknown entry type, ignore it */ andi t0, t0, ~0x8 - mv t3, s5 /* i = num words per page */ + li t3, (PAGE_SIZE / RISCV_SZPTR) /* i = num words per page */ 3: /* copy loop */ REG_L t1, (t0) /* t1 = *src_ptr */ - REG_S t1, (s4) /* *dst_ptr = *src_ptr */ + REG_S t1, (s5) /* *dst_ptr = *src_ptr */ addi t0, t0, RISCV_SZPTR /* stc_ptr++ */ - addi s4, s4, RISCV_SZPTR /* dst_ptr++ */ - sub t3, t3, s6 /* i-- */ - addi s11, s11, 1 /* c++ */ + addi s5, s5, RISCV_SZPTR /* dst_ptr++ */ + addi t3, t3, -0x1 /* i-- */ beqz t3, 1b /* copy done ? */ j 3b @@ -146,7 +140,7 @@ SYM_CODE_START(riscv_kexec_relocate) */ fence.i - jalr zero, a2, 0 + jr a2 SYM_CODE_END(riscv_kexec_relocate) riscv_kexec_relocate_end: diff --git a/arch/riscv/kernel/kgdb.c b/arch/riscv/kernel/kgdb.c index 963ed7edcff2..9f3db3503dab 100644 --- a/arch/riscv/kernel/kgdb.c +++ b/arch/riscv/kernel/kgdb.c @@ -11,7 +11,7 @@ #include <linux/string.h> #include <asm/cacheflush.h> #include <asm/gdb_xml.h> -#include <asm/parse_asm.h> +#include <asm/insn.h> enum { NOT_KGDB_BREAK = 0, @@ -23,27 +23,6 @@ enum { static unsigned long stepped_address; static unsigned int stepped_opcode; -#if __riscv_xlen == 32 -/* C.JAL is an RV32C-only instruction */ -DECLARE_INSN(c_jal, MATCH_C_JAL, MASK_C_JAL) -#else -#define is_c_jal_insn(opcode) 0 -#endif -DECLARE_INSN(jalr, MATCH_JALR, MASK_JALR) -DECLARE_INSN(jal, MATCH_JAL, MASK_JAL) -DECLARE_INSN(c_jr, MATCH_C_JR, MASK_C_JR) -DECLARE_INSN(c_jalr, MATCH_C_JALR, MASK_C_JALR) -DECLARE_INSN(c_j, MATCH_C_J, MASK_C_J) -DECLARE_INSN(beq, MATCH_BEQ, MASK_BEQ) -DECLARE_INSN(bne, MATCH_BNE, MASK_BNE) -DECLARE_INSN(blt, MATCH_BLT, MASK_BLT) -DECLARE_INSN(bge, MATCH_BGE, MASK_BGE) -DECLARE_INSN(bltu, MATCH_BLTU, MASK_BLTU) -DECLARE_INSN(bgeu, MATCH_BGEU, MASK_BGEU) -DECLARE_INSN(c_beqz, MATCH_C_BEQZ, MASK_C_BEQZ) -DECLARE_INSN(c_bnez, MATCH_C_BNEZ, MASK_C_BNEZ) -DECLARE_INSN(sret, MATCH_SRET, MASK_SRET) - static int decode_register_index(unsigned long opcode, int offset) { return (opcode >> offset) & 0x1F; @@ -65,23 +44,25 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) if (get_kernel_nofault(op_code, (void *)pc)) return -EINVAL; if ((op_code & __INSN_LENGTH_MASK) != __INSN_LENGTH_GE_32) { - if (is_c_jalr_insn(op_code) || is_c_jr_insn(op_code)) { + if (riscv_insn_is_c_jalr(op_code) || + riscv_insn_is_c_jr(op_code)) { rs1_num = decode_register_index(op_code, RVC_C2_RS1_OPOFF); *next_addr = regs_ptr[rs1_num]; - } else if (is_c_j_insn(op_code) || is_c_jal_insn(op_code)) { - *next_addr = EXTRACT_RVC_J_IMM(op_code) + pc; - } else if (is_c_beqz_insn(op_code)) { + } else if (riscv_insn_is_c_j(op_code) || + riscv_insn_is_c_jal(op_code)) { + *next_addr = RVC_EXTRACT_JTYPE_IMM(op_code) + pc; + } else if (riscv_insn_is_c_beqz(op_code)) { rs1_num = decode_register_index_short(op_code, RVC_C1_RS1_OPOFF); if (!rs1_num || regs_ptr[rs1_num] == 0) - *next_addr = EXTRACT_RVC_B_IMM(op_code) + pc; + *next_addr = RVC_EXTRACT_BTYPE_IMM(op_code) + pc; else *next_addr = pc + 2; - } else if (is_c_bnez_insn(op_code)) { + } else if (riscv_insn_is_c_bnez(op_code)) { rs1_num = decode_register_index_short(op_code, RVC_C1_RS1_OPOFF); if (rs1_num && regs_ptr[rs1_num] != 0) - *next_addr = EXTRACT_RVC_B_IMM(op_code) + pc; + *next_addr = RVC_EXTRACT_BTYPE_IMM(op_code) + pc; else *next_addr = pc + 2; } else { @@ -90,7 +71,7 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) } else { if ((op_code & __INSN_OPCODE_MASK) == __INSN_BRANCH_OPCODE) { bool result = false; - long imm = EXTRACT_BTYPE_IMM(op_code); + long imm = RV_EXTRACT_BTYPE_IMM(op_code); unsigned long rs1_val = 0, rs2_val = 0; rs1_num = decode_register_index(op_code, RVG_RS1_OPOFF); @@ -100,34 +81,34 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) if (rs2_num) rs2_val = regs_ptr[rs2_num]; - if (is_beq_insn(op_code)) + if (riscv_insn_is_beq(op_code)) result = (rs1_val == rs2_val) ? true : false; - else if (is_bne_insn(op_code)) + else if (riscv_insn_is_bne(op_code)) result = (rs1_val != rs2_val) ? true : false; - else if (is_blt_insn(op_code)) + else if (riscv_insn_is_blt(op_code)) result = ((long)rs1_val < (long)rs2_val) ? true : false; - else if (is_bge_insn(op_code)) + else if (riscv_insn_is_bge(op_code)) result = ((long)rs1_val >= (long)rs2_val) ? true : false; - else if (is_bltu_insn(op_code)) + else if (riscv_insn_is_bltu(op_code)) result = (rs1_val < rs2_val) ? true : false; - else if (is_bgeu_insn(op_code)) + else if (riscv_insn_is_bgeu(op_code)) result = (rs1_val >= rs2_val) ? true : false; if (result) *next_addr = imm + pc; else *next_addr = pc + 4; - } else if (is_jal_insn(op_code)) { - *next_addr = EXTRACT_JTYPE_IMM(op_code) + pc; - } else if (is_jalr_insn(op_code)) { + } else if (riscv_insn_is_jal(op_code)) { + *next_addr = RV_EXTRACT_JTYPE_IMM(op_code) + pc; + } else if (riscv_insn_is_jalr(op_code)) { rs1_num = decode_register_index(op_code, RVG_RS1_OPOFF); if (rs1_num) *next_addr = ((unsigned long *)regs)[rs1_num]; - *next_addr += EXTRACT_ITYPE_IMM(op_code); - } else if (is_sret_insn(op_code)) { + *next_addr += RV_EXTRACT_ITYPE_IMM(op_code); + } else if (riscv_insn_is_sret(op_code)) { *next_addr = pc; } else { *next_addr = pc + 4; @@ -273,6 +254,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) regs->epc = pc; } +noinline void arch_kgdb_breakpoint(void) +{ + asm(".global kgdb_compiled_break\n" + "kgdb_compiled_break: ebreak\n"); +} + void kgdb_arch_handle_qxfer_pkt(char *remcom_in_buffer, char *remcom_out_buffer) { diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index df8e24559035..2306ce3e5f22 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -15,30 +15,8 @@ #include <linux/compiler.h> /* For unreachable() */ #include <linux/cpu.h> /* For cpu_down() */ #include <linux/reboot.h> - -/* - * kexec_image_info - Print received image details - */ -static void -kexec_image_info(const struct kimage *image) -{ - unsigned long i; - - pr_debug("Kexec image info:\n"); - pr_debug("\ttype: %d\n", image->type); - pr_debug("\tstart: %lx\n", image->start); - pr_debug("\thead: %lx\n", image->head); - pr_debug("\tnr_segments: %lu\n", image->nr_segments); - - for (i = 0; i < image->nr_segments; i++) { - pr_debug("\t segment[%lu]: %016lx - %016lx", i, - image->segment[i].mem, - image->segment[i].mem + image->segment[i].memsz); - pr_debug("\t\t0x%lx bytes, %lu pages\n", - (unsigned long) image->segment[i].memsz, - (unsigned long) image->segment[i].memsz / PAGE_SIZE); - } -} +#include <linux/interrupt.h> +#include <linux/irq.h> /* * machine_kexec_prepare - Initialize kexec @@ -58,8 +36,6 @@ machine_kexec_prepare(struct kimage *image) unsigned int control_code_buffer_sz = 0; int i = 0; - kexec_image_info(image); - /* Find the Flattened Device Tree and save its physical address */ for (i = 0; i < image->nr_segments; i++) { if (image->segment[i].memsz <= sizeof(fdt)) @@ -142,15 +118,19 @@ void machine_shutdown(void) * machine_crash_shutdown - Prepare to kexec after a kernel crash * * This function is called by crash_kexec just before machine_kexec - * below and its goal is similar to machine_shutdown, but in case of - * a kernel crash. Since we don't handle such cases yet, this function - * is empty. + * and its goal is to shutdown non-crashing cpus and save registers. */ void machine_crash_shutdown(struct pt_regs *regs) { + local_irq_disable(); + + /* shutdown non-crashing cpus */ + crash_smp_send_stop(); + crash_save_cpu(regs, smp_processor_id()); - machine_shutdown(); + machine_kexec_mask_interrupts(); + pr_info("Starting crashdump kernel...\n"); } @@ -171,12 +151,17 @@ machine_kexec(struct kimage *image) struct kimage_arch *internal = &image->arch; unsigned long jump_addr = (unsigned long) image->start; unsigned long first_ind_entry = (unsigned long) &image->head; - unsigned long this_cpu_id = smp_processor_id(); + unsigned long this_cpu_id = __smp_processor_id(); unsigned long this_hart_id = cpuid_to_hartid_map(this_cpu_id); unsigned long fdt_addr = internal->fdt_addr; void *control_code_buffer = page_address(image->control_code_page); riscv_kexec_method kexec_method = NULL; +#ifdef CONFIG_SMP + WARN(smp_crash_stop_failed(), + "Some CPUs may be stale, kdump will be unreliable.\n"); +#endif + if (image->type != KEXEC_TYPE_CRASH) kexec_method = control_code_buffer; else diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index b0bf8c1722c0..e36104af2e24 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -7,8 +7,369 @@ * Author: Liao Chang (liaochang1@huawei.com) */ #include <linux/kexec.h> +#include <linux/elf.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/libfdt.h> +#include <linux/types.h> +#include <linux/memblock.h> +#include <linux/vmalloc.h> +#include <asm/setup.h> const struct kexec_file_ops * const kexec_file_loaders[] = { &elf_kexec_ops, + &image_kexec_ops, NULL }; + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + kvfree(image->arch.fdt); + image->arch.fdt = NULL; + + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} + +#ifdef CONFIG_CRASH_DUMP +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) +{ + unsigned int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) +{ + struct crash_mem *cmem = arg; + + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; + + return 0; +} + +static int prepare_elf_headers(void **addr, unsigned long *sz) +{ + struct crash_mem *cmem; + unsigned int nr_ranges; + int ret; + + nr_ranges = 1; /* For exclusion of crashkernel region */ + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + + cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + if (!cmem) + return -ENOMEM; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (!ret) + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); + +out: + kfree(cmem); + return ret; +} + +static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, + unsigned long cmdline_len) +{ + int elfcorehdr_strlen; + char *cmdline_ptr; + + cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL); + if (!cmdline_ptr) + return NULL; + + elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", + image->elf_load_addr); + + if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) { + pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n"); + kfree(cmdline_ptr); + return NULL; + } + + memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len); + /* Ensure it's nul terminated */ + cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; + return cmdline_ptr; +} +#endif + +#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) +#define RISCV_IMM_BITS 12 +#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) +#define RISCV_CONST_HIGH_PART(x) \ + (((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1)) +#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x)) + +#define ENCODE_ITYPE_IMM(x) \ + (RV_X(x, 0, 12) << 20) +#define ENCODE_BTYPE_IMM(x) \ + ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \ + (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31)) +#define ENCODE_UTYPE_IMM(x) \ + (RV_X(x, 12, 20) << 12) +#define ENCODE_JTYPE_IMM(x) \ + ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \ + (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31)) +#define ENCODE_CBTYPE_IMM(x) \ + ((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \ + (RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12)) +#define ENCODE_CJTYPE_IMM(x) \ + ((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \ + (RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \ + (RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12)) +#define ENCODE_UJTYPE_IMM(x) \ + (ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \ + (ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32)) +#define ENCODE_UITYPE_IMM(x) \ + (ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32)) + +#define CLEAN_IMM(type, x) \ + ((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x)) + +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab) +{ + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; + Elf64_Rela *relas; + int i, r_type; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; + + relas = (void *)pi->ehdr + relsec->sh_offset; + + for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { + const Elf_Sym *sym; /* symbol to relocate */ + unsigned long addr; /* final location after relocation */ + unsigned long val; /* relocated symbol value */ + unsigned long sec_base; /* relocated symbol value */ + void *loc; /* tmp location to modify */ + + sym = (void *)pi->ehdr + symtab->sh_offset; + sym += ELF64_R_SYM(relas[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + loc = pi->purgatory_buf; + loc += section->sh_offset; + loc += relas[i].r_offset; + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= pi->ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; + + val = sym->st_value; + val += sec_base; + val += relas[i].r_addend; + + addr = section->sh_addr + relas[i].r_offset; + + r_type = ELF64_R_TYPE(relas[i].r_info); + + switch (r_type) { + case R_RISCV_BRANCH: + *(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) | + ENCODE_BTYPE_IMM(val - addr); + break; + case R_RISCV_JAL: + *(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) | + ENCODE_JTYPE_IMM(val - addr); + break; + /* + * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I + * sym is expected to be next to R_RISCV_PCREL_HI20 + * in purgatory relsec. Handle it like R_RISCV_CALL + * sym, instead of searching the whole relsec. + */ + case R_RISCV_PCREL_HI20: + case R_RISCV_CALL_PLT: + case R_RISCV_CALL: + *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) | + ENCODE_UJTYPE_IMM(val - addr); + break; + case R_RISCV_RVC_BRANCH: + *(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) | + ENCODE_CBTYPE_IMM(val - addr); + break; + case R_RISCV_RVC_JUMP: + *(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) | + ENCODE_CJTYPE_IMM(val - addr); + break; + case R_RISCV_ADD16: + *(u16 *)loc += val; + break; + case R_RISCV_SUB16: + *(u16 *)loc -= val; + break; + case R_RISCV_ADD32: + *(u32 *)loc += val; + break; + case R_RISCV_SUB32: + *(u32 *)loc -= val; + break; + /* It has been applied by R_RISCV_PCREL_HI20 sym */ + case R_RISCV_PCREL_LO12_I: + case R_RISCV_ALIGN: + case R_RISCV_RELAX: + break; + case R_RISCV_64: + *(u64 *)loc = val; + break; + default: + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } + } + return 0; +} + + +int load_extra_segments(struct kimage *image, unsigned long kernel_start, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + int ret; + void *fdt; + unsigned long initrd_pbase = 0UL; + struct kexec_buf kbuf; + char *modified_cmdline = NULL; + + kbuf.image = image; + kbuf.buf_min = kernel_start + kernel_len; + kbuf.buf_max = ULONG_MAX; + +#ifdef CONFIG_CRASH_DUMP + /* Add elfcorehdr */ + if (image->type == KEXEC_TYPE_CRASH) { + void *headers; + unsigned long headers_sz; + ret = prepare_elf_headers(&headers, &headers_sz); + if (ret) { + pr_err("Preparing elf core header failed\n"); + goto out; + } + + kbuf.buffer = headers; + kbuf.bufsz = headers_sz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = headers_sz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree(headers); + goto out; + } + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; + + kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + + /* Setup cmdline for kdump kernel case */ + modified_cmdline = setup_kdump_cmdline(image, cmdline, + cmdline_len); + if (!modified_cmdline) { + pr_err("Setting up cmdline for kdump kernel failed\n"); + ret = -EINVAL; + goto out; + } + cmdline = modified_cmdline; + } +#endif + +#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY + /* Add purgatory to the image */ + kbuf.top_down = true; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_load_purgatory(image, &kbuf); + if (ret) { + pr_err("Error loading purgatory ret=%d\n", ret); + goto out; + } + kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem); + + ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry", + &kernel_start, + sizeof(kernel_start), 0); + if (ret) + pr_err("Error update purgatory ret=%d\n", ret); +#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ + + /* Add the initrd to the image */ + if (initrd != NULL) { + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.top_down = true; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out; + initrd_pbase = kbuf.mem; + kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase); + } + + /* Add the DTB to the image */ + fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase, + initrd_len, cmdline, 0); + if (!fdt) { + pr_err("Error setting up the new device tree.\n"); + ret = -EINVAL; + goto out; + } + + fdt_pack(fdt); + kbuf.buffer = fdt; + kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); + kbuf.buf_align = PAGE_SIZE; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.top_down = true; + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error add DTB kbuf ret=%d\n", ret); + goto out_free_fdt; + } + /* Cache the fdt buffer address for memory cleanup */ + image->arch.fdt = fdt; + kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem); + goto out; + +out_free_fdt: + kvfree(fdt); +out: + kfree(modified_cmdline); + return ret; +} diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index d171eca623b6..48f6c4f7dca0 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -3,18 +3,17 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <linux/export.h> #include <asm/asm.h> #include <asm/csr.h> #include <asm/unistd.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> -#include <asm-generic/export.h> #include <asm/ftrace.h> .text -#define FENTRY_RA_OFFSET 12 -#define ABI_SIZE_ON_STACK 72 +#define ABI_SIZE_ON_STACK 80 #define ABI_A0 0 #define ABI_A1 8 #define ABI_A2 16 @@ -23,10 +22,10 @@ #define ABI_A5 40 #define ABI_A6 48 #define ABI_A7 56 -#define ABI_RA 64 +#define ABI_T0 64 +#define ABI_RA 72 .macro SAVE_ABI - addi sp, sp, -SZREG addi sp, sp, -ABI_SIZE_ON_STACK REG_S a0, ABI_A0(sp) @@ -37,6 +36,7 @@ REG_S a5, ABI_A5(sp) REG_S a6, ABI_A6(sp) REG_S a7, ABI_A7(sp) + REG_S t0, ABI_T0(sp) REG_S ra, ABI_RA(sp) .endm @@ -49,153 +49,162 @@ REG_L a5, ABI_A5(sp) REG_L a6, ABI_A6(sp) REG_L a7, ABI_A7(sp) + REG_L t0, ABI_T0(sp) REG_L ra, ABI_RA(sp) addi sp, sp, ABI_SIZE_ON_STACK - addi sp, sp, SZREG .endm -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - .macro SAVE_ALL - addi sp, sp, -SZREG - addi sp, sp, -PT_SIZE_ON_STACK - - REG_S x1, PT_EPC(sp) - addi sp, sp, PT_SIZE_ON_STACK - REG_L x1, (sp) - addi sp, sp, -PT_SIZE_ON_STACK - REG_S x1, PT_RA(sp) - REG_L x1, PT_EPC(sp) - - REG_S x2, PT_SP(sp) - REG_S x3, PT_GP(sp) - REG_S x4, PT_TP(sp) - REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x8, PT_S0(sp) - REG_S x9, PT_S1(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x18, PT_S2(sp) - REG_S x19, PT_S3(sp) - REG_S x20, PT_S4(sp) - REG_S x21, PT_S5(sp) - REG_S x22, PT_S6(sp) - REG_S x23, PT_S7(sp) - REG_S x24, PT_S8(sp) - REG_S x25, PT_S9(sp) - REG_S x26, PT_S10(sp) - REG_S x27, PT_S11(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) - .endm - - .macro RESTORE_ALL - REG_L x1, PT_RA(sp) - addi sp, sp, PT_SIZE_ON_STACK - REG_S x1, (sp) - addi sp, sp, -PT_SIZE_ON_STACK - REG_L x1, PT_EPC(sp) - REG_L x2, PT_SP(sp) - REG_L x3, PT_GP(sp) - REG_L x4, PT_TP(sp) - REG_L x5, PT_T0(sp) - REG_L x6, PT_T1(sp) - REG_L x7, PT_T2(sp) - REG_L x8, PT_S0(sp) - REG_L x9, PT_S1(sp) - REG_L x10, PT_A0(sp) - REG_L x11, PT_A1(sp) - REG_L x12, PT_A2(sp) - REG_L x13, PT_A3(sp) - REG_L x14, PT_A4(sp) - REG_L x15, PT_A5(sp) - REG_L x16, PT_A6(sp) - REG_L x17, PT_A7(sp) - REG_L x18, PT_S2(sp) - REG_L x19, PT_S3(sp) - REG_L x20, PT_S4(sp) - REG_L x21, PT_S5(sp) - REG_L x22, PT_S6(sp) - REG_L x23, PT_S7(sp) - REG_L x24, PT_S8(sp) - REG_L x25, PT_S9(sp) - REG_L x26, PT_S10(sp) - REG_L x27, PT_S11(sp) - REG_L x28, PT_T3(sp) - REG_L x29, PT_T4(sp) - REG_L x30, PT_T5(sp) - REG_L x31, PT_T6(sp) - - addi sp, sp, PT_SIZE_ON_STACK - addi sp, sp, SZREG +/** +* SAVE_ABI_REGS - save regs against the ftrace_regs struct +* +* After the stack is established, +* +* 0(sp) stores the PC of the traced function which can be accessed +* by &(fregs)->epc in tracing function. +* +* 8(sp) stores the function return address (i.e. parent IP) that +* can be accessed by &(fregs)->ra in tracing function. +* +* The other regs are saved at the respective localtion and accessed +* by the respective ftrace_regs member. +* +* Here is the layout of stack for your reference. +* +* PT_SIZE_ON_STACK -> +++++++++ +* + ..... + +* + a0-a7 + --++++-> ftrace_caller saved +* + t1 + --++++-> direct tramp address +* + s0 + --+ // frame pointer +* + sp + + +* + ra + --+ // parent IP +* sp -> + epc + --+ // PC +* +++++++++ +**/ + .macro SAVE_ABI_REGS + addi sp, sp, -FREGS_SIZE_ON_STACK + REG_S t0, FREGS_EPC(sp) + REG_S x1, FREGS_RA(sp) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + REG_S x8, FREGS_S0(sp) +#endif + REG_S x6, FREGS_T1(sp) +#ifdef CONFIG_CC_IS_CLANG + REG_S x7, FREGS_T2(sp) + REG_S x28, FREGS_T3(sp) + REG_S x29, FREGS_T4(sp) + REG_S x30, FREGS_T5(sp) + REG_S x31, FREGS_T6(sp) +#endif + // save the arguments + REG_S x10, FREGS_A0(sp) + REG_S x11, FREGS_A1(sp) + REG_S x12, FREGS_A2(sp) + REG_S x13, FREGS_A3(sp) + REG_S x14, FREGS_A4(sp) + REG_S x15, FREGS_A5(sp) + REG_S x16, FREGS_A6(sp) + REG_S x17, FREGS_A7(sp) + mv a0, sp + addi a0, a0, FREGS_SIZE_ON_STACK + REG_S a0, FREGS_SP(sp) // Put original SP on stack .endm -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ - -ENTRY(ftrace_caller) - SAVE_ABI - - addi a0, ra, -FENTRY_RA_OFFSET - la a1, function_trace_op - REG_L a2, 0(a1) - REG_L a1, ABI_SIZE_ON_STACK(sp) - mv a3, sp -ftrace_call: - .global ftrace_call - call ftrace_stub - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi a0, sp, ABI_SIZE_ON_STACK - REG_L a1, ABI_RA(sp) - addi a1, a1, -FENTRY_RA_OFFSET + .macro RESTORE_ABI_REGS + REG_L t0, FREGS_EPC(sp) + REG_L x1, FREGS_RA(sp) #ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a2, s0 + REG_L x8, FREGS_S0(sp) #endif -ftrace_graph_call: - .global ftrace_graph_call - call ftrace_stub + REG_L x6, FREGS_T1(sp) +#ifdef CONFIG_CC_IS_CLANG + REG_L x7, FREGS_T2(sp) + REG_L x28, FREGS_T3(sp) + REG_L x29, FREGS_T4(sp) + REG_L x30, FREGS_T5(sp) + REG_L x31, FREGS_T6(sp) #endif - RESTORE_ABI - ret -ENDPROC(ftrace_caller) - -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS -ENTRY(ftrace_regs_caller) - SAVE_ALL + // restore the arguments + REG_L x10, FREGS_A0(sp) + REG_L x11, FREGS_A1(sp) + REG_L x12, FREGS_A2(sp) + REG_L x13, FREGS_A3(sp) + REG_L x14, FREGS_A4(sp) + REG_L x15, FREGS_A5(sp) + REG_L x16, FREGS_A6(sp) + REG_L x17, FREGS_A7(sp) + + addi sp, sp, FREGS_SIZE_ON_STACK + .endm - addi a0, ra, -FENTRY_RA_OFFSET + .macro PREPARE_ARGS + addi a0, t0, -MCOUNT_JALR_SIZE // ip (callsite's jalr insn) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + mv a1, ra // parent_ip + REG_L a2, -16(t0) // op + REG_L ra, FTRACE_OPS_FUNC(a2) // op->func +#else la a1, function_trace_op - REG_L a2, 0(a1) - REG_L a1, PT_SIZE_ON_STACK(sp) - mv a3, sp - -ftrace_regs_call: - .global ftrace_regs_call - call ftrace_stub + REG_L a2, 0(a1) // op + mv a1, ra // parent_ip +#endif + mv a3, sp // regs + .endm -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi a0, sp, PT_RA - REG_L a1, PT_EPC(sp) - addi a1, a1, -FENTRY_RA_OFFSET -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a2, s0 +SYM_FUNC_START(ftrace_caller) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + /* + * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the + * function entry, these are later overwritten with the pointer to the + * associated struct ftrace_ops. + * + * -8: &ftrace_ops of the associated tracer function. + *<ftrace enable>: + * 0: auipc t0/ra, 0x? + * 4: jalr t0/ra, ?(t0/ra) + * + * -8: &ftrace_nop_ops + *<ftrace disable>: + * 0: nop + * 4: nop + * + * t0 is set to ip+8 after the jalr is executed at the callsite, + * so we find the associated op at t0-16. + */ + REG_L t1, -16(t0) // op Should be SZ_REG instead of 16 + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * If the op has a direct call, handle it immediately without + * saving/restoring registers. + */ + REG_L t1, FTRACE_OPS_DIRECT_CALL(t1) + bnez t1, ftrace_caller_direct +#endif +#endif + SAVE_ABI_REGS + PREPARE_ARGS + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + jalr ra +#else +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + REG_L ra, ftrace_call_dest + jalr ra, 0(ra) +#endif + RESTORE_ABI_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bnez t1, ftrace_caller_direct #endif -ftrace_graph_regs_call: - .global ftrace_graph_regs_call - call ftrace_stub + jr t0 +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL) + jr t1 #endif +SYM_FUNC_END(ftrace_caller) - RESTORE_ALL - ret -ENDPROC(ftrace_regs_caller) -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_CODE_START(ftrace_stub_direct_tramp) + jr t0 +SYM_CODE_END(ftrace_stub_direct_tramp) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 6d462681c9c0..da4a4000e57e 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -3,20 +3,21 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <linux/export.h> #include <asm/asm.h> #include <asm/csr.h> #include <asm/unistd.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> -#include <asm-generic/export.h> #include <asm/ftrace.h> .text .macro SAVE_ABI_STATE addi sp, sp, -16 - sd s0, 0(sp) - sd ra, 8(sp) + REG_S s0, 0*SZREG(sp) + REG_S ra, 1*SZREG(sp) addi s0, sp, 16 .endm @@ -25,36 +26,42 @@ * register if a0 was not saved. */ .macro SAVE_RET_ABI_STATE - addi sp, sp, -32 - sd s0, 16(sp) - sd ra, 24(sp) - sd a0, 8(sp) - addi s0, sp, 32 + addi sp, sp, -FREGS_SIZE_ON_STACK + REG_S ra, FREGS_RA(sp) + REG_S s0, FREGS_S0(sp) + REG_S a0, FREGS_A0(sp) + REG_S a1, FREGS_A1(sp) + addi s0, sp, FREGS_SIZE_ON_STACK .endm .macro RESTORE_ABI_STATE - ld ra, 8(sp) - ld s0, 0(sp) + REG_L ra, 1*SZREG(sp) + REG_L s0, 0*SZREG(sp) addi sp, sp, 16 .endm .macro RESTORE_RET_ABI_STATE - ld ra, 24(sp) - ld s0, 16(sp) - ld a0, 8(sp) - addi sp, sp, 32 + REG_L ra, FREGS_RA(sp) + REG_L s0, FREGS_S0(sp) + REG_L a0, FREGS_A0(sp) + REG_L a1, FREGS_A1(sp) + addi sp, sp, FREGS_SIZE_ON_STACK .endm -ENTRY(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub) #ifdef CONFIG_DYNAMIC_FTRACE - .global MCOUNT_NAME - .set MCOUNT_NAME, ftrace_stub + .global _mcount + .set _mcount, ftrace_stub #endif ret -ENDPROC(ftrace_stub) +SYM_FUNC_END(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(return_to_handler) +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) + +SYM_FUNC_START(return_to_handler) /* * On implementing the frame point test, the ideal way is to compare the * s0 (frame pointer, if enabled) on entry and the sp (stack pointer) on return. @@ -63,36 +70,31 @@ ENTRY(return_to_handler) * So alternatively we check the *old* frame pointer position, that is, the * value stored in -16(s0) on entry, and the s0 on return. */ -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv t6, s0 -#endif SAVE_RET_ABI_STATE -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a0, t6 -#endif + mv a0, sp call ftrace_return_to_handler - mv a1, a0 + mv a2, a0 RESTORE_RET_ABI_STATE - jalr a1 -ENDPROC(return_to_handler) + jalr a2 +SYM_FUNC_END(return_to_handler) #endif #ifndef CONFIG_DYNAMIC_FTRACE -ENTRY(MCOUNT_NAME) +SYM_FUNC_START(_mcount) la t4, ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return - ld t1, 0(t0) - bne t1, t4, do_ftrace_graph_caller + REG_L t1, 0(t0) + bne t1, t4, .Ldo_ftrace_graph_caller la t3, ftrace_graph_entry - ld t2, 0(t3) + REG_L t2, 0(t3) la t6, ftrace_graph_entry_stub - bne t2, t6, do_ftrace_graph_caller + bne t2, t6, .Ldo_ftrace_graph_caller #endif la t3, ftrace_trace_function - ld t5, 0(t3) - bne t5, t4, do_trace + REG_L t5, 0(t3) + bne t5, t4, .Ldo_trace ret #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -100,11 +102,11 @@ ENTRY(MCOUNT_NAME) * A pseudo representation for the function graph tracer: * prepare_to_return(&ra_to_caller_of_caller, ra_to_caller) */ -do_ftrace_graph_caller: - addi a0, s0, -8 +.Ldo_ftrace_graph_caller: + addi a0, s0, -SZREG mv a1, ra #ifdef HAVE_FUNCTION_GRAPH_FP_TEST - ld a2, -16(s0) + REG_L a2, -2*SZREG(s0) #endif SAVE_ABI_STATE call prepare_ftrace_return @@ -116,14 +118,14 @@ do_ftrace_graph_caller: * A pseudo representation for the function tracer: * (*ftrace_trace_function)(ra_to_caller, ra_to_caller_of_caller) */ -do_trace: - ld a1, -8(s0) +.Ldo_trace: + REG_L a1, -SZREG(s0) mv a0, ra SAVE_ABI_STATE jalr t5 RESTORE_ABI_STATE ret -ENDPROC(MCOUNT_NAME) +SYM_FUNC_END(_mcount) #endif -EXPORT_SYMBOL(MCOUNT_NAME) +EXPORT_SYMBOL(_mcount) diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index e264e59e596e..75551ac6504c 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/sort.h> unsigned long module_emit_got_entry(struct module *mod, unsigned long val) { @@ -55,43 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) return (unsigned long)&plt[i]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; + const Elf_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(x->r_info, y->r_info); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; } static bool duplicate_rela(const Elf_Rela *rela, int idx) { - int i; - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - return false; + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding slot. + */ + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; } -static void count_max_entries(Elf_Rela *relas, int num, +static void count_max_entries(const Elf_Rela *relas, size_t num, unsigned int *plts, unsigned int *gots) { - unsigned int type, i; - - for (i = 0; i < num; i++) { - type = ELF_RISCV_R_TYPE(relas[i].r_info); - if (type == R_RISCV_CALL_PLT) { - if (!duplicate_rela(relas, i)) - (*plts)++; - } else if (type == R_RISCV_GOT_HI20) { - if (!duplicate_rela(relas, i)) - (*gots)++; + for (size_t i = 0; i < num; i++) { + if (duplicate_rela(relas, i)) + continue; + + switch (ELF_R_TYPE(relas[i].r_info)) { + case R_RISCV_CALL_PLT: + case R_RISCV_PLT32: + (*plts)++; + break; + case R_RISCV_GOT_HI20: + (*gots)++; + break; + default: + unreachable(); } } } +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) +{ + switch (ELF_R_TYPE(rela->r_info)) { + case R_RISCV_CALL_PLT: + case R_RISCV_GOT_HI20: + case R_RISCV_PLT32: + return true; + default: + return false; + } +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { + size_t num_scratch_relas = 0; unsigned int num_plts = 0; unsigned int num_gots = 0; + Elf_Rela *scratch = NULL; + size_t scratch_size = 0; int i; /* @@ -121,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, /* Calculate the maxinum number of entries */ for (i = 0; i < ehdr->e_shnum; i++) { + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; + size_t scratch_size_needed; if (sechdrs[i].sh_type != SHT_RELA) continue; @@ -132,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts, &num_gots); + /* + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being + * close together, so sort a copy of the section to avoid interfering. + */ + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); + if (scratch_size_needed > scratch_size) { + scratch_size = scratch_size_needed; + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + } + + for (size_t j = 0; j < num_relas; j++) + if (rela_needs_plt_got_entry(&relas[j])) + scratch[num_scratch_relas++] = relas[j]; + } + + if (scratch) { + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); + kvfree(scratch); } mod->arch.plt.shdr->sh_type = SHT_NOBITS; diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 91fe16bfaa07..7f6147c18033 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -7,13 +7,38 @@ #include <linux/elf.h> #include <linux/err.h> #include <linux/errno.h> +#include <linux/hashtable.h> +#include <linux/kernel.h> +#include <linux/log2.h> #include <linux/moduleloader.h> -#include <linux/vmalloc.h> #include <linux/sizes.h> #include <linux/pgtable.h> #include <asm/alternative.h> #include <asm/sections.h> +struct used_bucket { + struct list_head head; + struct hlist_head *bucket; +}; + +struct relocation_head { + struct hlist_node node; + struct list_head rel_entry; + void *location; +}; + +struct relocation_entry { + struct list_head head; + Elf_Addr value; + unsigned int type; +}; + +struct relocation_handlers { + int (*reloc_handler)(struct module *me, void *location, Elf_Addr v); + int (*accumulate_handler)(struct module *me, void *location, + long buffer); +}; + /* * The auipc+jalr instruction pair can reach any PC-relative offset * in the range [-2^31 - 2^11, 2^31 - 2^11) @@ -27,68 +52,90 @@ static bool riscv_insn_valid_32bit_offset(ptrdiff_t val) #endif } -static int apply_r_riscv_32_rela(struct module *me, u32 *location, Elf_Addr v) +static int riscv_insn_rmw(void *location, u32 keep, u32 set) +{ + __le16 *parcel = location; + u32 insn = (u32)le16_to_cpu(parcel[0]) | (u32)le16_to_cpu(parcel[1]) << 16; + + insn &= keep; + insn |= set; + + parcel[0] = cpu_to_le16(insn); + parcel[1] = cpu_to_le16(insn >> 16); + return 0; +} + +static int riscv_insn_rvc_rmw(void *location, u16 keep, u16 set) +{ + __le16 *parcel = location; + u16 insn = le16_to_cpu(*parcel); + + insn &= keep; + insn |= set; + + *parcel = cpu_to_le16(insn); + return 0; +} + +static int apply_r_riscv_32_rela(struct module *me, void *location, Elf_Addr v) { if (v != (u32)v) { pr_err("%s: value %016llx out of range for 32-bit field\n", me->name, (long long)v); return -EINVAL; } - *location = v; + *(u32 *)location = v; return 0; } -static int apply_r_riscv_64_rela(struct module *me, u32 *location, Elf_Addr v) +static int apply_r_riscv_64_rela(struct module *me, void *location, Elf_Addr v) { *(u64 *)location = v; return 0; } -static int apply_r_riscv_branch_rela(struct module *me, u32 *location, +static int apply_r_riscv_branch_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; + ptrdiff_t offset = (void *)v - location; u32 imm12 = (offset & 0x1000) << (31 - 12); u32 imm11 = (offset & 0x800) >> (11 - 7); u32 imm10_5 = (offset & 0x7e0) << (30 - 10); u32 imm4_1 = (offset & 0x1e) << (11 - 4); - *location = (*location & 0x1fff07f) | imm12 | imm11 | imm10_5 | imm4_1; - return 0; + return riscv_insn_rmw(location, 0x1fff07f, imm12 | imm11 | imm10_5 | imm4_1); } -static int apply_r_riscv_jal_rela(struct module *me, u32 *location, +static int apply_r_riscv_jal_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; + ptrdiff_t offset = (void *)v - location; u32 imm20 = (offset & 0x100000) << (31 - 20); u32 imm19_12 = (offset & 0xff000); u32 imm11 = (offset & 0x800) << (20 - 11); u32 imm10_1 = (offset & 0x7fe) << (30 - 10); - *location = (*location & 0xfff) | imm20 | imm19_12 | imm11 | imm10_1; - return 0; + return riscv_insn_rmw(location, 0xfff, imm20 | imm19_12 | imm11 | imm10_1); } -static int apply_r_riscv_rvc_branch_rela(struct module *me, u32 *location, +static int apply_r_riscv_rvc_branch_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; + ptrdiff_t offset = (void *)v - location; u16 imm8 = (offset & 0x100) << (12 - 8); u16 imm7_6 = (offset & 0xc0) >> (6 - 5); u16 imm5 = (offset & 0x20) >> (5 - 2); u16 imm4_3 = (offset & 0x18) << (12 - 5); u16 imm2_1 = (offset & 0x6) << (12 - 10); - *(u16 *)location = (*(u16 *)location & 0xe383) | - imm8 | imm7_6 | imm5 | imm4_3 | imm2_1; - return 0; + return riscv_insn_rvc_rmw(location, 0xe383, + imm8 | imm7_6 | imm5 | imm4_3 | imm2_1); } -static int apply_r_riscv_rvc_jump_rela(struct module *me, u32 *location, +static int apply_r_riscv_rvc_jump_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; + ptrdiff_t offset = (void *)v - location; u16 imm11 = (offset & 0x800) << (12 - 11); u16 imm10 = (offset & 0x400) >> (10 - 8); u16 imm9_8 = (offset & 0x300) << (12 - 11); @@ -98,16 +145,14 @@ static int apply_r_riscv_rvc_jump_rela(struct module *me, u32 *location, u16 imm4 = (offset & 0x10) << (12 - 5); u16 imm3_1 = (offset & 0xe) << (12 - 10); - *(u16 *)location = (*(u16 *)location & 0xe003) | - imm11 | imm10 | imm9_8 | imm7 | imm6 | imm5 | imm4 | imm3_1; - return 0; + return riscv_insn_rvc_rmw(location, 0xe003, + imm11 | imm10 | imm9_8 | imm7 | imm6 | imm5 | imm4 | imm3_1); } -static int apply_r_riscv_pcrel_hi20_rela(struct module *me, u32 *location, +static int apply_r_riscv_pcrel_hi20_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; - s32 hi20; + ptrdiff_t offset = (void *)v - location; if (!riscv_insn_valid_32bit_offset(offset)) { pr_err( @@ -116,23 +161,20 @@ static int apply_r_riscv_pcrel_hi20_rela(struct module *me, u32 *location, return -EINVAL; } - hi20 = (offset + 0x800) & 0xfffff000; - *location = (*location & 0xfff) | hi20; - return 0; + return riscv_insn_rmw(location, 0xfff, (offset + 0x800) & 0xfffff000); } -static int apply_r_riscv_pcrel_lo12_i_rela(struct module *me, u32 *location, +static int apply_r_riscv_pcrel_lo12_i_rela(struct module *me, void *location, Elf_Addr v) { /* * v is the lo12 value to fill. It is calculated before calling this * handler. */ - *location = (*location & 0xfffff) | ((v & 0xfff) << 20); - return 0; + return riscv_insn_rmw(location, 0xfffff, (v & 0xfff) << 20); } -static int apply_r_riscv_pcrel_lo12_s_rela(struct module *me, u32 *location, +static int apply_r_riscv_pcrel_lo12_s_rela(struct module *me, void *location, Elf_Addr v) { /* @@ -142,15 +184,12 @@ static int apply_r_riscv_pcrel_lo12_s_rela(struct module *me, u32 *location, u32 imm11_5 = (v & 0xfe0) << (31 - 11); u32 imm4_0 = (v & 0x1f) << (11 - 4); - *location = (*location & 0x1fff07f) | imm11_5 | imm4_0; - return 0; + return riscv_insn_rmw(location, 0x1fff07f, imm11_5 | imm4_0); } -static int apply_r_riscv_hi20_rela(struct module *me, u32 *location, +static int apply_r_riscv_hi20_rela(struct module *me, void *location, Elf_Addr v) { - s32 hi20; - if (IS_ENABLED(CONFIG_CMODEL_MEDLOW)) { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", @@ -158,22 +197,20 @@ static int apply_r_riscv_hi20_rela(struct module *me, u32 *location, return -EINVAL; } - hi20 = ((s32)v + 0x800) & 0xfffff000; - *location = (*location & 0xfff) | hi20; - return 0; + return riscv_insn_rmw(location, 0xfff, ((s32)v + 0x800) & 0xfffff000); } -static int apply_r_riscv_lo12_i_rela(struct module *me, u32 *location, +static int apply_r_riscv_lo12_i_rela(struct module *me, void *location, Elf_Addr v) { /* Skip medlow checking because of filtering by HI20 already */ s32 hi20 = ((s32)v + 0x800) & 0xfffff000; s32 lo12 = ((s32)v - hi20); - *location = (*location & 0xfffff) | ((lo12 & 0xfff) << 20); - return 0; + + return riscv_insn_rmw(location, 0xfffff, (lo12 & 0xfff) << 20); } -static int apply_r_riscv_lo12_s_rela(struct module *me, u32 *location, +static int apply_r_riscv_lo12_s_rela(struct module *me, void *location, Elf_Addr v) { /* Skip medlow checking because of filtering by HI20 already */ @@ -181,20 +218,18 @@ static int apply_r_riscv_lo12_s_rela(struct module *me, u32 *location, s32 lo12 = ((s32)v - hi20); u32 imm11_5 = (lo12 & 0xfe0) << (31 - 11); u32 imm4_0 = (lo12 & 0x1f) << (11 - 4); - *location = (*location & 0x1fff07f) | imm11_5 | imm4_0; - return 0; + + return riscv_insn_rmw(location, 0x1fff07f, imm11_5 | imm4_0); } -static int apply_r_riscv_got_hi20_rela(struct module *me, u32 *location, +static int apply_r_riscv_got_hi20_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; - s32 hi20; + ptrdiff_t offset = (void *)v - location; /* Always emit the got entry */ if (IS_ENABLED(CONFIG_MODULE_SECTIONS)) { - offset = module_emit_got_entry(me, v); - offset = (void *)offset - (void *)location; + offset = (void *)module_emit_got_entry(me, v) - location; } else { pr_err( "%s: can not generate the GOT entry for symbol = %016llx from PC = %p\n", @@ -202,22 +237,19 @@ static int apply_r_riscv_got_hi20_rela(struct module *me, u32 *location, return -EINVAL; } - hi20 = (offset + 0x800) & 0xfffff000; - *location = (*location & 0xfff) | hi20; - return 0; + return riscv_insn_rmw(location, 0xfff, (offset + 0x800) & 0xfffff000); } -static int apply_r_riscv_call_plt_rela(struct module *me, u32 *location, +static int apply_r_riscv_call_plt_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; + ptrdiff_t offset = (void *)v - location; u32 hi20, lo12; if (!riscv_insn_valid_32bit_offset(offset)) { /* Only emit the plt entry if offset over 32-bit range */ if (IS_ENABLED(CONFIG_MODULE_SECTIONS)) { - offset = module_emit_plt_entry(me, v); - offset = (void *)offset - (void *)location; + offset = (void *)module_emit_plt_entry(me, v) - location; } else { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", @@ -228,15 +260,14 @@ static int apply_r_riscv_call_plt_rela(struct module *me, u32 *location, hi20 = (offset + 0x800) & 0xfffff000; lo12 = (offset - hi20) & 0xfff; - *location = (*location & 0xfff) | hi20; - *(location + 1) = (*(location + 1) & 0xfffff) | (lo12 << 20); - return 0; + riscv_insn_rmw(location, 0xfff, hi20); + return riscv_insn_rmw(location + 4, 0xfffff, lo12 << 20); } -static int apply_r_riscv_call_rela(struct module *me, u32 *location, +static int apply_r_riscv_call_rela(struct module *me, void *location, Elf_Addr v) { - ptrdiff_t offset = (void *)v - (void *)location; + ptrdiff_t offset = (void *)v - location; u32 hi20, lo12; if (!riscv_insn_valid_32bit_offset(offset)) { @@ -248,18 +279,17 @@ static int apply_r_riscv_call_rela(struct module *me, u32 *location, hi20 = (offset + 0x800) & 0xfffff000; lo12 = (offset - hi20) & 0xfff; - *location = (*location & 0xfff) | hi20; - *(location + 1) = (*(location + 1) & 0xfffff) | (lo12 << 20); - return 0; + riscv_insn_rmw(location, 0xfff, hi20); + return riscv_insn_rmw(location + 4, 0xfffff, lo12 << 20); } -static int apply_r_riscv_relax_rela(struct module *me, u32 *location, +static int apply_r_riscv_relax_rela(struct module *me, void *location, Elf_Addr v) { return 0; } -static int apply_r_riscv_align_rela(struct module *me, u32 *location, +static int apply_r_riscv_align_rela(struct module *me, void *location, Elf_Addr v) { pr_err( @@ -268,75 +298,499 @@ static int apply_r_riscv_align_rela(struct module *me, u32 *location, return -EINVAL; } -static int apply_r_riscv_add32_rela(struct module *me, u32 *location, +static int apply_r_riscv_add8_rela(struct module *me, void *location, Elf_Addr v) +{ + *(u8 *)location += (u8)v; + return 0; +} + +static int apply_r_riscv_add16_rela(struct module *me, void *location, + Elf_Addr v) +{ + *(u16 *)location += (u16)v; + return 0; +} + +static int apply_r_riscv_add32_rela(struct module *me, void *location, Elf_Addr v) { *(u32 *)location += (u32)v; return 0; } -static int apply_r_riscv_add64_rela(struct module *me, u32 *location, +static int apply_r_riscv_add64_rela(struct module *me, void *location, Elf_Addr v) { *(u64 *)location += (u64)v; return 0; } -static int apply_r_riscv_sub32_rela(struct module *me, u32 *location, +static int apply_r_riscv_sub8_rela(struct module *me, void *location, Elf_Addr v) +{ + *(u8 *)location -= (u8)v; + return 0; +} + +static int apply_r_riscv_sub16_rela(struct module *me, void *location, + Elf_Addr v) +{ + *(u16 *)location -= (u16)v; + return 0; +} + +static int apply_r_riscv_sub32_rela(struct module *me, void *location, Elf_Addr v) { *(u32 *)location -= (u32)v; return 0; } -static int apply_r_riscv_sub64_rela(struct module *me, u32 *location, +static int apply_r_riscv_sub64_rela(struct module *me, void *location, Elf_Addr v) { *(u64 *)location -= (u64)v; return 0; } -static int (*reloc_handlers_rela[]) (struct module *me, u32 *location, - Elf_Addr v) = { - [R_RISCV_32] = apply_r_riscv_32_rela, - [R_RISCV_64] = apply_r_riscv_64_rela, - [R_RISCV_BRANCH] = apply_r_riscv_branch_rela, - [R_RISCV_JAL] = apply_r_riscv_jal_rela, - [R_RISCV_RVC_BRANCH] = apply_r_riscv_rvc_branch_rela, - [R_RISCV_RVC_JUMP] = apply_r_riscv_rvc_jump_rela, - [R_RISCV_PCREL_HI20] = apply_r_riscv_pcrel_hi20_rela, - [R_RISCV_PCREL_LO12_I] = apply_r_riscv_pcrel_lo12_i_rela, - [R_RISCV_PCREL_LO12_S] = apply_r_riscv_pcrel_lo12_s_rela, - [R_RISCV_HI20] = apply_r_riscv_hi20_rela, - [R_RISCV_LO12_I] = apply_r_riscv_lo12_i_rela, - [R_RISCV_LO12_S] = apply_r_riscv_lo12_s_rela, - [R_RISCV_GOT_HI20] = apply_r_riscv_got_hi20_rela, - [R_RISCV_CALL_PLT] = apply_r_riscv_call_plt_rela, - [R_RISCV_CALL] = apply_r_riscv_call_rela, - [R_RISCV_RELAX] = apply_r_riscv_relax_rela, - [R_RISCV_ALIGN] = apply_r_riscv_align_rela, - [R_RISCV_ADD32] = apply_r_riscv_add32_rela, - [R_RISCV_ADD64] = apply_r_riscv_add64_rela, - [R_RISCV_SUB32] = apply_r_riscv_sub32_rela, - [R_RISCV_SUB64] = apply_r_riscv_sub64_rela, +static int dynamic_linking_not_supported(struct module *me, void *location, + Elf_Addr v) +{ + pr_err("%s: Dynamic linking not supported in kernel modules PC = %p\n", + me->name, location); + return -EINVAL; +} + +static int tls_not_supported(struct module *me, void *location, Elf_Addr v) +{ + pr_err("%s: Thread local storage not supported in kernel modules PC = %p\n", + me->name, location); + return -EINVAL; +} + +static int apply_r_riscv_sub6_rela(struct module *me, void *location, Elf_Addr v) +{ + u8 *byte = location; + u8 value = v; + + *byte = (*byte - (value & 0x3f)) & 0x3f; + return 0; +} + +static int apply_r_riscv_set6_rela(struct module *me, void *location, Elf_Addr v) +{ + u8 *byte = location; + u8 value = v; + + *byte = (*byte & 0xc0) | (value & 0x3f); + return 0; +} + +static int apply_r_riscv_set8_rela(struct module *me, void *location, Elf_Addr v) +{ + *(u8 *)location = (u8)v; + return 0; +} + +static int apply_r_riscv_set16_rela(struct module *me, void *location, + Elf_Addr v) +{ + *(u16 *)location = (u16)v; + return 0; +} + +static int apply_r_riscv_set32_rela(struct module *me, void *location, + Elf_Addr v) +{ + *(u32 *)location = (u32)v; + return 0; +} + +static int apply_r_riscv_32_pcrel_rela(struct module *me, void *location, + Elf_Addr v) +{ + *(u32 *)location = v - (uintptr_t)location; + return 0; +} + +static int apply_r_riscv_plt32_rela(struct module *me, void *location, + Elf_Addr v) +{ + ptrdiff_t offset = (void *)v - location; + + if (!riscv_insn_valid_32bit_offset(offset)) { + /* Only emit the plt entry if offset over 32-bit range */ + if (IS_ENABLED(CONFIG_MODULE_SECTIONS)) { + offset = (void *)module_emit_plt_entry(me, v) - location; + } else { + pr_err("%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", + me->name, (long long)v, location); + return -EINVAL; + } + } + + *(u32 *)location = (u32)offset; + return 0; +} + +static int apply_r_riscv_set_uleb128(struct module *me, void *location, Elf_Addr v) +{ + *(long *)location = v; + return 0; +} + +static int apply_r_riscv_sub_uleb128(struct module *me, void *location, Elf_Addr v) +{ + *(long *)location -= v; + return 0; +} + +static int apply_6_bit_accumulation(struct module *me, void *location, long buffer) +{ + u8 *byte = location; + u8 value = buffer; + + if (buffer > 0x3f) { + pr_err("%s: value %ld out of range for 6-bit relocation.\n", + me->name, buffer); + return -EINVAL; + } + + *byte = (*byte & 0xc0) | (value & 0x3f); + return 0; +} + +static int apply_8_bit_accumulation(struct module *me, void *location, long buffer) +{ + if (buffer > U8_MAX) { + pr_err("%s: value %ld out of range for 8-bit relocation.\n", + me->name, buffer); + return -EINVAL; + } + *(u8 *)location = (u8)buffer; + return 0; +} + +static int apply_16_bit_accumulation(struct module *me, void *location, long buffer) +{ + if (buffer > U16_MAX) { + pr_err("%s: value %ld out of range for 16-bit relocation.\n", + me->name, buffer); + return -EINVAL; + } + *(u16 *)location = (u16)buffer; + return 0; +} + +static int apply_32_bit_accumulation(struct module *me, void *location, long buffer) +{ + if (buffer > U32_MAX) { + pr_err("%s: value %ld out of range for 32-bit relocation.\n", + me->name, buffer); + return -EINVAL; + } + *(u32 *)location = (u32)buffer; + return 0; +} + +static int apply_64_bit_accumulation(struct module *me, void *location, long buffer) +{ + *(u64 *)location = (u64)buffer; + return 0; +} + +static int apply_uleb128_accumulation(struct module *me, void *location, long buffer) +{ + /* + * ULEB128 is a variable length encoding. Encode the buffer into + * the ULEB128 data format. + */ + u8 *p = location; + + while (buffer != 0) { + u8 value = buffer & 0x7f; + + buffer >>= 7; + value |= (!!buffer) << 7; + + *p++ = value; + } + return 0; +} + +/* + * Relocations defined in the riscv-elf-psabi-doc. + * This handles static linking only. + */ +static const struct relocation_handlers reloc_handlers[] = { + [R_RISCV_32] = { .reloc_handler = apply_r_riscv_32_rela }, + [R_RISCV_64] = { .reloc_handler = apply_r_riscv_64_rela }, + [R_RISCV_RELATIVE] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_COPY] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_JUMP_SLOT] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_TLS_DTPMOD32] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_TLS_DTPMOD64] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_TLS_DTPREL32] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_TLS_DTPREL64] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_TLS_TPREL32] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_TLS_TPREL64] = { .reloc_handler = dynamic_linking_not_supported }, + /* 12-15 undefined */ + [R_RISCV_BRANCH] = { .reloc_handler = apply_r_riscv_branch_rela }, + [R_RISCV_JAL] = { .reloc_handler = apply_r_riscv_jal_rela }, + [R_RISCV_CALL] = { .reloc_handler = apply_r_riscv_call_rela }, + [R_RISCV_CALL_PLT] = { .reloc_handler = apply_r_riscv_call_plt_rela }, + [R_RISCV_GOT_HI20] = { .reloc_handler = apply_r_riscv_got_hi20_rela }, + [R_RISCV_TLS_GOT_HI20] = { .reloc_handler = tls_not_supported }, + [R_RISCV_TLS_GD_HI20] = { .reloc_handler = tls_not_supported }, + [R_RISCV_PCREL_HI20] = { .reloc_handler = apply_r_riscv_pcrel_hi20_rela }, + [R_RISCV_PCREL_LO12_I] = { .reloc_handler = apply_r_riscv_pcrel_lo12_i_rela }, + [R_RISCV_PCREL_LO12_S] = { .reloc_handler = apply_r_riscv_pcrel_lo12_s_rela }, + [R_RISCV_HI20] = { .reloc_handler = apply_r_riscv_hi20_rela }, + [R_RISCV_LO12_I] = { .reloc_handler = apply_r_riscv_lo12_i_rela }, + [R_RISCV_LO12_S] = { .reloc_handler = apply_r_riscv_lo12_s_rela }, + [R_RISCV_TPREL_HI20] = { .reloc_handler = tls_not_supported }, + [R_RISCV_TPREL_LO12_I] = { .reloc_handler = tls_not_supported }, + [R_RISCV_TPREL_LO12_S] = { .reloc_handler = tls_not_supported }, + [R_RISCV_TPREL_ADD] = { .reloc_handler = tls_not_supported }, + [R_RISCV_ADD8] = { .reloc_handler = apply_r_riscv_add8_rela, + .accumulate_handler = apply_8_bit_accumulation }, + [R_RISCV_ADD16] = { .reloc_handler = apply_r_riscv_add16_rela, + .accumulate_handler = apply_16_bit_accumulation }, + [R_RISCV_ADD32] = { .reloc_handler = apply_r_riscv_add32_rela, + .accumulate_handler = apply_32_bit_accumulation }, + [R_RISCV_ADD64] = { .reloc_handler = apply_r_riscv_add64_rela, + .accumulate_handler = apply_64_bit_accumulation }, + [R_RISCV_SUB8] = { .reloc_handler = apply_r_riscv_sub8_rela, + .accumulate_handler = apply_8_bit_accumulation }, + [R_RISCV_SUB16] = { .reloc_handler = apply_r_riscv_sub16_rela, + .accumulate_handler = apply_16_bit_accumulation }, + [R_RISCV_SUB32] = { .reloc_handler = apply_r_riscv_sub32_rela, + .accumulate_handler = apply_32_bit_accumulation }, + [R_RISCV_SUB64] = { .reloc_handler = apply_r_riscv_sub64_rela, + .accumulate_handler = apply_64_bit_accumulation }, + /* 41-42 reserved for future standard use */ + [R_RISCV_ALIGN] = { .reloc_handler = apply_r_riscv_align_rela }, + [R_RISCV_RVC_BRANCH] = { .reloc_handler = apply_r_riscv_rvc_branch_rela }, + [R_RISCV_RVC_JUMP] = { .reloc_handler = apply_r_riscv_rvc_jump_rela }, + /* 46-50 reserved for future standard use */ + [R_RISCV_RELAX] = { .reloc_handler = apply_r_riscv_relax_rela }, + [R_RISCV_SUB6] = { .reloc_handler = apply_r_riscv_sub6_rela, + .accumulate_handler = apply_6_bit_accumulation }, + [R_RISCV_SET6] = { .reloc_handler = apply_r_riscv_set6_rela, + .accumulate_handler = apply_6_bit_accumulation }, + [R_RISCV_SET8] = { .reloc_handler = apply_r_riscv_set8_rela, + .accumulate_handler = apply_8_bit_accumulation }, + [R_RISCV_SET16] = { .reloc_handler = apply_r_riscv_set16_rela, + .accumulate_handler = apply_16_bit_accumulation }, + [R_RISCV_SET32] = { .reloc_handler = apply_r_riscv_set32_rela, + .accumulate_handler = apply_32_bit_accumulation }, + [R_RISCV_32_PCREL] = { .reloc_handler = apply_r_riscv_32_pcrel_rela }, + [R_RISCV_IRELATIVE] = { .reloc_handler = dynamic_linking_not_supported }, + [R_RISCV_PLT32] = { .reloc_handler = apply_r_riscv_plt32_rela }, + [R_RISCV_SET_ULEB128] = { .reloc_handler = apply_r_riscv_set_uleb128, + .accumulate_handler = apply_uleb128_accumulation }, + [R_RISCV_SUB_ULEB128] = { .reloc_handler = apply_r_riscv_sub_uleb128, + .accumulate_handler = apply_uleb128_accumulation }, + /* 62-191 reserved for future standard use */ + /* 192-255 nonstandard ABI extensions */ }; +static void +process_accumulated_relocations(struct module *me, + struct hlist_head **relocation_hashtable, + struct list_head *used_buckets_list) +{ + /* + * Only ADD/SUB/SET/ULEB128 should end up here. + * + * Each bucket may have more than one relocation location. All + * relocations for a location are stored in a list in a bucket. + * + * Relocations are applied to a temp variable before being stored to the + * provided location to check for overflow. This also allows ULEB128 to + * properly decide how many entries are needed before storing to + * location. The final value is stored into location using the handler + * for the last relocation to an address. + * + * Three layers of indexing: + * - Each of the buckets in use + * - Groups of relocations in each bucket by location address + * - Each relocation entry for a location address + */ + struct used_bucket *bucket_iter; + struct used_bucket *bucket_iter_tmp; + struct relocation_head *rel_head_iter; + struct hlist_node *rel_head_iter_tmp; + struct relocation_entry *rel_entry_iter; + struct relocation_entry *rel_entry_iter_tmp; + int curr_type; + void *location; + long buffer; + + list_for_each_entry_safe(bucket_iter, bucket_iter_tmp, + used_buckets_list, head) { + hlist_for_each_entry_safe(rel_head_iter, rel_head_iter_tmp, + bucket_iter->bucket, node) { + buffer = 0; + location = rel_head_iter->location; + list_for_each_entry_safe(rel_entry_iter, + rel_entry_iter_tmp, + &rel_head_iter->rel_entry, + head) { + curr_type = rel_entry_iter->type; + reloc_handlers[curr_type].reloc_handler( + me, &buffer, rel_entry_iter->value); + kfree(rel_entry_iter); + } + reloc_handlers[curr_type].accumulate_handler( + me, location, buffer); + kfree(rel_head_iter); + } + kfree(bucket_iter); + } + + kvfree(*relocation_hashtable); +} + +static int add_relocation_to_accumulate(struct module *me, int type, + void *location, + unsigned int hashtable_bits, Elf_Addr v, + struct hlist_head *relocation_hashtable, + struct list_head *used_buckets_list) +{ + struct relocation_entry *entry; + struct relocation_head *rel_head; + struct hlist_head *current_head; + struct used_bucket *bucket; + unsigned long hash; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + + if (!entry) + return -ENOMEM; + + INIT_LIST_HEAD(&entry->head); + entry->type = type; + entry->value = v; + + hash = hash_min((uintptr_t)location, hashtable_bits); + + current_head = &relocation_hashtable[hash]; + + /* + * Search for the relocation_head for the relocations that happen at the + * provided location + */ + bool found = false; + struct relocation_head *rel_head_iter; + + hlist_for_each_entry(rel_head_iter, current_head, node) { + if (rel_head_iter->location == location) { + found = true; + rel_head = rel_head_iter; + break; + } + } + + /* + * If there has not yet been any relocations at the provided location, + * create a relocation_head for that location and populate it with this + * relocation_entry. + */ + if (!found) { + rel_head = kmalloc(sizeof(*rel_head), GFP_KERNEL); + + if (!rel_head) { + kfree(entry); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rel_head->rel_entry); + rel_head->location = location; + INIT_HLIST_NODE(&rel_head->node); + if (!current_head->first) { + bucket = + kmalloc(sizeof(struct used_bucket), GFP_KERNEL); + + if (!bucket) { + kfree(entry); + kfree(rel_head); + return -ENOMEM; + } + + INIT_LIST_HEAD(&bucket->head); + bucket->bucket = current_head; + list_add(&bucket->head, used_buckets_list); + } + hlist_add_head(&rel_head->node, current_head); + } + + /* Add relocation to head of discovered rel_head */ + list_add_tail(&entry->head, &rel_head->rel_entry); + + return 0; +} + +static unsigned int +initialize_relocation_hashtable(unsigned int num_relocations, + struct hlist_head **relocation_hashtable) +{ + /* Can safely assume that bits is not greater than sizeof(long) */ + unsigned long hashtable_size = roundup_pow_of_two(num_relocations); + /* + * When hashtable_size == 1, hashtable_bits == 0. + * This is valid because the hashing algorithm returns 0 in this case. + */ + unsigned int hashtable_bits = ilog2(hashtable_size); + + /* + * Double size of hashtable if num_relocations * 1.25 is greater than + * hashtable_size. + */ + int should_double_size = ((num_relocations + (num_relocations >> 2)) > (hashtable_size)); + + hashtable_bits += should_double_size; + + hashtable_size <<= should_double_size; + + /* Number of relocations may be large, so kvmalloc it */ + *relocation_hashtable = kvmalloc_array(hashtable_size, + sizeof(**relocation_hashtable), + GFP_KERNEL); + if (!*relocation_hashtable) + return 0; + + __hash_init(*relocation_hashtable, hashtable_size); + + return hashtable_bits; +} + int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned int relsec, struct module *me) { Elf_Rela *rel = (void *) sechdrs[relsec].sh_addr; - int (*handler)(struct module *me, u32 *location, Elf_Addr v); + int (*handler)(struct module *me, void *location, Elf_Addr v); Elf_Sym *sym; - u32 *location; + void *location; unsigned int i, type; + unsigned int j_idx = 0; Elf_Addr v; int res; + unsigned int num_relocations = sechdrs[relsec].sh_size / sizeof(*rel); + struct hlist_head *relocation_hashtable; + unsigned int hashtable_bits; + LIST_HEAD(used_buckets_list); + + hashtable_bits = initialize_relocation_hashtable(num_relocations, + &relocation_hashtable); + + if (!relocation_hashtable) + return -ENOMEM; pr_debug("Applying relocate section %u to %u\n", relsec, sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + for (i = 0; i < num_relocations; i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + rel[i].r_offset; @@ -354,8 +808,8 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, type = ELF_RISCV_R_TYPE(rel[i].r_info); - if (type < ARRAY_SIZE(reloc_handlers_rela)) - handler = reloc_handlers_rela[type]; + if (type < ARRAY_SIZE(reloc_handlers)) + handler = reloc_handlers[type].reloc_handler; else handler = NULL; @@ -368,9 +822,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, v = sym->st_value + rel[i].r_addend; if (type == R_RISCV_PCREL_LO12_I || type == R_RISCV_PCREL_LO12_S) { - unsigned int j; + unsigned int j = j_idx; + bool found = false; - for (j = 0; j < sechdrs[relsec].sh_size / sizeof(*rel); j++) { + do { unsigned long hi20_loc = sechdrs[sechdrs[relsec].sh_info].sh_addr + rel[j].r_offset; @@ -399,49 +854,43 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, hi20 = (offset + 0x800) & 0xfffff000; lo12 = offset - hi20; v = lo12; + found = true; break; } - } - if (j == sechdrs[relsec].sh_size / sizeof(*rel)) { + + j++; + if (j == num_relocations) + j = 0; + + } while (j_idx != j); + + if (!found) { pr_err( "%s: Can not find HI20 relocation information\n", me->name); return -EINVAL; } + + /* Record the previous j-loop end index */ + j_idx = j; } - res = handler(me, location, v); + if (reloc_handlers[type].accumulate_handler) + res = add_relocation_to_accumulate(me, type, location, + hashtable_bits, v, + relocation_hashtable, + &used_buckets_list); + else + res = handler(me, location, v); if (res) return res; } - return 0; -} - -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) -void *module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, MODULES_VADDR, - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} -#endif - -static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - const char *name) -{ - const Elf_Shdr *s, *se; - const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + process_accumulated_relocations(me, &relocation_hashtable, + &used_buckets_list); - for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { - if (strcmp(name, secstrs + s->sh_name) == 0) - return s; - } - - return NULL; + return 0; } int module_finalize(const Elf_Ehdr *hdr, diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c new file mode 100644 index 000000000000..fa6b0339a65d --- /dev/null +++ b/arch/riscv/kernel/paravirt.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "riscv-pv: " fmt + +#include <linux/cpuhotplug.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/jump_label.h> +#include <linux/kconfig.h> +#include <linux/kernel.h> +#include <linux/percpu-defs.h> +#include <linux/printk.h> +#include <linux/static_call.h> +#include <linux/types.h> + +#include <asm/barrier.h> +#include <asm/page.h> +#include <asm/paravirt.h> +#include <asm/sbi.h> + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +static DEFINE_PER_CPU(struct sbi_sta_struct, steal_time) __aligned(64); + +static bool __init has_pv_steal_clock(void) +{ + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_STA) > 0) { + pr_info("SBI STA extension detected\n"); + return true; + } + + return false; +} + +static int sbi_sta_steal_time_set_shmem(unsigned long lo, unsigned long hi, + unsigned long flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_STA, SBI_EXT_STA_STEAL_TIME_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + if (ret.error) { + if (lo == SBI_SHMEM_DISABLE && hi == SBI_SHMEM_DISABLE) + pr_warn("Failed to disable steal-time shmem"); + else + pr_warn("Failed to set steal-time shmem"); + return sbi_err_map_linux_errno(ret.error); + } + + return 0; +} + +static int pv_time_cpu_online(unsigned int cpu) +{ + struct sbi_sta_struct *st = this_cpu_ptr(&steal_time); + phys_addr_t pa = __pa(st); + unsigned long lo = (unsigned long)pa; + unsigned long hi = IS_ENABLED(CONFIG_32BIT) ? upper_32_bits((u64)pa) : 0; + + return sbi_sta_steal_time_set_shmem(lo, hi, 0); +} + +static int pv_time_cpu_down_prepare(unsigned int cpu) +{ + return sbi_sta_steal_time_set_shmem(SBI_SHMEM_DISABLE, + SBI_SHMEM_DISABLE, 0); +} + +static u64 pv_time_steal_clock(int cpu) +{ + struct sbi_sta_struct *st = per_cpu_ptr(&steal_time, cpu); + __le32 sequence; + __le64 steal; + + /* + * Check the sequence field before and after reading the steal + * field. Repeat the read if it is different or odd. + */ + do { + sequence = READ_ONCE(st->sequence); + virt_rmb(); + steal = READ_ONCE(st->steal); + virt_rmb(); + } while ((le32_to_cpu(sequence) & 1) || + sequence != READ_ONCE(st->sequence)); + + return le64_to_cpu(steal); +} + +int __init pv_time_init(void) +{ + int ret; + + if (!has_pv_steal_clock()) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "riscv/pv_time:online", + pv_time_cpu_online, + pv_time_cpu_down_prepare); + if (ret < 0) + return ret; + + static_call_update(pv_steal_clock, pv_time_steal_clock); + + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + + pr_info("Computing paravirt steal-time\n"); + + return 0; +} diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 765004b60513..db13c9ddf9e3 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -6,20 +6,34 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memory.h> +#include <linux/string.h> #include <linux/uaccess.h> #include <linux/stop_machine.h> #include <asm/kprobes.h> #include <asm/cacheflush.h> #include <asm/fixmap.h> -#include <asm/patch.h> +#include <asm/ftrace.h> +#include <asm/text-patching.h> +#include <asm/sections.h> struct patch_insn { void *addr; - u32 insn; + u32 *insns; + size_t len; atomic_t cpu_count; }; +int riscv_patch_in_stop_machine = false; + #ifdef CONFIG_MMU + +static inline bool is_kernel_exittext(uintptr_t addr) +{ + return system_state < SYSTEM_RUNNING && + addr >= (uintptr_t)__exittext_begin && + addr < (uintptr_t)__exittext_end; +} + /* * The fix_to_virt(, idx) needs a const value (not a dynamic variable of * reg-a0) or BUILD_BUG_ON failed with "idx >= __end_of_fixed_addresses". @@ -30,7 +44,7 @@ static __always_inline void *patch_map(void *addr, const unsigned int fixmap) uintptr_t uintaddr = (uintptr_t) addr; struct page *page; - if (core_kernel_text(uintaddr)) + if (core_kernel_text(uintaddr) || is_kernel_exittext(uintaddr)) page = phys_to_page(__pa_symbol(addr)); else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) page = vmalloc_to_page(addr); @@ -40,7 +54,7 @@ static __always_inline void *patch_map(void *addr, const unsigned int fixmap) BUG_ON(!page); return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); + offset_in_page(addr)); } static void patch_unmap(int fixmap) @@ -49,51 +63,186 @@ static void patch_unmap(int fixmap) } NOKPROBE_SYMBOL(patch_unmap); -static int patch_insn_write(void *addr, const void *insn, size_t len) +static int __patch_insn_set(void *addr, u8 c, size_t len) { + bool across_pages = (offset_in_page(addr) + len) > PAGE_SIZE; void *waddr = addr; - bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE; - int ret; /* + * Only two pages can be mapped at a time for writing. + */ + if (len + offset_in_page(addr) > 2 * PAGE_SIZE) + return -EINVAL; + /* * Before reaching here, it was expected to lock the text_mutex * already, so we don't need to give another lock here and could * ensure that it was safe between each cores. */ lockdep_assert_held(&text_mutex); + preempt_disable(); + if (across_pages) - patch_map(addr + len, FIX_TEXT_POKE1); + patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); + + waddr = patch_map(addr, FIX_TEXT_POKE0); + + memset(waddr, c, len); + + /* + * We could have just patched a function that is about to be + * called so make sure we don't execute partially patched + * instructions by flushing the icache as soon as possible. + */ + local_flush_icache_range((unsigned long)waddr, + (unsigned long)waddr + len); + + patch_unmap(FIX_TEXT_POKE0); + + if (across_pages) + patch_unmap(FIX_TEXT_POKE1); + + preempt_enable(); + + return 0; +} +NOKPROBE_SYMBOL(__patch_insn_set); + +static int __patch_insn_write(void *addr, const void *insn, size_t len) +{ + bool across_pages = (offset_in_page(addr) + len) > PAGE_SIZE; + void *waddr = addr; + int ret; + + /* + * Only two pages can be mapped at a time for writing. + */ + if (len + offset_in_page(addr) > 2 * PAGE_SIZE) + return -EINVAL; + + /* + * Before reaching here, it was expected to lock the text_mutex + * already, so we don't need to give another lock here and could + * ensure that it was safe between each cores. + * + * We're currently using stop_machine() for ftrace & kprobes, and while + * that ensures text_mutex is held before installing the mappings it + * does not ensure text_mutex is held by the calling thread. That's + * safe but triggers a lockdep failure, so just elide it for that + * specific case. + */ + if (!riscv_patch_in_stop_machine) + lockdep_assert_held(&text_mutex); + + preempt_disable(); + + if (across_pages) + patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); waddr = patch_map(addr, FIX_TEXT_POKE0); ret = copy_to_kernel_nofault(waddr, insn, len); + /* + * We could have just patched a function that is about to be + * called so make sure we don't execute partially patched + * instructions by flushing the icache as soon as possible. + */ + local_flush_icache_range((unsigned long)waddr, + (unsigned long)waddr + len); + patch_unmap(FIX_TEXT_POKE0); if (across_pages) patch_unmap(FIX_TEXT_POKE1); + preempt_enable(); + return ret; } -NOKPROBE_SYMBOL(patch_insn_write); +NOKPROBE_SYMBOL(__patch_insn_write); #else -static int patch_insn_write(void *addr, const void *insn, size_t len) +static int __patch_insn_set(void *addr, u8 c, size_t len) +{ + memset(addr, c, len); + + return 0; +} +NOKPROBE_SYMBOL(__patch_insn_set); + +static int __patch_insn_write(void *addr, const void *insn, size_t len) { return copy_to_kernel_nofault(addr, insn, len); } -NOKPROBE_SYMBOL(patch_insn_write); +NOKPROBE_SYMBOL(__patch_insn_write); #endif /* CONFIG_MMU */ -int patch_text_nosync(void *addr, const void *insns, size_t len) +static int patch_insn_set(void *addr, u8 c, size_t len) +{ + size_t size; + int ret; + + /* + * __patch_insn_set() can only work on 2 pages at a time so call it in a + * loop with len <= 2 * PAGE_SIZE. + */ + while (len) { + size = min(len, PAGE_SIZE * 2 - offset_in_page(addr)); + ret = __patch_insn_set(addr, c, size); + if (ret) + return ret; + + addr += size; + len -= size; + } + + return 0; +} +NOKPROBE_SYMBOL(patch_insn_set); + +int patch_text_set_nosync(void *addr, u8 c, size_t len) +{ + int ret; + + ret = patch_insn_set(addr, c, len); + if (!ret) + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); + + return ret; +} +NOKPROBE_SYMBOL(patch_text_set_nosync); + +int patch_insn_write(void *addr, const void *insn, size_t len) { - u32 *tp = addr; + size_t size; int ret; - ret = patch_insn_write(tp, insns, len); + /* + * Copy the instructions to the destination address, two pages at a time + * because __patch_insn_write() can only handle len <= 2 * PAGE_SIZE. + */ + while (len) { + size = min(len, PAGE_SIZE * 2 - offset_in_page(addr)); + ret = __patch_insn_write(addr, insn, size); + if (ret) + return ret; + + addr += size; + insn += size; + len -= size; + } + + return 0; +} +NOKPROBE_SYMBOL(patch_insn_write); +int patch_text_nosync(void *addr, const void *insns, size_t len) +{ + int ret; + + ret = patch_insn_write(addr, insns, len); if (!ret) - flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len); + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); return ret; } @@ -105,29 +254,48 @@ static int patch_text_cb(void *data) int ret = 0; if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { - ret = - patch_text_nosync(patch->addr, &patch->insn, - GET_INSN_LENGTH(patch->insn)); - atomic_inc(&patch->cpu_count); + ret = patch_insn_write(patch->addr, patch->insns, patch->len); + /* + * Make sure the patching store is effective *before* we + * increment the counter which releases all waiting CPUs + * by using the release variant of atomic increment. The + * release pairs with the call to local_flush_icache_all() + * on the waiting CPU. + */ + atomic_inc_return_release(&patch->cpu_count); } else { while (atomic_read(&patch->cpu_count) <= num_online_cpus()) cpu_relax(); - smp_mb(); + + local_flush_icache_all(); } return ret; } NOKPROBE_SYMBOL(patch_text_cb); -int patch_text(void *addr, u32 insn) +int patch_text(void *addr, u32 *insns, size_t len) { + int ret; struct patch_insn patch = { .addr = addr, - .insn = insn, + .insns = insns, + .len = len, .cpu_count = ATOMIC_INIT(0), }; - return stop_machine_cpuslocked(patch_text_cb, - &patch, cpu_online_mask); + /* + * kprobes takes text_mutex, before calling patch_text(), but as we call + * calls stop_machine(), the lockdep assertion in patch_insn_write() + * gets confused by the context in which the lock is taken. + * Instead, ensure the lock is held before calling stop_machine(), and + * set riscv_patch_in_stop_machine to skip the check in + * patch_insn_write(). + */ + lockdep_assert_held(&text_mutex); + riscv_patch_in_stop_machine = true; + ret = stop_machine_cpuslocked(patch_text_cb, &patch, cpu_online_mask); + riscv_patch_in_stop_machine = false; + return ret; } NOKPROBE_SYMBOL(patch_text); diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index 3348a61de7d9..b465bc9eb870 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -6,37 +6,9 @@ #include <asm/stacktrace.h> -/* - * Get the return address for a single stackframe and return a pointer to the - * next frame tail. - */ -static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, - unsigned long fp, unsigned long reg_ra) +static bool fill_callchain(void *entry, unsigned long pc) { - struct stackframe buftail; - unsigned long ra = 0; - unsigned long __user *user_frame_tail = - (unsigned long __user *)(fp - sizeof(struct stackframe)); - - /* Check accessibility of one struct frame_tail beyond */ - if (!access_ok(user_frame_tail, sizeof(buftail))) - return 0; - if (__copy_from_user_inatomic(&buftail, user_frame_tail, - sizeof(buftail))) - return 0; - - if (reg_ra != 0) - ra = reg_ra; - else - ra = buftail.ra; - - fp = buftail.fp; - if (ra != 0) - perf_callchain_store(entry, ra); - else - return 0; - - return fp; + return perf_callchain_store(entry, pc) == 0; } /* @@ -56,23 +28,21 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - unsigned long fp = 0; + if (perf_guest_state()) { + /* TODO: We don't support guest os callchain now */ + return; + } - fp = regs->s0; - perf_callchain_store(entry, regs->epc); - - fp = user_backtrace(entry, fp, regs->ra); - while (fp && !(fp & 0x3) && entry->nr < entry->max_stack) - fp = user_backtrace(entry, fp, 0); -} - -static bool fill_callchain(void *entry, unsigned long pc) -{ - return perf_callchain_store(entry, pc) == 0; + arch_stack_walk_user(fill_callchain, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + if (perf_guest_state()) { + /* TODO: We don't support guest os callchain now */ + return; + } + walk_stackframe(NULL, regs, fill_callchain, entry); } diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile new file mode 100644 index 000000000000..81d69d45c06c --- /dev/null +++ b/arch/riscv/kernel/pi/Makefile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 +# This file was copied from arm64/kernel/pi/Makefile. + +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + $(call cc-option,-mbranch-protection=none) \ + -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ + -include $(srctree)/include/linux/hidden.h \ + -D__DISABLE_EXPORTS -ffreestanding \ + -fno-asynchronous-unwind-tables -fno-unwind-tables \ + $(call cc-option,-fno-addrsig) + +# Disable LTO +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) + +KBUILD_CFLAGS += -mcmodel=medany + +CFLAGS_cmdline_early.o += -D__NO_FORTIFY +CFLAGS_fdt_early.o += -D__NO_FORTIFY +# lib/string.c already defines __NO_FORTIFY +CFLAGS_ctype.o += -D__NO_FORTIFY +CFLAGS_lib-fdt.o += -D__NO_FORTIFY +CFLAGS_lib-fdt_ro.o += -D__NO_FORTIFY +CFLAGS_archrandom_early.o += -D__NO_FORTIFY + +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ + --remove-section=.note.gnu.property \ + --prefix-alloc-sections=.init.pi +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) + +$(obj)/string.o: $(srctree)/lib/string.c FORCE + $(call if_changed_rule,cc_o_c) + +$(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE + $(call if_changed_rule,cc_o_c) + +obj-y := cmdline_early.pi.o fdt_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o archrandom_early.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/riscv/kernel/pi/archrandom_early.c b/arch/riscv/kernel/pi/archrandom_early.c new file mode 100644 index 000000000000..3f05d3cf3b7b --- /dev/null +++ b/arch/riscv/kernel/pi/archrandom_early.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/csr.h> +#include <linux/processor.h> + +#include "pi.h" + +/* + * To avoid rewriting code include asm/archrandom.h and create macros + * for the functions that won't be included. + */ +#undef riscv_has_extension_unlikely +#define riscv_has_extension_likely(...) false +#undef pr_err_once +#define pr_err_once(...) + +#include <asm/archrandom.h> + +u64 get_kaslr_seed_zkr(const uintptr_t dtb_pa) +{ + unsigned long seed = 0; + + if (!fdt_early_match_extension_isa((const void *)dtb_pa, "zkr")) + return 0; + + if (!csr_seed_long(&seed)) + return 0; + + return seed; +} diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c new file mode 100644 index 000000000000..fbcdc9e4e143 --- /dev/null +++ b/arch/riscv/kernel/pi/cmdline_early.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/types.h> +#include <linux/init.h> +#include <linux/libfdt.h> +#include <linux/string.h> +#include <asm/pgtable.h> +#include <asm/setup.h> + +#include "pi.h" + +static char early_cmdline[COMMAND_LINE_SIZE]; + +static char *get_early_cmdline(uintptr_t dtb_pa) +{ + const char *fdt_cmdline = NULL; + unsigned int fdt_cmdline_size = 0; + int chosen_node; + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + chosen_node = fdt_path_offset((void *)dtb_pa, "/chosen"); + if (chosen_node >= 0) { + fdt_cmdline = fdt_getprop((void *)dtb_pa, chosen_node, + "bootargs", NULL); + if (fdt_cmdline) { + fdt_cmdline_size = strlen(fdt_cmdline); + strscpy(early_cmdline, fdt_cmdline, + COMMAND_LINE_SIZE); + } + } + } + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) || + IS_ENABLED(CONFIG_CMDLINE_FORCE) || + fdt_cmdline_size == 0 /* CONFIG_CMDLINE_FALLBACK */) { + strlcat(early_cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + } + + return early_cmdline; +} + +static u64 match_noXlvl(char *cmdline) +{ + if (strstr(cmdline, "no4lvl")) + return SATP_MODE_48; + else if (strstr(cmdline, "no5lvl")) + return SATP_MODE_57; + + return 0; +} + +u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa) +{ + char *cmdline = get_early_cmdline(dtb_pa); + + return match_noXlvl(cmdline); +} + +static bool match_nokaslr(char *cmdline) +{ + return strstr(cmdline, "nokaslr"); +} + +bool set_nokaslr_from_cmdline(uintptr_t dtb_pa) +{ + char *cmdline = get_early_cmdline(dtb_pa); + + return match_nokaslr(cmdline); +} diff --git a/arch/riscv/kernel/pi/fdt_early.c b/arch/riscv/kernel/pi/fdt_early.c new file mode 100644 index 000000000000..9bdee2fafe47 --- /dev/null +++ b/arch/riscv/kernel/pi/fdt_early.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/types.h> +#include <linux/init.h> +#include <linux/libfdt.h> +#include <linux/ctype.h> + +#include "pi.h" + +u64 get_kaslr_seed(uintptr_t dtb_pa) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset((void *)dtb_pa, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w((void *)dtb_pa, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +/** + * fdt_device_is_available - check if a device is available for use + * + * @fdt: pointer to the device tree blob + * @node: offset of the node whose property to find + * + * Returns true if the status property is absent or set to "okay" or "ok", + * false otherwise + */ +static bool fdt_device_is_available(const void *fdt, int node) +{ + const char *status; + int statlen; + + status = fdt_getprop(fdt, node, "status", &statlen); + if (!status) + return true; + + if (statlen > 0) { + if (!strcmp(status, "okay") || !strcmp(status, "ok")) + return true; + } + + return false; +} + +/* Copy of fdt_nodename_eq_ */ +static int fdt_node_name_eq(const void *fdt, int offset, + const char *s) +{ + int olen; + int len = strlen(s); + const char *p = fdt_get_name(fdt, offset, &olen); + + if (!p || olen < len) + /* short match */ + return 0; + + if (memcmp(p, s, len) != 0) + return 0; + + if (p[len] == '\0') + return 1; + else if (!memchr(s, '@', len) && (p[len] == '@')) + return 1; + else + return 0; +} + +/** + * isa_string_contains - check if isa string contains an extension + * + * @isa_str: isa string to search + * @ext_name: the extension to search for + * + * Returns true if the extension is in the given isa string, + * false otherwise + */ +static bool isa_string_contains(const char *isa_str, const char *ext_name) +{ + size_t i, single_end, len = strlen(ext_name); + char ext_end; + + /* Error must contain rv32/64 */ + if (strlen(isa_str) < 4) + return false; + + if (len == 1) { + single_end = strcspn(isa_str, "sSxXzZ"); + /* Search for single chars between rv32/64 and multi-letter extensions */ + for (i = 4; i < single_end; i++) { + if (tolower(isa_str[i]) == ext_name[0]) + return true; + } + return false; + } + + /* Skip to start of multi-letter extensions */ + isa_str = strpbrk(isa_str, "sSxXzZ"); + while (isa_str) { + if (strncasecmp(isa_str, ext_name, len) == 0) { + ext_end = isa_str[len]; + /* Check if matches the whole extension. */ + if (ext_end == '\0' || ext_end == '_') + return true; + } + /* Multi-letter extensions must be split from other multi-letter + * extensions with an "_", the end of a multi-letter extension will + * either be the null character or the "_" at the start of the next + * multi-letter extension. + */ + isa_str = strchr(isa_str, '_'); + if (isa_str) + isa_str++; + } + + return false; +} + +/** + * early_cpu_isa_ext_available - check if cpu node has an extension + * + * @fdt: pointer to the device tree blob + * @node: offset of the cpu node + * @ext_name: the extension to search for + * + * Returns true if the cpu node has the extension, + * false otherwise + */ +static bool early_cpu_isa_ext_available(const void *fdt, int node, const char *ext_name) +{ + const void *prop; + int len; + + prop = fdt_getprop(fdt, node, "riscv,isa-extensions", &len); + if (prop && fdt_stringlist_contains(prop, len, ext_name)) + return true; + + prop = fdt_getprop(fdt, node, "riscv,isa", &len); + if (prop && isa_string_contains(prop, ext_name)) + return true; + + return false; +} + +/** + * fdt_early_match_extension_isa - check if all cpu nodes have an extension + * + * @fdt: pointer to the device tree blob + * @ext_name: the extension to search for + * + * Returns true if the all available the cpu nodes have the extension, + * false otherwise + */ +bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name) +{ + int node, parent; + bool ret = false; + + parent = fdt_path_offset(fdt, "/cpus"); + if (parent < 0) + return false; + + fdt_for_each_subnode(node, fdt, parent) { + if (!fdt_node_name_eq(fdt, node, "cpu")) + continue; + + if (!fdt_device_is_available(fdt, node)) + continue; + + if (!early_cpu_isa_ext_available(fdt, node, ext_name)) + return false; + + ret = true; + } + + return ret; +} diff --git a/arch/riscv/kernel/pi/pi.h b/arch/riscv/kernel/pi/pi.h new file mode 100644 index 000000000000..21141d84fea6 --- /dev/null +++ b/arch/riscv/kernel/pi/pi.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RISCV_PI_H_ +#define _RISCV_PI_H_ + +#include <linux/types.h> + +/* + * The following functions are exported (but prefixed). Declare them here so + * that LLVM does not complain it lacks the 'static' keyword (which, if + * added, makes LLVM complain because the function is unused). + */ + +u64 get_kaslr_seed(uintptr_t dtb_pa); +u64 get_kaslr_seed_zkr(const uintptr_t dtb_pa); +bool set_nokaslr_from_cmdline(uintptr_t dtb_pa); +u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa); + +bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name); + +#endif /* _RISCV_PI_H_ */ diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile index 7f0840dcc31b..d2129f2c61b8 100644 --- a/arch/riscv/kernel/probes/Makefile +++ b/arch/riscv/kernel/probes/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o simulate-insn.o -obj-$(CONFIG_KPROBES) += kprobes_trampoline.o -obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o +obj-$(CONFIG_RETHOOK) += rethook.o rethook_trampoline.o obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o simulate-insn.o CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rethook_trampoline.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c index 64f6183b4717..65d9590bfb9f 100644 --- a/arch/riscv/kernel/probes/decode-insn.c +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -29,13 +29,14 @@ riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) * TODO: the REJECTED ones below need to be implemented */ #ifdef CONFIG_RISCV_ISA_C - RISCV_INSN_REJECTED(c_j, insn); - RISCV_INSN_REJECTED(c_jr, insn); RISCV_INSN_REJECTED(c_jal, insn); - RISCV_INSN_REJECTED(c_jalr, insn); - RISCV_INSN_REJECTED(c_beqz, insn); - RISCV_INSN_REJECTED(c_bnez, insn); RISCV_INSN_REJECTED(c_ebreak, insn); + + RISCV_INSN_SET_SIMULATE(c_j, insn); + RISCV_INSN_SET_SIMULATE(c_jr, insn); + RISCV_INSN_SET_SIMULATE(c_jalr, insn); + RISCV_INSN_SET_SIMULATE(c_beqz, insn); + RISCV_INSN_SET_SIMULATE(c_bnez, insn); #endif RISCV_INSN_SET_SIMULATE(jal, insn); diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c deleted file mode 100644 index 7142ec42e889..000000000000 --- a/arch/riscv/kernel/probes/ftrace.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/kprobes.h> - -/* Ftrace callback handler for kprobes -- called under preepmt disabled */ -void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) -{ - struct kprobe *p; - struct pt_regs *regs; - struct kprobe_ctlblk *kcb; - int bit; - - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) - return; - - p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) - goto out; - - regs = ftrace_get_regs(fregs); - kcb = get_kprobe_ctlblk(); - if (kprobe_running()) { - kprobes_inc_nmissed_count(p); - } else { - unsigned long orig_ip = instruction_pointer(regs); - - instruction_pointer_set(regs, ip); - - __this_cpu_write(current_kprobe, p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) { - /* - * Emulate singlestep (and also recover regs->pc) - * as if there is a nop - */ - instruction_pointer_set(regs, - (unsigned long)p->addr + MCOUNT_INSN_SIZE); - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - instruction_pointer_set(regs, orig_ip); - } - - /* - * If pre_handler returns !0, it changes regs->pc. We have to - * skip emulating post_handler. - */ - __this_cpu_write(current_kprobe, NULL); - } -out: - ftrace_test_recursion_unlock(bit); -} -NOKPROBE_SYMBOL(kprobe_ftrace_handler); - -int arch_prepare_kprobe_ftrace(struct kprobe *p) -{ - p->ainsn.api.insn = NULL; - return 0; -} diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index e6e950b7cf32..c0738d6c6498 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -6,12 +6,13 @@ #include <linux/extable.h> #include <linux/slab.h> #include <linux/stop_machine.h> +#include <linux/vmalloc.h> #include <asm/ptrace.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <asm/cacheflush.h> #include <asm/bug.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include "decode-insn.h" @@ -23,13 +24,13 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - unsigned long offset = GET_INSN_LENGTH(p->opcode); + size_t len = GET_INSN_LENGTH(p->opcode); + u32 insn = __BUG_INSN_32; - p->ainsn.api.restore = (unsigned long)p->addr + offset; + p->ainsn.api.restore = (unsigned long)p->addr + len; - patch_text(p->ainsn.api.insn, p->opcode); - patch_text((void *)((unsigned long)(p->ainsn.api.insn) + offset), - __BUG_INSN_32); + patch_text_nosync(p->ainsn.api.insn, &p->opcode, len); + patch_text_nosync((void *)p->ainsn.api.insn + len, &insn, GET_INSN_LENGTH(insn)); } static void __kprobes arch_prepare_simulate(struct kprobe *p) @@ -48,15 +49,35 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) post_kprobe_handler(p, kcb, regs); } +static bool __kprobes arch_check_kprobe(struct kprobe *p) +{ + unsigned long tmp = (unsigned long)p->addr - p->offset; + unsigned long addr = (unsigned long)p->addr; + + while (tmp <= addr) { + if (tmp == addr) + return true; + + tmp += GET_INSN_LENGTH(*(u16 *)tmp); + } + + return false; +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { - unsigned long probe_addr = (unsigned long)p->addr; + u16 *insn = (u16 *)p->addr; - if (probe_addr & 0x1) + if ((unsigned long)insn & 0x1) + return -EILSEQ; + + if (!arch_check_kprobe(p)) return -EILSEQ; /* copy instruction */ - p->opcode = *p->addr; + p->opcode = (kprobe_opcode_t)(*insn++); + if (GET_INSN_LENGTH(p->opcode) == 4) + p->opcode |= (kprobe_opcode_t)(*insn) << 16; /* decode instruction */ switch (riscv_probe_decode_insn(p->addr, &p->ainsn.api)) { @@ -83,29 +104,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -#ifdef CONFIG_MMU -void *alloc_insn_page(void) -{ - return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_READ_EXEC, - VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, - __builtin_return_address(0)); -} -#endif - /* install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { - if ((p->opcode & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) - patch_text(p->addr, __BUG_INSN_32); - else - patch_text(p->addr, __BUG_INSN_16); + size_t len = GET_INSN_LENGTH(p->opcode); + u32 insn = len == 4 ? __BUG_INSN_32 : __BUG_INSN_16; + + patch_text(p->addr, &insn, len); } /* remove breakpoint from text */ void __kprobes arch_disarm_kprobe(struct kprobe *p) { - patch_text(p->addr, p->opcode); + size_t len = GET_INSN_LENGTH(p->opcode); + + patch_text(p->addr, &p->opcode, len); } void __kprobes arch_remove_kprobe(struct kprobe *p) @@ -345,19 +358,6 @@ int __init arch_populate_kprobe_blacklist(void) return ret; } -void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) -{ - return (void *)kretprobe_trampoline_handler(regs, NULL); -} - -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->ra; - ri->fp = NULL; - regs->ra = (unsigned long) &__kretprobe_trampoline; -} - int __kprobes arch_trampoline_kprobe(struct kprobe *p) { return 0; diff --git a/arch/riscv/kernel/probes/rethook.c b/arch/riscv/kernel/probes/rethook.c new file mode 100644 index 000000000000..5c27c1f50989 --- /dev/null +++ b/arch/riscv/kernel/probes/rethook.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generic return hook for riscv. + */ + +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include "rethook.h" + +/* This is called from arch_rethook_trampoline() */ +unsigned long __used arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->s0); +} + +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +void arch_rethook_prepare(struct rethook_node *rhn, struct pt_regs *regs, bool mcount) +{ + rhn->ret_addr = regs->ra; + rhn->frame = regs->s0; + + /* replace return addr with trampoline */ + regs->ra = (unsigned long)arch_rethook_trampoline; +} + +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/riscv/kernel/probes/rethook.h b/arch/riscv/kernel/probes/rethook.h new file mode 100644 index 000000000000..4758f7e3ce88 --- /dev/null +++ b/arch/riscv/kernel/probes/rethook.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __RISCV_RETHOOK_H +#define __RISCV_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); +void arch_rethook_prepare(struct rethook_node *rhn, struct pt_regs *regs, bool mcount); + +#endif diff --git a/arch/riscv/kernel/probes/kprobes_trampoline.S b/arch/riscv/kernel/probes/rethook_trampoline.S index 7bdb09ded39b..f2cd83d9b0f0 100644 --- a/arch/riscv/kernel/probes/kprobes_trampoline.S +++ b/arch/riscv/kernel/probes/rethook_trampoline.S @@ -75,13 +75,13 @@ REG_L x31, PT_T6(sp) .endm -ENTRY(__kretprobe_trampoline) +SYM_CODE_START(arch_rethook_trampoline) addi sp, sp, -(PT_SIZE_ON_STACK) save_all_base_regs move a0, sp /* pt_regs */ - call trampoline_probe_handler + call arch_rethook_trampoline_callback /* use the result as the return-address */ move ra, a0 @@ -90,4 +90,4 @@ ENTRY(__kretprobe_trampoline) addi sp, sp, PT_SIZE_ON_STACK ret -ENDPROC(__kretprobe_trampoline) +SYM_CODE_END(arch_rethook_trampoline) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index d73e96f6ed7c..6c166029079c 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -24,7 +24,7 @@ static inline bool rv_insn_reg_set_val(struct pt_regs *regs, u32 index, unsigned long val) { if (index == 0) - return false; + return true; else if (index <= 31) *((unsigned long *)regs + index) = val; else @@ -71,11 +71,11 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg u32 rd_index = (opcode >> 7) & 0x1f; u32 rs1_index = (opcode >> 15) & 0x1f; - ret = rv_insn_reg_set_val(regs, rd_index, addr + 4); + ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); if (!ret) return ret; - ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); + ret = rv_insn_reg_set_val(regs, rd_index, addr + 4); if (!ret) return ret; @@ -136,13 +136,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re #define branch_offset(opcode) \ sign_extend32((branch_imm(opcode)), 12) -#define BRANCH_BEQ 0x0 -#define BRANCH_BNE 0x1 -#define BRANCH_BLT 0x4 -#define BRANCH_BGE 0x5 -#define BRANCH_BLTU 0x6 -#define BRANCH_BGEU 0x7 - bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs) { /* @@ -169,22 +162,22 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r offset_tmp = branch_offset(opcode); switch (branch_funct3(opcode)) { - case BRANCH_BEQ: + case RVG_FUNCT3_BEQ: offset = (rs1_val == rs2_val) ? offset_tmp : 4; break; - case BRANCH_BNE: + case RVG_FUNCT3_BNE: offset = (rs1_val != rs2_val) ? offset_tmp : 4; break; - case BRANCH_BLT: + case RVG_FUNCT3_BLT: offset = ((long)rs1_val < (long)rs2_val) ? offset_tmp : 4; break; - case BRANCH_BGE: + case RVG_FUNCT3_BGE: offset = ((long)rs1_val >= (long)rs2_val) ? offset_tmp : 4; break; - case BRANCH_BLTU: + case RVG_FUNCT3_BLTU: offset = (rs1_val < rs2_val) ? offset_tmp : 4; break; - case BRANCH_BGEU: + case RVG_FUNCT3_BGEU: offset = (rs1_val >= rs2_val) ? offset_tmp : 4; break; default: @@ -195,3 +188,108 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r return true; } + +bool __kprobes simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + /* + * 15 13 12 2 1 0 + * | funct3 | offset[11|4|9:8|10|6|7|3:1|5] | opcode | + * 3 11 2 + */ + + s32 offset; + + offset = ((opcode >> 3) & 0x7) << 1; + offset |= ((opcode >> 11) & 0x1) << 4; + offset |= ((opcode >> 2) & 0x1) << 5; + offset |= ((opcode >> 7) & 0x1) << 6; + offset |= ((opcode >> 6) & 0x1) << 7; + offset |= ((opcode >> 9) & 0x3) << 8; + offset |= ((opcode >> 8) & 0x1) << 10; + offset |= ((opcode >> 12) & 0x1) << 11; + + instruction_pointer_set(regs, addr + sign_extend32(offset, 11)); + + return true; +} + +static bool __kprobes simulate_c_jr_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs, + bool is_jalr) +{ + /* + * 15 12 11 7 6 2 1 0 + * | funct4 | rs1 | rs2 | op | + * 4 5 5 2 + */ + + unsigned long jump_addr; + + u32 rs1 = (opcode >> 7) & 0x1f; + + if (rs1 == 0) /* C.JR is only valid when rs1 != x0 */ + return false; + + if (!rv_insn_reg_get_val(regs, rs1, &jump_addr)) + return false; + + if (is_jalr && !rv_insn_reg_set_val(regs, 1, addr + 2)) + return false; + + instruction_pointer_set(regs, jump_addr); + + return true; +} + +bool __kprobes simulate_c_jr(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_jr_jalr(opcode, addr, regs, false); +} + +bool __kprobes simulate_c_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_jr_jalr(opcode, addr, regs, true); +} + +static bool __kprobes simulate_c_bnez_beqz(u32 opcode, unsigned long addr, struct pt_regs *regs, + bool is_bnez) +{ + /* + * 15 13 12 10 9 7 6 2 1 0 + * | funct3 | offset[8|4:3] | rs1' | offset[7:6|2:1|5] | op | + * 3 3 3 5 2 + */ + + s32 offset; + u32 rs1; + unsigned long rs1_val; + + rs1 = 0x8 | ((opcode >> 7) & 0x7); + + if (!rv_insn_reg_get_val(regs, rs1, &rs1_val)) + return false; + + if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) { + offset = ((opcode >> 3) & 0x3) << 1; + offset |= ((opcode >> 10) & 0x3) << 3; + offset |= ((opcode >> 2) & 0x1) << 5; + offset |= ((opcode >> 5) & 0x3) << 6; + offset |= ((opcode >> 12) & 0x1) << 8; + offset = sign_extend32(offset, 8); + } else { + offset = 2; + } + + instruction_pointer_set(regs, addr + offset); + + return true; +} + +bool __kprobes simulate_c_bnez(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_bnez_beqz(opcode, addr, regs, true); +} + +bool __kprobes simulate_c_beqz(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_bnez_beqz(opcode, addr, regs, false); +} diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h index cb6ff7dccb92..44ebbc444db9 100644 --- a/arch/riscv/kernel/probes/simulate-insn.h +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -3,14 +3,7 @@ #ifndef _RISCV_KERNEL_PROBES_SIMULATE_INSN_H #define _RISCV_KERNEL_PROBES_SIMULATE_INSN_H -#define __RISCV_INSN_FUNCS(name, mask, val) \ -static __always_inline bool riscv_insn_is_##name(probe_opcode_t code) \ -{ \ - BUILD_BUG_ON(~(mask) & (val)); \ - return (code & (mask)) == (val); \ -} \ -bool simulate_##name(u32 opcode, unsigned long addr, \ - struct pt_regs *regs) +#include <asm/insn.h> #define RISCV_INSN_REJECTED(name, code) \ do { \ @@ -19,9 +12,6 @@ bool simulate_##name(u32 opcode, unsigned long addr, \ } \ } while (0) -__RISCV_INSN_FUNCS(system, 0x7f, 0x73); -__RISCV_INSN_FUNCS(fence, 0x7f, 0x0f); - #define RISCV_INSN_SET_SIMULATE(name, code) \ do { \ if (riscv_insn_is_##name(code)) { \ @@ -30,18 +20,14 @@ __RISCV_INSN_FUNCS(fence, 0x7f, 0x0f); } \ } while (0) -__RISCV_INSN_FUNCS(c_j, 0xe003, 0xa001); -__RISCV_INSN_FUNCS(c_jr, 0xf007, 0x8002); -__RISCV_INSN_FUNCS(c_jal, 0xe003, 0x2001); -__RISCV_INSN_FUNCS(c_jalr, 0xf007, 0x9002); -__RISCV_INSN_FUNCS(c_beqz, 0xe003, 0xc001); -__RISCV_INSN_FUNCS(c_bnez, 0xe003, 0xe001); -__RISCV_INSN_FUNCS(c_ebreak, 0xffff, 0x9002); - -__RISCV_INSN_FUNCS(auipc, 0x7f, 0x17); -__RISCV_INSN_FUNCS(branch, 0x7f, 0x63); - -__RISCV_INSN_FUNCS(jal, 0x7f, 0x6f); -__RISCV_INSN_FUNCS(jalr, 0x707f, 0x67); +bool simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_jr(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_bnez(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_beqz(u32 opcode, unsigned long addr, struct pt_regs *regs); #endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */ diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c index 7a057b5f0adc..cc15f7ca6cc1 100644 --- a/arch/riscv/kernel/probes/uprobes.c +++ b/arch/riscv/kernel/probes/uprobes.c @@ -3,6 +3,7 @@ #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/uprobes.h> +#include <asm/insn.h> #include "decode-insn.h" @@ -17,6 +18,11 @@ bool is_swbp_insn(uprobe_opcode_t *insn) #endif } +bool is_trap_insn(uprobe_opcode_t *insn) +{ + return riscv_insn_is_ebreak(*insn) || riscv_insn_is_c_ebreak(*insn); +} + unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) { return instruction_pointer(regs); @@ -59,8 +65,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) instruction_pointer_set(regs, utask->xol_vaddr); - regs->status &= ~SR_SPIE; - return 0; } @@ -69,11 +73,10 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.bad_cause != UPROBE_TRAP_NR); + current->thread.bad_cause = utask->autask.saved_cause; instruction_pointer_set(regs, utask->vaddr + auprobe->insn_size); - regs->status |= SR_SPIE; - return 0; } @@ -106,13 +109,12 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; + current->thread.bad_cause = utask->autask.saved_cause; /* * Task has received a fatal signal, so reset back to probbed * address. */ instruction_pointer_set(regs, utask->vaddr); - - regs->status &= ~SR_SPIE; } bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, @@ -165,6 +167,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, /* Initialize the slot */ void *kaddr = kmap_atomic(page); void *dst = kaddr + (vaddr & ~PAGE_MASK); + unsigned long start = (unsigned long)dst; memcpy(dst, src, len); @@ -174,13 +177,6 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, *(uprobe_opcode_t *)dst = __BUG_INSN_32; } + flush_icache_range(start, start + len); kunmap_atomic(kaddr); - - /* - * We probably need flush_icache_user_page() but it needs vma. - * This should work on most of architectures by default. If - * architecture needs to do something different it can define - * its own version of the function. - */ - flush_dcache_page(page); } diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index ceb9ebab6558..a0a40889d79a 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -7,6 +7,7 @@ * Copyright (C) 2017 SiFive */ +#include <linux/bitfield.h> #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -15,7 +16,10 @@ #include <linux/tick.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/personality.h> +#include <linux/entry-common.h> +#include <asm/asm-prototypes.h> #include <asm/unistd.h> #include <asm/processor.h> #include <asm/csr.h> @@ -24,8 +28,9 @@ #include <asm/switch_to.h> #include <asm/thread_info.h> #include <asm/cpuidle.h> - -register unsigned long gp_in_global __asm__("gp"); +#include <asm/vector.h> +#include <asm/cpufeature.h> +#include <asm/exec.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> @@ -33,13 +38,29 @@ unsigned long __stack_chk_guard __read_mostly; EXPORT_SYMBOL(__stack_chk_guard); #endif -extern asmlinkage void ret_from_fork(void); -extern asmlinkage void ret_from_kernel_thread(void); +extern asmlinkage void ret_from_fork_kernel_asm(void); +extern asmlinkage void ret_from_fork_user_asm(void); -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { cpu_do_idle(); - raw_local_irq_enable(); +} + +int set_unalign_ctl(struct task_struct *tsk, unsigned int val) +{ + if (!unaligned_ctl_available()) + return -EINVAL; + + tsk->thread.align_ctl = val; + return 0; +} + +int get_unalign_ctl(struct task_struct *tsk, unsigned long adr) +{ + if (!unaligned_ctl_available()) + return -EINVAL; + + return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr); } void __show_regs(struct pt_regs *regs) @@ -84,6 +105,13 @@ void show_regs(struct pt_regs *regs) dump_backtrace(regs, NULL, KERN_DEFAULT); } +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_u32_below(PAGE_SIZE); + return sp & ~0xf; +} + #ifdef CONFIG_COMPAT static bool compat_mode_supported __read_mostly; @@ -105,7 +133,7 @@ static int __init compat_mode_detect(void) csr_write(CSR_STATUS, tmp); pr_info("riscv: ELF compat mode %s", - compat_mode_supported ? "supported" : "failed"); + compat_mode_supported ? "supported" : "unsupported"); return 0; } @@ -148,15 +176,51 @@ void flush_thread(void) fstate_off(current, task_pt_regs(current)); memset(¤t->thread.fstate, 0, sizeof(current->thread.fstate)); #endif +#ifdef CONFIG_RISCV_ISA_V + /* Reset vector state */ + riscv_v_vstate_ctrl_init(current); + riscv_v_vstate_off(task_pt_regs(current)); + kfree(current->thread.vstate.datap); + memset(¤t->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + clear_tsk_thread_flag(current, TIF_RISCV_V_DEFER_RESTORE); +#endif +#ifdef CONFIG_RISCV_ISA_SUPM + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + envcfg_update_bits(current, ENVCFG_PMM, ENVCFG_PMM_PMLEN_0); +#endif +} + +void arch_release_task_struct(struct task_struct *tsk) +{ + /* Free the vector context of datap. */ + if (has_vector() || has_xtheadvector()) + riscv_v_thread_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { fstate_save(src, task_pt_regs(src)); *dst = *src; + /* clear entire V context, including datap for a new task */ + memset(&dst->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + memset(&dst->thread.kernel_vstate, 0, sizeof(struct __riscv_v_ext_state)); + clear_tsk_thread_flag(dst, TIF_RISCV_V_DEFER_RESTORE); + return 0; } +asmlinkage void ret_from_fork_kernel(void *fn_arg, int (*fn)(void *), struct pt_regs *regs) +{ + fn(fn_arg); + + syscall_exit_to_user_mode(regs); +} + +asmlinkage void ret_from_fork_user(struct pt_regs *regs) +{ + syscall_exit_to_user_mode(regs); +} + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; @@ -164,26 +228,192 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + /* Ensure all threads in this mm have the same pointer masking mode. */ + if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM) && p->mm && (clone_flags & CLONE_VM)) + set_bit(MM_CONTEXT_LOCK_PMLEN, &p->mm->context.flags); + + memset(&p->thread.s, 0, sizeof(p->thread.s)); + /* p->thread holds context to be restored by __switch_to() */ if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gp = gp_in_global; /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; - p->thread.ra = (unsigned long)ret_from_kernel_thread; p->thread.s[0] = (unsigned long)args->fn; p->thread.s[1] = (unsigned long)args->fn_arg; + p->thread.ra = (unsigned long)ret_from_fork_kernel_asm; } else { *childregs = *(current_pt_regs()); + /* Turn off status.VS */ + riscv_v_vstate_off(childregs); if (usp) /* User fork */ childregs->sp = usp; if (clone_flags & CLONE_SETTLS) childregs->tp = tls; childregs->a0 = 0; /* Return value of fork() */ - p->thread.ra = (unsigned long)ret_from_fork; + p->thread.ra = (unsigned long)ret_from_fork_user_asm; } + p->thread.riscv_v_flags = 0; + if (has_vector() || has_xtheadvector()) + riscv_v_thread_alloc(p); p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; } + +void __init arch_task_cache_init(void) +{ + riscv_v_setup_ctx_cache(); +} + +#ifdef CONFIG_RISCV_ISA_SUPM +enum { + PMLEN_0 = 0, + PMLEN_7 = 7, + PMLEN_16 = 16, +}; + +static bool have_user_pmlen_7; +static bool have_user_pmlen_16; + +/* + * Control the relaxed ABI allowing tagged user addresses into the kernel. + */ +static unsigned int tagged_addr_disabled; + +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) +{ + unsigned long valid_mask = PR_PMLEN_MASK | PR_TAGGED_ADDR_ENABLE; + struct thread_info *ti = task_thread_info(task); + struct mm_struct *mm = task->mm; + unsigned long pmm; + u8 pmlen; + + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return -EINVAL; + + if (is_compat_thread(ti)) + return -EINVAL; + + if (arg & ~valid_mask) + return -EINVAL; + + /* + * Prefer the smallest PMLEN that satisfies the user's request, + * in case choosing a larger PMLEN has a performance impact. + */ + pmlen = FIELD_GET(PR_PMLEN_MASK, arg); + if (pmlen == PMLEN_0) { + pmm = ENVCFG_PMM_PMLEN_0; + } else if (pmlen <= PMLEN_7 && have_user_pmlen_7) { + pmlen = PMLEN_7; + pmm = ENVCFG_PMM_PMLEN_7; + } else if (pmlen <= PMLEN_16 && have_user_pmlen_16) { + pmlen = PMLEN_16; + pmm = ENVCFG_PMM_PMLEN_16; + } else { + return -EINVAL; + } + + /* + * Do not allow the enabling of the tagged address ABI if globally + * disabled via sysctl abi.tagged_addr_disabled, if pointer masking + * is disabled for userspace. + */ + if (arg & PR_TAGGED_ADDR_ENABLE && (tagged_addr_disabled || !pmlen)) + return -EINVAL; + + if (!(arg & PR_TAGGED_ADDR_ENABLE)) + pmlen = PMLEN_0; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + if (test_bit(MM_CONTEXT_LOCK_PMLEN, &mm->context.flags) && mm->context.pmlen != pmlen) { + mmap_write_unlock(mm); + return -EBUSY; + } + + envcfg_update_bits(task, ENVCFG_PMM, pmm); + mm->context.pmlen = pmlen; + + mmap_write_unlock(mm); + + return 0; +} + +long get_tagged_addr_ctrl(struct task_struct *task) +{ + struct thread_info *ti = task_thread_info(task); + long ret = 0; + + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return -EINVAL; + + if (is_compat_thread(ti)) + return -EINVAL; + + /* + * The mm context's pmlen is set only when the tagged address ABI is + * enabled, so the effective PMLEN must be extracted from envcfg.PMM. + */ + switch (task->thread.envcfg & ENVCFG_PMM) { + case ENVCFG_PMM_PMLEN_7: + ret = FIELD_PREP(PR_PMLEN_MASK, PMLEN_7); + break; + case ENVCFG_PMM_PMLEN_16: + ret = FIELD_PREP(PR_PMLEN_MASK, PMLEN_16); + break; + } + + if (task->mm->context.pmlen) + ret |= PR_TAGGED_ADDR_ENABLE; + + return ret; +} + +static bool try_to_set_pmm(unsigned long value) +{ + csr_set(CSR_ENVCFG, value); + return (csr_read_clear(CSR_ENVCFG, ENVCFG_PMM) & ENVCFG_PMM) == value; +} + +/* + * Global sysctl to disable the tagged user addresses support. This control + * only prevents the tagged address ABI enabling via prctl() and does not + * disable it for tasks that already opted in to the relaxed ABI. + */ + +static const struct ctl_table tagged_addr_sysctl_table[] = { + { + .procname = "tagged_addr_disabled", + .mode = 0644, + .data = &tagged_addr_disabled, + .maxlen = sizeof(int), + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init tagged_addr_init(void) +{ + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return 0; + + /* + * envcfg.PMM is a WARL field. Detect which values are supported. + * Assume the supported PMLEN values are the same on all harts. + */ + csr_clear(CSR_ENVCFG, ENVCFG_PMM); + have_user_pmlen_7 = try_to_set_pmm(ENVCFG_PMM_PMLEN_7); + have_user_pmlen_16 = try_to_set_pmm(ENVCFG_PMM_PMLEN_16); + + if (!register_sysctl("abi", tagged_addr_sysctl_table)) + return -EINVAL; + + return 0; +} +core_initcall(tagged_addr_init); +#endif /* CONFIG_RISCV_ISA_SUPM */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 2ae8280ae475..ea67e9fb7a58 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -7,6 +7,7 @@ * Copied from arch/tile/kernel/ptrace.c */ +#include <asm/vector.h> #include <asm/ptrace.h> #include <asm/syscall.h> #include <asm/thread_info.h> @@ -19,14 +20,17 @@ #include <linux/sched.h> #include <linux/sched/task_stack.h> -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - enum riscv_regset { REGSET_X, #ifdef CONFIG_FPU REGSET_F, #endif +#ifdef CONFIG_RISCV_ISA_V + REGSET_V, +#endif +#ifdef CONFIG_RISCV_ISA_SUPM + REGSET_TAGGED_ADDR_CTRL, +#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -83,6 +87,103 @@ static int riscv_fpr_set(struct task_struct *target, } #endif +#ifdef CONFIG_RISCV_ISA_V +static int riscv_vr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct __riscv_v_ext_state *vstate = &target->thread.vstate; + struct __riscv_v_regset_state ptrace_vstate; + + if (!riscv_v_vstate_query(task_pt_regs(target))) + return -EINVAL; + + /* + * Ensure the vector registers have been saved to the memory before + * copying them to membuf. + */ + if (target == current) { + get_cpu_vector_context(); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); + put_cpu_vector_context(); + } + + ptrace_vstate.vstart = vstate->vstart; + ptrace_vstate.vl = vstate->vl; + ptrace_vstate.vtype = vstate->vtype; + ptrace_vstate.vcsr = vstate->vcsr; + ptrace_vstate.vlenb = vstate->vlenb; + + /* Copy vector header from vstate. */ + membuf_write(&to, &ptrace_vstate, sizeof(struct __riscv_v_regset_state)); + + /* Copy all the vector registers from vstate. */ + return membuf_write(&to, vstate->datap, riscv_v_vsize); +} + +static int riscv_vr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct __riscv_v_ext_state *vstate = &target->thread.vstate; + struct __riscv_v_regset_state ptrace_vstate; + + if (!riscv_v_vstate_query(task_pt_regs(target))) + return -EINVAL; + + /* Copy rest of the vstate except datap */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ptrace_vstate, 0, + sizeof(struct __riscv_v_regset_state)); + if (unlikely(ret)) + return ret; + + if (vstate->vlenb != ptrace_vstate.vlenb) + return -EINVAL; + + vstate->vstart = ptrace_vstate.vstart; + vstate->vl = ptrace_vstate.vl; + vstate->vtype = ptrace_vstate.vtype; + vstate->vcsr = ptrace_vstate.vcsr; + + /* Copy all the vector registers. */ + pos = 0; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate->datap, + 0, riscv_v_vsize); + return ret; +} +#endif + +#ifdef CONFIG_RISCV_ISA_SUPM +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (IS_ERR_VALUE(ctrl)) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + long ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + static const struct user_regset riscv_user_regset[] = { [REGSET_X] = { .core_note_type = NT_PRSTATUS, @@ -102,6 +203,27 @@ static const struct user_regset riscv_user_regset[] = { .set = riscv_fpr_set, }, #endif +#ifdef CONFIG_RISCV_ISA_V + [REGSET_V] = { + .core_note_type = NT_RISCV_VECTOR, + .align = 16, + .n = ((32 * RISCV_MAX_VLENB) + + sizeof(struct __riscv_v_regset_state)) / sizeof(__u32), + .size = sizeof(__u32), + .regset_get = riscv_vr_get, + .set = riscv_vr_set, + }, +#endif +#ifdef CONFIG_RISCV_ISA_SUPM + [REGSET_TAGGED_ADDR_CTRL] = { + .core_note_type = NT_RISCV_TAGGED_ADDR_CTRL, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif }; static const struct user_regset_view riscv_user_native_view = { @@ -212,7 +334,6 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) void ptrace_disable(struct task_struct *child) { - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); } long arch_ptrace(struct task_struct *child, long request, @@ -229,46 +350,6 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -/* - * Allows PTRACE_SYSCALL to work. These are called from entry.S in - * {handle,ret_from}_syscall. - */ -__visible int do_syscall_trace_enter(struct pt_regs *regs) -{ - if (test_thread_flag(TIF_SYSCALL_TRACE)) - if (ptrace_report_syscall_entry(regs)) - return -1; - - /* - * Do the secure computing after ptrace; failures should be fast. - * If this fails we might have return value in a0 from seccomp - * (via SECCOMP_RET_ERRNO/TRACE). - */ - if (secure_computing() == -1) - return -1; - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_enter(regs, syscall_get_nr(current, regs)); -#endif - - audit_syscall_entry(regs->a7, regs->a0, regs->a1, regs->a2, regs->a3); - return 0; -} - -__visible void do_syscall_trace_exit(struct pt_regs *regs) -{ - audit_syscall_exit(regs); - - if (test_thread_flag(TIF_SYSCALL_TRACE)) - ptrace_report_syscall_exit(regs, 0); - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_exit(regs, regs_return_value(regs)); -#endif -} - #ifdef CONFIG_COMPAT static int compat_riscv_gpr_get(struct task_struct *target, const struct user_regset *regset, @@ -338,14 +419,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, return ret; } +#else +static const struct user_regset_view compat_riscv_user_native_view = {}; #endif /* CONFIG_COMPAT */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_32BIT)) + if (is_compat_thread(&task->thread_info)) return &compat_riscv_user_native_view; else -#endif return &riscv_user_native_view; } diff --git a/arch/riscv/kernel/return_address.c b/arch/riscv/kernel/return_address.c new file mode 100644 index 000000000000..c8115ec8fb30 --- /dev/null +++ b/arch/riscv/kernel/return_address.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code come from arch/arm64/kernel/return_address.c + * + * Copyright (C) 2023 SiFive. + */ + +#include <linux/export.h> +#include <linux/kprobes.h> +#include <linux/stacktrace.h> + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static bool save_return_addr(void *d, unsigned long pc) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)pc; + return false; + } + + --data->level; + + return true; +} +NOKPROBE_SYMBOL(save_return_addr); + +noinline void *return_address(unsigned int level) +{ + struct return_address_data data; + + data.level = level + 3; + data.addr = NULL; + + arch_stack_walk(save_return_addr, &data, current, NULL); + + if (!data.level) + return data.addr; + else + return NULL; + +} +EXPORT_SYMBOL_GPL(return_address); +NOKPROBE_SYMBOL(return_address); diff --git a/arch/riscv/kernel/sbi-ipi.c b/arch/riscv/kernel/sbi-ipi.c new file mode 100644 index 000000000000..0cc5559c08d8 --- /dev/null +++ b/arch/riscv/kernel/sbi-ipi.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Multiplex several IPIs over a single HW IPI. + * + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "riscv: " fmt +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/irqchip/chained_irq.h> +#include <linux/irqdomain.h> +#include <asm/sbi.h> + +DEFINE_STATIC_KEY_FALSE(riscv_sbi_for_rfence); +EXPORT_SYMBOL_GPL(riscv_sbi_for_rfence); + +static int sbi_ipi_virq; + +static void sbi_ipi_handle(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + chained_irq_enter(chip, desc); + + csr_clear(CSR_IP, IE_SIE); + ipi_mux_process(); + + chained_irq_exit(chip, desc); +} + +static int sbi_ipi_starting_cpu(unsigned int cpu) +{ + enable_percpu_irq(sbi_ipi_virq, irq_get_trigger_type(sbi_ipi_virq)); + return 0; +} + +void __init sbi_ipi_init(void) +{ + int virq; + struct irq_domain *domain; + + if (riscv_ipi_have_virq_range()) + return; + + domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), + DOMAIN_BUS_ANY); + if (!domain) { + pr_err("unable to find INTC IRQ domain\n"); + return; + } + + sbi_ipi_virq = irq_create_mapping(domain, RV_IRQ_SOFT); + if (!sbi_ipi_virq) { + pr_err("unable to create INTC IRQ mapping\n"); + return; + } + + virq = ipi_mux_create(BITS_PER_BYTE, sbi_send_ipi); + if (virq <= 0) { + pr_err("unable to create muxed IPIs\n"); + irq_dispose_mapping(sbi_ipi_virq); + return; + } + + irq_set_chained_handler(sbi_ipi_virq, sbi_ipi_handle); + + /* + * Don't disable IPI when CPU goes offline because + * the masking/unmasking of virtual IPIs is done + * via generic IPI-Mux + */ + cpuhp_setup_state(CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING, + "irqchip/sbi-ipi:starting", + sbi_ipi_starting_cpu, NULL); + + riscv_ipi_set_virq_range(virq, BITS_PER_BYTE); + pr_info("providing IPIs using SBI IPI extension\n"); + + /* + * Use the SBI remote fence extension to avoid + * the extra context switch needed to handle IPIs. + */ + static_branch_enable(&riscv_sbi_for_rfence); +} diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 775d3322b422..53836a9235e3 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -7,66 +7,23 @@ #include <linux/bits.h> #include <linux/init.h> +#include <linux/mm.h> #include <linux/pm.h> #include <linux/reboot.h> #include <asm/sbi.h> #include <asm/smp.h> +#include <asm/tlbflush.h> /* default SBI version is 0.1 */ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init; -static int (*__sbi_send_ipi)(const struct cpumask *cpu_mask) __ro_after_init; +static void (*__sbi_send_ipi)(unsigned int cpu) __ro_after_init; static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) __ro_after_init; -struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, - unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5) -{ - struct sbiret ret; - - register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); - register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); - register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); - register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); - register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); - register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); - register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); - register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); - asm volatile ("ecall" - : "+r" (a0), "+r" (a1) - : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) - : "memory"); - ret.error = a0; - ret.value = a1; - - return ret; -} -EXPORT_SYMBOL(sbi_ecall); - -int sbi_err_map_linux_errno(int err) -{ - switch (err) { - case SBI_SUCCESS: - return 0; - case SBI_ERR_DENIED: - return -EPERM; - case SBI_ERR_INVALID_PARAM: - return -EINVAL; - case SBI_ERR_INVALID_ADDRESS: - return -EFAULT; - case SBI_ERR_NOT_SUPPORTED: - case SBI_ERR_FAILURE: - default: - return -ENOTSUPP; - }; -} -EXPORT_SYMBOL(sbi_err_map_linux_errno); - #ifdef CONFIG_RISCV_SBI_V01 static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask) { @@ -131,17 +88,6 @@ void sbi_shutdown(void) EXPORT_SYMBOL(sbi_shutdown); /** - * sbi_clear_ipi() - Clear any pending IPIs for the calling hart. - * - * Return: None - */ -void sbi_clear_ipi(void) -{ - sbi_ecall(SBI_EXT_0_1_CLEAR_IPI, 0, 0, 0, 0, 0, 0, 0); -} -EXPORT_SYMBOL(sbi_clear_ipi); - -/** * __sbi_set_timer_v01() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. * @@ -157,17 +103,12 @@ static void __sbi_set_timer_v01(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) +static void __sbi_send_ipi_v01(unsigned int cpu) { - unsigned long hart_mask; - - if (!cpu_mask || cpumask_empty(cpu_mask)) - cpu_mask = cpu_online_mask; - hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); - + unsigned long hart_mask = + __sbi_v01_cpumask_to_hartmask(cpumask_of(cpu)); sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)(&hart_mask), 0, 0, 0, 0, 0); - return 0; } static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, @@ -216,12 +157,10 @@ static void __sbi_set_timer_v01(uint64_t stime_value) sbi_major_version(), sbi_minor_version()); } -static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) +static void __sbi_send_ipi_v01(unsigned int cpu) { pr_warn("IPI extension is not available in SBI v%lu.%lu\n", sbi_major_version(), sbi_minor_version()); - - return 0; } static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, @@ -248,55 +187,18 @@ static void __sbi_set_timer_v02(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) +static void __sbi_send_ipi_v02(unsigned int cpu) { - unsigned long hartid, cpuid, hmask = 0, hbase = 0, htop = 0; - struct sbiret ret = {0}; int result; + struct sbiret ret = {0}; - if (!cpu_mask || cpumask_empty(cpu_mask)) - cpu_mask = cpu_online_mask; - - for_each_cpu(cpuid, cpu_mask) { - hartid = cpuid_to_hartid_map(cpuid); - if (hmask) { - if (hartid + BITS_PER_LONG <= htop || - hbase + BITS_PER_LONG <= hartid) { - ret = sbi_ecall(SBI_EXT_IPI, - SBI_EXT_IPI_SEND_IPI, hmask, - hbase, 0, 0, 0, 0); - if (ret.error) - goto ecall_failed; - hmask = 0; - } else if (hartid < hbase) { - /* shift the mask to fit lower hartid */ - hmask <<= hbase - hartid; - hbase = hartid; - } - } - if (!hmask) { - hbase = hartid; - htop = hartid; - } else if (hartid > htop) { - htop = hartid; - } - hmask |= BIT(hartid - hbase); - } - - if (hmask) { - ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask, hbase, 0, 0, 0, 0); - if (ret.error) - goto ecall_failed; + ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, + 1UL, cpuid_to_hartid_map(cpu), 0, 0, 0, 0); + if (ret.error) { + result = sbi_err_map_linux_errno(ret.error); + pr_err("%s: hbase = [%lu] failed (error [%d])\n", + __func__, cpuid_to_hartid_map(cpu), result); } - - return 0; - -ecall_failed: - result = sbi_err_map_linux_errno(ret.error); - pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask, result); - return result; } static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask, @@ -397,6 +299,76 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, return 0; } +static bool sbi_fwft_supported; + +struct fwft_set_req { + u32 feature; + unsigned long value; + unsigned long flags; + atomic_t error; +}; + +static void cpu_sbi_fwft_set(void *arg) +{ + struct fwft_set_req *req = arg; + int ret; + + ret = sbi_fwft_set(req->feature, req->value, req->flags); + if (ret) + atomic_set(&req->error, ret); +} + +/** + * sbi_fwft_set() - Set a feature on the local hart + * @feature: The feature ID to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags) +{ + struct sbiret ret; + + if (!sbi_fwft_supported) + return -EOPNOTSUPP; + + ret = sbi_ecall(SBI_EXT_FWFT, SBI_EXT_FWFT_SET, + feature, value, flags, 0, 0, 0); + + return sbi_err_map_linux_errno(ret.error); +} + +/** + * sbi_fwft_set_cpumask() - Set a feature for the specified cpumask + * @mask: CPU mask of cpus that need the feature to be set + * @feature: The feature ID to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature, + unsigned long value, unsigned long flags) +{ + struct fwft_set_req req = { + .feature = feature, + .value = value, + .flags = flags, + .error = ATOMIC_INIT(0), + }; + + if (!sbi_fwft_supported) + return -EOPNOTSUPP; + + if (feature & SBI_FWFT_GLOBAL_FEATURE_BIT) + return -EINVAL; + + on_each_cpu_mask(mask, cpu_sbi_fwft_set, &req, 1); + + return atomic_read(&req.error); +} + /** * sbi_set_timer() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. @@ -410,13 +382,11 @@ void sbi_set_timer(uint64_t stime_value) /** * sbi_send_ipi() - Send an IPI to any hart. - * @cpu_mask: A cpu mask containing all the target harts. - * - * Return: 0 on success, appropriate linux error code otherwise. + * @cpu: Logical id of the target CPU. */ -int sbi_send_ipi(const struct cpumask *cpu_mask) +void sbi_send_ipi(unsigned int cpu) { - return __sbi_send_ipi(cpu_mask); + __sbi_send_ipi(cpu); } EXPORT_SYMBOL(sbi_send_ipi); @@ -434,31 +404,14 @@ int sbi_remote_fence_i(const struct cpumask *cpu_mask) EXPORT_SYMBOL(sbi_remote_fence_i); /** - * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote - * harts for the specified virtual address range. - * @cpu_mask: A cpu mask containing all the target harts. - * @start: Start of the virtual address - * @size: Total size of the virtual address range. - * - * Return: 0 on success, appropriate linux error code otherwise. - */ -int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, - unsigned long start, - unsigned long size) -{ - return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, - cpu_mask, start, size, 0, 0); -} -EXPORT_SYMBOL(sbi_remote_sfence_vma); - -/** * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given - * remote harts for a virtual address range belonging to a specific ASID. + * remote harts for a virtual address range belonging to a specific ASID or not. * * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. - * @asid: The value of address space identifier (ASID). + * @asid: The value of address space identifier (ASID), or FLUSH_TLB_NO_ASID + * for flushing all address spaces. * * Return: 0 on success, appropriate linux error code otherwise. */ @@ -467,8 +420,12 @@ int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long size, unsigned long asid) { - return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - cpu_mask, start, size, asid, 0); + if (asid == FLUSH_TLB_NO_ASID) + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, + cpu_mask, start, size, 0, 0); + else + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); @@ -581,33 +538,21 @@ static void sbi_srst_power_off(void) * sbi_probe_extension() - Check if an SBI extension ID is supported or not. * @extid: The extension ID to be probed. * - * Return: Extension specific nonzero value f yes, -ENOTSUPP otherwise. + * Return: 1 or an extension specific nonzero value if yes, 0 otherwise. */ -int sbi_probe_extension(int extid) +long sbi_probe_extension(int extid) { struct sbiret ret; ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_PROBE_EXT, extid, 0, 0, 0, 0, 0); if (!ret.error) - if (ret.value) - return ret.value; + return ret.value; - return -ENOTSUPP; + return 0; } EXPORT_SYMBOL(sbi_probe_extension); -static long __sbi_base_ecall(int fid) -{ - struct sbiret ret; - - ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); - if (!ret.error) - return ret.value; - else - return sbi_err_map_linux_errno(ret.error); -} - static inline long sbi_get_spec_version(void) { return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION); @@ -627,25 +572,79 @@ long sbi_get_mvendorid(void) { return __sbi_base_ecall(SBI_EXT_BASE_GET_MVENDORID); } +EXPORT_SYMBOL_GPL(sbi_get_mvendorid); long sbi_get_marchid(void) { return __sbi_base_ecall(SBI_EXT_BASE_GET_MARCHID); } +EXPORT_SYMBOL_GPL(sbi_get_marchid); long sbi_get_mimpid(void) { return __sbi_base_ecall(SBI_EXT_BASE_GET_MIMPID); } +EXPORT_SYMBOL_GPL(sbi_get_mimpid); -static void sbi_send_cpumask_ipi(const struct cpumask *target) +bool sbi_debug_console_available; + +int sbi_debug_console_write(const char *bytes, unsigned int num_bytes) { - sbi_send_ipi(target); + phys_addr_t base_addr; + struct sbiret ret; + + if (!sbi_debug_console_available) + return -EOPNOTSUPP; + + if (is_vmalloc_addr(bytes)) + base_addr = page_to_phys(vmalloc_to_page(bytes)) + + offset_in_page(bytes); + else + base_addr = __pa(bytes); + if (PAGE_SIZE < (offset_in_page(bytes) + num_bytes)) + num_bytes = PAGE_SIZE - offset_in_page(bytes); + + if (IS_ENABLED(CONFIG_32BIT)) + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_WRITE, + num_bytes, lower_32_bits(base_addr), + upper_32_bits(base_addr), 0, 0, 0); + else + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_WRITE, + num_bytes, base_addr, 0, 0, 0, 0); + + if (ret.error == SBI_ERR_FAILURE) + return -EIO; + return ret.error ? sbi_err_map_linux_errno(ret.error) : ret.value; } -static const struct riscv_ipi_ops sbi_ipi_ops = { - .ipi_inject = sbi_send_cpumask_ipi -}; +int sbi_debug_console_read(char *bytes, unsigned int num_bytes) +{ + phys_addr_t base_addr; + struct sbiret ret; + + if (!sbi_debug_console_available) + return -EOPNOTSUPP; + + if (is_vmalloc_addr(bytes)) + base_addr = page_to_phys(vmalloc_to_page(bytes)) + + offset_in_page(bytes); + else + base_addr = __pa(bytes); + if (PAGE_SIZE < (offset_in_page(bytes) + num_bytes)) + num_bytes = PAGE_SIZE - offset_in_page(bytes); + + if (IS_ENABLED(CONFIG_32BIT)) + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_READ, + num_bytes, lower_32_bits(base_addr), + upper_32_bits(base_addr), 0, 0, 0); + else + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_READ, + num_bytes, base_addr, 0, 0, 0, 0); + + if (ret.error == SBI_ERR_FAILURE) + return -EIO; + return ret.error ? sbi_err_map_linux_errno(ret.error) : ret.value; +} void __init sbi_init(void) { @@ -662,37 +661,45 @@ void __init sbi_init(void) if (!sbi_spec_is_0_1()) { pr_info("SBI implementation ID=0x%lx Version=0x%lx\n", sbi_get_firmware_id(), sbi_get_firmware_version()); - if (sbi_probe_extension(SBI_EXT_TIME) > 0) { + if (sbi_probe_extension(SBI_EXT_TIME)) { __sbi_set_timer = __sbi_set_timer_v02; pr_info("SBI TIME extension detected\n"); } else { __sbi_set_timer = __sbi_set_timer_v01; } - if (sbi_probe_extension(SBI_EXT_IPI) > 0) { + if (sbi_probe_extension(SBI_EXT_IPI)) { __sbi_send_ipi = __sbi_send_ipi_v02; pr_info("SBI IPI extension detected\n"); } else { __sbi_send_ipi = __sbi_send_ipi_v01; } - if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) { + if (sbi_probe_extension(SBI_EXT_RFENCE)) { __sbi_rfence = __sbi_rfence_v02; pr_info("SBI RFENCE extension detected\n"); } else { __sbi_rfence = __sbi_rfence_v01; } - if ((sbi_spec_version >= sbi_mk_version(0, 3)) && - (sbi_probe_extension(SBI_EXT_SRST) > 0)) { + if (sbi_spec_version >= sbi_mk_version(0, 3) && + sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); pm_power_off = sbi_srst_power_off; sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot; sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); } + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_DBCN) > 0) { + pr_info("SBI DBCN extension detected\n"); + sbi_debug_console_available = true; + } + if (sbi_spec_version >= sbi_mk_version(3, 0) && + sbi_probe_extension(SBI_EXT_FWFT)) { + pr_info("SBI FWFT extension detected\n"); + sbi_fwft_supported = true; + } } else { __sbi_set_timer = __sbi_set_timer_v01; __sbi_send_ipi = __sbi_send_ipi_v01; __sbi_rfence = __sbi_rfence_v01; } - - riscv_set_ipi_ops(&sbi_ipi_ops); } diff --git a/arch/riscv/kernel/sbi_ecall.c b/arch/riscv/kernel/sbi_ecall.c new file mode 100644 index 000000000000..24aabb4fbde3 --- /dev/null +++ b/arch/riscv/kernel/sbi_ecall.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Rivos Inc. */ + +#include <asm/sbi.h> +#define CREATE_TRACE_POINTS +#include <asm/trace.h> + +long __sbi_base_ecall(int fid) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); + if (!ret.error) + return ret.value; + else + return sbi_err_map_linux_errno(ret.error); +} +EXPORT_SYMBOL(__sbi_base_ecall); + +struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, + int fid, int ext) +{ + struct sbiret ret; + + trace_sbi_call(ext, fid); + + register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); + register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); + register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); + register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); + register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); + register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); + register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); + asm volatile ("ecall" + : "+r" (a0), "+r" (a1) + : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) + : "memory"); + ret.error = a0; + ret.value = a1; + + trace_sbi_return(ext, ret.error, ret.value); + + return ret; +} +EXPORT_SYMBOL(__sbi_ecall); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f0f36a4a0e9b..f7c9a1caa83e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -8,21 +8,24 @@ * Nick Kossifidis <mick@ics.forth.gr> */ +#include <linux/acpi.h> +#include <linux/cpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/console.h> -#include <linux/screen_info.h> #include <linux/of_fdt.h> -#include <linux/of_platform.h> #include <linux/sched/task.h> #include <linux/smp.h> #include <linux/efi.h> #include <linux/crash_dump.h> +#include <linux/panic_notifier.h> +#include <asm/acpi.h> #include <asm/alternative.h> -#include <asm/cpu_ops.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> #include <asm/early_ioremap.h> #include <asm/pgtable.h> #include <asm/setup.h> @@ -36,17 +39,6 @@ #include "head.h" -#if defined(CONFIG_DUMMY_CONSOLE) || defined(CONFIG_EFI) -struct screen_info screen_info __section(".data") = { - .orig_video_lines = 30, - .orig_video_cols = 80, - .orig_video_mode = 0, - .orig_video_ega_bx = 0, - .orig_video_isVGA = 1, - .orig_video_points = 8 -}; -#endif - /* * The lucky hart to first increment this variable will boot the other cores. * This is used before the kernel initializes the BSS so it can't be in the @@ -58,7 +50,6 @@ atomic_t hart_lottery __section(".sdata") #endif ; unsigned long boot_cpu_hartid; -static DEFINE_PER_CPU(struct cpu, cpu_devices); /* * Place kernel memory regions on the resource tree so that @@ -75,6 +66,9 @@ static struct resource bss_res = { .name = "Kernel bss", }; static struct resource elfcorehdr_res = { .name = "ELF Core hdr", }; #endif +static int num_standard_resources; +static struct resource *standard_resources; + static int __init add_resource(struct resource *parent, struct resource *res) { @@ -148,7 +142,7 @@ static void __init init_resources(void) struct resource *res = NULL; struct resource *mem_res = NULL; size_t mem_res_sz = 0; - int num_resources = 0, res_idx = 0; + int num_resources = 0, res_idx = 0, non_resv_res = 0; int ret = 0; /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */ @@ -156,9 +150,7 @@ static void __init init_resources(void) res_idx = num_resources - 1; mem_res_sz = num_resources * sizeof(*mem_res); - mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); - if (!mem_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); + mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES); /* * Start by adding the reserved regions, if they overlap @@ -169,14 +161,6 @@ static void __init init_resources(void) if (ret < 0) goto error; -#ifdef CONFIG_KEXEC_CORE - if (crashk_res.start != crashk_res.end) { - ret = add_resource(&iomem_resource, &crashk_res); - if (ret < 0) - goto error; - } -#endif - #ifdef CONFIG_CRASH_DUMP if (elfcorehdr_size > 0) { elfcorehdr_res.start = elfcorehdr_addr; @@ -212,6 +196,7 @@ static void __init init_resources(void) /* Add /memory regions to the resource tree */ for_each_mem_region(region) { res = &mem_res[res_idx--]; + non_resv_res++; if (unlikely(memblock_is_nomap(region))) { res->name = "Reserved"; @@ -229,6 +214,9 @@ static void __init init_resources(void) goto error; } + num_standard_resources = non_resv_res; + standard_resources = &mem_res[res_idx + 1]; + /* Clean-up any unused pre-allocated resources */ if (res_idx >= 0) memblock_free(mem_res, (res_idx + 1) * sizeof(*mem_res)); @@ -240,27 +228,87 @@ static void __init init_resources(void) memblock_free(mem_res, mem_res_sz); } +static int __init reserve_memblock_reserved_regions(void) +{ + u64 i, j; + + for (i = 0; i < num_standard_resources; i++) { + struct resource *mem = &standard_resources[i]; + phys_addr_t r_start, r_end, mem_size = resource_size(mem); + + if (!memblock_is_region_reserved(mem->start, mem_size)) + continue; + + for_each_reserved_mem_range(j, &r_start, &r_end) { + resource_size_t start, end; + + start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); + end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end); + + if (start > mem->end || end < mem->start) + continue; + + reserve_region_with_split(mem, start, end, "Reserved"); + } + } + + return 0; +} +arch_initcall(reserve_memblock_reserved_regions); static void __init parse_dtb(void) { /* Early scan of device tree from init memory */ - if (early_init_dt_scan(dtb_early_va)) { + if (early_init_dt_scan(dtb_early_va, dtb_early_pa)) { const char *name = of_flat_dt_get_machine_name(); if (name) { pr_info("Machine model: %s\n", name); dump_stack_set_arch_desc("%s (DT)", name); } + } else { + pr_err("No DTB passed to the kernel\n"); + } +} + +#if defined(CONFIG_RISCV_COMBO_SPINLOCKS) +DEFINE_STATIC_KEY_TRUE(qspinlock_key); +EXPORT_SYMBOL(qspinlock_key); +#endif + +static void __init riscv_spinlock_init(void) +{ + char *using_ext = NULL; + + if (IS_ENABLED(CONFIG_RISCV_TICKET_SPINLOCKS)) { + pr_info("Ticket spinlock: enabled\n"); return; } - pr_err("No DTB passed to the kernel\n"); -#ifdef CONFIG_CMDLINE_FORCE - strscpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); - pr_info("Forcing kernel command line to: %s\n", boot_command_line); + if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && + IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && + riscv_isa_extension_available(NULL, ZABHA) && + riscv_isa_extension_available(NULL, ZACAS)) { + using_ext = "using Zabha"; + } else if (riscv_isa_extension_available(NULL, ZICCRSE)) { + using_ext = "using Ziccrse"; + } +#if defined(CONFIG_RISCV_COMBO_SPINLOCKS) + else { + static_branch_disable(&qspinlock_key); + pr_info("Ticket spinlock: enabled\n"); + return; + } #endif + + if (!using_ext) + pr_err("Queued spinlock without Zabha or Ziccrse"); + else + pr_info("Queued spinlock %s: enabled\n", using_ext); } +extern void __init init_rt_signal_env(void); + void __init setup_arch(char **cmdline_p) { parse_dtb(); @@ -269,23 +317,24 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; early_ioremap_setup(); + sbi_init(); jump_label_init(); parse_early_param(); efi_init(); paging_init(); + + /* Parse the ACPI tables for possible boot-time configuration */ + acpi_boot_table_init(); + #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa)))) - unflatten_device_tree(); - else - pr_err("No DTB found in kernel mappings\n"); + unflatten_device_tree(); #endif misc_mem_init(); init_resources(); - sbi_init(); #ifdef CONFIG_KASAN kasan_init(); @@ -295,34 +344,61 @@ void __init setup_arch(char **cmdline_p) setup_smp(); #endif + if (!acpi_disabled) { + acpi_init_rintc_map(); + acpi_map_cpus_to_nodes(); + } + + riscv_init_cbo_blocksizes(); riscv_fill_hwcap(); apply_boot_alternatives(); + init_rt_signal_env(); + + if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && + riscv_isa_extension_available(NULL, ZICBOM)) + riscv_noncoherent_supported(); + riscv_set_dma_cache_alignment(); + + riscv_user_isa_enable(); + riscv_spinlock_init(); } -static int __init topology_init(void) +bool arch_cpu_is_hotpluggable(int cpu) { - int i, ret; - - for_each_possible_cpu(i) { - struct cpu *cpu = &per_cpu(cpu_devices, i); + return cpu_has_hotplug(cpu); +} - cpu->hotpluggable = cpu_has_hotplug(i); - ret = register_cpu(cpu, i); - if (unlikely(ret)) - pr_warn("Warning: %s: register_cpu %d failed (%d)\n", - __func__, i, ret); +void free_initmem(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { + set_kernel_memory(lm_alias(__init_begin), lm_alias(__init_end), set_memory_rw_nx); + if (IS_ENABLED(CONFIG_64BIT)) + set_kernel_memory(__init_begin, __init_end, set_memory_nx); } + free_initmem_default(POISON_FREE_INITMEM); +} + +static int dump_kernel_offset(struct notifier_block *self, + unsigned long v, void *p) +{ + pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", + kernel_map.virt_offset, + KERNEL_LINK_ADDR); + return 0; } -subsys_initcall(topology_init); -void free_initmem(void) +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) { - if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) - set_kernel_memory(lm_alias(__init_begin), lm_alias(__init_end), - IS_ENABLED(CONFIG_64BIT) ? - set_memory_rw : set_memory_rw_nx); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); - free_initmem_default(POISON_FREE_INITMEM); + return 0; } +device_initcall(register_kernel_offset_dumper); diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 38b05ca6fe66..08378fea3a11 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -12,14 +12,21 @@ #include <linux/syscalls.h> #include <linux/resume_user_mode.h> #include <linux/linkage.h> +#include <linux/entry-common.h> #include <asm/ucontext.h> #include <asm/vdso.h> +#include <asm/signal.h> #include <asm/signal32.h> #include <asm/switch_to.h> +#include <asm/vector.h> #include <asm/csr.h> +#include <asm/cacheflush.h> + +unsigned long signal_minsigstksz __ro_after_init; extern u32 __user_rt_sigreturn[2]; +static size_t riscv_v_sc_size __ro_after_init; #define DEBUG_SIG 0 @@ -37,26 +44,13 @@ static long restore_fp_state(struct pt_regs *regs, { long err; struct __riscv_d_ext_state __user *state = &sc_fpregs->d; - size_t i; err = __copy_from_user(¤t->thread.fstate, state, sizeof(*state)); if (unlikely(err)) return err; fstate_restore(current, regs); - - /* We support no other extension state at this time. */ - for (i = 0; i < ARRAY_SIZE(sc_fpregs->q.reserved); i++) { - u32 value; - - err = __get_user(value, &sc_fpregs->q.reserved[i]); - if (unlikely(err)) - break; - if (value != 0) - return -EINVAL; - } - - return err; + return 0; } static long save_fp_state(struct pt_regs *regs, @@ -64,52 +58,184 @@ static long save_fp_state(struct pt_regs *regs, { long err; struct __riscv_d_ext_state __user *state = &sc_fpregs->d; - size_t i; fstate_save(current, regs); err = __copy_to_user(state, ¤t->thread.fstate, sizeof(*state)); + return err; +} +#else +#define save_fp_state(task, regs) (0) +#define restore_fp_state(task, regs) (0) +#endif + +#ifdef CONFIG_RISCV_ISA_V + +static long save_v_state(struct pt_regs *regs, void __user **sc_vec) +{ + struct __riscv_ctx_hdr __user *hdr; + struct __sc_riscv_v_state __user *state; + void __user *datap; + long err; + + hdr = *sc_vec; + /* Place state to the user's signal context space after the hdr */ + state = (struct __sc_riscv_v_state __user *)(hdr + 1); + /* Point datap right after the end of __sc_riscv_v_state */ + datap = state + 1; + + /* datap is designed to be 16 byte aligned for better performance */ + WARN_ON(!IS_ALIGNED((unsigned long)datap, 16)); + + get_cpu_vector_context(); + riscv_v_vstate_save(¤t->thread.vstate, regs); + put_cpu_vector_context(); + + /* Copy everything of vstate but datap. */ + err = __copy_to_user(&state->v_state, ¤t->thread.vstate, + offsetof(struct __riscv_v_ext_state, datap)); + /* Copy the pointer datap itself. */ + err |= __put_user((__force void *)datap, &state->v_state.datap); + /* Copy the whole vector content to user space datap. */ + err |= __copy_to_user(datap, current->thread.vstate.datap, riscv_v_vsize); + /* Copy magic to the user space after saving all vector conetext */ + err |= __put_user(RISCV_V_MAGIC, &hdr->magic); + err |= __put_user(riscv_v_sc_size, &hdr->size); if (unlikely(err)) return err; - /* We support no other extension state at this time. */ - for (i = 0; i < ARRAY_SIZE(sc_fpregs->q.reserved); i++) { - err = __put_user(0, &sc_fpregs->q.reserved[i]); - if (unlikely(err)) - break; - } + /* Only progress the sv_vec if everything has done successfully */ + *sc_vec += riscv_v_sc_size; + return 0; +} - return err; +/* + * Restore Vector extension context from the user's signal frame. This function + * assumes a valid extension header. So magic and size checking must be done by + * the caller. + */ +static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) +{ + long err; + struct __sc_riscv_v_state __user *state = sc_vec; + void __user *datap; + + /* + * Mark the vstate as clean prior performing the actual copy, + * to avoid getting the vstate incorrectly clobbered by the + * discarded vector state. + */ + riscv_v_vstate_set_restore(current, regs); + + /* Copy everything of __sc_riscv_v_state except datap. */ + err = __copy_from_user(¤t->thread.vstate, &state->v_state, + offsetof(struct __riscv_v_ext_state, datap)); + if (unlikely(err)) + return err; + + /* Copy the pointer datap itself. */ + err = __get_user(datap, &state->v_state.datap); + if (unlikely(err)) + return err; + /* + * Copy the whole vector content from user space datap. Use + * copy_from_user to prevent information leak. + */ + return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } #else -#define save_fp_state(task, regs) (0) -#define restore_fp_state(task, regs) (0) +#define save_v_state(task, regs) (0) +#define __restore_v_state(task, regs) (0) #endif static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { + void __user *sc_ext_ptr = &sc->sc_extdesc.hdr; + __u32 rsvd; long err; /* sc_regs is structured the same as the start of pt_regs */ err = __copy_from_user(regs, &sc->sc_regs, sizeof(sc->sc_regs)); + if (unlikely(err)) + return err; + /* Restore the floating-point state. */ - if (has_fpu()) - err |= restore_fp_state(regs, &sc->sc_fpregs); + if (has_fpu()) { + err = restore_fp_state(regs, &sc->sc_fpregs); + if (unlikely(err)) + return err; + } + + /* Check the reserved word before extensions parsing */ + err = __get_user(rsvd, &sc->sc_extdesc.reserved); + if (unlikely(err)) + return err; + if (unlikely(rsvd)) + return -EINVAL; + + while (!err) { + __u32 magic, size; + struct __riscv_ctx_hdr __user *head = sc_ext_ptr; + + err |= __get_user(magic, &head->magic); + err |= __get_user(size, &head->size); + if (unlikely(err)) + return err; + + sc_ext_ptr += sizeof(*head); + switch (magic) { + case END_MAGIC: + if (size != END_HDR_SIZE) + return -EINVAL; + + return 0; + case RISCV_V_MAGIC: + if (!(has_vector() || has_xtheadvector()) || !riscv_v_vstate_query(regs) || + size != riscv_v_sc_size) + return -EINVAL; + + err = __restore_v_state(regs, sc_ext_ptr); + break; + default: + return -EINVAL; + } + sc_ext_ptr = (void __user *)head + size; + } return err; } +static size_t get_rt_frame_size(bool cal_all) +{ + struct rt_sigframe __user *frame; + size_t frame_size; + size_t total_context_size = 0; + + frame_size = sizeof(*frame); + + if (has_vector() || has_xtheadvector()) { + if (cal_all || riscv_v_vstate_query(task_pt_regs(current))) + total_context_size += riscv_v_sc_size; + } + + frame_size += total_context_size; + + frame_size = round_up(frame_size, 16); + return frame_size; +} + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; struct task_struct *task; sigset_t set; + size_t frame_size = get_rt_frame_size(false); /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)regs->sp; - if (!access_ok(frame, sizeof(*frame))) + if (!access_ok(frame, frame_size)) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) @@ -123,6 +249,8 @@ SYSCALL_DEFINE0(rt_sigreturn) if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + regs->cause = -1UL; + return regs->a0; badframe: @@ -141,12 +269,23 @@ static long setup_sigcontext(struct rt_sigframe __user *frame, struct pt_regs *regs) { struct sigcontext __user *sc = &frame->uc.uc_mcontext; + struct __riscv_ctx_hdr __user *sc_ext_ptr = &sc->sc_extdesc.hdr; long err; + /* sc_regs is structured the same as the start of pt_regs */ err = __copy_to_user(&sc->sc_regs, regs, sizeof(sc->sc_regs)); /* Save the floating-point state. */ if (has_fpu()) err |= save_fp_state(regs, &sc->sc_fpregs); + /* Save the vector state. */ + if ((has_vector() || has_xtheadvector()) && riscv_v_vstate_query(regs)) + err |= save_v_state(regs, (void __user **)&sc_ext_ptr); + /* Write zero to fp-reserved space and check it on restore_sigcontext */ + err |= __put_user(0, &sc->sc_extdesc.reserved); + /* And put END __riscv_ctx_hdr at the end. */ + err |= __put_user(END_MAGIC, &sc_ext_ptr->magic); + err |= __put_user(END_HDR_SIZE, &sc_ext_ptr->size); + return err; } @@ -178,9 +317,11 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, { struct rt_sigframe __user *frame; long err = 0; + unsigned long __maybe_unused addr; + size_t frame_size = get_rt_frame_size(false); - frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(frame, sizeof(*frame))) + frame = get_sigframe(ksig, regs, frame_size); + if (!access_ok(frame, frame_size)) return -EFAULT; err |= copy_siginfo_to_user(&frame->info, &ksig->info); @@ -206,7 +347,12 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, if (copy_to_user(&frame->sigreturn_code, __user_rt_sigreturn, sizeof(frame->sigreturn_code))) return -EFAULT; - regs->ra = (unsigned long)&frame->sigreturn_code; + + addr = (unsigned long)&frame->sigreturn_code; + /* Make sure the two instructions are pushed to icache. */ + flush_icache_range(addr, addr + sizeof(frame->sigreturn_code)); + + regs->ra = addr; #endif /* CONFIG_MMU */ /* @@ -236,30 +382,6 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) sigset_t *oldset = sigmask_to_save(); int ret; - /* Are we from a system call? */ - if (regs->cause == EXC_SYSCALL) { - /* Avoid additional syscall restarting via ret_from_exception */ - regs->cause = -1UL; - /* If so, check system call restarting.. */ - switch (regs->a0) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->a0 = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { - regs->a0 = -EINTR; - break; - } - fallthrough; - case -ERESTARTNOINTR: - regs->a0 = regs->orig_a0; - regs->epc -= 0x4; - break; - } - } - rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ @@ -271,58 +393,91 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(ret, ksig, 0); } -static void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { + unsigned long continue_addr = 0, restart_addr = 0; + int retval = 0; struct ksignal ksig; + bool syscall = (regs->cause == EXC_SYSCALL); - if (get_signal(&ksig)) { - /* Actually deliver the signal */ - handle_signal(&ksig, regs); - return; - } + /* If we were from a system call, check for system call restarting */ + if (syscall) { + continue_addr = regs->epc; + restart_addr = continue_addr - 4; + retval = regs->a0; - /* Did we come from a system call? */ - if (regs->cause == EXC_SYSCALL) { /* Avoid additional syscall restarting via ret_from_exception */ regs->cause = -1UL; - /* Restart the system call - no handlers present */ - switch (regs->a0) { + /* + * Prepare for system call restart. We do this here so that a + * debugger will see the already changed PC. + */ + switch (retval) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->a0 = regs->orig_a0; - regs->epc -= 0x4; - break; case -ERESTART_RESTARTBLOCK: - regs->a0 = regs->orig_a0; - regs->a7 = __NR_restart_syscall; - regs->epc -= 0x4; + regs->a0 = regs->orig_a0; + regs->epc = restart_addr; break; } } /* + * Get the signal to deliver. When running under ptrace, at this point + * the debugger may change all of our registers. + */ + if (get_signal(&ksig)) { + /* + * Depending on the signal settings, we may need to revert the + * decision to restart the system call, but skip this if a + * debugger has chosen to restart at a different PC. + */ + if (regs->epc == restart_addr && + (retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK || + (retval == -ERESTARTSYS && + !(ksig.ka.sa.sa_flags & SA_RESTART)))) { + regs->a0 = -EINTR; + regs->epc = continue_addr; + } + + /* Actually deliver the signal */ + handle_signal(&ksig, regs); + return; + } + + /* + * Handle restarting a different system call. As above, if a debugger + * has chosen to restart at a different PC, ignore the restart. + */ + if (syscall && regs->epc == restart_addr && retval == -ERESTART_RESTARTBLOCK) + regs->a7 = __NR_restart_syscall; + + /* * If there is no signal to deliver, we just put the saved * sigmask back. */ restore_saved_sigmask(); } -/* - * notification of userspace execution resumption - * - triggered by the _TIF_WORK_MASK flags - */ -asmlinkage __visible void do_notify_resume(struct pt_regs *regs, - unsigned long thread_info_flags) +void init_rt_signal_env(void); +void __init init_rt_signal_env(void) { - if (thread_info_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - /* Handle pending signal delivery */ - if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); + riscv_v_sc_size = sizeof(struct __riscv_ctx_hdr) + + sizeof(struct __sc_riscv_v_state) + riscv_v_vsize; + /* + * Determine the stack space required for guaranteed signal delivery. + * The signal_minsigstksz will be populated into the AT_MINSIGSTKSZ entry + * in the auxiliary array at process startup. + */ + signal_minsigstksz = get_rt_frame_size(true); +} - if (thread_info_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); +#ifdef CONFIG_DYNAMIC_SIGFRAME +bool sigaltstack_size_valid(size_t ss_size) +{ + return ss_size > get_rt_frame_size(false); } +#endif /* CONFIG_DYNAMIC_SIGFRAME */ diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index b5d30ea92292..e650dec44817 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -12,42 +12,52 @@ #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/kexec.h> +#include <linux/kgdb.h> +#include <linux/percpu.h> #include <linux/profile.h> #include <linux/smp.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/delay.h> +#include <linux/irq.h> #include <linux/irq_work.h> +#include <linux/nmi.h> -#include <asm/sbi.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> +#include <asm/cpu_ops.h> enum ipi_message_type { IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CPU_STOP, + IPI_CPU_CRASH_STOP, IPI_IRQ_WORK, IPI_TIMER, + IPI_CPU_BACKTRACE, + IPI_KGDB_ROUNDUP, IPI_MAX }; unsigned long __cpuid_to_hartid_map[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = INVALID_HARTID }; +EXPORT_SYMBOL_GPL(__cpuid_to_hartid_map); void __init smp_setup_processor_id(void) { cpuid_to_hartid_map(0) = boot_cpu_hartid; + + pr_info("Booting Linux on hartid %lu\n", boot_cpu_hartid); } -/* A collection of single bit ipi messages. */ -static struct { - unsigned long stats[IPI_MAX] ____cacheline_aligned; - unsigned long bits ____cacheline_aligned; -} ipi_data[NR_CPUS] __cacheline_aligned; +static DEFINE_PER_CPU_READ_MOSTLY(int, ipi_dummy_dev); +static int ipi_virq_base __ro_after_init; +static int nr_ipi __ro_after_init = IPI_MAX; +static struct irq_desc *ipi_desc[IPI_MAX] __read_mostly; -int riscv_hartid_to_cpuid(int hartid) +int riscv_hartid_to_cpuid(unsigned long hartid) { int i; @@ -55,21 +65,9 @@ int riscv_hartid_to_cpuid(int hartid) if (cpuid_to_hartid_map(i) == hartid) return i; - pr_err("Couldn't find cpu id for hartid [%d]\n", hartid); return -ENOENT; } -bool arch_match_cpu_phys_id(int cpu, u64 phys_id) -{ - return phys_id == cpuid_to_hartid_map(cpu); -} - -/* Unsupported */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static void ipi_stop(void) { set_cpu_online(smp_processor_id(), false); @@ -77,48 +75,40 @@ static void ipi_stop(void) wait_for_interrupt(); } -static const struct riscv_ipi_ops *ipi_ops __ro_after_init; +#ifdef CONFIG_KEXEC_CORE +static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops) +static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { - ipi_ops = ops; -} -EXPORT_SYMBOL_GPL(riscv_set_ipi_ops); + crash_save_cpu(regs, cpu); -void riscv_clear_ipi(void) -{ - if (ipi_ops && ipi_ops->ipi_clear) - ipi_ops->ipi_clear(); + atomic_dec(&waiting_for_crash_ipi); + + local_irq_disable(); - csr_clear(CSR_IP, IE_SIE); +#ifdef CONFIG_HOTPLUG_CPU + if (cpu_has_hotplug(cpu)) + cpu_ops->cpu_stop(); +#endif + + for(;;) + wait_for_interrupt(); +} +#else +static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) +{ + unreachable(); } -EXPORT_SYMBOL_GPL(riscv_clear_ipi); +#endif static void send_ipi_mask(const struct cpumask *mask, enum ipi_message_type op) { - int cpu; - - smp_mb__before_atomic(); - for_each_cpu(cpu, mask) - set_bit(op, &ipi_data[cpu].bits); - smp_mb__after_atomic(); - - if (ipi_ops && ipi_ops->ipi_inject) - ipi_ops->ipi_inject(mask); - else - pr_warn("SMP: IPI inject method not available\n"); + __ipi_send_mask(ipi_desc[op], mask); } static void send_ipi_single(int cpu, enum ipi_message_type op) { - smp_mb__before_atomic(); - set_bit(op, &ipi_data[cpu].bits); - smp_mb__after_atomic(); - - if (ipi_ops && ipi_ops->ipi_inject) - ipi_ops->ipi_inject(cpumask_of(cpu)); - else - pr_warn("SMP: IPI inject method not available\n"); + __ipi_send_mask(ipi_desc[op], cpumask_of(cpu)); } #ifdef CONFIG_IRQ_WORK @@ -128,62 +118,107 @@ void arch_irq_work_raise(void) } #endif -void handle_IPI(struct pt_regs *regs) +static irqreturn_t handle_IPI(int irq, void *data) { - unsigned long *pending_ipis = &ipi_data[smp_processor_id()].bits; - unsigned long *stats = ipi_data[smp_processor_id()].stats; + unsigned int cpu = smp_processor_id(); + int ipi = irq - ipi_virq_base; + + switch (ipi) { + case IPI_RESCHEDULE: + scheduler_ipi(); + break; + case IPI_CALL_FUNC: + generic_smp_call_function_interrupt(); + break; + case IPI_CPU_STOP: + ipi_stop(); + break; + case IPI_CPU_CRASH_STOP: + ipi_cpu_crash_stop(cpu, get_irq_regs()); + break; + case IPI_IRQ_WORK: + irq_work_run(); + break; +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + case IPI_TIMER: + tick_receive_broadcast(); + break; +#endif + case IPI_CPU_BACKTRACE: + nmi_cpu_backtrace(get_irq_regs()); + break; + case IPI_KGDB_ROUNDUP: + kgdb_nmicallback(cpu, get_irq_regs()); + break; + default: + pr_warn("CPU%d: unhandled IPI%d\n", cpu, ipi); + break; + } + + return IRQ_HANDLED; +} - riscv_clear_ipi(); +void riscv_ipi_enable(void) +{ + int i; + + if (WARN_ON_ONCE(!ipi_virq_base)) + return; + + for (i = 0; i < nr_ipi; i++) + enable_percpu_irq(ipi_virq_base + i, 0); +} - while (true) { - unsigned long ops; +void riscv_ipi_disable(void) +{ + int i; - /* Order bit clearing and data access. */ - mb(); + if (WARN_ON_ONCE(!ipi_virq_base)) + return; - ops = xchg(pending_ipis, 0); - if (ops == 0) - return; + for (i = 0; i < nr_ipi; i++) + disable_percpu_irq(ipi_virq_base + i); +} - if (ops & (1 << IPI_RESCHEDULE)) { - stats[IPI_RESCHEDULE]++; - scheduler_ipi(); - } +bool riscv_ipi_have_virq_range(void) +{ + return (ipi_virq_base) ? true : false; +} - if (ops & (1 << IPI_CALL_FUNC)) { - stats[IPI_CALL_FUNC]++; - generic_smp_call_function_interrupt(); - } +void riscv_ipi_set_virq_range(int virq, int nr) +{ + int i, err; - if (ops & (1 << IPI_CPU_STOP)) { - stats[IPI_CPU_STOP]++; - ipi_stop(); - } + if (WARN_ON(ipi_virq_base)) + return; - if (ops & (1 << IPI_IRQ_WORK)) { - stats[IPI_IRQ_WORK]++; - irq_work_run(); - } + WARN_ON(nr < IPI_MAX); + nr_ipi = min(nr, IPI_MAX); + ipi_virq_base = virq; -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - if (ops & (1 << IPI_TIMER)) { - stats[IPI_TIMER]++; - tick_receive_broadcast(); - } -#endif - BUG_ON((ops >> IPI_MAX) != 0); + /* Request IPIs */ + for (i = 0; i < nr_ipi; i++) { + err = request_percpu_irq(ipi_virq_base + i, handle_IPI, + "IPI", &ipi_dummy_dev); + WARN_ON(err); - /* Order data access and bit testing. */ - mb(); + ipi_desc[i] = irq_to_desc(ipi_virq_base + i); + irq_set_status_flags(ipi_virq_base + i, IRQ_HIDDEN); } + + /* Enabled IPIs for boot CPU immediately */ + riscv_ipi_enable(); } static const char * const ipi_names[] = { [IPI_RESCHEDULE] = "Rescheduling interrupts", [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", + [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", [IPI_TIMER] = "Timer broadcast interrupts", + [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", + [IPI_KGDB_ROUNDUP] = "KGDB roundup interrupts", }; void show_ipi_stats(struct seq_file *p, int prec) @@ -194,7 +229,7 @@ void show_ipi_stats(struct seq_file *p, int prec) seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10lu ", ipi_data[cpu].stats[i]); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); seq_printf(p, " %s\n", ipi_names[i]); } } @@ -241,8 +276,92 @@ void smp_send_stop(void) cpumask_pr_args(cpu_online_mask)); } -void smp_send_reschedule(int cpu) +#ifdef CONFIG_KEXEC_CORE +/* + * The number of CPUs online, not counting this CPU (which may not be + * fully online and so not counted in num_online_cpus()). + */ +static inline unsigned int num_other_online_cpus(void) +{ + unsigned int this_cpu_online = cpu_online(smp_processor_id()); + + return num_online_cpus() - this_cpu_online; +} + +void crash_smp_send_stop(void) +{ + static int cpus_stopped; + cpumask_t mask; + unsigned long timeout; + + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + cpus_stopped = 1; + + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + return; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); + + pr_crit("SMP: stopping secondary CPUs\n"); + send_ipi_mask(&mask, IPI_CPU_CRASH_STOP); + + /* Wait up to one second for other CPUs to stop */ + timeout = USEC_PER_SEC; + while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) + udelay(1); + + if (atomic_read(&waiting_for_crash_ipi) > 0) + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(&mask)); +} + +bool smp_crash_stop_failed(void) +{ + return (atomic_read(&waiting_for_crash_ipi) > 0); +} +#endif + +void arch_smp_send_reschedule(int cpu) { send_ipi_single(cpu, IPI_RESCHEDULE); } -EXPORT_SYMBOL_GPL(smp_send_reschedule); +EXPORT_SYMBOL_GPL(arch_smp_send_reschedule); + +static void riscv_backtrace_ipi(cpumask_t *mask) +{ + send_ipi_mask(mask, IPI_CPU_BACKTRACE); +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, riscv_backtrace_ipi); +} + +#ifdef CONFIG_KGDB +void kgdb_roundup_cpus(void) +{ + int this_cpu = raw_smp_processor_id(); + int cpu; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + + send_ipi_single(cpu, IPI_KGDB_ROUNDUP); + } +} +#endif diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index f1e4948a4b52..601a321e0f17 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -8,6 +8,7 @@ * Copyright (C) 2017 SiFive */ +#include <linux/acpi.h> #include <linux/arch_topology.h> #include <linux/module.h> #include <linux/init.h> @@ -24,31 +25,31 @@ #include <linux/of.h> #include <linux/sched/task_stack.h> #include <linux/sched/mm.h> + +#include <asm/cacheflush.h> #include <asm/cpu_ops.h> #include <asm/irq.h> #include <asm/mmu_context.h> #include <asm/numa.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/sbi.h> #include <asm/smp.h> +#include <uapi/asm/hwcap.h> +#include <asm/vector.h> #include "head.h" static DECLARE_COMPLETION(cpu_running); -void __init smp_prepare_boot_cpu(void) -{ - init_cpu_topology(); -} - void __init smp_prepare_cpus(unsigned int max_cpus) { int cpuid; - int ret; unsigned int curr_cpuid; + init_cpu_topology(); + curr_cpuid = smp_processor_id(); + store_cpu_topology(curr_cpuid); numa_store_cpu_info(curr_cpuid); numa_add_cpu(curr_cpuid); @@ -59,28 +60,75 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpuid) { if (cpuid == curr_cpuid) continue; - if (cpu_ops[cpuid]->cpu_prepare) { - ret = cpu_ops[cpuid]->cpu_prepare(cpuid); - if (ret) - continue; - } set_cpu_present(cpuid, true); numa_store_cpu_info(cpuid); } } -void __init setup_smp(void) +#ifdef CONFIG_ACPI +static unsigned int cpu_count = 1; + +static int __init acpi_parse_rintc(union acpi_subtable_headers *header, const unsigned long end) +{ + unsigned long hart; + static bool found_boot_cpu; + struct acpi_madt_rintc *processor = (struct acpi_madt_rintc *)header; + + /* + * Each RINTC structure in MADT will have a flag. If ACPI_MADT_ENABLED + * bit in the flag is not enabled, it means OS should not try to enable + * the cpu to which RINTC belongs. + */ + if (!(processor->flags & ACPI_MADT_ENABLED)) + return 0; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + hart = processor->hart_id; + if (hart == INVALID_HARTID) { + pr_warn("Invalid hartid\n"); + return 0; + } + + if (hart == cpuid_to_hartid_map(0)) { + BUG_ON(found_boot_cpu); + found_boot_cpu = true; + return 0; + } + + if (cpu_count >= NR_CPUS) { + pr_warn("NR_CPUS is too small for the number of ACPI tables.\n"); + return 0; + } + + cpuid_to_hartid_map(cpu_count) = hart; + cpu_count++; + + return 0; +} + +static void __init acpi_parse_and_init_cpus(void) +{ + acpi_table_parse_madt(ACPI_MADT_TYPE_RINTC, acpi_parse_rintc, 0); +} +#else +#define acpi_parse_and_init_cpus(...) do { } while (0) +#endif + +static void __init of_parse_and_init_cpus(void) { struct device_node *dn; - int hart; + unsigned long hart; bool found_boot_cpu = false; int cpuid = 1; - - cpu_set_ops(0); + int rc; for_each_of_cpu_node(dn) { - hart = riscv_of_processor_hartid(dn); - if (hart < 0) + rc = riscv_early_of_processor_hartid(dn, &hart); + if (rc < 0) continue; if (hart == cpuid_to_hartid_map(0)) { @@ -90,7 +138,7 @@ void __init setup_smp(void) continue; } if (cpuid >= NR_CPUS) { - pr_warn("Invalid cpuid [%d] for hartid [%d]\n", + pr_warn("Invalid cpuid [%d] for hartid [%lu]\n", cpuid, hart); continue; } @@ -105,19 +153,28 @@ void __init setup_smp(void) if (cpuid > nr_cpu_ids) pr_warn("Total number of cpus [%d] is greater than nr_cpus option value [%d]\n", cpuid, nr_cpu_ids); +} + +void __init setup_smp(void) +{ + int cpuid; + + cpu_set_ops(); - for (cpuid = 1; cpuid < nr_cpu_ids; cpuid++) { - if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) { - cpu_set_ops(cpuid); + if (acpi_disabled) + of_parse_and_init_cpus(); + else + acpi_parse_and_init_cpus(); + + for (cpuid = 1; cpuid < nr_cpu_ids; cpuid++) + if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) set_cpu_possible(cpuid, true); - } - } } static int start_secondary_cpu(int cpu, struct task_struct *tidle) { - if (cpu_ops[cpu]->cpu_start) - return cpu_ops[cpu]->cpu_start(cpu, tidle); + if (cpu_ops->cpu_start) + return cpu_ops->cpu_start(cpu, tidle); return -EOPNOTSUPP; } @@ -155,21 +212,36 @@ asmlinkage __visible void smp_callin(void) struct mm_struct *mm = &init_mm; unsigned int curr_cpuid = smp_processor_id(); - riscv_clear_ipi(); + if (has_vector()) { + /* + * Return as early as possible so the hart with a mismatching + * vlen won't boot. + */ + if (riscv_v_setup_vsize()) + return; + } /* All kernel threads share the same mm context. */ mmgrab(mm); current->active_mm = mm; + store_cpu_topology(curr_cpuid); notify_cpu_starting(curr_cpuid); + + riscv_ipi_enable(); + numa_add_cpu(curr_cpuid); - update_siblings_masks(curr_cpuid); - set_cpu_online(curr_cpuid, 1); + + pr_debug("CPU%u: Booted secondary hartid %lu\n", curr_cpuid, + cpuid_to_hartid_map(curr_cpuid)); + + set_cpu_online(curr_cpuid, true); /* - * Remote TLB flushes are ignored while the CPU is offline, so emit - * a local TLB flush right now just in case. + * Remote cache and TLB flushes are ignored while the CPU is offline, + * so flush them both right now just in case. */ + local_flush_icache_all(); local_flush_tlb_all(); complete(&cpu_running); /* diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 08d11a53f39e..3fe9e6edef8f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -16,10 +16,24 @@ #ifdef CONFIG_FRAME_POINTER +extern asmlinkage void handle_exception(void); +extern unsigned long ret_from_exception_end; + +static inline int fp_is_valid(unsigned long fp, unsigned long sp) +{ + unsigned long low, high; + + low = sp + sizeof(struct stackframe); + high = ALIGN(sp, THREAD_SIZE); + + return !(fp < low || fp > high || fp & 0x07); +} + void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg) { unsigned long fp, sp, pc; + int graph_idx = 0; int level = 0; if (regs) { @@ -30,6 +44,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = (unsigned long)__builtin_frame_address(0); sp = current_stack_pointer; pc = (unsigned long)walk_stackframe; + level = -1; } else { /* task blocked in __switch_to */ fp = task->thread.s[0]; @@ -38,27 +53,33 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, } for (;;) { - unsigned long low, high; struct stackframe *frame; - if (unlikely(!__kernel_text_address(pc) || (level++ >= 1 && !fn(arg, pc)))) + if (unlikely(!__kernel_text_address(pc) || (level++ >= 0 && !fn(arg, pc)))) break; - /* Validate frame pointer */ - low = sp + sizeof(struct stackframe); - high = ALIGN(sp, THREAD_SIZE); - if (unlikely(fp < low || fp > high || fp & 0x7)) + if (unlikely(!fp_is_valid(fp, sp))) break; + /* Unwind stack frame */ frame = (struct stackframe *)fp - 1; sp = fp; - if (regs && (regs->epc == pc) && (frame->fp & 0x7)) { + if (regs && (regs->epc == pc) && fp_is_valid(frame->ra, sp)) { + /* We hit function where ra is not saved on the stack */ fp = frame->ra; pc = regs->ra; } else { fp = frame->fp; - pc = ftrace_graph_ret_addr(current, NULL, frame->ra, - (unsigned long *)(fp - 8)); + pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra, + &frame->ra); + if (pc >= (unsigned long)handle_exception && + pc < (unsigned long)&ret_from_exception_end) { + if (unlikely(!fn(arg, pc))) + break; + + pc = ((struct pt_regs *)sp)->epc; + fp = ((struct pt_regs *)sp)->s0; + } } } @@ -91,7 +112,7 @@ void notrace walk_stackframe(struct task_struct *task, while (!kstack_end(ksp)) { if (__kernel_text_address(pc) && unlikely(!fn(arg, pc))) break; - pc = (*ksp++) - 0x4; + pc = READ_ONCE_NOCHECK(*ksp++) - 0x4; } } @@ -138,8 +159,51 @@ unsigned long __get_wchan(struct task_struct *task) return pc; } -noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { walk_stackframe(task, regs, consume_entry, cookie); } + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static unsigned long unwind_user_frame(stack_trace_consume_fn consume_entry, + void *cookie, unsigned long fp, + unsigned long reg_ra) +{ + struct stackframe buftail; + unsigned long ra = 0; + unsigned long __user *user_frame_tail = + (unsigned long __user *)(fp - sizeof(struct stackframe)); + + /* Check accessibility of one struct frame_tail beyond */ + if (!access_ok(user_frame_tail, sizeof(buftail))) + return 0; + if (__copy_from_user_inatomic(&buftail, user_frame_tail, + sizeof(buftail))) + return 0; + + ra = reg_ra ? : buftail.ra; + + fp = buftail.fp; + if (!ra || !consume_entry(cookie, ra)) + return 0; + + return fp; +} + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + unsigned long fp = 0; + + fp = regs->s0; + if (!consume_entry(cookie, regs->epc)) + return; + + fp = unwind_user_frame(consume_entry, cookie, fp, regs->ra); + while (fp && !(fp & 0x7)) + fp = unwind_user_frame(consume_entry, cookie, fp, 0); +} diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index 9ba24fb8cc93..24b3f57d467f 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -4,13 +4,18 @@ * Copyright (c) 2022 Ventana Micro Systems Inc. */ +#define pr_fmt(fmt) "suspend: " fmt + #include <linux/ftrace.h> +#include <linux/suspend.h> #include <asm/csr.h> +#include <asm/sbi.h> #include <asm/suspend.h> -static void suspend_save_csrs(struct suspend_context *context) +void suspend_save_csrs(struct suspend_context *context) { - context->scratch = csr_read(CSR_SCRATCH); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG)) + context->envcfg = csr_read(CSR_ENVCFG); context->tvec = csr_read(CSR_TVEC); context->ie = csr_read(CSR_IE); @@ -25,17 +30,33 @@ static void suspend_save_csrs(struct suspend_context *context) */ #ifdef CONFIG_MMU + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SSTC)) { + context->stimecmp = csr_read(CSR_STIMECMP); +#if __riscv_xlen < 64 + context->stimecmph = csr_read(CSR_STIMECMPH); +#endif + } + context->satp = csr_read(CSR_SATP); #endif } -static void suspend_restore_csrs(struct suspend_context *context) +void suspend_restore_csrs(struct suspend_context *context) { - csr_write(CSR_SCRATCH, context->scratch); + csr_write(CSR_SCRATCH, 0); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG)) + csr_write(CSR_ENVCFG, context->envcfg); csr_write(CSR_TVEC, context->tvec); csr_write(CSR_IE, context->ie); #ifdef CONFIG_MMU + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SSTC)) { + csr_write(CSR_STIMECMP, context->stimecmp); +#if __riscv_xlen < 64 + csr_write(CSR_STIMECMPH, context->stimecmph); +#endif + } + csr_write(CSR_SATP, context->satp); #endif } @@ -85,3 +106,92 @@ int cpu_suspend(unsigned long arg, return rc; } + +#ifdef CONFIG_RISCV_SBI +static int sbi_system_suspend(unsigned long sleep_type, + unsigned long resume_addr, + unsigned long opaque) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_SUSP, SBI_EXT_SUSP_SYSTEM_SUSPEND, + sleep_type, resume_addr, opaque, 0, 0, 0); + if (ret.error) + return sbi_err_map_linux_errno(ret.error); + + return ret.value; +} + +static int sbi_system_suspend_enter(suspend_state_t state) +{ + return cpu_suspend(SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM, sbi_system_suspend); +} + +static const struct platform_suspend_ops sbi_system_suspend_ops = { + .valid = suspend_valid_only_mem, + .enter = sbi_system_suspend_enter, +}; + +static int __init sbi_system_suspend_init(void) +{ + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_SUSP) > 0) { + pr_info("SBI SUSP extension detected\n"); + if (IS_ENABLED(CONFIG_SUSPEND)) + suspend_set_ops(&sbi_system_suspend_ops); + } + + return 0; +} + +arch_initcall(sbi_system_suspend_init); + +static int sbi_suspend_finisher(unsigned long suspend_type, + unsigned long resume_addr, + unsigned long opaque) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, + suspend_type, resume_addr, opaque, 0, 0, 0); + + return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0; +} + +int riscv_sbi_hart_suspend(u32 state) +{ + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return cpu_suspend(state, sbi_suspend_finisher); + else + return sbi_suspend_finisher(state, 0, 0); +} + +bool riscv_sbi_suspend_state_is_valid(u32 state) +{ + if (state > SBI_HSM_SUSPEND_RET_DEFAULT && + state < SBI_HSM_SUSPEND_RET_PLATFORM) + return false; + + if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT && + state < SBI_HSM_SUSPEND_NON_RET_PLATFORM) + return false; + + return true; +} + +bool riscv_sbi_hsm_is_supported(void) +{ + /* + * The SBI HSM suspend function is only available when: + * 1) SBI version is 0.3 or higher + * 2) SBI HSM extension is available + */ + if (sbi_spec_version < sbi_mk_version(0, 3) || + !sbi_probe_extension(SBI_EXT_HSM)) { + pr_info("HSM suspend not available\n"); + return false; + } + + return true; +} +#endif /* CONFIG_RISCV_SBI */ diff --git a/arch/riscv/kernel/suspend_entry.S b/arch/riscv/kernel/suspend_entry.S index aafcca58c19d..2d54f309c140 100644 --- a/arch/riscv/kernel/suspend_entry.S +++ b/arch/riscv/kernel/suspend_entry.S @@ -5,8 +5,10 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm.h> #include <asm/asm-offsets.h> +#include <asm/assembler.h> #include <asm/csr.h> #include <asm/xip_fixup.h> @@ -14,7 +16,7 @@ .altmacro .option norelax -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) /* Save registers (except A0 and T0-T6) */ REG_S ra, (SUSPEND_CONTEXT_REGS + PT_RA)(a0) REG_S sp, (SUSPEND_CONTEXT_REGS + PT_SP)(a0) @@ -55,14 +57,11 @@ ENTRY(__cpu_suspend_enter) /* Return to C code */ ret -END(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) -ENTRY(__cpu_resume_enter) +SYM_TYPED_FUNC_START(__cpu_resume_enter) /* Load the global pointer */ - .option push - .option norelax - la gp, __global_pointer$ - .option pop + load_global_pointer #ifdef CONFIG_MMU /* Save A0 and A1 */ @@ -83,43 +82,14 @@ ENTRY(__cpu_resume_enter) add a0, a1, zero /* Restore CSRs */ - REG_L t0, (SUSPEND_CONTEXT_REGS + PT_EPC)(a0) - csrw CSR_EPC, t0 - REG_L t0, (SUSPEND_CONTEXT_REGS + PT_STATUS)(a0) - csrw CSR_STATUS, t0 - REG_L t0, (SUSPEND_CONTEXT_REGS + PT_BADADDR)(a0) - csrw CSR_TVAL, t0 - REG_L t0, (SUSPEND_CONTEXT_REGS + PT_CAUSE)(a0) - csrw CSR_CAUSE, t0 + suspend_restore_csrs /* Restore registers (except A0 and T0-T6) */ - REG_L ra, (SUSPEND_CONTEXT_REGS + PT_RA)(a0) - REG_L sp, (SUSPEND_CONTEXT_REGS + PT_SP)(a0) - REG_L gp, (SUSPEND_CONTEXT_REGS + PT_GP)(a0) - REG_L tp, (SUSPEND_CONTEXT_REGS + PT_TP)(a0) - REG_L s0, (SUSPEND_CONTEXT_REGS + PT_S0)(a0) - REG_L s1, (SUSPEND_CONTEXT_REGS + PT_S1)(a0) - REG_L a1, (SUSPEND_CONTEXT_REGS + PT_A1)(a0) - REG_L a2, (SUSPEND_CONTEXT_REGS + PT_A2)(a0) - REG_L a3, (SUSPEND_CONTEXT_REGS + PT_A3)(a0) - REG_L a4, (SUSPEND_CONTEXT_REGS + PT_A4)(a0) - REG_L a5, (SUSPEND_CONTEXT_REGS + PT_A5)(a0) - REG_L a6, (SUSPEND_CONTEXT_REGS + PT_A6)(a0) - REG_L a7, (SUSPEND_CONTEXT_REGS + PT_A7)(a0) - REG_L s2, (SUSPEND_CONTEXT_REGS + PT_S2)(a0) - REG_L s3, (SUSPEND_CONTEXT_REGS + PT_S3)(a0) - REG_L s4, (SUSPEND_CONTEXT_REGS + PT_S4)(a0) - REG_L s5, (SUSPEND_CONTEXT_REGS + PT_S5)(a0) - REG_L s6, (SUSPEND_CONTEXT_REGS + PT_S6)(a0) - REG_L s7, (SUSPEND_CONTEXT_REGS + PT_S7)(a0) - REG_L s8, (SUSPEND_CONTEXT_REGS + PT_S8)(a0) - REG_L s9, (SUSPEND_CONTEXT_REGS + PT_S9)(a0) - REG_L s10, (SUSPEND_CONTEXT_REGS + PT_S10)(a0) - REG_L s11, (SUSPEND_CONTEXT_REGS + PT_S11)(a0) + suspend_restore_regs /* Return zero value */ add a0, zero, zero /* Return to C code */ ret -END(__cpu_resume_enter) +SYM_FUNC_END(__cpu_resume_enter) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c new file mode 100644 index 000000000000..0b170e18a2be --- /dev/null +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The hwprobe interface, for allowing userspace to probe to see which features + * are supported by the hardware. See Documentation/arch/riscv/hwprobe.rst for + * more details. + */ +#include <linux/syscalls.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/hwprobe.h> +#include <asm/processor.h> +#include <asm/delay.h> +#include <asm/sbi.h> +#include <asm/switch_to.h> +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/vector.h> +#include <asm/vendor_extensions/sifive_hwprobe.h> +#include <asm/vendor_extensions/thead_hwprobe.h> +#include <vdso/vsyscall.h> + + +static void hwprobe_arch_id(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + u64 id = -1ULL; + bool first = true; + int cpu; + + for_each_cpu(cpu, cpus) { + u64 cpu_id; + + switch (pair->key) { + case RISCV_HWPROBE_KEY_MVENDORID: + cpu_id = riscv_cached_mvendorid(cpu); + break; + case RISCV_HWPROBE_KEY_MIMPID: + cpu_id = riscv_cached_mimpid(cpu); + break; + case RISCV_HWPROBE_KEY_MARCHID: + cpu_id = riscv_cached_marchid(cpu); + break; + } + + if (first) { + id = cpu_id; + first = false; + } + + /* + * If there's a mismatch for the given set, return -1 in the + * value. + */ + if (id != cpu_id) { + id = -1ULL; + break; + } + } + + pair->value = id; +} + +static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + int cpu; + u64 missing = 0; + + pair->value = 0; + if (has_fpu()) + pair->value |= RISCV_HWPROBE_IMA_FD; + + if (riscv_isa_extension_available(NULL, c)) + pair->value |= RISCV_HWPROBE_IMA_C; + + if (has_vector() && riscv_isa_extension_available(NULL, v)) + pair->value |= RISCV_HWPROBE_IMA_V; + + /* + * Loop through and record extensions that 1) anyone has, and 2) anyone + * doesn't have. + */ + for_each_cpu(cpu, cpus) { + struct riscv_isainfo *isainfo = &hart_isa[cpu]; + +#define EXT_KEY(ext) \ + do { \ + if (__riscv_isa_extension_available(isainfo->isa, RISCV_ISA_EXT_##ext)) \ + pair->value |= RISCV_HWPROBE_EXT_##ext; \ + else \ + missing |= RISCV_HWPROBE_EXT_##ext; \ + } while (false) + + /* + * Only use EXT_KEY() for extensions which can be exposed to userspace, + * regardless of the kernel's configuration, as no other checks, besides + * presence in the hart_isa bitmap, are made. + */ + EXT_KEY(ZAAMO); + EXT_KEY(ZABHA); + EXT_KEY(ZACAS); + EXT_KEY(ZALRSC); + EXT_KEY(ZAWRS); + EXT_KEY(ZBA); + EXT_KEY(ZBB); + EXT_KEY(ZBC); + EXT_KEY(ZBKB); + EXT_KEY(ZBKC); + EXT_KEY(ZBKX); + EXT_KEY(ZBS); + EXT_KEY(ZCA); + EXT_KEY(ZCB); + EXT_KEY(ZCMOP); + EXT_KEY(ZICBOM); + EXT_KEY(ZICBOZ); + EXT_KEY(ZICNTR); + EXT_KEY(ZICOND); + EXT_KEY(ZIHINTNTL); + EXT_KEY(ZIHINTPAUSE); + EXT_KEY(ZIHPM); + EXT_KEY(ZIMOP); + EXT_KEY(ZKND); + EXT_KEY(ZKNE); + EXT_KEY(ZKNH); + EXT_KEY(ZKSED); + EXT_KEY(ZKSH); + EXT_KEY(ZKT); + EXT_KEY(ZTSO); + + /* + * All the following extensions must depend on the kernel + * support of V. + */ + if (has_vector()) { + EXT_KEY(ZVBB); + EXT_KEY(ZVBC); + EXT_KEY(ZVE32F); + EXT_KEY(ZVE32X); + EXT_KEY(ZVE64D); + EXT_KEY(ZVE64F); + EXT_KEY(ZVE64X); + EXT_KEY(ZVFBFMIN); + EXT_KEY(ZVFBFWMA); + EXT_KEY(ZVFH); + EXT_KEY(ZVFHMIN); + EXT_KEY(ZVKB); + EXT_KEY(ZVKG); + EXT_KEY(ZVKNED); + EXT_KEY(ZVKNHA); + EXT_KEY(ZVKNHB); + EXT_KEY(ZVKSED); + EXT_KEY(ZVKSH); + EXT_KEY(ZVKT); + } + + if (has_fpu()) { + EXT_KEY(ZCD); + EXT_KEY(ZCF); + EXT_KEY(ZFA); + EXT_KEY(ZFBFMIN); + EXT_KEY(ZFH); + EXT_KEY(ZFHMIN); + } + + if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) + EXT_KEY(SUPM); +#undef EXT_KEY + } + + /* Now turn off reporting features if any CPU is missing it. */ + pair->value &= ~missing; +} + +static bool hwprobe_ext0_has(const struct cpumask *cpus, u64 ext) +{ + struct riscv_hwprobe pair; + + hwprobe_isa_ext0(&pair, cpus); + return (pair.value & ext); +} + +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) +static u64 hwprobe_misaligned(const struct cpumask *cpus) +{ + int cpu; + u64 perf = -1ULL; + + for_each_cpu(cpu, cpus) { + int this_perf = per_cpu(misaligned_access_speed, cpu); + + if (perf == -1ULL) + perf = this_perf; + + if (perf != this_perf) { + perf = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; + break; + } + } + + if (perf == -1ULL) + return RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; + + return perf; +} +#else +static u64 hwprobe_misaligned(const struct cpumask *cpus) +{ + if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS)) + return RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; + + if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available()) + return RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; + + return RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; +} +#endif + +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) +{ + int cpu; + u64 perf = -1ULL; + + /* Return if supported or not even if speed wasn't probed */ + for_each_cpu(cpu, cpus) { + int this_perf = per_cpu(vector_misaligned_access, cpu); + + if (perf == -1ULL) + perf = this_perf; + + if (perf != this_perf) { + perf = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + break; + } + } + + if (perf == -1ULL) + return RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + + return perf; +} +#else +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) +{ + if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS)) + return RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; + + if (IS_ENABLED(CONFIG_RISCV_SLOW_VECTOR_UNALIGNED_ACCESS)) + return RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; + + return RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; +} +#endif + +static void hwprobe_one_pair(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + switch (pair->key) { + case RISCV_HWPROBE_KEY_MVENDORID: + case RISCV_HWPROBE_KEY_MARCHID: + case RISCV_HWPROBE_KEY_MIMPID: + hwprobe_arch_id(pair, cpus); + break; + /* + * The kernel already assumes that the base single-letter ISA + * extensions are supported on all harts, and only supports the + * IMA base, so just cheat a bit here and tell that to + * userspace. + */ + case RISCV_HWPROBE_KEY_BASE_BEHAVIOR: + pair->value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA; + break; + + case RISCV_HWPROBE_KEY_IMA_EXT_0: + hwprobe_isa_ext0(pair, cpus); + break; + + case RISCV_HWPROBE_KEY_CPUPERF_0: + case RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF: + pair->value = hwprobe_misaligned(cpus); + break; + + case RISCV_HWPROBE_KEY_MISALIGNED_VECTOR_PERF: + pair->value = hwprobe_vec_misaligned(cpus); + break; + + case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: + pair->value = 0; + if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) + pair->value = riscv_cboz_block_size; + break; + case RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE: + pair->value = 0; + if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOM)) + pair->value = riscv_cbom_block_size; + break; + case RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS: + pair->value = user_max_virt_addr(); + break; + + case RISCV_HWPROBE_KEY_TIME_CSR_FREQ: + pair->value = riscv_timebase; + break; + + case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: + hwprobe_isa_vendor_ext_sifive_0(pair, cpus); + break; + + case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: + hwprobe_isa_vendor_ext_thead_0(pair, cpus); + break; + + /* + * For forward compatibility, unknown keys don't fail the whole + * call, but get their element key set to -1 and value set to 0 + * indicating they're unrecognized. + */ + default: + pair->key = -1; + pair->value = 0; + break; + } +} + +static int hwprobe_get_values(struct riscv_hwprobe __user *pairs, + size_t pair_count, size_t cpusetsize, + unsigned long __user *cpus_user, + unsigned int flags) +{ + size_t out; + int ret; + cpumask_t cpus; + + /* Check the reserved flags. */ + if (flags != 0) + return -EINVAL; + + /* + * The interface supports taking in a CPU mask, and returns values that + * are consistent across that mask. Allow userspace to specify NULL and + * 0 as a shortcut to all online CPUs. + */ + cpumask_clear(&cpus); + if (!cpusetsize && !cpus_user) { + cpumask_copy(&cpus, cpu_online_mask); + } else { + if (cpusetsize > cpumask_size()) + cpusetsize = cpumask_size(); + + ret = copy_from_user(&cpus, cpus_user, cpusetsize); + if (ret) + return -EFAULT; + + /* + * Userspace must provide at least one online CPU, without that + * there's no way to define what is supported. + */ + cpumask_and(&cpus, &cpus, cpu_online_mask); + if (cpumask_empty(&cpus)) + return -EINVAL; + } + + for (out = 0; out < pair_count; out++, pairs++) { + struct riscv_hwprobe pair; + + if (get_user(pair.key, &pairs->key)) + return -EFAULT; + + pair.value = 0; + hwprobe_one_pair(&pair, &cpus); + ret = put_user(pair.key, &pairs->key); + if (ret == 0) + ret = put_user(pair.value, &pairs->value); + + if (ret) + return -EFAULT; + } + + return 0; +} + +static int hwprobe_get_cpus(struct riscv_hwprobe __user *pairs, + size_t pair_count, size_t cpusetsize, + unsigned long __user *cpus_user, + unsigned int flags) +{ + cpumask_t cpus, one_cpu; + bool clear_all = false; + size_t i; + int ret; + + if (flags != RISCV_HWPROBE_WHICH_CPUS) + return -EINVAL; + + if (!cpusetsize || !cpus_user) + return -EINVAL; + + if (cpusetsize > cpumask_size()) + cpusetsize = cpumask_size(); + + ret = copy_from_user(&cpus, cpus_user, cpusetsize); + if (ret) + return -EFAULT; + + if (cpumask_empty(&cpus)) + cpumask_copy(&cpus, cpu_online_mask); + + cpumask_and(&cpus, &cpus, cpu_online_mask); + + cpumask_clear(&one_cpu); + + for (i = 0; i < pair_count; i++) { + struct riscv_hwprobe pair, tmp; + int cpu; + + ret = copy_from_user(&pair, &pairs[i], sizeof(pair)); + if (ret) + return -EFAULT; + + if (!riscv_hwprobe_key_is_valid(pair.key)) { + clear_all = true; + pair = (struct riscv_hwprobe){ .key = -1, }; + ret = copy_to_user(&pairs[i], &pair, sizeof(pair)); + if (ret) + return -EFAULT; + } + + if (clear_all) + continue; + + tmp = (struct riscv_hwprobe){ .key = pair.key, }; + + for_each_cpu(cpu, &cpus) { + cpumask_set_cpu(cpu, &one_cpu); + + hwprobe_one_pair(&tmp, &one_cpu); + + if (!riscv_hwprobe_pair_cmp(&tmp, &pair)) + cpumask_clear_cpu(cpu, &cpus); + + cpumask_clear_cpu(cpu, &one_cpu); + } + } + + if (clear_all) + cpumask_clear(&cpus); + + ret = copy_to_user(cpus_user, &cpus, cpusetsize); + if (ret) + return -EFAULT; + + return 0; +} + +static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs, + size_t pair_count, size_t cpusetsize, + unsigned long __user *cpus_user, + unsigned int flags) +{ + if (flags & RISCV_HWPROBE_WHICH_CPUS) + return hwprobe_get_cpus(pairs, pair_count, cpusetsize, + cpus_user, flags); + + return hwprobe_get_values(pairs, pair_count, cpusetsize, + cpus_user, flags); +} + +#ifdef CONFIG_MMU + +static int __init init_hwprobe_vdso_data(void) +{ + struct vdso_arch_data *avd = vdso_k_arch_data; + u64 id_bitsmash = 0; + struct riscv_hwprobe pair; + int key; + + /* + * Initialize vDSO data with the answers for the "all CPUs" case, to + * save a syscall in the common case. + */ + for (key = 0; key <= RISCV_HWPROBE_MAX_KEY; key++) { + pair.key = key; + hwprobe_one_pair(&pair, cpu_online_mask); + + WARN_ON_ONCE(pair.key < 0); + + avd->all_cpu_hwprobe_values[key] = pair.value; + /* + * Smash together the vendor, arch, and impl IDs to see if + * they're all 0 or any negative. + */ + if (key <= RISCV_HWPROBE_KEY_MIMPID) + id_bitsmash |= pair.value; + } + + /* + * If the arch, vendor, and implementation ID are all the same across + * all harts, then assume all CPUs are the same, and allow the vDSO to + * answer queries for arbitrary masks. However if all values are 0 (not + * populated) or any value returns -1 (varies across CPUs), then the + * vDSO should defer to the kernel for exotic cpu masks. + */ + avd->homogeneous_cpus = id_bitsmash != 0 && id_bitsmash != -1; + return 0; +} + +arch_initcall_sync(init_hwprobe_vdso_data); + +#endif /* CONFIG_MMU */ + +SYSCALL_DEFINE5(riscv_hwprobe, struct riscv_hwprobe __user *, pairs, + size_t, pair_count, size_t, cpusetsize, unsigned long __user *, + cpus, unsigned int, flags) +{ + return do_riscv_hwprobe(pairs, pair_count, cpusetsize, + cpus, flags); +} diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 9c0194f176fc..d77afe05578f 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -6,9 +6,7 @@ */ #include <linux/syscalls.h> -#include <asm/unistd.h> #include <asm/cacheflush.h> -#include <asm-generic/mman-common.h> static long riscv_sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, @@ -18,10 +16,6 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) return -EINVAL; - if ((prot & PROT_WRITE) && (prot & PROT_EXEC)) - if (unlikely(!(prot & PROT_READ))) - return -EINVAL; - return ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> (PAGE_SHIFT - page_shift_offset)); } @@ -29,7 +23,7 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, #ifdef CONFIG_64BIT SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, offset) + unsigned long, fd, unsigned long, offset) { return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 0); } @@ -38,7 +32,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, offset) + unsigned long, fd, unsigned long, offset) { /* * Note that the shift for mmap2 is constant (12), @@ -73,3 +67,9 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end, return 0; } + +/* Not defined using SYSCALL_DEFINE0 to avoid error injection */ +asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *__unused) +{ + return -ENOSYS; +} diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index 44b1420a2270..6f1a36cb0f3f 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -9,10 +9,16 @@ #include <asm-generic/syscalls.h> #include <asm/syscall.h> +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) + +#undef __SYSCALL +#define __SYSCALL(nr, call) asmlinkage long __riscv_##call(const struct pt_regs *); +#include <asm/syscall_table.h> + #undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (call), +#define __SYSCALL(nr, call) [nr] = __riscv_##call, void * const sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls - 1] = sys_ni_syscall, -#include <asm/unistd.h> + [0 ... __NR_syscalls - 1] = __riscv_sys_ni_syscall, +#include <asm/syscall_table.h> }; diff --git a/arch/riscv/kernel/tests/Kconfig.debug b/arch/riscv/kernel/tests/Kconfig.debug new file mode 100644 index 000000000000..78cea5d2c270 --- /dev/null +++ b/arch/riscv/kernel/tests/Kconfig.debug @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0-only +menu "arch/riscv/kernel Testing and Coverage" + +config AS_HAS_ULEB128 + def_bool $(as-instr,.reloc label$(comma) R_RISCV_SET_ULEB128$(comma) 127\n.reloc label$(comma) R_RISCV_SUB_ULEB128$(comma) 127\nlabel:\n.word 0) + +menuconfig RUNTIME_KERNEL_TESTING_MENU + bool "arch/riscv/kernel runtime Testing" + default y + help + Enable riscv kernel runtime testing. + +if RUNTIME_KERNEL_TESTING_MENU + +config RISCV_MODULE_LINKING_KUNIT + bool "KUnit test riscv module linking at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test riscv module linking at boot. This will + enable a module called "test_module_linking". + + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + +endif # RUNTIME_TESTING_MENU + +endmenu # "arch/riscv/kernel runtime Testing" diff --git a/arch/riscv/kernel/tests/Makefile b/arch/riscv/kernel/tests/Makefile new file mode 100644 index 000000000000..7d6c76cffe20 --- /dev/null +++ b/arch/riscv/kernel/tests/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_RISCV_MODULE_LINKING_KUNIT) += module_test/ diff --git a/arch/riscv/kernel/tests/module_test/Makefile b/arch/riscv/kernel/tests/module_test/Makefile new file mode 100644 index 000000000000..d7a6fd8943de --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/Makefile @@ -0,0 +1,15 @@ +obj-m += test_module_linking.o + +test_sub := test_sub6.o test_sub8.o test_sub16.o test_sub32.o test_sub64.o + +test_set := test_set6.o test_set8.o test_set16.o test_set32.o + +test_module_linking-objs += $(test_sub) + +test_module_linking-objs += $(test_set) + +ifeq ($(CONFIG_AS_HAS_ULEB128),y) +test_module_linking-objs += test_uleb128.o +endif + +test_module_linking-objs += test_module_linking_main.o diff --git a/arch/riscv/kernel/tests/module_test/test_module_linking_main.c b/arch/riscv/kernel/tests/module_test/test_module_linking_main.c new file mode 100644 index 000000000000..8df5fa5b834e --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_module_linking_main.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Rivos Inc. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <kunit/test.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Test module linking"); + +extern int test_set32(void); +extern int test_set16(void); +extern int test_set8(void); +extern int test_set6(void); +extern long test_sub64(void); +extern int test_sub32(void); +extern int test_sub16(void); +extern int test_sub8(void); +extern int test_sub6(void); + +#ifdef CONFIG_AS_HAS_ULEB128 +extern int test_uleb_basic(void); +extern int test_uleb_large(void); +#endif + +#define CHECK_EQ(lhs, rhs) KUNIT_ASSERT_EQ(test, lhs, rhs) + +void run_test_set(struct kunit *test); +void run_test_sub(struct kunit *test); +void run_test_uleb(struct kunit *test); + +void run_test_set(struct kunit *test) +{ + int val32 = test_set32(); + int val16 = test_set16(); + int val8 = test_set8(); + int val6 = test_set6(); + + CHECK_EQ(val32, 0); + CHECK_EQ(val16, 0); + CHECK_EQ(val8, 0); + CHECK_EQ(val6, 0); +} + +void run_test_sub(struct kunit *test) +{ + int val64 = test_sub64(); + int val32 = test_sub32(); + int val16 = test_sub16(); + int val8 = test_sub8(); + int val6 = test_sub6(); + + CHECK_EQ(val64, 0); + CHECK_EQ(val32, 0); + CHECK_EQ(val16, 0); + CHECK_EQ(val8, 0); + CHECK_EQ(val6, 0); +} + +#ifdef CONFIG_AS_HAS_ULEB128 +void run_test_uleb(struct kunit *test) +{ + int val_uleb = test_uleb_basic(); + int val_uleb2 = test_uleb_large(); + + CHECK_EQ(val_uleb, 0); + CHECK_EQ(val_uleb2, 0); +} +#endif + +static struct kunit_case __refdata riscv_module_linking_test_cases[] = { + KUNIT_CASE(run_test_set), + KUNIT_CASE(run_test_sub), +#ifdef CONFIG_AS_HAS_ULEB128 + KUNIT_CASE(run_test_uleb), +#endif + {} +}; + +static struct kunit_suite riscv_module_linking_test_suite = { + .name = "riscv_checksum", + .test_cases = riscv_module_linking_test_cases, +}; + +kunit_test_suites(&riscv_module_linking_test_suite); diff --git a/arch/riscv/kernel/tests/module_test/test_set16.S b/arch/riscv/kernel/tests/module_test/test_set16.S new file mode 100644 index 000000000000..2be0e441a12e --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_set16.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_set16 +test_set16: + lw a0, set16 + la t0, set16 +#ifdef CONFIG_32BIT + slli t0, t0, 16 + srli t0, t0, 16 +#else + slli t0, t0, 48 + srli t0, t0, 48 +#endif + sub a0, a0, t0 + ret +.data +set16: + .reloc set16, R_RISCV_SET16, set16 + .word 0 diff --git a/arch/riscv/kernel/tests/module_test/test_set32.S b/arch/riscv/kernel/tests/module_test/test_set32.S new file mode 100644 index 000000000000..de0444537e67 --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_set32.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_set32 +test_set32: + lw a0, set32 + la t0, set32 +#ifndef CONFIG_32BIT + slli t0, t0, 32 + srli t0, t0, 32 +#endif + sub a0, a0, t0 + ret +.data +set32: + .reloc set32, R_RISCV_SET32, set32 + .word 0 diff --git a/arch/riscv/kernel/tests/module_test/test_set6.S b/arch/riscv/kernel/tests/module_test/test_set6.S new file mode 100644 index 000000000000..c39ce4c219eb --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_set6.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_set6 +test_set6: + lw a0, set6 + la t0, set6 +#ifdef CONFIG_32BIT + slli t0, t0, 26 + srli t0, t0, 26 +#else + slli t0, t0, 58 + srli t0, t0, 58 +#endif + sub a0, a0, t0 + ret +.data +set6: + .reloc set6, R_RISCV_SET6, set6 + .word 0 diff --git a/arch/riscv/kernel/tests/module_test/test_set8.S b/arch/riscv/kernel/tests/module_test/test_set8.S new file mode 100644 index 000000000000..a656173f6f99 --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_set8.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_set8 +test_set8: + lw a0, set8 + la t0, set8 +#ifdef CONFIG_32BIT + slli t0, t0, 24 + srli t0, t0, 24 +#else + slli t0, t0, 56 + srli t0, t0, 56 +#endif + sub a0, a0, t0 + ret +.data +set8: + .reloc set8, R_RISCV_SET8, set8 + .word 0 diff --git a/arch/riscv/kernel/tests/module_test/test_sub16.S b/arch/riscv/kernel/tests/module_test/test_sub16.S new file mode 100644 index 000000000000..80f731d599ba --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_sub16.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_sub16 +test_sub16: + lh a0, sub16 + addi a0, a0, -32 + ret +first: + .space 32 +second: + +.data +sub16: + .reloc sub16, R_RISCV_ADD16, second + .reloc sub16, R_RISCV_SUB16, first + .half 0 diff --git a/arch/riscv/kernel/tests/module_test/test_sub32.S b/arch/riscv/kernel/tests/module_test/test_sub32.S new file mode 100644 index 000000000000..a341686e12df --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_sub32.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_sub32 +test_sub32: + lw a0, sub32 + addi a0, a0, -32 + ret +first: + .space 32 +second: + +.data +sub32: + .reloc sub32, R_RISCV_ADD32, second + .reloc sub32, R_RISCV_SUB32, first + .word 0 diff --git a/arch/riscv/kernel/tests/module_test/test_sub6.S b/arch/riscv/kernel/tests/module_test/test_sub6.S new file mode 100644 index 000000000000..e8b61c1ec527 --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_sub6.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_sub6 +test_sub6: + lb a0, sub6 + addi a0, a0, -32 + ret +first: + .space 32 +second: + +.data +sub6: + .reloc sub6, R_RISCV_SET6, second + .reloc sub6, R_RISCV_SUB6, first + .byte 0 diff --git a/arch/riscv/kernel/tests/module_test/test_sub64.S b/arch/riscv/kernel/tests/module_test/test_sub64.S new file mode 100644 index 000000000000..a59e8afa88fd --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_sub64.S @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_sub64 +test_sub64: +#ifdef CONFIG_32BIT + lw a0, sub64 +#else + ld a0, sub64 +#endif + addi a0, a0, -32 + ret +first: + .space 32 +second: + +.data +sub64: + .reloc sub64, R_RISCV_ADD64, second + .reloc sub64, R_RISCV_SUB64, first + .word 0 + .word 0 diff --git a/arch/riscv/kernel/tests/module_test/test_sub8.S b/arch/riscv/kernel/tests/module_test/test_sub8.S new file mode 100644 index 000000000000..ac5d0ec98de3 --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_sub8.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_sub8 +test_sub8: + lb a0, sub8 + addi a0, a0, -32 + ret +first: + .space 32 +second: + +.data +sub8: + .reloc sub8, R_RISCV_ADD8, second + .reloc sub8, R_RISCV_SUB8, first + .byte 0 diff --git a/arch/riscv/kernel/tests/module_test/test_uleb128.S b/arch/riscv/kernel/tests/module_test/test_uleb128.S new file mode 100644 index 000000000000..8515ed7cd8c1 --- /dev/null +++ b/arch/riscv/kernel/tests/module_test/test_uleb128.S @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +.text +.global test_uleb_basic +test_uleb_basic: + lw a0, second + addi a0, a0, -127 + ret + +.global test_uleb_large +test_uleb_large: + lw a0, fourth + addi a0, a0, -0x07e8 + ret + +.data +first: + .space 127 +second: + .reloc second, R_RISCV_SET_ULEB128, second + .reloc second, R_RISCV_SUB_ULEB128, first + .word 0 +third: + .space 1000 +fourth: + .reloc fourth, R_RISCV_SET_ULEB128, fourth + .reloc fourth, R_RISCV_SUB_ULEB128, third + .word 0 diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 8217b0f67c6c..ba3477197789 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -4,12 +4,15 @@ * Copyright (C) 2017 SiFive */ +#include <linux/acpi.h> #include <linux/of_clk.h> +#include <linux/clockchips.h> #include <linux/clocksource.h> #include <linux/delay.h> #include <asm/sbi.h> #include <asm/processor.h> #include <asm/timex.h> +#include <asm/paravirt.h> unsigned long riscv_timebase __ro_after_init; EXPORT_SYMBOL_GPL(riscv_timebase); @@ -17,25 +20,32 @@ EXPORT_SYMBOL_GPL(riscv_timebase); void __init time_init(void) { struct device_node *cpu; + struct acpi_table_rhct *rhct; + acpi_status status; u32 prop; - cpu = of_find_node_by_path("/cpus"); - if (!cpu || of_property_read_u32(cpu, "timebase-frequency", &prop)) - panic(KERN_WARNING "RISC-V system with no 'timebase-frequency' in DTS\n"); - of_node_put(cpu); - riscv_timebase = prop; + if (acpi_disabled) { + cpu = of_find_node_by_path("/cpus"); + if (!cpu || of_property_read_u32(cpu, "timebase-frequency", &prop)) + panic("RISC-V system with no 'timebase-frequency' in DTS\n"); + + of_node_put(cpu); + riscv_timebase = prop; + of_clk_init(NULL); + } else { + status = acpi_get_table(ACPI_SIG_RHCT, 0, (struct acpi_table_header **)&rhct); + if (ACPI_FAILURE(status)) + panic("RISC-V ACPI system with no RHCT table\n"); + + riscv_timebase = rhct->time_base_freq; + acpi_put_table((struct acpi_table_header *)rhct); + } lpj_fine = riscv_timebase / HZ; - of_clk_init(NULL); timer_probe(); -} -void clocksource_arch_init(struct clocksource *cs) -{ -#ifdef CONFIG_GENERIC_GETTIMEOFDAY - cs->vdso_clock_mode = VDSO_CLOCKMODE_ARCHTIMER; -#else - cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; -#endif + tick_setup_hrtimer_broadcast(); + + pv_time_init(); } diff --git a/arch/riscv/kernel/trace_irq.c b/arch/riscv/kernel/trace_irq.c deleted file mode 100644 index 095ac976d7da..000000000000 --- a/arch/riscv/kernel/trace_irq.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2022 Changbin Du <changbin.du@gmail.com> - */ - -#include <linux/irqflags.h> -#include <linux/kprobes.h> -#include "trace_irq.h" - -/* - * trace_hardirqs_on/off require the caller to setup frame pointer properly. - * Otherwise, CALLER_ADDR1 might trigger an pagging exception in kernel. - * Here we add one extra level so they can be safely called by low - * level entry code which $fp is used for other purpose. - */ - -void __trace_hardirqs_on(void) -{ - trace_hardirqs_on(); -} -NOKPROBE_SYMBOL(__trace_hardirqs_on); - -void __trace_hardirqs_off(void) -{ - trace_hardirqs_off(); -} -NOKPROBE_SYMBOL(__trace_hardirqs_off); diff --git a/arch/riscv/kernel/trace_irq.h b/arch/riscv/kernel/trace_irq.h deleted file mode 100644 index 99fe67377e5e..000000000000 --- a/arch/riscv/kernel/trace_irq.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2022 Changbin Du <changbin.du@gmail.com> - */ -#ifndef __TRACE_IRQ_H -#define __TRACE_IRQ_H - -void __trace_hardirqs_on(void); -void __trace_hardirqs_off(void); - -#endif /* __TRACE_IRQ_H */ diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index b40426509244..9c83848797a7 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -6,6 +6,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/randomize_kstack.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/signal.h> @@ -13,40 +14,93 @@ #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/kprobes.h> +#include <linux/uprobes.h> +#include <asm/uprobes.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/irq.h> +#include <linux/kexec.h> +#include <linux/entry-common.h> #include <asm/asm-prototypes.h> #include <asm/bug.h> +#include <asm/cfi.h> +#include <asm/csr.h> #include <asm/processor.h> #include <asm/ptrace.h> -#include <asm/csr.h> +#include <asm/syscall.h> +#include <asm/thread_info.h> +#include <asm/vector.h> +#include <asm/irq_stack.h> int show_unhandled_signals = 1; -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); + +static int copy_code(struct pt_regs *regs, u16 *val, const u16 *insns) +{ + const void __user *uaddr = (__force const void __user *)insns; + + if (!user_mode(regs)) + return get_kernel_nofault(*val, insns); + + /* The user space code from other tasks cannot be accessed. */ + if (regs != task_pt_regs(current)) + return -EPERM; + + return copy_from_user_nofault(val, uaddr, sizeof(*val)); +} + +static void dump_instr(const char *loglvl, struct pt_regs *regs) +{ + char str[sizeof("0000 ") * 12 + 2 + 1], *p = str; + const u16 *insns = (u16 *)instruction_pointer(regs); + long bad; + u16 val; + int i; + + for (i = -10; i < 2; i++) { + bad = copy_code(regs, &val, &insns[i]); + if (!bad) { + p += sprintf(p, i == 0 ? "(%04hx) " : "%04hx ", val); + } else { + printk("%sCode: Unable to access instruction at 0x%px.\n", + loglvl, &insns[i]); + return; + } + } + printk("%sCode: %s\n", loglvl, str); +} void die(struct pt_regs *regs, const char *str) { static int die_counter; int ret; + long cause; + unsigned long flags; oops_enter(); - spin_lock_irq(&die_lock); + raw_spin_lock_irqsave(&die_lock, flags); console_verbose(); bust_spinlocks(1); pr_emerg("%s [#%d]\n", str, ++die_counter); print_modules(); - show_regs(regs); + if (regs) { + show_regs(regs); + dump_instr(KERN_EMERG, regs); + } + + cause = regs ? regs->cause : -1; + ret = notify_die(DIE_OOPS, str, regs, 0, cause, SIGSEGV); - ret = notify_die(DIE_OOPS, str, regs, 0, regs->cause, SIGSEGV); + if (kexec_should_crash(current)) + crash_kexec(regs); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irq(&die_lock); + raw_spin_unlock_irqrestore(&die_lock, flags); oops_exit(); if (in_interrupt()) @@ -68,6 +122,7 @@ void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) print_vma_addr(KERN_CONT " in ", instruction_pointer(regs)); pr_cont("\n"); __show_regs(regs); + dump_instr(KERN_INFO, regs); } force_sig_fault(signo, code, (void __user *)addr); @@ -87,14 +142,22 @@ static void do_trap_error(struct pt_regs *regs, int signo, int code, } #if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_RISCV_ALTERNATIVE) -#define __trap_section __section(".xip.traps") +#define __trap_section __noinstr_section(".xip.traps") #else -#define __trap_section +#define __trap_section noinstr #endif -#define DO_ERROR_INFO(name, signo, code, str) \ -asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ -{ \ - do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ +#define DO_ERROR_INFO(name, signo, code, str) \ +asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ +{ \ + if (user_mode(regs)) { \ + irqentry_enter_from_user_mode(regs); \ + do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ + irqentry_exit_to_user_mode(regs); \ + } else { \ + irqentry_state_t state = irqentry_nmi_enter(regs); \ + do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ + irqentry_nmi_exit(regs, state); \ + } \ } DO_ERROR_INFO(do_trap_unknown, @@ -103,39 +166,91 @@ DO_ERROR_INFO(do_trap_insn_misaligned, SIGBUS, BUS_ADRALN, "instruction address misaligned"); DO_ERROR_INFO(do_trap_insn_fault, SIGSEGV, SEGV_ACCERR, "instruction access fault"); -DO_ERROR_INFO(do_trap_insn_illegal, - SIGILL, ILL_ILLOPC, "illegal instruction"); + +asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *regs) +{ + bool handled; + + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + local_irq_enable(); + + handled = riscv_v_first_use_handler(regs); + + local_irq_disable(); + + if (!handled) + do_trap_error(regs, SIGILL, ILL_ILLOPC, regs->epc, + "Oops - illegal instruction"); + + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + do_trap_error(regs, SIGILL, ILL_ILLOPC, regs->epc, + "Oops - illegal instruction"); + + irqentry_nmi_exit(regs, state); + } +} + DO_ERROR_INFO(do_trap_load_fault, SIGSEGV, SEGV_ACCERR, "load access fault"); -#ifndef CONFIG_RISCV_M_MODE -DO_ERROR_INFO(do_trap_load_misaligned, - SIGBUS, BUS_ADRALN, "Oops - load address misaligned"); -DO_ERROR_INFO(do_trap_store_misaligned, - SIGBUS, BUS_ADRALN, "Oops - store (or AMO) address misaligned"); -#else -int handle_misaligned_load(struct pt_regs *regs); -int handle_misaligned_store(struct pt_regs *regs); -asmlinkage void __trap_section do_trap_load_misaligned(struct pt_regs *regs) +enum misaligned_access_type { + MISALIGNED_STORE, + MISALIGNED_LOAD, +}; +static const struct { + const char *type_str; + int (*handler)(struct pt_regs *regs); +} misaligned_handler[] = { + [MISALIGNED_STORE] = { + .type_str = "Oops - store (or AMO) address misaligned", + .handler = handle_misaligned_store, + }, + [MISALIGNED_LOAD] = { + .type_str = "Oops - load address misaligned", + .handler = handle_misaligned_load, + }, +}; + +static void do_trap_misaligned(struct pt_regs *regs, enum misaligned_access_type type) { - if (!handle_misaligned_load(regs)) - return; - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - load address misaligned"); + irqentry_state_t state; + + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + local_irq_enable(); + } else { + state = irqentry_nmi_enter(regs); + } + + if (misaligned_handler[type].handler(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + misaligned_handler[type].type_str); + + if (user_mode(regs)) { + local_irq_disable(); + irqentry_exit_to_user_mode(regs); + } else { + irqentry_nmi_exit(regs, state); + } } -asmlinkage void __trap_section do_trap_store_misaligned(struct pt_regs *regs) +asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) { - if (!handle_misaligned_store(regs)) - return; - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - store (or AMO) address misaligned"); + do_trap_misaligned(regs, MISALIGNED_LOAD); } -#endif + +asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) +{ + do_trap_misaligned(regs, MISALIGNED_STORE); +} + DO_ERROR_INFO(do_trap_store_fault, SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault"); -DO_ERROR_INFO(do_trap_ecall_u, - SIGILL, ILL_ILLTRP, "environment call from U-mode"); DO_ERROR_INFO(do_trap_ecall_s, SIGILL, ILL_ILLTRP, "environment call from S-mode"); DO_ERROR_INFO(do_trap_ecall_m, @@ -151,22 +266,28 @@ static inline unsigned long get_break_insn_length(unsigned long pc) return GET_INSN_LENGTH(insn); } -asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) +static bool probe_single_step_handler(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs)) - return; + bool user = user_mode(regs); - if (kprobe_breakpoint_handler(regs)) - return; -#endif -#ifdef CONFIG_UPROBES - if (uprobe_single_step_handler(regs)) + return user ? uprobe_single_step_handler(regs) : kprobe_single_step_handler(regs); +} + +static bool probe_breakpoint_handler(struct pt_regs *regs) +{ + bool user = user_mode(regs); + + return user ? uprobe_breakpoint_handler(regs) : kprobe_breakpoint_handler(regs); +} + +void handle_break(struct pt_regs *regs) +{ + if (probe_single_step_handler(regs)) return; - if (uprobe_breakpoint_handler(regs)) + if (probe_breakpoint_handler(regs)) return; -#endif + current->thread.bad_cause = regs->cause; if (user_mode(regs)) @@ -176,12 +297,108 @@ asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) == NOTIFY_STOP) return; #endif - else if (report_bug(regs->epc, regs) == BUG_TRAP_TYPE_WARN) + else if (report_bug(regs->epc, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) regs->epc += get_break_insn_length(regs->epc); else die(regs, "Kernel BUG"); } -NOKPROBE_SYMBOL(do_trap_break); + +asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) +{ + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + handle_break(regs); + + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + handle_break(regs); + + irqentry_nmi_exit(regs, state); + } +} + +asmlinkage __visible __trap_section __no_stack_protector +void do_trap_ecall_u(struct pt_regs *regs) +{ + if (user_mode(regs)) { + long syscall = regs->a7; + + regs->epc += 4; + regs->orig_a0 = regs->a0; + regs->a0 = -ENOSYS; + + riscv_v_vstate_discard(regs); + + syscall = syscall_enter_from_user_mode(regs, syscall); + + add_random_kstack_offset(); + + if (syscall >= 0 && syscall < NR_syscalls) + syscall_handler(regs, syscall); + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * so the maximum stack offset is 1k bytes (10 bits). + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: 16-byte (i.e. 4-bit) aligned + * for RV32I or RV64I. + * + * The resulting 6 bits of entropy is seen in SP[9:4]. + */ + choose_random_kstack_offset(get_random_u16()); + + syscall_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + do_trap_error(regs, SIGILL, ILL_ILLTRP, regs->epc, + "Oops - environment call from U-mode"); + + irqentry_nmi_exit(regs, state); + } + +} + +#ifdef CONFIG_MMU +asmlinkage __visible noinstr void do_page_fault(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + + handle_page_fault(regs); + + local_irq_disable(); + + irqentry_exit(regs, state); +} +#endif + +static void noinstr handle_riscv_irq(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + + irq_enter_rcu(); + old_regs = set_irq_regs(regs); + handle_arch_irq(regs); + set_irq_regs(old_regs); + irq_exit_rcu(); +} + +asmlinkage void noinstr do_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + + if (IS_ENABLED(CONFIG_IRQ_STACKS) && on_thread_stack()) + call_on_irq_stack(regs, handle_riscv_irq); + else + handle_riscv_irq(regs); + + irqentry_exit(regs, state); +} #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long pc) @@ -200,18 +417,8 @@ int is_valid_bugaddr(unsigned long pc) #endif /* CONFIG_GENERIC_BUG */ #ifdef CONFIG_VMAP_STACK -static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)__aligned(16); -/* - * shadow stack, handled_ kernel_ stack_ overflow(in kernel/entry.S) is used - * to get per-cpu overflow stack(get_overflow_stack). - */ -long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)]; -asmlinkage unsigned long get_overflow_stack(void) -{ - return (unsigned long)this_cpu_ptr(overflow_stack) + - OVERFLOW_STACK_SIZE; -} asmlinkage void handle_bad_stack(struct pt_regs *regs) { diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 46c4dafe3ba0..dd8e4af6583f 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -6,11 +6,18 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/perf_event.h> #include <linux/irq.h> +#include <linux/stringify.h> #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/csr.h> +#include <asm/entry-common.h> +#include <asm/hwprobe.h> +#include <asm/cpufeature.h> +#include <asm/sbi.h> +#include <asm/vector.h> #define INSN_MATCH_LB 0x3 #define INSN_MASK_LB 0x707f @@ -82,6 +89,13 @@ #define INSN_MATCH_C_FSWSP 0xe002 #define INSN_MASK_C_FSWSP 0xe003 +#define INSN_MATCH_C_LHU 0x8400 +#define INSN_MASK_C_LHU 0xfc43 +#define INSN_MATCH_C_LH 0x8440 +#define INSN_MASK_C_LH 0xfc43 +#define INSN_MATCH_C_SH 0x8c00 +#define INSN_MASK_C_SH 0xfc43 + #define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) #if defined(CONFIG_64BIT) @@ -131,8 +145,6 @@ #define REG_PTR(insn, pos, regs) \ (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)) -#define GET_RM(insn) (((insn) >> 12) & 7) - #define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) #define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) #define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) @@ -150,100 +162,223 @@ #define PRECISION_S 0 #define PRECISION_D 1 -#define STR(x) XSTR(x) -#define XSTR(x) #x +#ifdef CONFIG_FPU -#define DECLARE_UNPRIVILEGED_LOAD_FUNCTION(type, insn) \ -static inline type load_##type(const type *addr) \ -{ \ - type val; \ - asm (#insn " %0, %1" \ - : "=&r" (val) : "m" (*addr)); \ - return val; \ -} +#define FP_GET_RD(insn) (insn >> 7 & 0x1F) + +extern void put_f32_reg(unsigned long fp_reg, unsigned long value); + +static int set_f32_rd(unsigned long insn, struct pt_regs *regs, + unsigned long val) +{ + unsigned long fp_reg = FP_GET_RD(insn); -#define DECLARE_UNPRIVILEGED_STORE_FUNCTION(type, insn) \ -static inline void store_##type(type *addr, type val) \ -{ \ - asm volatile (#insn " %0, %1\n" \ - : : "r" (val), "m" (*addr)); \ + put_f32_reg(fp_reg, val); + regs->status |= SR_FS_DIRTY; + + return 0; } -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u8, lbu) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u16, lhu) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s8, lb) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s16, lh) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s32, lw) -DECLARE_UNPRIVILEGED_STORE_FUNCTION(u8, sb) -DECLARE_UNPRIVILEGED_STORE_FUNCTION(u16, sh) -DECLARE_UNPRIVILEGED_STORE_FUNCTION(u32, sw) -#if defined(CONFIG_64BIT) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lwu) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u64, ld) -DECLARE_UNPRIVILEGED_STORE_FUNCTION(u64, sd) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, ld) +extern void put_f64_reg(unsigned long fp_reg, unsigned long value); + +static int set_f64_rd(unsigned long insn, struct pt_regs *regs, u64 val) +{ + unsigned long fp_reg = FP_GET_RD(insn); + unsigned long value; + +#if __riscv_xlen == 32 + value = (unsigned long) &val; #else -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lw) -DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, lw) + value = val; +#endif + put_f64_reg(fp_reg, value); + regs->status |= SR_FS_DIRTY; + + return 0; +} + +#if __riscv_xlen == 32 +extern void get_f64_reg(unsigned long fp_reg, u64 *value); -static inline u64 load_u64(const u64 *addr) +static u64 get_f64_rs(unsigned long insn, u8 fp_reg_offset, + struct pt_regs *regs) { - return load_u32((u32 *)addr) - + ((u64)load_u32((u32 *)addr + 1) << 32); + unsigned long fp_reg = (insn >> fp_reg_offset) & 0x1F; + u64 val; + + get_f64_reg(fp_reg, &val); + regs->status |= SR_FS_DIRTY; + + return val; } +#else -static inline void store_u64(u64 *addr, u64 val) +extern unsigned long get_f64_reg(unsigned long fp_reg); + +static unsigned long get_f64_rs(unsigned long insn, u8 fp_reg_offset, + struct pt_regs *regs) { - store_u32((u32 *)addr, val); - store_u32((u32 *)addr + 1, val >> 32); + unsigned long fp_reg = (insn >> fp_reg_offset) & 0x1F; + unsigned long val; + + val = get_f64_reg(fp_reg); + regs->status |= SR_FS_DIRTY; + + return val; } + #endif -static inline ulong get_insn(ulong mepc) +extern unsigned long get_f32_reg(unsigned long fp_reg); + +static unsigned long get_f32_rs(unsigned long insn, u8 fp_reg_offset, + struct pt_regs *regs) { - register ulong __mepc asm ("a2") = mepc; - ulong val, rvc_mask = 3, tmp; + unsigned long fp_reg = (insn >> fp_reg_offset) & 0x1F; + unsigned long val; - asm ("and %[tmp], %[addr], 2\n" - "bnez %[tmp], 1f\n" -#if defined(CONFIG_64BIT) - STR(LWU) " %[insn], (%[addr])\n" -#else - STR(LW) " %[insn], (%[addr])\n" -#endif - "and %[tmp], %[insn], %[rvc_mask]\n" - "beq %[tmp], %[rvc_mask], 2f\n" - "sll %[insn], %[insn], %[xlen_minus_16]\n" - "srl %[insn], %[insn], %[xlen_minus_16]\n" - "j 2f\n" - "1:\n" - "lhu %[insn], (%[addr])\n" - "and %[tmp], %[insn], %[rvc_mask]\n" - "bne %[tmp], %[rvc_mask], 2f\n" - "lhu %[tmp], 2(%[addr])\n" - "sll %[tmp], %[tmp], 16\n" - "add %[insn], %[insn], %[tmp]\n" - "2:" - : [insn] "=&r" (val), [tmp] "=&r" (tmp) - : [addr] "r" (__mepc), [rvc_mask] "r" (rvc_mask), - [xlen_minus_16] "i" (XLEN_MINUS_16)); + val = get_f32_reg(fp_reg); + regs->status |= SR_FS_DIRTY; return val; } +#else /* CONFIG_FPU */ +static void set_f32_rd(unsigned long insn, struct pt_regs *regs, + unsigned long val) {} + +static void set_f64_rd(unsigned long insn, struct pt_regs *regs, u64 val) {} + +static unsigned long get_f64_rs(unsigned long insn, u8 fp_reg_offset, + struct pt_regs *regs) +{ + return 0; +} + +static unsigned long get_f32_rs(unsigned long insn, u8 fp_reg_offset, + struct pt_regs *regs) +{ + return 0; +} + +#endif + +#define GET_F64_RS2(insn, regs) (get_f64_rs(insn, 20, regs)) +#define GET_F64_RS2C(insn, regs) (get_f64_rs(insn, 2, regs)) +#define GET_F64_RS2S(insn, regs) (get_f64_rs(RVC_RS2S(insn), 0, regs)) + +#define GET_F32_RS2(insn, regs) (get_f32_rs(insn, 20, regs)) +#define GET_F32_RS2C(insn, regs) (get_f32_rs(insn, 2, regs)) +#define GET_F32_RS2S(insn, regs) (get_f32_rs(RVC_RS2S(insn), 0, regs)) + +#define __read_insn(regs, insn, insn_addr, type) \ +({ \ + int __ret; \ + \ + if (user_mode(regs)) { \ + __ret = get_user(insn, (type __user *) insn_addr); \ + } else { \ + insn = *(type *)insn_addr; \ + __ret = 0; \ + } \ + \ + __ret; \ +}) + +static inline int get_insn(struct pt_regs *regs, ulong epc, ulong *r_insn) +{ + ulong insn = 0; + + if (epc & 0x2) { + ulong tmp = 0; + + if (__read_insn(regs, insn, epc, u16)) + return -EFAULT; + /* __get_user() uses regular "lw" which sign extend the loaded + * value make sure to clear higher order bits in case we "or" it + * below with the upper 16 bits half. + */ + insn &= GENMASK(15, 0); + if ((insn & __INSN_LENGTH_MASK) != __INSN_LENGTH_32) { + *r_insn = insn; + return 0; + } + epc += sizeof(u16); + if (__read_insn(regs, tmp, epc, u16)) + return -EFAULT; + *r_insn = (tmp << 16) | insn; + + return 0; + } else { + if (__read_insn(regs, insn, epc, u32)) + return -EFAULT; + if ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) { + *r_insn = insn; + return 0; + } + insn &= GENMASK(15, 0); + *r_insn = insn; + + return 0; + } +} + union reg_data { u8 data_bytes[8]; ulong data_ulong; u64 data_u64; }; -int handle_misaligned_load(struct pt_regs *regs) +/* sysctl hooks */ +int unaligned_enabled __read_mostly = 1; /* Enabled by default */ + +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED +static int handle_vector_misaligned_load(struct pt_regs *regs) +{ + unsigned long epc = regs->epc; + unsigned long insn; + + if (get_insn(regs, epc, &insn)) + return -1; + + /* Only return 0 when in check_vector_unaligned_access_emulated */ + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; + regs->epc = epc + INSN_LEN(insn); + return 0; + } + + /* If vector instruction we don't emulate it yet */ + regs->epc = epc; + return -1; +} +#else +static int handle_vector_misaligned_load(struct pt_regs *regs) +{ + return -1; +} +#endif + +static int handle_scalar_misaligned_load(struct pt_regs *regs) { union reg_data val; unsigned long epc = regs->epc; - unsigned long insn = get_insn(epc); - unsigned long addr = csr_read(mtval); - int i, fp = 0, shift = 0, len = 0; + unsigned long insn; + unsigned long addr = regs->badaddr; + int fp = 0, shift = 0, len = 0; + + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); + + *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; + + if (!unaligned_enabled) + return -1; + + if (user_mode(regs) && (current->thread.align_ctl & PR_UNALIGN_SIGBUS)) + return -1; + + if (get_insn(regs, epc, &insn)) + return -1; regs->epc = 0; @@ -302,31 +437,59 @@ int handle_misaligned_load(struct pt_regs *regs) fp = 1; len = 4; #endif + } else if ((insn & INSN_MASK_C_LHU) == INSN_MATCH_C_LHU) { + len = 2; + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LH) == INSN_MATCH_C_LH) { + len = 2; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; } else { regs->epc = epc; return -1; } + if (!IS_ENABLED(CONFIG_FPU) && fp) + return -EOPNOTSUPP; + val.data_u64 = 0; - for (i = 0; i < len; i++) - val.data_bytes[i] = load_u8((void *)(addr + i)); + if (user_mode(regs)) { + if (copy_from_user_nofault(&val, (u8 __user *)addr, len)) + return -1; + } else { + memcpy(&val, (u8 *)addr, len); + } - if (fp) - return -1; - SET_RD(insn, regs, val.data_ulong << shift >> shift); + if (!fp) + SET_RD(insn, regs, val.data_ulong << shift >> shift); + else if (len == 8) + set_f64_rd(insn, regs, val.data_u64); + else + set_f32_rd(insn, regs, val.data_ulong); regs->epc = epc + INSN_LEN(insn); return 0; } -int handle_misaligned_store(struct pt_regs *regs) +static int handle_scalar_misaligned_store(struct pt_regs *regs) { union reg_data val; unsigned long epc = regs->epc; - unsigned long insn = get_insn(epc); - unsigned long addr = csr_read(mtval); - int i, len = 0; + unsigned long insn; + unsigned long addr = regs->badaddr; + int len = 0, fp = 0; + + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); + + if (!unaligned_enabled) + return -1; + + if (user_mode(regs) && (current->thread.align_ctl & PR_UNALIGN_SIGBUS)) + return -1; + + if (get_insn(regs, epc, &insn)) + return -1; regs->epc = 0; @@ -338,33 +501,290 @@ int handle_misaligned_store(struct pt_regs *regs) } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) { len = 8; #endif + } else if ((insn & INSN_MASK_FSD) == INSN_MATCH_FSD) { + fp = 1; + len = 8; + val.data_u64 = GET_F64_RS2(insn, regs); + } else if ((insn & INSN_MASK_FSW) == INSN_MATCH_FSW) { + fp = 1; + len = 4; + val.data_ulong = GET_F32_RS2(insn, regs); } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) { len = 2; #if defined(CONFIG_64BIT) } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) { len = 8; val.data_ulong = GET_RS2S(insn, regs); - } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP && - ((insn >> SH_RD) & 0x1f)) { + } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP) { len = 8; val.data_ulong = GET_RS2C(insn, regs); #endif } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) { len = 4; val.data_ulong = GET_RS2S(insn, regs); - } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP && - ((insn >> SH_RD) & 0x1f)) { + } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP) { len = 4; val.data_ulong = GET_RS2C(insn, regs); + } else if ((insn & INSN_MASK_C_FSD) == INSN_MATCH_C_FSD) { + fp = 1; + len = 8; + val.data_u64 = GET_F64_RS2S(insn, regs); + } else if ((insn & INSN_MASK_C_FSDSP) == INSN_MATCH_C_FSDSP) { + fp = 1; + len = 8; + val.data_u64 = GET_F64_RS2C(insn, regs); +#if !defined(CONFIG_64BIT) + } else if ((insn & INSN_MASK_C_FSW) == INSN_MATCH_C_FSW) { + fp = 1; + len = 4; + val.data_ulong = GET_F32_RS2S(insn, regs); + } else if ((insn & INSN_MASK_C_FSWSP) == INSN_MATCH_C_FSWSP) { + fp = 1; + len = 4; + val.data_ulong = GET_F32_RS2C(insn, regs); +#endif + } else if ((insn & INSN_MASK_C_SH) == INSN_MATCH_C_SH) { + len = 2; + val.data_ulong = GET_RS2S(insn, regs); } else { regs->epc = epc; return -1; } - for (i = 0; i < len; i++) - store_u8((void *)(addr + i), val.data_bytes[i]); + if (!IS_ENABLED(CONFIG_FPU) && fp) + return -EOPNOTSUPP; + + if (user_mode(regs)) { + if (copy_to_user_nofault((u8 __user *)addr, &val, len)) + return -1; + } else { + memcpy((u8 *)addr, &val, len); + } regs->epc = epc + INSN_LEN(insn); return 0; } + +int handle_misaligned_load(struct pt_regs *regs) +{ + unsigned long epc = regs->epc; + unsigned long insn; + + if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) { + if (get_insn(regs, epc, &insn)) + return -1; + + if (insn_is_vector(insn)) + return handle_vector_misaligned_load(regs); + } + + if (IS_ENABLED(CONFIG_RISCV_SCALAR_MISALIGNED)) + return handle_scalar_misaligned_load(regs); + + return -1; +} + +int handle_misaligned_store(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_RISCV_SCALAR_MISALIGNED)) + return handle_scalar_misaligned_store(regs); + + return -1; +} + +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED +void check_vector_unaligned_access_emulated(struct work_struct *work __always_unused) +{ + long *mas_ptr = this_cpu_ptr(&vector_misaligned_access); + unsigned long tmp_var; + + *mas_ptr = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + + kernel_vector_begin(); + /* + * In pre-13.0.0 versions of GCC, vector registers cannot appear in + * the clobber list. This inline asm clobbers v0, but since we do not + * currently build the kernel with V enabled, the v0 clobber arg is not + * needed (as the compiler will not emit vector code itself). If the kernel + * is changed to build with V enabled, the clobber arg will need to be + * added here. + */ + __asm__ __volatile__ ( + ".balign 4\n\t" + ".option push\n\t" + ".option arch, +zve32x\n\t" + " vsetivli zero, 1, e16, m1, ta, ma\n\t" // Vectors of 16b + " vle16.v v0, (%[ptr])\n\t" // Load bytes + ".option pop\n\t" + : : [ptr] "r" ((u8 *)&tmp_var + 1)); + kernel_vector_end(); +} + +bool __init check_vector_unaligned_access_emulated_all_cpus(void) +{ + int cpu; + + /* + * While being documented as very slow, schedule_on_each_cpu() is used since + * kernel_vector_begin() expects irqs to be enabled or it will panic() + */ + schedule_on_each_cpu(check_vector_unaligned_access_emulated); + + for_each_online_cpu(cpu) + if (per_cpu(vector_misaligned_access, cpu) + == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) + return false; + + return true; +} +#else +bool __init check_vector_unaligned_access_emulated_all_cpus(void) +{ + return false; +} +#endif + +static bool all_cpus_unaligned_scalar_access_emulated(void) +{ + int cpu; + + for_each_online_cpu(cpu) + if (per_cpu(misaligned_access_speed, cpu) != + RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED) + return false; + + return true; +} + +#ifdef CONFIG_RISCV_SCALAR_MISALIGNED + +static bool unaligned_ctl __read_mostly; + +static void check_unaligned_access_emulated(void *arg __always_unused) +{ + int cpu = smp_processor_id(); + long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); + unsigned long tmp_var, tmp_val; + + *mas_ptr = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; + + __asm__ __volatile__ ( + " "REG_L" %[tmp], 1(%[ptr])\n" + : [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory"); +} + +static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) +{ + long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); + + check_unaligned_access_emulated(NULL); + + /* + * If unaligned_ctl is already set, this means that we detected that all + * CPUS uses emulated misaligned access at boot time. If that changed + * when hotplugging the new cpu, this is something we don't handle. + */ + if (unlikely(unaligned_ctl && (*mas_ptr != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED))) { + pr_crit("CPU misaligned accesses non homogeneous (expected all emulated)\n"); + return -EINVAL; + } + + return 0; +} + +bool __init check_unaligned_access_emulated_all_cpus(void) +{ + /* + * We can only support PR_UNALIGN controls if all CPUs have misaligned + * accesses emulated since tasks requesting such control can run on any + * CPU. + */ + on_each_cpu(check_unaligned_access_emulated, NULL, 1); + + if (!all_cpus_unaligned_scalar_access_emulated()) + return false; + + unaligned_ctl = true; + return true; +} + +bool unaligned_ctl_available(void) +{ + return unaligned_ctl; +} +#else +bool __init check_unaligned_access_emulated_all_cpus(void) +{ + return false; +} +static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) +{ + return 0; +} +#endif + +static bool misaligned_traps_delegated; + +#ifdef CONFIG_RISCV_SBI + +static int cpu_online_sbi_unaligned_setup(unsigned int cpu) +{ + if (sbi_fwft_set(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0) && + misaligned_traps_delegated) { + pr_crit("Misaligned trap delegation non homogeneous (expected delegated)"); + return -EINVAL; + } + + return 0; +} + +void __init unaligned_access_init(void) +{ + int ret; + + ret = sbi_fwft_set_online_cpus(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0); + if (ret) + return; + + misaligned_traps_delegated = true; + pr_info("SBI misaligned access exception delegation ok\n"); + /* + * Note that we don't have to take any specific action here, if + * the delegation is successful, then + * check_unaligned_access_emulated() will verify that indeed the + * platform traps on misaligned accesses. + */ +} +#else +void __init unaligned_access_init(void) {} + +static int cpu_online_sbi_unaligned_setup(unsigned int cpu __always_unused) +{ + return 0; +} + +#endif + +int cpu_online_unaligned_access_init(unsigned int cpu) +{ + int ret; + + ret = cpu_online_sbi_unaligned_setup(cpu); + if (ret) + return ret; + + return cpu_online_check_unaligned_access_emulated(cpu); +} + +bool misaligned_traps_can_delegate(void) +{ + /* + * Either we successfully requested misaligned traps delegation for all + * CPUs, or the SBI does not implement the FWFT extension but delegated + * the exception by default. + */ + return misaligned_traps_delegated || + all_cpus_unaligned_scalar_access_emulated(); +} +EXPORT_SYMBOL_GPL(misaligned_traps_can_delegate); diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c new file mode 100644 index 000000000000..ae2068425fbc --- /dev/null +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Rivos Inc. + */ + +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/jump_label.h> +#include <linux/kthread.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/types.h> +#include <asm/cpufeature.h> +#include <asm/hwprobe.h> +#include <asm/vector.h> + +#include "copy-unaligned.h" + +#define MISALIGNED_ACCESS_JIFFIES_LG2 1 +#define MISALIGNED_BUFFER_SIZE 0x4000 +#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) +#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) + +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; + +static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; +static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + +static cpumask_t fast_misaligned_access; + +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS +static int check_unaligned_access(void *param) +{ + int cpu = smp_processor_id(); + u64 start_cycles, end_cycles; + u64 word_cycles; + u64 byte_cycles; + int ratio; + unsigned long start_jiffies, now; + struct page *page = param; + void *dst; + void *src; + long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; + + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) + return 0; + + /* Make an unaligned destination buffer. */ + dst = (void *)((unsigned long)page_address(page) | 0x1); + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ + src = dst + (MISALIGNED_BUFFER_SIZE / 2); + src += 2; + word_cycles = -1ULL; + /* Do a warmup. */ + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + preempt_disable(); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + /* + * For a fixed amount of time, repeatedly try the function, and take + * the best time in cycles as the measurement. + */ + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < word_cycles) + word_cycles = end_cycles - start_cycles; + } + + byte_cycles = -1ULL; + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + mb(); + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < byte_cycles) + byte_cycles = end_cycles - start_cycles; + } + + preempt_enable(); + + /* Don't divide by zero. */ + if (!word_cycles || !byte_cycles) { + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", + cpu); + + return 0; + } + + if (word_cycles < byte_cycles) + speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; + + ratio = div_u64((byte_cycles * 100), word_cycles); + pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", + cpu, + ratio / 100, + ratio % 100, + (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); + + per_cpu(misaligned_access_speed, cpu) = speed; + + /* + * Set the value of fast_misaligned_access of a CPU. These operations + * are atomic to avoid race conditions. + */ + if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) + cpumask_set_cpu(cpu, &fast_misaligned_access); + else + cpumask_clear_cpu(cpu, &fast_misaligned_access); + + return 0; +} + +static void __init check_unaligned_access_nonboot_cpu(void *param) +{ + unsigned int cpu = smp_processor_id(); + struct page **pages = param; + + if (smp_processor_id() != 0) + check_unaligned_access(pages[cpu]); +} + +/* Measure unaligned access speed on all CPUs present at boot in parallel. */ +static void __init check_unaligned_access_speed_all_cpus(void) +{ + unsigned int cpu; + unsigned int cpu_count = num_possible_cpus(); + struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); + + if (!bufs) { + pr_warn("Allocation failure, not measuring misaligned performance\n"); + return; + } + + /* + * Allocate separate buffers for each CPU so there's no fighting over + * cache lines. + */ + for_each_cpu(cpu, cpu_online_mask) { + bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); + if (!bufs[cpu]) { + pr_warn("Allocation failure, not measuring misaligned performance\n"); + goto out; + } + } + + /* Check everybody except 0, who stays behind to tend jiffies. */ + on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); + + /* Check core 0. */ + smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); + +out: + for_each_cpu(cpu, cpu_online_mask) { + if (bufs[cpu]) + __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); + } + + kfree(bufs); +} +#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ +static void __init check_unaligned_access_speed_all_cpus(void) +{ +} +#endif + +DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); + +static void modify_unaligned_access_branches(cpumask_t *mask, int weight) +{ + if (cpumask_weight(mask) == weight) + static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); + else + static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); +} + +static void set_unaligned_access_static_branches_except_cpu(int cpu) +{ + /* + * Same as set_unaligned_access_static_branches, except excludes the + * given CPU from the result. When a CPU is hotplugged into an offline + * state, this function is called before the CPU is set to offline in + * the cpumask, and thus the CPU needs to be explicitly excluded. + */ + + cpumask_t fast_except_me; + + cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); + cpumask_clear_cpu(cpu, &fast_except_me); + + modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); +} + +static void set_unaligned_access_static_branches(void) +{ + /* + * This will be called after check_unaligned_access_all_cpus so the + * result of unaligned access speed for all CPUs will be available. + * + * To avoid the number of online cpus changing between reading + * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be + * held before calling this function. + */ + + cpumask_t fast_and_online; + + cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); + + modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); +} + +static int __init lock_and_set_unaligned_access_static_branch(void) +{ + cpus_read_lock(); + set_unaligned_access_static_branches(); + cpus_read_unlock(); + + return 0; +} + +arch_initcall_sync(lock_and_set_unaligned_access_static_branch); + +static int riscv_online_cpu(unsigned int cpu) +{ + int ret = cpu_online_unaligned_access_init(cpu); + + if (ret) + return ret; + + /* We are already set since the last check */ + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { + goto exit; + } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { + per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; + goto exit; + } + +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS + { + static struct page *buf; + + buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); + if (!buf) { + pr_warn("Allocation failure, not measuring misaligned performance\n"); + return -ENOMEM; + } + + check_unaligned_access(buf); + __free_pages(buf, MISALIGNED_BUFFER_ORDER); + } +#endif + +exit: + set_unaligned_access_static_branches(); + + return 0; +} + +static int riscv_offline_cpu(unsigned int cpu) +{ + set_unaligned_access_static_branches_except_cpu(cpu); + + return 0; +} + +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS +static void check_vector_unaligned_access(struct work_struct *work __always_unused) +{ + int cpu = smp_processor_id(); + u64 start_cycles, end_cycles; + u64 word_cycles; + u64 byte_cycles; + int ratio; + unsigned long start_jiffies, now; + struct page *page; + void *dst; + void *src; + long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; + + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) + return; + + page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); + if (!page) { + pr_warn("Allocation failure, not measuring vector misaligned performance\n"); + return; + } + + /* Make an unaligned destination buffer. */ + dst = (void *)((unsigned long)page_address(page) | 0x1); + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ + src = dst + (MISALIGNED_BUFFER_SIZE / 2); + src += 2; + word_cycles = -1ULL; + + /* Do a warmup. */ + kernel_vector_begin(); + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + /* + * For a fixed amount of time, repeatedly try the function, and take + * the best time in cycles as the measurement. + */ + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < word_cycles) + word_cycles = end_cycles - start_cycles; + } + + byte_cycles = -1ULL; + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < byte_cycles) + byte_cycles = end_cycles - start_cycles; + } + + kernel_vector_end(); + + /* Don't divide by zero. */ + if (!word_cycles || !byte_cycles) { + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", + cpu); + + goto free; + } + + if (word_cycles < byte_cycles) + speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; + + ratio = div_u64((byte_cycles * 100), word_cycles); + pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", + cpu, + ratio / 100, + ratio % 100, + (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); + + per_cpu(vector_misaligned_access, cpu) = speed; + +free: + __free_pages(page, MISALIGNED_BUFFER_ORDER); +} + +/* Measure unaligned access speed on all CPUs present at boot in parallel. */ +static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) +{ + schedule_on_each_cpu(check_vector_unaligned_access); + + return 0; +} +#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ +static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) +{ + return 0; +} +#endif + +static int riscv_online_cpu_vec(unsigned int cpu) +{ + if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { + per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; + return 0; + } + +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) + return 0; + + check_vector_unaligned_access_emulated(NULL); + check_vector_unaligned_access(NULL); +#endif + + return 0; +} + +static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" }; + +static int __init set_unaligned_scalar_speed_param(char *str) +{ + if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW])) + unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; + else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST])) + unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; + else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED])) + unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED; + else + return -EINVAL; + + return 1; +} +__setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param); + +static int __init set_unaligned_vector_speed_param(char *str) +{ + if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW])) + unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; + else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST])) + unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; + else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED])) + unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; + else + return -EINVAL; + + return 1; +} +__setup("unaligned_vector_speed=", set_unaligned_vector_speed_param); + +static int __init check_unaligned_access_all_cpus(void) +{ + int cpu; + + unaligned_access_init(); + + if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { + pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", + speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); + for_each_online_cpu(cpu) + per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; + } else if (!check_unaligned_access_emulated_all_cpus()) { + check_unaligned_access_speed_all_cpus(); + } + + if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { + if (!has_vector() && + unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) { + pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n", + speed_str[unaligned_vector_speed_param]); + } else { + pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n", + speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param); + } + } + + if (!has_vector()) + unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; + + if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { + for_each_online_cpu(cpu) + per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; + } else if (!check_vector_unaligned_access_emulated_all_cpus() && + IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { + kthread_run(vec_check_unaligned_access_speed_all_cpus, + NULL, "vec_check_unaligned_access_speed_all_cpus"); + } + + /* + * Setup hotplug callbacks for any new CPUs that come online or go + * offline. + */ + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", + riscv_online_cpu, riscv_offline_cpu); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", + riscv_online_cpu_vec, NULL); + + return 0; +} + +arch_initcall(check_unaligned_access_all_cpus); diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 69b05b6c181b..3a8e038b10a2 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -13,53 +13,26 @@ #include <linux/err.h> #include <asm/page.h> #include <asm/vdso.h> -#include <linux/time_namespace.h> - -#ifdef CONFIG_GENERIC_TIME_VSYSCALL +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> -#else -struct vdso_data { -}; -#endif - -extern char vdso_start[], vdso_end[]; -#ifdef CONFIG_COMPAT -extern char compat_vdso_start[], compat_vdso_end[]; -#endif - -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - -enum rv_vdso_map { - RV_VDSO_MAP_VVAR, - RV_VDSO_MAP_VDSO, -}; - -#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) +#include <vdso/vsyscall.h> -/* - * The vDSO data page. - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT) struct __vdso_info { const char *name; const char *vdso_code_start; const char *vdso_code_end; unsigned long vdso_pages; - /* Data Mapping */ - struct vm_special_mapping *dm; /* Code Mapping */ struct vm_special_mapping *cm; }; +static struct __vdso_info vdso_info; +#ifdef CONFIG_COMPAT +static struct __vdso_info compat_vdso_info; +#endif + static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -97,132 +70,29 @@ static void __init __vdso_init(struct __vdso_info *vdso_info) vdso_info->cm->pages = vdso_pagelist; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - struct __vdso_info *vdso_info = mm->context.vdso_info; - - mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long size = vma->vm_end - vma->vm_start; - - if (vma_is_special_mapping(vma, vdso_info->dm)) - zap_page_range(vma, vma->vm_start, size); - } - - mmap_read_unlock(mm); - return 0; -} - -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops. - * For more details check_vma_flags() and __access_remote_vm() - */ - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} -#else -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - -static struct vm_special_mapping rv_vdso_maps[] __ro_after_init = { - [RV_VDSO_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, - [RV_VDSO_MAP_VDSO] = { - .name = "[vdso]", - .mremap = vdso_mremap, - }, +static struct vm_special_mapping rv_vdso_map __ro_after_init = { + .name = "[vdso]", + .mremap = vdso_mremap, }; static struct __vdso_info vdso_info __ro_after_init = { .name = "vdso", .vdso_code_start = vdso_start, .vdso_code_end = vdso_end, - .dm = &rv_vdso_maps[RV_VDSO_MAP_VVAR], - .cm = &rv_vdso_maps[RV_VDSO_MAP_VDSO], + .cm = &rv_vdso_map, }; #ifdef CONFIG_COMPAT -static struct vm_special_mapping rv_compat_vdso_maps[] __ro_after_init = { - [RV_VDSO_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, - [RV_VDSO_MAP_VDSO] = { - .name = "[vdso]", - .mremap = vdso_mremap, - }, +static struct vm_special_mapping rv_compat_vdso_map __ro_after_init = { + .name = "[vdso]", + .mremap = vdso_mremap, }; static struct __vdso_info compat_vdso_info __ro_after_init = { .name = "compat_vdso", .vdso_code_start = compat_vdso_start, .vdso_code_end = compat_vdso_end, - .dm = &rv_compat_vdso_maps[RV_VDSO_MAP_VVAR], - .cm = &rv_compat_vdso_maps[RV_VDSO_MAP_VDSO], + .cm = &rv_compat_vdso_map, }; #endif @@ -245,7 +115,7 @@ static int __setup_additional_pages(struct mm_struct *mm, unsigned long vdso_base, vdso_text_len, vdso_mapping_len; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info->vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ @@ -257,18 +127,16 @@ static int __setup_additional_pages(struct mm_struct *mm, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE, - (VM_READ | VM_MAYREAD | VM_PFNMAP), vdso_info->dm); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; vdso_base += VVAR_SIZE; mm->context.vdso = (void *)vdso_base; - mm->context.vdso_info = (void *)vdso_info; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), + (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_SEALED_SYSMAP), vdso_info->cm); if (IS_ERR(ret)) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index f2e065671e4d..9ebb5e590f93 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -1,10 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only # Copied from arch/tile/kernel/vdso/Makefile -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_RISCV_32|R_RISCV_64|R_RISCV_JUMP_SLOT -include $(srctree)/lib/vdso/Makefile +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile.include # Symbols present in the vdso vdso-syms = rt_sigreturn ifdef CONFIG_64BIT @@ -12,40 +10,57 @@ vdso-syms += vgettimeofday endif vdso-syms += getcpu vdso-syms += flush_icache +vdso-syms += hwprobe +vdso-syms += sys_hwprobe + +ifdef CONFIG_VDSO_GETRANDOM +vdso-syms += getrandom +endif # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o +ifdef CONFIG_VDSO_GETRANDOM +obj-vdso += vgetrandom-chacha.o +endif + ccflags-y := -fno-stack-protector +ccflags-y += -DDISABLE_BRANCH_PROFILING +ccflags-y += -fno-builtin ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif +ifneq ($(c-getrandom-y),) + CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y) +endif + +CFLAGS_hwprobe.o += -fPIC + # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) obj-y += vdso.o CPPFLAGS_vdso.lds += -P -C -U$(ARCH) +ifneq ($(filter vgettimeofday, $(vdso-syms)),) +CPPFLAGS_vdso.lds += -DHAS_VGETTIMEOFDAY +endif # Disable -pg to prevent insert call site -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os - -# Disable profiling and instrumentation for VDSO code -GCOV_PROFILE := n -KCOV_INSTRUMENT := n -KASAN_SANITIZE := n -UBSAN_SANITIZE := n +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_getrandom.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) # Force dependency $(obj)/vdso.o: $(obj)/vdso.so # link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE - $(call if_changed,vdsold) -LDFLAGS_vdso.so.dbg = -shared -S -soname=linux-vdso.so.1 \ - --build-id=sha1 --hash-style=both --eh-frame-hdr + $(call if_changed,vdsold_and_check) +LDFLAGS_vdso.so.dbg = -shared -soname=linux-vdso.so.1 \ + --build-id=sha1 --eh-frame-hdr # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -53,7 +68,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ @@ -63,17 +78,8 @@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE # actual build commands # The DSO images are built using a special linker script # Make sure only to export the intended __vdso_xxx symbol offsets. -quiet_cmd_vdsold = VDSOLD $@ - cmd_vdsold = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \ +quiet_cmd_vdsold_and_check = VDSOLD $@ + cmd_vdsold_and_check = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \ $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ - rm $@.tmp - -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso.so: $(obj)/vdso.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso.so + rm $@.tmp && \ + $(cmd_vdso_check) diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S index 82f97d67c23e..8f884227e8bc 100644 --- a/arch/riscv/kernel/vdso/flush_icache.S +++ b/arch/riscv/kernel/vdso/flush_icache.S @@ -8,7 +8,7 @@ .text /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */ -ENTRY(__vdso_flush_icache) +SYM_FUNC_START(__vdso_flush_icache) .cfi_startproc #ifdef CONFIG_SMP li a7, __NR_riscv_flush_icache @@ -19,4 +19,4 @@ ENTRY(__vdso_flush_icache) #endif ret .cfi_endproc -ENDPROC(__vdso_flush_icache) +SYM_FUNC_END(__vdso_flush_icache) diff --git a/arch/riscv/kernel/vdso/getcpu.S b/arch/riscv/kernel/vdso/getcpu.S index bb0c05e2ffba..9c1bd531907f 100644 --- a/arch/riscv/kernel/vdso/getcpu.S +++ b/arch/riscv/kernel/vdso/getcpu.S @@ -8,11 +8,11 @@ .text /* int __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); */ -ENTRY(__vdso_getcpu) +SYM_FUNC_START(__vdso_getcpu) .cfi_startproc /* For now, just do the syscall. */ li a7, __NR_getcpu ecall ret .cfi_endproc -ENDPROC(__vdso_getcpu) +SYM_FUNC_END(__vdso_getcpu) diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c new file mode 100644 index 000000000000..f21922e8cebd --- /dev/null +++ b/arch/riscv/kernel/vdso/getrandom.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + */ +#include <linux/types.h> + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} diff --git a/arch/riscv/kernel/vdso/hwprobe.c b/arch/riscv/kernel/vdso/hwprobe.c new file mode 100644 index 000000000000..2ddeba6c68dd --- /dev/null +++ b/arch/riscv/kernel/vdso/hwprobe.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2023 Rivos, Inc + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> + +extern int riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpusetsize, unsigned long *cpus, + unsigned int flags); + +static int riscv_vdso_get_values(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpusetsize, unsigned long *cpus, + unsigned int flags) +{ + const struct vdso_arch_data *avd = &vdso_u_arch_data; + bool all_cpus = !cpusetsize && !cpus; + struct riscv_hwprobe *p = pairs; + struct riscv_hwprobe *end = pairs + pair_count; + + /* + * Defer to the syscall for exotic requests. The vdso has answers + * stashed away only for the "all cpus" case. If all CPUs are + * homogeneous, then this function can handle requests for arbitrary + * masks. + */ + if ((flags != 0) || (!all_cpus && !avd->homogeneous_cpus)) + return riscv_hwprobe(pairs, pair_count, cpusetsize, cpus, flags); + + /* This is something we can handle, fill out the pairs. */ + while (p < end) { + if (riscv_hwprobe_key_is_valid(p->key)) { + p->value = avd->all_cpu_hwprobe_values[p->key]; + + } else { + p->key = -1; + p->value = 0; + } + + p++; + } + + return 0; +} + +static int riscv_vdso_get_cpus(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpusetsize, unsigned long *cpus, + unsigned int flags) +{ + const struct vdso_arch_data *avd = &vdso_u_arch_data; + struct riscv_hwprobe *p = pairs; + struct riscv_hwprobe *end = pairs + pair_count; + unsigned char *c = (unsigned char *)cpus; + bool empty_cpus = true; + bool clear_all = false; + int i; + + if (!cpusetsize || !cpus) + return -EINVAL; + + for (i = 0; i < cpusetsize; i++) { + if (c[i]) { + empty_cpus = false; + break; + } + } + + if (empty_cpus || flags != RISCV_HWPROBE_WHICH_CPUS || !avd->homogeneous_cpus) + return riscv_hwprobe(pairs, pair_count, cpusetsize, cpus, flags); + + while (p < end) { + if (riscv_hwprobe_key_is_valid(p->key)) { + struct riscv_hwprobe t = { + .key = p->key, + .value = avd->all_cpu_hwprobe_values[p->key], + }; + + if (!riscv_hwprobe_pair_cmp(&t, p)) + clear_all = true; + } else { + clear_all = true; + p->key = -1; + p->value = 0; + } + p++; + } + + if (clear_all) { + for (i = 0; i < cpusetsize; i++) + c[i] = 0; + } + + return 0; +} + +/* Add a prototype to avoid -Wmissing-prototypes warning. */ +int __vdso_riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpusetsize, unsigned long *cpus, + unsigned int flags); + +int __vdso_riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpusetsize, unsigned long *cpus, + unsigned int flags) +{ + if (flags & RISCV_HWPROBE_WHICH_CPUS) + return riscv_vdso_get_cpus(pairs, pair_count, cpusetsize, + cpus, flags); + + return riscv_vdso_get_values(pairs, pair_count, cpusetsize, + cpus, flags); +} diff --git a/arch/riscv/kernel/vdso/rt_sigreturn.S b/arch/riscv/kernel/vdso/rt_sigreturn.S index 0573705eac76..3dc022aa8931 100644 --- a/arch/riscv/kernel/vdso/rt_sigreturn.S +++ b/arch/riscv/kernel/vdso/rt_sigreturn.S @@ -7,10 +7,10 @@ #include <asm/unistd.h> .text -ENTRY(__vdso_rt_sigreturn) +SYM_FUNC_START(__vdso_rt_sigreturn) .cfi_startproc .cfi_signal_frame li a7, __NR_rt_sigreturn - scall + ecall .cfi_endproc -ENDPROC(__vdso_rt_sigreturn) +SYM_FUNC_END(__vdso_rt_sigreturn) diff --git a/arch/riscv/kernel/vdso/sys_hwprobe.S b/arch/riscv/kernel/vdso/sys_hwprobe.S new file mode 100644 index 000000000000..77e57f830521 --- /dev/null +++ b/arch/riscv/kernel/vdso/sys_hwprobe.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Rivos, Inc */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +.text +SYM_FUNC_START(riscv_hwprobe) + .cfi_startproc + li a7, __NR_riscv_hwprobe + ecall + ret + + .cfi_endproc +SYM_FUNC_END(riscv_hwprobe) diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 01d94aae5bf5..7c15b0f4ee3b 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -4,15 +4,14 @@ */ #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_ARCH(riscv) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -23,28 +22,31 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } - .note : { *(.note.*) } :text :note .dynamic : { *(.dynamic) } :text :dynamic + .rodata : { + *(.rodata .rodata.* .gnu.linkonce.r.*) + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + } + + .note : { *(.note.*) } :text :note + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text - .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } - /* - * This linker script is used both with -r and with -shared. - * For the layouts to match, we need to skip more than enough - * space for the dynamic symbol table, etc. If this amount is - * insufficient, ld -shared will error; simply increase it here. + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. */ - . = 0x800; + . = ALIGN(16); .text : { *(.text .text.*) } :text - .data : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) + . = ALIGN(4); + .alternative : { + *(.alternative) } } @@ -68,11 +70,19 @@ VERSION LINUX_4.15 { global: __vdso_rt_sigreturn; +#ifdef HAS_VGETTIMEOFDAY __vdso_gettimeofday; __vdso_clock_gettime; __vdso_clock_getres; +#endif __vdso_getcpu; __vdso_flush_icache; +#ifndef COMPAT_VDSO + __vdso_riscv_hwprobe; +#endif +#if defined(CONFIG_VDSO_GETRANDOM) && !defined(COMPAT_VDSO) + __vdso_getrandom; +#endif local: *; }; } diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..5f0dad8f2373 --- /dev/null +++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + * + * Based on arch/loongarch/vdso/vgetrandom-chacha.S. + */ + +#include <asm/asm.h> +#include <linux/linkage.h> + +.text + +.macro ROTRI rd rs imm + slliw t0, \rs, 32 - \imm + srliw \rd, \rs, \imm + or \rd, \rd, t0 +.endm + +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s1 + \op \d2, \d2, \s2 + \op \d3, \d3, \s3 +.endm + +/* + * a0: output bytes + * a1: 32-byte key input + * a2: 8-byte counter input/output + * a3: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +#define output a0 +#define key a1 +#define counter a2 +#define nblocks a3 +#define i a4 +#define state0 s0 +#define state1 s1 +#define state2 s2 +#define state3 s3 +#define state4 s4 +#define state5 s5 +#define state6 s6 +#define state7 s7 +#define state8 s8 +#define state9 s9 +#define state10 s10 +#define state11 s11 +#define state12 a5 +#define state13 a6 +#define state14 a7 +#define state15 t1 +#define cnt t2 +#define copy0 t3 +#define copy1 t4 +#define copy2 t5 +#define copy3 t6 + +/* Packs to be used with OP_4REG */ +#define line0 state0, state1, state2, state3 +#define line1 state4, state5, state6, state7 +#define line2 state8, state9, state10, state11 +#define line3 state12, state13, state14, state15 + +#define line1_perm state5, state6, state7, state4 +#define line2_perm state10, state11, state8, state9 +#define line3_perm state15, state12, state13, state14 + +#define copy copy0, copy1, copy2, copy3 + +#define _16 16, 16, 16, 16 +#define _20 20, 20, 20, 20 +#define _24 24, 24, 24, 24 +#define _25 25, 25, 25, 25 + + /* + * The ABI requires s0-s9 saved. + * This does not violate the stack-less requirement: no sensitive data + * is spilled onto the stack. + */ + addi sp, sp, -12*SZREG + REG_S s0, (sp) + REG_S s1, SZREG(sp) + REG_S s2, 2*SZREG(sp) + REG_S s3, 3*SZREG(sp) + REG_S s4, 4*SZREG(sp) + REG_S s5, 5*SZREG(sp) + REG_S s6, 6*SZREG(sp) + REG_S s7, 7*SZREG(sp) + REG_S s8, 8*SZREG(sp) + REG_S s9, 9*SZREG(sp) + REG_S s10, 10*SZREG(sp) + REG_S s11, 11*SZREG(sp) + + ld cnt, (counter) + + li copy0, 0x61707865 + li copy1, 0x3320646e + li copy2, 0x79622d32 + li copy3, 0x6b206574 + +.Lblock: + /* state[0,1,2,3] = "expand 32-byte k" */ + mv state0, copy0 + mv state1, copy1 + mv state2, copy2 + mv state3, copy3 + + /* state[4,5,..,11] = key */ + lw state4, (key) + lw state5, 4(key) + lw state6, 8(key) + lw state7, 12(key) + lw state8, 16(key) + lw state9, 20(key) + lw state10, 24(key) + lw state11, 28(key) + + /* state[12,13] = counter */ + mv state12, cnt + srli state13, cnt, 32 + + /* state[14,15] = 0 */ + mv state14, zero + mv state15, zero + + li i, 10 +.Lpermute: + /* odd round */ + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _16 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _20 + + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _24 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _25 + + /* even round */ + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _16 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _20 + + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _24 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _25 + + addi i, i, -1 + bnez i, .Lpermute + + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ + OP_4REG addw line0, copy + sw state0, (output) + sw state1, 4(output) + sw state2, 8(output) + sw state3, 12(output) + + /* from now on state[0,1,2,3] are scratch registers */ + + /* state[0,1,2,3] = lo(key) */ + lw state0, (key) + lw state1, 4(key) + lw state2, 8(key) + lw state3, 12(key) + + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ + OP_4REG addw line1, line0 + sw state4, 16(output) + sw state5, 20(output) + sw state6, 24(output) + sw state7, 28(output) + + /* state[0,1,2,3] = hi(key) */ + lw state0, 16(key) + lw state1, 20(key) + lw state2, 24(key) + lw state3, 28(key) + + /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ + OP_4REG addw line2, line0 + sw state8, 32(output) + sw state9, 36(output) + sw state10, 40(output) + sw state11, 44(output) + + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ + addw state12, state12, cnt + srli state0, cnt, 32 + addw state13, state13, state0 + sw state12, 48(output) + sw state13, 52(output) + sw state14, 56(output) + sw state15, 60(output) + + /* ++counter */ + addi cnt, cnt, 1 + + /* output += 64 */ + addi output, output, 64 + /* --nblocks */ + addi nblocks, nblocks, -1 + bnez nblocks, .Lblock + + /* counter = [cnt_lo, cnt_hi] */ + sd cnt, (counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and + * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we + * only need to zero state[12,...,15]. + */ + mv state12, zero + mv state13, zero + mv state14, zero + mv state15, zero + + REG_L s0, (sp) + REG_L s1, SZREG(sp) + REG_L s2, 2*SZREG(sp) + REG_L s3, 3*SZREG(sp) + REG_L s4, 4*SZREG(sp) + REG_L s5, 5*SZREG(sp) + REG_L s6, 6*SZREG(sp) + REG_L s7, 7*SZREG(sp) + REG_L s8, 8*SZREG(sp) + REG_L s9, 9*SZREG(sp) + REG_L s10, 10*SZREG(sp) + REG_L s11, 11*SZREG(sp) + addi sp, sp, 12*SZREG + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/riscv/kernel/vdso/vgettimeofday.c b/arch/riscv/kernel/vdso/vgettimeofday.c index cc0d80699c31..b35057802584 100644 --- a/arch/riscv/kernel/vdso/vgettimeofday.c +++ b/arch/riscv/kernel/vdso/vgettimeofday.c @@ -8,23 +8,18 @@ #include <linux/time.h> #include <linux/types.h> +#include <vdso/gettime.h> -extern -int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { return __cvdso_clock_gettime(clock, ts); } -extern -int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); } -extern -int __vdso_clock_getres(clockid_t clock_id, struct __kernel_timespec *res); int __vdso_clock_getres(clockid_t clock_id, struct __kernel_timespec *res) { return __cvdso_clock_getres(clock_id, res); diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S new file mode 100644 index 000000000000..7ce4de6f6e69 --- /dev/null +++ b/arch/riscv/kernel/vec-copy-unaligned.S @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2024 Rivos Inc. */ + +#include <linux/args.h> +#include <linux/linkage.h> +#include <asm/asm.h> + + .text + +#define WORD_EEW 32 + +#define WORD_SEW CONCATENATE(e, WORD_EEW) +#define VEC_L CONCATENATE(vle, WORD_EEW).v +#define VEC_S CONCATENATE(vse, WORD_EEW).v + +/* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using word loads and stores. */ +/* Note: The size is truncated to a multiple of WORD_EEW */ +SYM_FUNC_START(__riscv_copy_vec_words_unaligned) + andi a4, a2, ~(WORD_EEW-1) + beqz a4, 2f + add a3, a1, a4 + .option push + .option arch, +zve32x +1: + vsetivli t0, 8, WORD_SEW, m8, ta, ma + VEC_L v0, (a1) + VEC_S v0, (a0) + addi a0, a0, WORD_EEW + addi a1, a1, WORD_EEW + bltu a1, a3, 1b + +2: + .option pop + ret +SYM_FUNC_END(__riscv_copy_vec_words_unaligned) + +/* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using only byte accesses. */ +/* Note: The size is truncated to a multiple of 8 */ +SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned) + andi a4, a2, ~(8-1) + beqz a4, 2f + add a3, a1, a4 + .option push + .option arch, +zve32x +1: + vsetivli t0, 8, e8, m8, ta, ma + vle8.v v0, (a1) + vse8.v v0, (a0) + addi a0, a0, 8 + addi a1, a1, 8 + bltu a1, a3, 1b + +2: + .option pop + ret +SYM_FUNC_END(__riscv_copy_vec_bytes_unaligned) diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c new file mode 100644 index 000000000000..184f780c932d --- /dev/null +++ b/arch/riscv/kernel/vector.c @@ -0,0 +1,326 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 SiFive + * Author: Andy Chiu <andy.chiu@sifive.com> + */ +#include <linux/export.h> +#include <linux/sched/signal.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/prctl.h> + +#include <asm/thread_info.h> +#include <asm/processor.h> +#include <asm/insn.h> +#include <asm/vector.h> +#include <asm/csr.h> +#include <asm/elf.h> +#include <asm/ptrace.h> +#include <asm/bug.h> + +static bool riscv_v_implicit_uacc = IS_ENABLED(CONFIG_RISCV_ISA_V_DEFAULT_ENABLE); +static struct kmem_cache *riscv_v_user_cachep; +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static struct kmem_cache *riscv_v_kernel_cachep; +#endif + +unsigned long riscv_v_vsize __read_mostly; +EXPORT_SYMBOL_GPL(riscv_v_vsize); + +int riscv_v_setup_vsize(void) +{ + unsigned long this_vsize; + + /* + * There are 32 vector registers with vlenb length. + * + * If the thead,vlenb property was provided by the firmware, use that + * instead of probing the CSRs. + */ + if (thead_vlenb_of) { + riscv_v_vsize = thead_vlenb_of * 32; + return 0; + } + + riscv_v_enable(); + this_vsize = csr_read(CSR_VLENB) * 32; + riscv_v_disable(); + + if (!riscv_v_vsize) { + riscv_v_vsize = this_vsize; + return 0; + } + + if (riscv_v_vsize != this_vsize) { + WARN(1, "RISCV_ISA_V only supports one vlenb on SMP systems"); + return -EOPNOTSUPP; + } + + return 0; +} + +void __init riscv_v_setup_ctx_cache(void) +{ + if (!(has_vector() || has_xtheadvector())) + return; + + riscv_v_user_cachep = kmem_cache_create_usercopy("riscv_vector_ctx", + riscv_v_vsize, 16, SLAB_PANIC, + 0, riscv_v_vsize, NULL); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + riscv_v_kernel_cachep = kmem_cache_create("riscv_vector_kctx", + riscv_v_vsize, 16, + SLAB_PANIC, NULL); +#endif +} + +bool insn_is_vector(u32 insn_buf) +{ + u32 opcode = insn_buf & __INSN_OPCODE_MASK; + u32 width, csr; + + /* + * All V-related instructions, including CSR operations are 4-Byte. So, + * do not handle if the instruction length is not 4-Byte. + */ + if (unlikely(GET_INSN_LENGTH(insn_buf) != 4)) + return false; + + switch (opcode) { + case RVV_OPCODE_VECTOR: + return true; + case RVV_OPCODE_VL: + case RVV_OPCODE_VS: + width = RVV_EXRACT_VL_VS_WIDTH(insn_buf); + if (width == RVV_VL_VS_WIDTH_8 || width == RVV_VL_VS_WIDTH_16 || + width == RVV_VL_VS_WIDTH_32 || width == RVV_VL_VS_WIDTH_64) + return true; + + break; + case RVG_OPCODE_SYSTEM: + csr = RVG_EXTRACT_SYSTEM_CSR(insn_buf); + if ((csr >= CSR_VSTART && csr <= CSR_VCSR) || + (csr >= CSR_VL && csr <= CSR_VLENB)) + return true; + } + + return false; +} + +static int riscv_v_thread_zalloc(struct kmem_cache *cache, + struct __riscv_v_ext_state *ctx) +{ + void *datap; + + datap = kmem_cache_zalloc(cache, GFP_KERNEL); + if (!datap) + return -ENOMEM; + + ctx->datap = datap; + memset(ctx, 0, offsetof(struct __riscv_v_ext_state, datap)); + return 0; +} + +void riscv_v_thread_alloc(struct task_struct *tsk) +{ +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + riscv_v_thread_zalloc(riscv_v_kernel_cachep, &tsk->thread.kernel_vstate); +#endif +} + +void riscv_v_thread_free(struct task_struct *tsk) +{ + if (tsk->thread.vstate.datap) + kmem_cache_free(riscv_v_user_cachep, tsk->thread.vstate.datap); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + if (tsk->thread.kernel_vstate.datap) + kmem_cache_free(riscv_v_kernel_cachep, tsk->thread.kernel_vstate.datap); +#endif +} + +#define VSTATE_CTRL_GET_CUR(x) ((x) & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) +#define VSTATE_CTRL_GET_NEXT(x) (((x) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) >> 2) +#define VSTATE_CTRL_MAKE_NEXT(x) (((x) << 2) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) +#define VSTATE_CTRL_GET_INHERIT(x) (!!((x) & PR_RISCV_V_VSTATE_CTRL_INHERIT)) +static inline int riscv_v_ctrl_get_cur(struct task_struct *tsk) +{ + return VSTATE_CTRL_GET_CUR(tsk->thread.vstate_ctrl); +} + +static inline int riscv_v_ctrl_get_next(struct task_struct *tsk) +{ + return VSTATE_CTRL_GET_NEXT(tsk->thread.vstate_ctrl); +} + +static inline bool riscv_v_ctrl_test_inherit(struct task_struct *tsk) +{ + return VSTATE_CTRL_GET_INHERIT(tsk->thread.vstate_ctrl); +} + +static inline void riscv_v_ctrl_set(struct task_struct *tsk, int cur, int nxt, + bool inherit) +{ + unsigned long ctrl; + + ctrl = cur & PR_RISCV_V_VSTATE_CTRL_CUR_MASK; + ctrl |= VSTATE_CTRL_MAKE_NEXT(nxt); + if (inherit) + ctrl |= PR_RISCV_V_VSTATE_CTRL_INHERIT; + tsk->thread.vstate_ctrl &= ~PR_RISCV_V_VSTATE_CTRL_MASK; + tsk->thread.vstate_ctrl |= ctrl; +} + +bool riscv_v_vstate_ctrl_user_allowed(void) +{ + return riscv_v_ctrl_get_cur(current) == PR_RISCV_V_VSTATE_CTRL_ON; +} +EXPORT_SYMBOL_GPL(riscv_v_vstate_ctrl_user_allowed); + +bool riscv_v_first_use_handler(struct pt_regs *regs) +{ + u32 __user *epc = (u32 __user *)regs->epc; + u32 insn = (u32)regs->badaddr; + + if (!(has_vector() || has_xtheadvector())) + return false; + + /* Do not handle if V is not supported, or disabled */ + if (!riscv_v_vstate_ctrl_user_allowed()) + return false; + + /* If V has been enabled then it is not the first-use trap */ + if (riscv_v_vstate_query(regs)) + return false; + + /* Get the instruction */ + if (!insn) { + if (__get_user(insn, epc)) + return false; + } + + /* Filter out non-V instructions */ + if (!insn_is_vector(insn)) + return false; + + /* Sanity check. datap should be null by the time of the first-use trap */ + WARN_ON(current->thread.vstate.datap); + + /* + * Now we sure that this is a V instruction. And it executes in the + * context where VS has been off. So, try to allocate the user's V + * context and resume execution. + */ + if (riscv_v_thread_zalloc(riscv_v_user_cachep, ¤t->thread.vstate)) { + force_sig(SIGBUS); + return true; + } + riscv_v_vstate_on(regs); + riscv_v_vstate_set_restore(current, regs); + return true; +} + +void riscv_v_vstate_ctrl_init(struct task_struct *tsk) +{ + bool inherit; + int cur, next; + + if (!(has_vector() || has_xtheadvector())) + return; + + next = riscv_v_ctrl_get_next(tsk); + if (!next) { + if (READ_ONCE(riscv_v_implicit_uacc)) + cur = PR_RISCV_V_VSTATE_CTRL_ON; + else + cur = PR_RISCV_V_VSTATE_CTRL_OFF; + } else { + cur = next; + } + /* Clear next mask if inherit-bit is not set */ + inherit = riscv_v_ctrl_test_inherit(tsk); + if (!inherit) + next = PR_RISCV_V_VSTATE_CTRL_DEFAULT; + + riscv_v_ctrl_set(tsk, cur, next, inherit); +} + +long riscv_v_vstate_ctrl_get_current(void) +{ + if (!(has_vector() || has_xtheadvector())) + return -EINVAL; + + return current->thread.vstate_ctrl & PR_RISCV_V_VSTATE_CTRL_MASK; +} + +long riscv_v_vstate_ctrl_set_current(unsigned long arg) +{ + bool inherit; + int cur, next; + + if (!(has_vector() || has_xtheadvector())) + return -EINVAL; + + if (arg & ~PR_RISCV_V_VSTATE_CTRL_MASK) + return -EINVAL; + + cur = VSTATE_CTRL_GET_CUR(arg); + switch (cur) { + case PR_RISCV_V_VSTATE_CTRL_OFF: + /* Do not allow user to turn off V if current is not off */ + if (riscv_v_ctrl_get_cur(current) != PR_RISCV_V_VSTATE_CTRL_OFF) + return -EPERM; + + break; + case PR_RISCV_V_VSTATE_CTRL_ON: + break; + case PR_RISCV_V_VSTATE_CTRL_DEFAULT: + cur = riscv_v_ctrl_get_cur(current); + break; + default: + return -EINVAL; + } + + next = VSTATE_CTRL_GET_NEXT(arg); + inherit = VSTATE_CTRL_GET_INHERIT(arg); + switch (next) { + case PR_RISCV_V_VSTATE_CTRL_DEFAULT: + case PR_RISCV_V_VSTATE_CTRL_OFF: + case PR_RISCV_V_VSTATE_CTRL_ON: + riscv_v_ctrl_set(current, cur, next, inherit); + return 0; + } + + return -EINVAL; +} + +#ifdef CONFIG_SYSCTL + +static const struct ctl_table riscv_v_default_vstate_table[] = { + { + .procname = "riscv_v_default_allow", + .data = &riscv_v_implicit_uacc, + .maxlen = sizeof(riscv_v_implicit_uacc), + .mode = 0644, + .proc_handler = proc_dobool, + }, +}; + +static int __init riscv_v_sysctl_init(void) +{ + if (has_vector() || has_xtheadvector()) + if (!register_sysctl("abi", riscv_v_default_vstate_table)) + return -EINVAL; + return 0; +} + +#else /* ! CONFIG_SYSCTL */ +static int __init riscv_v_sysctl_init(void) { return 0; } +#endif /* ! CONFIG_SYSCTL */ + +static int __init riscv_v_init(void) +{ + return riscv_v_sysctl_init(); +} +core_initcall(riscv_v_init); diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c new file mode 100644 index 000000000000..92d8ff81f42c --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Rivos, Inc + */ + +#include <asm/vendorid_list.h> +#include <asm/vendor_extensions.h> +#include <asm/vendor_extensions/andes.h> +#include <asm/vendor_extensions/sifive.h> +#include <asm/vendor_extensions/thead.h> + +#include <linux/array_size.h> +#include <linux/types.h> + +struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = { +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES + &riscv_isa_vendor_ext_list_andes, +#endif +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE + &riscv_isa_vendor_ext_list_sifive, +#endif +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD + &riscv_isa_vendor_ext_list_thead, +#endif +}; + +const size_t riscv_isa_vendor_ext_list_size = ARRAY_SIZE(riscv_isa_vendor_ext_list); + +/** + * __riscv_isa_vendor_extension_available() - Check whether given vendor + * extension is available or not. + * + * @cpu: check if extension is available on this cpu + * @vendor: vendor that the extension is a member of + * @bit: bit position of the desired extension + * Return: true or false + * + * NOTE: When cpu is -1, will check if extension is available on all cpus + */ +bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit) +{ + struct riscv_isavendorinfo *bmap; + struct riscv_isavendorinfo *cpu_bmap; + + switch (vendor) { + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES + case ANDES_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_andes.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; + break; + #endif + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE + case SIFIVE_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap; + break; + #endif + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD + case THEAD_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_thead.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_thead.per_hart_isa_bitmap; + break; + #endif + default: + return false; + } + + if (cpu != -1) + bmap = &cpu_bmap[cpu]; + + if (bit >= RISCV_ISA_VENDOR_EXT_MAX) + return false; + + return test_bit(bit, bmap->isa); +} +EXPORT_SYMBOL_GPL(__riscv_isa_vendor_extension_available); diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile new file mode 100644 index 000000000000..a4eca96d1c8a --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive_hwprobe.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead_hwprobe.o diff --git a/arch/riscv/kernel/vendor_extensions/andes.c b/arch/riscv/kernel/vendor_extensions/andes.c new file mode 100644 index 000000000000..51f302b6d503 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/andes.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/cpufeature.h> +#include <asm/vendor_extensions.h> +#include <asm/vendor_extensions/andes.h> + +#include <linux/array_size.h> +#include <linux/types.h> + +/* All Andes vendor extensions supported in Linux */ +static const struct riscv_isa_ext_data riscv_isa_vendor_ext_andes[] = { + __RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_VENDOR_EXT_XANDESPMU), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_andes = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_andes), + .ext_data = riscv_isa_vendor_ext_andes, +}; diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c new file mode 100644 index 000000000000..1411337dc1e6 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/sifive.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/cpufeature.h> +#include <asm/vendor_extensions.h> +#include <asm/vendor_extensions/sifive.h> + +#include <linux/array_size.h> +#include <linux/types.h> + +/* All SiFive vendor extensions supported in Linux */ +const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { + __RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF), + __RISCV_ISA_EXT_DATA(xsfvfwmaccqqq, RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ), + __RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD), + __RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_sifive), + .ext_data = riscv_isa_vendor_ext_sifive, +}; diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c new file mode 100644 index 000000000000..1f77f6309763 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/vendor_extensions/sifive.h> +#include <asm/vendor_extensions/sifive_hwprobe.h> +#include <asm/vendor_extensions/vendor_hwprobe.h> + +#include <linux/cpumask.h> +#include <linux/types.h> + +#include <uapi/asm/hwprobe.h> +#include <uapi/asm/vendor/sifive.h> + +void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus) +{ + VENDOR_EXTENSION_SUPPORTED(pair, cpus, + riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap, { + VENDOR_EXT_KEY(XSFVQMACCDOD); + VENDOR_EXT_KEY(XSFVQMACCQOQ); + VENDOR_EXT_KEY(XSFVFNRCLIPXFQF); + VENDOR_EXT_KEY(XSFVFWMACCQQQ); + }); +} diff --git a/arch/riscv/kernel/vendor_extensions/thead.c b/arch/riscv/kernel/vendor_extensions/thead.c new file mode 100644 index 000000000000..519dbf70710a --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/thead.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/cpufeature.h> +#include <asm/vendor_extensions.h> +#include <asm/vendor_extensions/thead.h> + +#include <linux/array_size.h> +#include <linux/cpumask.h> +#include <linux/types.h> + +/* All T-Head vendor extensions supported in Linux */ +static const struct riscv_isa_ext_data riscv_isa_vendor_ext_thead[] = { + __RISCV_ISA_EXT_DATA(xtheadvector, RISCV_ISA_VENDOR_EXT_XTHEADVECTOR), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_thead = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_thead), + .ext_data = riscv_isa_vendor_ext_thead, +}; + +void disable_xtheadvector(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + clear_bit(RISCV_ISA_VENDOR_EXT_XTHEADVECTOR, riscv_isa_vendor_ext_list_thead.per_hart_isa_bitmap[cpu].isa); + + clear_bit(RISCV_ISA_VENDOR_EXT_XTHEADVECTOR, riscv_isa_vendor_ext_list_thead.all_harts_isa_bitmap.isa); +} diff --git a/arch/riscv/kernel/vendor_extensions/thead_hwprobe.c b/arch/riscv/kernel/vendor_extensions/thead_hwprobe.c new file mode 100644 index 000000000000..2eba34011786 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/thead_hwprobe.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/vendor_extensions/thead.h> +#include <asm/vendor_extensions/thead_hwprobe.h> +#include <asm/vendor_extensions/vendor_hwprobe.h> + +#include <linux/cpumask.h> +#include <linux/types.h> + +#include <uapi/asm/hwprobe.h> +#include <uapi/asm/vendor/thead.h> + +void hwprobe_isa_vendor_ext_thead_0(struct riscv_hwprobe *pair, const struct cpumask *cpus) +{ + VENDOR_EXTENSION_SUPPORTED(pair, cpus, + riscv_isa_vendor_ext_list_thead.per_hart_isa_bitmap, { + VENDOR_EXT_KEY(XTHEADVECTOR); + }); +} diff --git a/arch/riscv/kernel/vmcore_info.c b/arch/riscv/kernel/vmcore_info.c new file mode 100644 index 000000000000..d5e448aa90e7 --- /dev/null +++ b/arch/riscv/kernel/vmcore_info.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/vmcore_info.h> +#include <linux/pagemap.h> + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_NUMBER(phys_ram_base); + + vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET); + vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); +#ifdef CONFIG_MMU + VMCOREINFO_NUMBER(VA_BITS); + vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); + vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); +#ifdef CONFIG_64BIT + vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); + vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); +#endif +#endif + vmcoreinfo_append_str("NUMBER(KERNEL_LINK_ADDR)=0x%lx\n", KERNEL_LINK_ADDR); +#ifdef CONFIG_XIP_KERNEL + /* TODO: Communicate with crash-utility developers on the information to + * export. The XIP case is more complicated, because the virtual-physical + * address offset depends on whether the address is in ROM or in RAM. + */ +#else + vmcoreinfo_append_str("NUMBER(va_kernel_pa_offset)=0x%lx\n", + kernel_map.va_kernel_pa_offset); +#endif +} diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S index 75e0fa8a700a..a7611789bad5 100644 --- a/arch/riscv/kernel/vmlinux-xip.lds.S +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -14,6 +14,7 @@ #include <asm/page.h> #include <asm/cache.h> #include <asm/thread_info.h> +#include <asm/set_memory.h> OUTPUT_ARCH(riscv) ENTRY(_start) @@ -29,17 +30,18 @@ SECTIONS HEAD_TEXT_SECTION INIT_TEXT_SECTION(PAGE_SIZE) /* we have to discard exit text and such at runtime, not link time */ + __exittext_begin = .; .exit.text : { EXIT_TEXT } + __exittext_end = .; .text : { _text = .; _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT @@ -64,10 +66,10 @@ SECTIONS * From this point, stuff is considered writable and will be copied to RAM */ __data_loc = ALIGN(PAGE_SIZE); /* location in file */ - . = KERNEL_LINK_ADDR + XIP_OFFSET; /* location in memory */ + . = ALIGN(SECTION_ALIGN); /* location in memory */ #undef LOAD_OFFSET -#define LOAD_OFFSET (KERNEL_LINK_ADDR + XIP_OFFSET - (__data_loc & XIP_OFFSET_MASK)) +#define LOAD_OFFSET (KERNEL_LINK_ADDR + _sdata - __data_loc) _sdata = .; /* Start of data section */ _data = .; @@ -99,12 +101,6 @@ SECTIONS __soc_builtin_dtb_table_end = .; } - . = ALIGN(8); - .alternative : { - __alt_start = .; - *(.alternative) - __alt_end = .; - } __init_end = .; . = ALIGN(16); diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 4e6c88aa4d87..61bd5ba6680a 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -5,6 +5,7 @@ */ #define RO_EXCEPTION_TABLE_ALIGN 4 +#define RUNTIME_DISCARD_EXIT #ifdef CONFIG_XIP_KERNEL #include "vmlinux-xip.lds.S" @@ -26,9 +27,6 @@ ENTRY(_start) jiffies = jiffies_64; -PECOFF_SECTION_ALIGNMENT = 0x1000; -PECOFF_FILE_ALIGNMENT = 0x200; - SECTIONS { /* Beginning of code and text segment */ @@ -42,7 +40,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT @@ -72,10 +69,12 @@ SECTIONS __soc_builtin_dtb_table_end = .; } /* we have to discard exit text and such at runtime, not link time */ + __exittext_begin = .; .exit.text : { EXIT_TEXT } + __exittext_end = .; __init_text_end = .; . = ALIGN(SECTION_ALIGN); @@ -86,22 +85,39 @@ SECTIONS /* Start of init data section */ __init_data_begin = .; INIT_DATA_SECTION(16) + + .init.pi : { + KEEP(*(.init.pi*)) + } + + .init.bss : { + KEEP(*(.init.bss*)) /* from the EFI stub */ + } .exit.data : { EXIT_DATA } + + RUNTIME_CONST_VARIABLES + PERCPU_SECTION(L1_CACHE_BYTES) .rel.dyn : { *(.rel.dyn*) } + .rela.dyn : ALIGN(8) { + __rela_dyn_start = .; + *(.rela .rela*) + __rela_dyn_end = .; + } + __init_data_end = .; . = ALIGN(8); .alternative : { __alt_start = .; - *(.alternative) + KEEP(*(.alternative)) __alt_end = .; } __init_end = .; @@ -122,9 +138,22 @@ SECTIONS *(.sdata*) } + .got : { *(.got*) } + +#ifdef CONFIG_RELOCATABLE + .data.rel : { *(.data.rel*) } + .plt : { *(.plt) } + .dynamic : { *(.dynamic) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .hash : { *(.hash) } + .gnu.hash : { *(.gnu.hash) } +#endif + #ifdef CONFIG_EFI .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } __pecoff_data_raw_size = ABSOLUTE(. - __pecoff_text_end); + __pecoff_data_raw_end = ABSOLUTE(.); #endif /* End of data section */ @@ -135,12 +164,14 @@ SECTIONS #ifdef CONFIG_EFI . = ALIGN(PECOFF_SECTION_ALIGNMENT); __pecoff_data_virt_size = ABSOLUTE(. - __pecoff_text_end); + __pecoff_data_virt_end = ABSOLUTE(.); #endif _end = .; STABS_DEBUG DWARF_DEBUG ELF_DETAILS + .riscv.attributes 0 : { *(.riscv.attributes) } DISCARDS } |