From 8d56e5c5a99ce1d17d39ce5a8260e42c2a2d7682 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Mon, 25 Apr 2022 12:44:42 +0100 Subject: arm64: Treat ESR_ELx as a 64-bit register In the initial release of the ARM Architecture Reference Manual for ARMv8-A, the ESR_ELx registers were defined as 32-bit registers. This changed in 2018 with version D.a (ARM DDI 0487D.a) of the architecture, when they became 64-bit registers, with bits [63:32] defined as RES0. In version G.a, a new field was added to ESR_ELx, ISS2, which covers bits [36:32]. This field is used when the Armv8.7 extension FEAT_LS64 is implemented. As a result of the evolution of the register width, Linux stores it as both a 64-bit value and a 32-bit value, which hasn't affected correctness so far as Linux only uses the lower 32 bits of the register. Make the register type consistent and always treat it as 64-bit wide. The register is redefined as an "unsigned long", which is an unsigned double-word (64-bit quantity) for the LP64 machine (aapcs64 [1], Table 1, page 14). The type was chosen because "unsigned int" is the most frequent type for ESR_ELx and because FAR_ELx, which is used together with ESR_ELx in exception handling, is also declared as "unsigned long". The 64-bit type also makes adding support for architectural features that use fields above bit 31 easier in the future. The KVM hypervisor will receive a similar update in a subsequent patch. [1] https://github.com/ARM-software/abi-aa/releases/download/2021Q3/aapcs64.pdf Signed-off-by: Alexandru Elisei Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20220425114444.368693-4-alexandru.elisei@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 70 +++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 77341b160aca..24f9b43bc18e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -43,7 +43,7 @@ #include struct fault_info { - int (*fn)(unsigned long far, unsigned int esr, + int (*fn)(unsigned long far, unsigned long esr, struct pt_regs *regs); int sig; int code; @@ -53,17 +53,17 @@ struct fault_info { static const struct fault_info fault_info[]; static struct fault_info debug_fault_info[]; -static inline const struct fault_info *esr_to_fault_info(unsigned int esr) +static inline const struct fault_info *esr_to_fault_info(unsigned long esr) { return fault_info + (esr & ESR_ELx_FSC); } -static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr) +static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr) { return debug_fault_info + DBG_ESR_EVT(esr); } -static void data_abort_decode(unsigned int esr) +static void data_abort_decode(unsigned long esr) { pr_alert("Data abort info:\n"); @@ -85,11 +85,11 @@ static void data_abort_decode(unsigned int esr) (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT); } -static void mem_abort_decode(unsigned int esr) +static void mem_abort_decode(unsigned long esr) { pr_alert("Mem abort info:\n"); - pr_alert(" ESR = 0x%08x\n", esr); + pr_alert(" ESR = 0x%016lx\n", esr); pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n", ESR_ELx_EC(esr), esr_get_class_string(esr), (esr & ESR_ELx_IL) ? 32 : 16); @@ -99,7 +99,7 @@ static void mem_abort_decode(unsigned int esr) pr_alert(" EA = %lu, S1PTW = %lu\n", (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); - pr_alert(" FSC = 0x%02x: %s\n", (esr & ESR_ELx_FSC), + pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC), esr_to_fault_info(esr)->name); if (esr_is_data_abort(esr)) @@ -229,20 +229,20 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return 1; } -static bool is_el1_instruction_abort(unsigned int esr) +static bool is_el1_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; } -static bool is_el1_data_abort(unsigned int esr) +static bool is_el1_data_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR; } -static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr, +static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { - unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE; if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) return false; @@ -258,7 +258,7 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr, } static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, - unsigned int esr, + unsigned long esr, struct pt_regs *regs) { unsigned long flags; @@ -290,7 +290,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, } static void die_kernel_fault(const char *msg, unsigned long addr, - unsigned int esr, struct pt_regs *regs) + unsigned long esr, struct pt_regs *regs) { bust_spinlocks(1); @@ -308,7 +308,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr, } #ifdef CONFIG_KASAN_HW_TAGS -static void report_tag_fault(unsigned long addr, unsigned int esr, +static void report_tag_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { /* @@ -320,11 +320,11 @@ static void report_tag_fault(unsigned long addr, unsigned int esr, } #else /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ -static inline void report_tag_fault(unsigned long addr, unsigned int esr, +static inline void report_tag_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { } #endif -static void do_tag_recovery(unsigned long addr, unsigned int esr, +static void do_tag_recovery(unsigned long addr, unsigned long esr, struct pt_regs *regs) { @@ -339,9 +339,9 @@ static void do_tag_recovery(unsigned long addr, unsigned int esr, isb(); } -static bool is_el1_mte_sync_tag_check_fault(unsigned int esr) +static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) { - unsigned int fsc = esr & ESR_ELx_FSC; + unsigned long fsc = esr & ESR_ELx_FSC; if (!is_el1_data_abort(esr)) return false; @@ -352,7 +352,7 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned int esr) return false; } -static void __do_kernel_fault(unsigned long addr, unsigned int esr, +static void __do_kernel_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { const char *msg; @@ -393,7 +393,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, die_kernel_fault(msg, addr, esr, regs); } -static void set_thread_esr(unsigned long address, unsigned int esr) +static void set_thread_esr(unsigned long address, unsigned long esr) { current->thread.fault_address = address; @@ -441,7 +441,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr) * exception level). Fail safe by not providing an ESR * context record at all. */ - WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr); + WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr); esr = 0; break; } @@ -450,7 +450,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr) current->thread.fault_code = esr; } -static void do_bad_area(unsigned long far, unsigned int esr, +static void do_bad_area(unsigned long far, unsigned long esr, struct pt_regs *regs) { unsigned long addr = untagged_addr(far); @@ -501,7 +501,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, return handle_mm_fault(vma, addr, mm_flags, regs); } -static bool is_el0_instruction_abort(unsigned int esr) +static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; } @@ -510,12 +510,12 @@ static bool is_el0_instruction_abort(unsigned int esr) * Note: not valid for EL1 DC IVAC, but we never use that such that it * should fault. EL0 cannot issue DC IVAC (undef). */ -static bool is_write_abort(unsigned int esr) +static bool is_write_abort(unsigned long esr) { return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } -static int __kprobes do_page_fault(unsigned long far, unsigned int esr, +static int __kprobes do_page_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -671,7 +671,7 @@ no_context: } static int __kprobes do_translation_fault(unsigned long far, - unsigned int esr, + unsigned long esr, struct pt_regs *regs) { unsigned long addr = untagged_addr(far); @@ -683,19 +683,19 @@ static int __kprobes do_translation_fault(unsigned long far, return 0; } -static int do_alignment_fault(unsigned long far, unsigned int esr, +static int do_alignment_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { do_bad_area(far, esr, regs); return 0; } -static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs) +static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs) { return 1; /* "fault" */ } -static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) +static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; unsigned long siaddr; @@ -725,7 +725,7 @@ static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) return 0; } -static int do_tag_check_fault(unsigned long far, unsigned int esr, +static int do_tag_check_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { /* @@ -805,7 +805,7 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; -void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) +void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); unsigned long addr = untagged_addr(far); @@ -825,14 +825,14 @@ void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) } NOKPROBE_SYMBOL(do_mem_abort); -void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) +void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) { arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); -int __init early_brk64(unsigned long addr, unsigned int esr, +int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs); /* @@ -852,7 +852,7 @@ static struct fault_info __refdata debug_fault_info[] = { }; void __init hook_debug_fault_code(int nr, - int (*fn)(unsigned long, unsigned int, struct pt_regs *), + int (*fn)(unsigned long, unsigned long, struct pt_regs *), int sig, int code, const char *name) { BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info)); @@ -885,7 +885,7 @@ static void debug_exception_exit(struct pt_regs *regs) } NOKPROBE_SYMBOL(debug_exception_exit); -void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, +void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); -- cgit v1.2.3-59-g8ed1b From bc249e37b9334826de29111c5350c3e7a08a3969 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 3 May 2022 18:02:24 +0100 Subject: arm64/mte: Make TCF field values and naming more standard In preparation for automatic generation of the defines for system registers make the values used for the enumeration in SCTLR_ELx.TCF suitable for use with the newly defined SYS_FIELD_PREP_ENUM helper, removing the shift from the define and using the helper to generate it on use instead. Since we only ever interact with this field in EL1 and in preparation for generation of the defines also rename from SCTLR_ELx to SCTLR_EL1. SCTLR_EL2 is not quite the same as SCTLR_EL1 so the conversion does not share the field definitions. There should be no functional change from this patch. Signed-off-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220503170233.507788-4-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 14 +++++++------- arch/arm64/kernel/mte.c | 9 +++++---- arch/arm64/mm/fault.c | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6dc840be0268..732d84111d9f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -631,13 +631,6 @@ #define SCTLR_ELx_DSSBS (BIT(44)) #define SCTLR_ELx_ATA (BIT(43)) -#define SCTLR_ELx_TCF_SHIFT 40 -#define SCTLR_ELx_TCF_NONE (UL(0x0) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_SYNC (UL(0x1) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_ASYNC (UL(0x2) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_ASYMM (UL(0x3) << SCTLR_ELx_TCF_SHIFT) -#define SCTLR_ELx_TCF_MASK (UL(0x3) << SCTLR_ELx_TCF_SHIFT) - #define SCTLR_ELx_ENIA_SHIFT 31 #define SCTLR_ELx_ITFSB (BIT(37)) @@ -677,6 +670,13 @@ #define SCTLR_EL1_EPAN (BIT(57)) #define SCTLR_EL1_ATA0 (BIT(42)) +#define SCTLR_EL1_TCF_SHIFT 40 +#define SCTLR_EL1_TCF_NONE (UL(0x0)) +#define SCTLR_EL1_TCF_SYNC (UL(0x1)) +#define SCTLR_EL1_TCF_ASYNC (UL(0x2)) +#define SCTLR_EL1_TCF_ASYMM (UL(0x3)) +#define SCTLR_EL1_TCF_MASK (UL(0x3) << SCTLR_EL1_TCF_SHIFT) + #define SCTLR_EL1_TCF0_SHIFT 38 #define SCTLR_EL1_TCF0_NONE (UL(0x0)) #define SCTLR_EL1_TCF0_SYNC (UL(0x1)) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 41469b69a48e..98f5e1e13c36 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -106,7 +106,8 @@ int memcmp_pages(struct page *page1, struct page *page2) static inline void __mte_enable_kernel(const char *mode, unsigned long tcf) { /* Enable MTE Sync Mode for EL1. */ - sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, tcf); + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCF, tcf)); isb(); pr_info_once("MTE: enabled in %s mode at EL1\n", mode); @@ -122,12 +123,12 @@ void mte_enable_kernel_sync(void) WARN_ONCE(system_uses_mte_async_or_asymm_mode(), "MTE async mode enabled system wide!"); - __mte_enable_kernel("synchronous", SCTLR_ELx_TCF_SYNC); + __mte_enable_kernel("synchronous", SCTLR_EL1_TCF_SYNC); } void mte_enable_kernel_async(void) { - __mte_enable_kernel("asynchronous", SCTLR_ELx_TCF_ASYNC); + __mte_enable_kernel("asynchronous", SCTLR_EL1_TCF_ASYNC); /* * MTE async mode is set system wide by the first PE that @@ -144,7 +145,7 @@ void mte_enable_kernel_async(void) void mte_enable_kernel_asymm(void) { if (cpus_have_cap(ARM64_MTE_ASYMM)) { - __mte_enable_kernel("asymmetric", SCTLR_ELx_TCF_ASYMM); + __mte_enable_kernel("asymmetric", SCTLR_EL1_TCF_ASYMM); /* * MTE asymm mode behaves as async mode for store diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 77341b160aca..5e280cc566ca 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -335,7 +335,8 @@ static void do_tag_recovery(unsigned long addr, unsigned int esr, * It will be done lazily on the other CPUs when they will hit a * tag fault. */ - sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE); + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE)); isb(); } -- cgit v1.2.3-59-g8ed1b From 921d161f15d6b090599f6a8c23f131969edbd1fa Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Wed, 20 Apr 2022 03:04:13 +0000 Subject: arm64: fix types in copy_highpage() In copy_highpage() the `kto` and `kfrom` local variables are pointers to struct page, but these are used to hold arbitrary pointers to kernel memory . Each call to page_address() returns a void pointer to memory associated with the relevant page, and copy_page() expects void pointers to this memory. This inconsistency was introduced in commit 2563776b41c3 ("arm64: mte: Tags-aware copy_{user_,}highpage() implementations") and while this doesn't appear to be harmful in practice it is clearly wrong. Correct this by making `kto` and `kfrom` void pointers. Fixes: 2563776b41c3 ("arm64: mte: Tags-aware copy_{user_,}highpage() implementations") Signed-off-by: Tong Tiangen Acked-by: Mark Rutland Reviewed-by: Kefeng Wang Link: https://lore.kernel.org/r/20220420030418.3189040-3-tongtiangen@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/copypage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index b5447e53cd73..0dea80bf6de4 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -16,8 +16,8 @@ void copy_highpage(struct page *to, struct page *from) { - struct page *kto = page_address(to); - struct page *kfrom = page_address(from); + void *kto = page_address(to); + void *kfrom = page_address(from); copy_page(kto, kfrom); -- cgit v1.2.3-59-g8ed1b From f41ef4c2ee99d255c82d8ac6f720f28116340869 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 11 Apr 2022 17:24:55 +0800 Subject: arm64: mm: Cleanup useless parameters in zone_sizes_init() Directly use max_pfn for max and no one use min, kill them. Reviewed-by: Vijay Balakrishna Signed-off-by: Kefeng Wang Link: https://lore.kernel.org/r/20220411092455.1461-4-wangkefeng.wang@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1e7b1550e2fc..fb07e94242bb 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -157,7 +157,7 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; } -static void __init zone_sizes_init(unsigned long min, unsigned long max) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; unsigned int __maybe_unused acpi_zone_dma_bits; @@ -176,7 +176,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) if (!arm64_dma_phys_limit) arm64_dma_phys_limit = dma32_phys_limit; #endif - max_zone_pfns[ZONE_NORMAL] = max; + max_zone_pfns[ZONE_NORMAL] = max_pfn; free_area_init(max_zone_pfns); } @@ -374,7 +374,7 @@ void __init bootmem_init(void) * done after the fixed reservations */ sparse_init(); - zone_sizes_init(min, max); + zone_sizes_init(); /* * Reserve the CMA area after arm64_dma_phys_limit was initialised. -- cgit v1.2.3-59-g8ed1b From e6b394425c615d1596ce7d9de23a3a34ee2e612b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 6 May 2022 19:43:58 +0800 Subject: arm64: Use insert_resource() to simplify code insert_resource() traverses the subtree layer by layer from the root node until a proper location is found. Compared with request_resource(), the parent node does not need to be determined in advance. In addition, move the insertion of node 'crashk_res' into function reserve_crashkernel() to make the associated code close together. Signed-off-by: Zhen Lei Acked-by: John Donnelly Acked-by: Baoquan He Link: https://lore.kernel.org/r/20220506114402.365-3-thunder.leizhen@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/setup.c | 17 +++-------------- arch/arm64/mm/init.c | 1 + 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 3505789cf4bd..fea3223704b6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -225,6 +225,8 @@ static void __init request_standard_resources(void) kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); + insert_resource(&iomem_resource, &kernel_code); + insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); @@ -246,20 +248,7 @@ static void __init request_standard_resources(void) res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); -#ifdef CONFIG_KEXEC_CORE - /* Userspace will find "Crash kernel" region in /proc/iomem. */ - if (crashk_res.end && crashk_res.start >= res->start && - crashk_res.end <= res->end) - request_resource(res, &crashk_res); -#endif + insert_resource(&iomem_resource, res); } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1e7b1550e2fc..51863f1448c6 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -137,6 +137,7 @@ static void __init reserve_crashkernel(void) kmemleak_ignore_phys(crash_base); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); } /* -- cgit v1.2.3-59-g8ed1b From 944a45abfabc171fd121315ff0d5e62b11cb5d6f Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Fri, 6 May 2022 19:43:59 +0800 Subject: arm64: kdump: Reimplement crashkernel=X There are following issues in arm64 kdump: 1. We use crashkernel=X to reserve crashkernel in DMA zone, which will fail when there is not enough low memory. 2. If reserving crashkernel above DMA zone, in this case, crash dump kernel will fail to boot because there is no low memory available for allocation. To solve these issues, introduce crashkernel=X,[high,low]. The "crashkernel=X,high" is used to select a region above DMA zone, and the "crashkernel=Y,low" is used to allocate specified size low memory. Signed-off-by: Chen Zhou Co-developed-by: Zhen Lei Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20220506114402.365-4-thunder.leizhen@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/machine_kexec.c | 9 +++-- arch/arm64/kernel/machine_kexec_file.c | 12 +++++-- arch/arm64/mm/init.c | 63 ++++++++++++++++++++++++++++++---- 3 files changed, 74 insertions(+), 10 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index e16b248699d5..19c2d487cb08 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -329,8 +329,13 @@ bool crash_is_nosave(unsigned long pfn) /* in reserved memory? */ addr = __pfn_to_phys(pfn); - if ((addr < crashk_res.start) || (crashk_res.end < addr)) - return false; + if ((addr < crashk_res.start) || (crashk_res.end < addr)) { + if (!crashk_low_res.end) + return false; + + if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr)) + return false; + } if (!kexec_crash_image) return true; diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 59c648d51848..889951291cc0 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -65,10 +65,18 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) /* Exclude crashkernel region */ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + if (crashk_low_res.end) { + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + goto out; + } - if (!ret) - ret = crash_prepare_elf64_headers(cmem, true, addr, sz); + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); +out: kfree(cmem); return ret; } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 51863f1448c6..18ba66c90991 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -90,6 +90,32 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit; phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1; #endif +/* Current arm64 boot protocol requires 2MB alignment */ +#define CRASH_ALIGN SZ_2M + +#define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit +#define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1) + +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} + /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -100,17 +126,35 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1; static void __init reserve_crashkernel(void) { unsigned long long crash_base, crash_size; - unsigned long long crash_max = arm64_dma_phys_limit; + unsigned long long crash_low_size = 0; + unsigned long long crash_max = CRASH_ADDR_LOW_MAX; + char *cmdline = boot_command_line; int ret; if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + /* crashkernel=X[@offset] */ + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), &crash_size, &crash_base); - /* no crashkernel= or invalid value specified */ - if (ret || !crash_size) + if (ret == -ENOENT) { + ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); + if (ret || !crash_size) + return; + + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base); + if (ret && (ret != -ENOENT)) + return; + + crash_max = CRASH_ADDR_HIGH_MAX; + } else if (ret || !crash_size) { + /* The specified value is invalid */ return; + } crash_size = PAGE_ALIGN(crash_size); @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void) if (crash_base) crash_max = crash_base + crash_size; - /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_phys_alloc_range(crash_size, SZ_2M, + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, crash_base, crash_max); if (!crash_base) { pr_warn("cannot allocate crashkernel (size:0x%llx)\n", @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void) return; } + if (crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void) * map. Inform kmemleak so that it won't try to access it. */ kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); -- cgit v1.2.3-59-g8ed1b From fb396bb459c1fa3920dd8a9d84680398c65fed75 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 10 May 2022 10:09:30 +0530 Subject: arm64/hugetlb: Drop TLB flush from get_clear_flush() This drops now redundant TLB flush in get_clear_flush() which is no longer required after recent commit 697a1d44af8b ("tlb: hugetlb: Add more sizes to tlb_remove_huge_tlb_entry"). It also renames this function i.e dropping off '_flush' and replacing it with '__contig' as appropriate. Cc: Will Deacon Cc: Mike Kravetz Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20220510043930.2410985-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index cbace1c9e137..749435b01a89 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -166,15 +166,14 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) * * This helper performs the break step. */ -static pte_t get_clear_flush(struct mm_struct *mm, +static pte_t get_clear_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { pte_t orig_pte = huge_ptep_get(ptep); - bool valid = pte_valid(orig_pte); - unsigned long i, saddr = addr; + unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { pte_t pte = ptep_get_and_clear(mm, addr, ptep); @@ -190,11 +189,6 @@ static pte_t get_clear_flush(struct mm_struct *mm, if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } - - if (valid) { - struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); - flush_tlb_range(&vma, saddr, addr); - } return orig_pte; } @@ -392,7 +386,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, ncontig = find_num_contig(mm, addr, ptep, &pgsize); - return get_clear_flush(mm, addr, ptep, pgsize, ncontig); + return get_clear_contig(mm, addr, ptep, pgsize, ncontig); } /* @@ -443,7 +437,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; - orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + orig_pte = get_clear_contig(vma->vm_mm, addr, ptep, pgsize, ncontig); /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) @@ -476,7 +470,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, ncontig = find_num_contig(mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; - pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig); + pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); hugeprot = pte_pgprot(pte); -- cgit v1.2.3-59-g8ed1b From 8f0f104e2ab6eed4cad3b111dc206f843bda43ea Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 May 2022 11:20:32 +0800 Subject: arm64: kdump: Do not allocate crash low memory if not needed When "crashkernel=X,high" is specified, the specified "crashkernel=Y,low" memory is not required in the following corner cases: 1. If both CONFIG_ZONE_DMA and CONFIG_ZONE_DMA32 are disabled, it means that the devices can access any memory. 2. If the system memory is small, the crash high memory may be allocated from the DMA zones. If that happens, there's no need to allocate another crash low memory because there's already one. Add condition '(crash_base >= CRASH_ADDR_LOW_MAX)' to determine whether the 'high' memory is allocated above DMA zones. Note: when both CONFIG_ZONE_DMA and CONFIG_ZONE_DMA32 are disabled, the entire physical memory is DMA accessible, CRASH_ADDR_LOW_MAX equals 'PHYS_MASK + 1'. Signed-off-by: Zhen Lei Acked-by: Baoquan He Link: https://lore.kernel.org/r/20220511032033.426-1-thunder.leizhen@huawei.com Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- arch/arm64/mm/init.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/arm64/mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f6ff55840751..1b543c3109f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -823,7 +823,7 @@ low memory is needed to make sure DMA buffers for 32-bit devices won't run out. Kernel would try to allocate at least 256M below 4G automatically. - This one let user to specify own low range under 4G + This one lets the user specify own low range under 4G for second kernel instead. 0: to disable low allocation. It will be ignored when crashkernel=X,high is not used @@ -832,7 +832,8 @@ [KNL, ARM64] range in low memory. This one lets the user specify a low range in the DMA zone for the crash dump kernel. - It will be ignored when crashkernel=X,high is not used. + It will be ignored when crashkernel=X,high is not used + or memory reserved is located in the DMA zones. cryptomgr.notests [KNL] Disable crypto self-tests diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 18ba66c90991..ac510fb6a2c0 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -170,7 +170,8 @@ static void __init reserve_crashkernel(void) return; } - if (crash_low_size && reserve_crashkernel_low(crash_low_size)) { + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { memblock_phys_free(crash_base, crash_size); return; } -- cgit v1.2.3-59-g8ed1b From f0d9d79ec793ec66271e80ff2f9bf7a10458a584 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 16 May 2022 08:55:57 +0800 Subject: arm64/hugetlb: Use ptep_get() to get the pte value of a huge page The original huge_ptep_get() on ARM64 is just a wrapper of ptep_get(), which will not take into account any contig-PTEs dirty and access bits. Meanwhile we will implement a new ARM64-specific huge_ptep_get() interface in following patch, which will take into account any contig-PTEs dirty and access bits. To keep the same efficient logic to get the pte value, change to use ptep_get() as a preparation. Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/5113ed6e103f995e1d0f0c9fda0373b761bbcad2.1652496622.git.baolin.wang@linux.alibaba.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 749435b01a89..2aa1b0e176a2 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -172,7 +172,7 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = huge_ptep_get(ptep); + pte_t orig_pte = ptep_get(ptep); unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { @@ -379,7 +379,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, { int ncontig; size_t pgsize; - pte_t orig_pte = huge_ptep_get(ptep); + pte_t orig_pte = ptep_get(ptep); if (!pte_cont(orig_pte)) return ptep_get_and_clear(mm, addr, ptep); @@ -402,11 +402,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; - if (pte_write(pte) != pte_write(huge_ptep_get(ptep))) + if (pte_write(pte) != pte_write(ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { - pte_t orig_pte = huge_ptep_get(ptep + i); + pte_t orig_pte = ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; -- cgit v1.2.3-59-g8ed1b From bc5dfb4fd7bd471c77ea48143159eb5e1308d636 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 16 May 2022 08:55:58 +0800 Subject: arm64/hugetlb: Implement arm64 specific huge_ptep_get() Now we use huge_ptep_get() to get the pte value of a hugetlb page, however it will only return one specific pte value for the CONT-PTE or CONT-PMD size hugetlb on ARM64 system, which can contain several continuous pte or pmd entries with same page table attributes. And it will not take into account the subpages' dirty or young bits of a CONT-PTE/PMD size hugetlb page. So the huge_ptep_get() is inconsistent with huge_ptep_get_and_clear(), which already takes account the dirty or young bits for any subpages in this CONT-PTE/PMD size hugetlb [1]. Meanwhile we can miss dirty or young flags statistics for hugetlb pages with current huge_ptep_get(), such as the gather_hugetlb_stats() function, and CONT-PTE/PMD hugetlb monitoring with DAMON. Thus define an ARM64 specific huge_ptep_get() implementation as well as enabling __HAVE_ARCH_HUGE_PTEP_GET, that will take into account any subpages' dirty or young bits for CONT-PTE/PMD size hugetlb page, for those functions that want to check the dirty and young flags of a hugetlb page. [1] https://lore.kernel.org/linux-mm/85bd80b4-b4fd-0d3f-a2e5-149559f2f387@oracle.com/ Suggested-by: Muchun Song Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/624109a80ac4bbdf1e462dfa0b49e9f7c31a7c0d.1652496622.git.baolin.wang@linux.alibaba.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 2 ++ arch/arm64/mm/hugetlbpage.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1242f71937f8..d656822b13f1 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -44,6 +44,8 @@ extern void huge_ptep_clear_flush(struct vm_area_struct *vma, #define __HAVE_ARCH_HUGE_PTE_CLEAR extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); +#define __HAVE_ARCH_HUGE_PTEP_GET +extern pte_t huge_ptep_get(pte_t *ptep); extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); #define set_huge_swap_pte_at set_huge_swap_pte_at diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2aa1b0e176a2..64bb078e2e7b 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -158,6 +158,28 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) return contig_ptes; } +pte_t huge_ptep_get(pte_t *ptep) +{ + int ncontig, i; + size_t pgsize; + pte_t orig_pte = ptep_get(ptep); + + if (!pte_present(orig_pte) || !pte_cont(orig_pte)) + return orig_pte; + + ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); + for (i = 0; i < ncontig; i++, ptep++) { + pte_t pte = ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + return orig_pte; +} + /* * Changing some bits of contiguous entries requires us to follow a * Break-Before-Make approach, breaking the whole contiguous set -- cgit v1.2.3-59-g8ed1b From 01142791b0d11f20becccd0b30ed5e8fbb3822b6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 29 Apr 2022 15:13:47 +0200 Subject: arm64: mm: avoid writable executable mappings in kexec/hibernate code The temporary mappings of the low-level kexec and hibernate helpers are created with both writable and executable attributes, which is not necessary here, and generally best avoided. So use read-only, executable attributes instead. Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220429131347.3621090-3-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/mm/trans_pgd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index d7da8ca40d2e..4ea2eefbc053 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -238,7 +238,7 @@ int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, int this_level, index, level_lsb, level_msb; dst_addr &= PAGE_MASK; - prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_ROX)); for (this_level = 3; this_level >= 0; this_level--) { levels[this_level] = trans_alloc(info); -- cgit v1.2.3-59-g8ed1b