From df9d7a22dd21c926e8175ccc6e176cb45fc7cb09 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 6 Sep 2019 10:52:39 +0100 Subject: arm64: mte: Add Memory Tagging Extension documentation Memory Tagging Extension (part of the ARMv8.5 Extensions) provides a mechanism to detect the sources of memory related errors which may be vulnerable to exploitation, including bounds violations, use-after-free, use-after-return, use-out-of-scope and use before initialization errors. Add Memory Tagging Extension documentation for the arm64 linux kernel support. Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Acked-by: Szabolcs Nagy Cc: Will Deacon --- Documentation/arm64/cpu-feature-registers.rst | 2 + Documentation/arm64/elf_hwcaps.rst | 4 + Documentation/arm64/index.rst | 1 + Documentation/arm64/memory-tagging-extension.rst | 305 +++++++++++++++++++++++ 4 files changed, 312 insertions(+) create mode 100644 Documentation/arm64/memory-tagging-extension.rst (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index f28853f80089..328e0c454fbd 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -175,6 +175,8 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | MTE | [11-8] | y | + +------------------------------+---------+---------+ | SSBS | [7-4] | y | +------------------------------+---------+---------+ | BT | [3-0] | y | diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 84a9fd2d41b4..bbd9cf54db6c 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -240,6 +240,10 @@ HWCAP2_BTI Functionality implied by ID_AA64PFR0_EL1.BT == 0b0001. +HWCAP2_MTE + + Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0010, as described + by Documentation/arm64/memory-tagging-extension.rst. 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst index d9665d83c53a..43b0939d384e 100644 --- a/Documentation/arm64/index.rst +++ b/Documentation/arm64/index.rst @@ -14,6 +14,7 @@ ARM64 Architecture hugetlbpage legacy_instructions memory + memory-tagging-extension perf pointer-authentication silicon-errata diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst new file mode 100644 index 000000000000..e3709b536b89 --- /dev/null +++ b/Documentation/arm64/memory-tagging-extension.rst @@ -0,0 +1,305 @@ +=============================================== +Memory Tagging Extension (MTE) in AArch64 Linux +=============================================== + +Authors: Vincenzo Frascino + Catalin Marinas + +Date: 2020-02-25 + +This document describes the provision of the Memory Tagging Extension +functionality in AArch64 Linux. + +Introduction +============ + +ARMv8.5 based processors introduce the Memory Tagging Extension (MTE) +feature. MTE is built on top of the ARMv8.0 virtual address tagging TBI +(Top Byte Ignore) feature and allows software to access a 4-bit +allocation tag for each 16-byte granule in the physical address space. +Such memory range must be mapped with the Normal-Tagged memory +attribute. A logical tag is derived from bits 59-56 of the virtual +address used for the memory access. A CPU with MTE enabled will compare +the logical tag against the allocation tag and potentially raise an +exception on mismatch, subject to system registers configuration. + +Userspace Support +================= + +When ``CONFIG_ARM64_MTE`` is selected and Memory Tagging Extension is +supported by the hardware, the kernel advertises the feature to +userspace via ``HWCAP2_MTE``. + +PROT_MTE +-------- + +To access the allocation tags, a user process must enable the Tagged +memory attribute on an address range using a new ``prot`` flag for +``mmap()`` and ``mprotect()``: + +``PROT_MTE`` - Pages allow access to the MTE allocation tags. + +The allocation tag is set to 0 when such pages are first mapped in the +user address space and preserved on copy-on-write. ``MAP_SHARED`` is +supported and the allocation tags can be shared between processes. + +**Note**: ``PROT_MTE`` is only supported on ``MAP_ANONYMOUS`` and +RAM-based file mappings (``tmpfs``, ``memfd``). Passing it to other +types of mapping will result in ``-EINVAL`` returned by these system +calls. + +**Note**: The ``PROT_MTE`` flag (and corresponding memory type) cannot +be cleared by ``mprotect()``. + +**Note**: ``madvise()`` memory ranges with ``MADV_DONTNEED`` and +``MADV_FREE`` may have the allocation tags cleared (set to 0) at any +point after the system call. + +Tag Check Faults +---------------- + +When ``PROT_MTE`` is enabled on an address range and a mismatch between +the logical and allocation tags occurs on access, there are three +configurable behaviours: + +- *Ignore* - This is the default mode. The CPU (and kernel) ignores the + tag check fault. + +- *Synchronous* - The kernel raises a ``SIGSEGV`` synchronously, with + ``.si_code = SEGV_MTESERR`` and ``.si_addr = ``. The + memory access is not performed. If ``SIGSEGV`` is ignored or blocked + by the offending thread, the containing process is terminated with a + ``coredump``. + +- *Asynchronous* - The kernel raises a ``SIGSEGV``, in the offending + thread, asynchronously following one or multiple tag check faults, + with ``.si_code = SEGV_MTEAERR`` and ``.si_addr = 0`` (the faulting + address is unknown). + +The user can select the above modes, per thread, using the +``prctl(PR_SET_TAGGED_ADDR_CTRL, flags, 0, 0, 0)`` system call where +``flags`` contain one of the following values in the ``PR_MTE_TCF_MASK`` +bit-field: + +- ``PR_MTE_TCF_NONE`` - *Ignore* tag check faults +- ``PR_MTE_TCF_SYNC`` - *Synchronous* tag check fault mode +- ``PR_MTE_TCF_ASYNC`` - *Asynchronous* tag check fault mode + +The current tag check fault mode can be read using the +``prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0)`` system call. + +Tag checking can also be disabled for a user thread by setting the +``PSTATE.TCO`` bit with ``MSR TCO, #1``. + +**Note**: Signal handlers are always invoked with ``PSTATE.TCO = 0``, +irrespective of the interrupted context. ``PSTATE.TCO`` is restored on +``sigreturn()``. + +**Note**: There are no *match-all* logical tags available for user +applications. + +**Note**: Kernel accesses to the user address space (e.g. ``read()`` +system call) are not checked if the user thread tag checking mode is +``PR_MTE_TCF_NONE`` or ``PR_MTE_TCF_ASYNC``. If the tag checking mode is +``PR_MTE_TCF_SYNC``, the kernel makes a best effort to check its user +address accesses, however it cannot always guarantee it. + +Excluding Tags in the ``IRG``, ``ADDG`` and ``SUBG`` instructions +----------------------------------------------------------------- + +The architecture allows excluding certain tags to be randomly generated +via the ``GCR_EL1.Exclude`` register bit-field. By default, Linux +excludes all tags other than 0. A user thread can enable specific tags +in the randomly generated set using the ``prctl(PR_SET_TAGGED_ADDR_CTRL, +flags, 0, 0, 0)`` system call where ``flags`` contains the tags bitmap +in the ``PR_MTE_TAG_MASK`` bit-field. + +**Note**: The hardware uses an exclude mask but the ``prctl()`` +interface provides an include mask. An include mask of ``0`` (exclusion +mask ``0xffff``) results in the CPU always generating tag ``0``. + +Initial process state +--------------------- + +On ``execve()``, the new process has the following configuration: + +- ``PR_TAGGED_ADDR_ENABLE`` set to 0 (disabled) +- Tag checking mode set to ``PR_MTE_TCF_NONE`` +- ``PR_MTE_TAG_MASK`` set to 0 (all tags excluded) +- ``PSTATE.TCO`` set to 0 +- ``PROT_MTE`` not set on any of the initial memory maps + +On ``fork()``, the new process inherits the parent's configuration and +memory map attributes with the exception of the ``madvise()`` ranges +with ``MADV_WIPEONFORK`` which will have the data and tags cleared (set +to 0). + +The ``ptrace()`` interface +-------------------------- + +``PTRACE_PEEKMTETAGS`` and ``PTRACE_POKEMTETAGS`` allow a tracer to read +the tags from or set the tags to a tracee's address space. The +``ptrace()`` system call is invoked as ``ptrace(request, pid, addr, +data)`` where: + +- ``request`` - one of ``PTRACE_PEEKMTETAGS`` or ``PTRACE_PEEKMTETAGS``. +- ``pid`` - the tracee's PID. +- ``addr`` - address in the tracee's address space. +- ``data`` - pointer to a ``struct iovec`` where ``iov_base`` points to + a buffer of ``iov_len`` length in the tracer's address space. + +The tags in the tracer's ``iov_base`` buffer are represented as one +4-bit tag per byte and correspond to a 16-byte MTE tag granule in the +tracee's address space. + +**Note**: If ``addr`` is not aligned to a 16-byte granule, the kernel +will use the corresponding aligned address. + +``ptrace()`` return value: + +- 0 - tags were copied, the tracer's ``iov_len`` was updated to the + number of tags transferred. This may be smaller than the requested + ``iov_len`` if the requested address range in the tracee's or the + tracer's space cannot be accessed or does not have valid tags. +- ``-EPERM`` - the specified process cannot be traced. +- ``-EIO`` - the tracee's address range cannot be accessed (e.g. invalid + address) and no tags copied. ``iov_len`` not updated. +- ``-EFAULT`` - fault on accessing the tracer's memory (``struct iovec`` + or ``iov_base`` buffer) and no tags copied. ``iov_len`` not updated. +- ``-EOPNOTSUPP`` - the tracee's address does not have valid tags (never + mapped with the ``PROT_MTE`` flag). ``iov_len`` not updated. + +**Note**: There are no transient errors for the requests above, so user +programs should not retry in case of a non-zero system call return. + +``PTRACE_GETREGSET`` and ``PTRACE_SETREGSET`` with ``addr == +``NT_ARM_TAGGED_ADDR_CTRL`` allow ``ptrace()`` access to the tagged +address ABI control and MTE configuration of a process as per the +``prctl()`` options described in +Documentation/arm64/tagged-address-abi.rst and above. The corresponding +``regset`` is 1 element of 8 bytes (``sizeof(long))``). + +Example of correct usage +======================== + +*MTE Example code* + +.. code-block:: c + + /* + * To be compiled with -march=armv8.5-a+memtag + */ + #include + #include + #include + #include + #include + #include + #include + #include + + /* + * From arch/arm64/include/uapi/asm/hwcap.h + */ + #define HWCAP2_MTE (1 << 18) + + /* + * From arch/arm64/include/uapi/asm/mman.h + */ + #define PROT_MTE 0x20 + + /* + * From include/uapi/linux/prctl.h + */ + #define PR_SET_TAGGED_ADDR_CTRL 55 + #define PR_GET_TAGGED_ADDR_CTRL 56 + # define PR_TAGGED_ADDR_ENABLE (1UL << 0) + # define PR_MTE_TCF_SHIFT 1 + # define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TAG_SHIFT 3 + # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) + + /* + * Insert a random logical tag into the given pointer. + */ + #define insert_random_tag(ptr) ({ \ + uint64_t __val; \ + asm("irg %0, %1" : "=r" (__val) : "r" (ptr)); \ + __val; \ + }) + + /* + * Set the allocation tag on the destination address. + */ + #define set_tag(tagged_addr) do { \ + asm volatile("stg %0, [%0]" : : "r" (tagged_addr) : "memory"); \ + } while (0) + + int main() + { + unsigned char *a; + unsigned long page_sz = sysconf(_SC_PAGESIZE); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + + /* check if MTE is present */ + if (!(hwcap2 & HWCAP2_MTE)) + return EXIT_FAILURE; + + /* + * Enable the tagged address ABI, synchronous MTE tag check faults and + * allow all non-zero tags in the randomly generated set. + */ + if (prctl(PR_SET_TAGGED_ADDR_CTRL, + PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC | (0xfffe << PR_MTE_TAG_SHIFT), + 0, 0, 0)) { + perror("prctl() failed"); + return EXIT_FAILURE; + } + + a = mmap(0, page_sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (a == MAP_FAILED) { + perror("mmap() failed"); + return EXIT_FAILURE; + } + + /* + * Enable MTE on the above anonymous mmap. The flag could be passed + * directly to mmap() and skip this step. + */ + if (mprotect(a, page_sz, PROT_READ | PROT_WRITE | PROT_MTE)) { + perror("mprotect() failed"); + return EXIT_FAILURE; + } + + /* access with the default tag (0) */ + a[0] = 1; + a[1] = 2; + + printf("a[0] = %hhu a[1] = %hhu\n", a[0], a[1]); + + /* set the logical and allocation tags */ + a = (unsigned char *)insert_random_tag(a); + set_tag(a); + + printf("%p\n", a); + + /* non-zero tag access */ + a[0] = 3; + printf("a[0] = %hhu a[1] = %hhu\n", a[0], a[1]); + + /* + * If MTE is enabled correctly the next instruction will generate an + * exception. + */ + printf("Expecting SIGSEGV...\n"); + a[16] = 0xdd; + + /* this should not be printed in the PR_MTE_TCF_SYNC mode */ + printf("...haven't got one\n"); + + return EXIT_FAILURE; + } -- cgit v1.2.3-59-g8ed1b From 7a5d265b68e10bd99909434eedf08ca79ab2e640 Mon Sep 17 00:00:00 2001 From: Bailu Lin Date: Fri, 25 Sep 2020 19:25:58 -0700 Subject: doc: zh_CN: index files in arm64 subdirectory Add arm64 subdirectory into the table of Contents for zh_CN, then add other translations in arm64 conveniently. Signed-off-by: Bailu Lin Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20200926022558.46232-1-bailu.lin@vivo.com Signed-off-by: Jonathan Corbet --- Documentation/arm64/index.rst | 2 ++ Documentation/translations/zh_CN/arm64/index.rst | 14 ++++++++++++++ Documentation/translations/zh_CN/index.rst | 1 + 3 files changed, 17 insertions(+) create mode 100644 Documentation/translations/zh_CN/arm64/index.rst (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst index d9665d83c53a..3ebe0fa31948 100644 --- a/Documentation/arm64/index.rst +++ b/Documentation/arm64/index.rst @@ -1,3 +1,5 @@ +.. _arm64_index: + ================== ARM64 Architecture ================== diff --git a/Documentation/translations/zh_CN/arm64/index.rst b/Documentation/translations/zh_CN/arm64/index.rst new file mode 100644 index 000000000000..57545f19ab2d --- /dev/null +++ b/Documentation/translations/zh_CN/arm64/index.rst @@ -0,0 +1,14 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: :ref:`Documentation/arm64/index.rst ` +:Translator: Bailu Lin + +.. _cn_arm64_index: + + +========== +ARM64 架构 +========== + +.. toctree:: + :maxdepth: 2 diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index 85643e46e308..be6f11176200 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -19,6 +19,7 @@ admin-guide/index process/index filesystems/index + arm64/index 目录和表格 ---------- -- cgit v1.2.3-59-g8ed1b From a0eef4a8acbb57f3ca307dcd127dcc0c2e806981 Mon Sep 17 00:00:00 2001 From: Bailu Lin Date: Fri, 25 Sep 2020 19:52:33 -0700 Subject: Documentation: Chinese translation of Documentation/arm64/amu.rst This is a Chinese translated version of Documentation/arm64/amu.rst Signed-off-by: Bailu Lin Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20200926025233.47214-1-bailu.lin@vivo.com Signed-off-by: Jonathan Corbet --- Documentation/arm64/amu.rst | 2 + Documentation/translations/zh_CN/arm64/amu.rst | 100 +++++++++++++++++++++++ Documentation/translations/zh_CN/arm64/index.rst | 2 + 3 files changed, 104 insertions(+) create mode 100644 Documentation/translations/zh_CN/arm64/amu.rst (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/amu.rst b/Documentation/arm64/amu.rst index 452ec8b115c2..01f2de2b0450 100644 --- a/Documentation/arm64/amu.rst +++ b/Documentation/arm64/amu.rst @@ -1,3 +1,5 @@ +.. _amu_index: + ======================================================= Activity Monitors Unit (AMU) extension in AArch64 Linux ======================================================= diff --git a/Documentation/translations/zh_CN/arm64/amu.rst b/Documentation/translations/zh_CN/arm64/amu.rst new file mode 100644 index 000000000000..bd875f221330 --- /dev/null +++ b/Documentation/translations/zh_CN/arm64/amu.rst @@ -0,0 +1,100 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: :ref:`Documentation/arm64/amu.rst ` + +Translator: Bailu Lin + +================================= +AArch64 Linux 中扩展的活动监控单元 +================================= + +作者: Ionela Voinescu + +日期: 2019-09-10 + +本文档简要描述了 AArch64 Linux 支持的活动监控单元的规范。 + + +架构总述 +-------- + +活动监控是 ARMv8.4 CPU 架构引入的一个可选扩展特性。 + +活动监控单元(在每个 CPU 中实现)为系统管理提供了性能计数器。既可以通 +过系统寄存器的方式访问计数器,同时也支持外部内存映射的方式访问计数器。 + +AMUv1 架构实现了一个由4个固定的64位事件计数器组成的计数器组。 + + - CPU 周期计数器:同 CPU 的频率增长 + - 常量计数器:同固定的系统时钟频率增长 + - 淘汰指令计数器: 同每次架构指令执行增长 + - 内存停顿周期计数器:计算由在时钟域内的最后一级缓存中未命中而引起 + 的指令调度停顿周期数 + +当处于 WFI 或者 WFE 状态时,计数器不会增长。 + +AMU 架构提供了一个高达16位的事件计数器空间,未来新的 AMU 版本中可能 +用它来实现新增的事件计数器。 + +另外,AMUv1 实现了一个多达16个64位辅助事件计数器的计数器组。 + +冷复位时所有的计数器会清零。 + + +基本支持 +-------- + +内核可以安全地运行在支持 AMU 和不支持 AMU 的 CPU 组合中。 +因此,当配置 CONFIG_ARM64_AMU_EXTN 后我们无条件使能后续 +(secondary or hotplugged) CPU 检测和使用这个特性。 + +当在 CPU 上检测到该特性时,我们会标记为特性可用但是不能保证计数器的功能, +仅表明有扩展属性。 + +固件(代码运行在高异常级别,例如 arm-tf )需支持以下功能: + + - 提供低异常级别(EL2 和 EL1)访问 AMU 寄存器的能力。 + - 使能计数器。如果未使能,它的值应为 0。 + - 在从电源关闭状态启动 CPU 前或后保存或者恢复计数器。 + +当使用使能了该特性的内核启动但固件损坏时,访问计数器寄存器可能会遭遇 +panic 或者死锁。即使未发现这些症状,计数器寄存器返回的数据结果并不一 +定能反映真实情况。通常,计数器会返回 0,表明他们未被使能。 + +如果固件没有提供适当的支持最好关闭 CONFIG_ARM64_AMU_EXTN。 +值得注意的是,出于安全原因,不要绕过 AMUSERRENR_EL0 设置而捕获从 +EL0(用户空间) 访问 EL1(内核空间)。 因此,固件应该确保访问 AMU寄存器 +不会困在 EL2或EL3。 + +AMUv1 的固定计数器可以通过如下系统寄存器访问: + + - SYS_AMEVCNTR0_CORE_EL0 + - SYS_AMEVCNTR0_CONST_EL0 + - SYS_AMEVCNTR0_INST_RET_EL0 + - SYS_AMEVCNTR0_MEM_STALL_EL0 + +特定辅助计数器可以通过 SYS_AMEVCNTR1_EL0(n) 访问,其中n介于0到15。 + +详细信息定义在目录:arch/arm64/include/asm/sysreg.h。 + + +用户空间访问 +------------ + +由于以下原因,当前禁止从用户空间访问 AMU 的寄存器: + + - 安全因数:可能会暴露处于安全模式执行的代码信息。 + - 意愿:AMU 是用于系统管理的。 + +同样,该功能对用户空间不可见。 + + +虚拟化 +------ + +由于以下原因,当前禁止从 KVM 客户端的用户空间(EL0)和内核空间(EL1) +访问 AMU 的寄存器: + + - 安全因数:可能会暴露给其他客户端或主机端执行的代码信息。 + +任何试图访问 AMU 寄存器的行为都会触发一个注册在客户端的未定义异常。 diff --git a/Documentation/translations/zh_CN/arm64/index.rst b/Documentation/translations/zh_CN/arm64/index.rst index 57545f19ab2d..646ed1f7aea3 100644 --- a/Documentation/translations/zh_CN/arm64/index.rst +++ b/Documentation/translations/zh_CN/arm64/index.rst @@ -12,3 +12,5 @@ ARM64 架构 .. toctree:: :maxdepth: 2 + + amu -- cgit v1.2.3-59-g8ed1b From b5756146db3ad57a9c0e841ea01ce915db27b7de Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 28 Sep 2020 23:02:19 +0100 Subject: arm64: mte: Fix typo in memory tagging ABI documentation We offer both PTRACE_PEEKMTETAGS and PTRACE_POKEMTETAGS requests via ptrace(). Reported-by: Andrey Konovalov Signed-off-by: Will Deacon --- Documentation/arm64/memory-tagging-extension.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst index e3709b536b89..034d37c605e8 100644 --- a/Documentation/arm64/memory-tagging-extension.rst +++ b/Documentation/arm64/memory-tagging-extension.rst @@ -142,7 +142,7 @@ the tags from or set the tags to a tracee's address space. The ``ptrace()`` system call is invoked as ``ptrace(request, pid, addr, data)`` where: -- ``request`` - one of ``PTRACE_PEEKMTETAGS`` or ``PTRACE_PEEKMTETAGS``. +- ``request`` - one of ``PTRACE_PEEKMTETAGS`` or ``PTRACE_POKEMTETAGS``. - ``pid`` - the tracee's PID. - ``addr`` - address in the tracee's address space. - ``data`` - pointer to a ``struct iovec`` where ``iov_base`` points to -- cgit v1.2.3-59-g8ed1b From e0533dee522593c25a88b63bf730b2096f6d4122 Mon Sep 17 00:00:00 2001 From: Bailu Lin Date: Tue, 13 Oct 2020 19:20:03 -0700 Subject: Documentation: Chinese translation of Documentation/arm64/hugetlbpage.rst This is a Chinese translated version of Documentation/arm64/hugetlbpage.rst Signed-off-by: Bailu Lin Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20201014022003.43862-1-bailu.lin@vivo.com Signed-off-by: Jonathan Corbet --- Documentation/arm64/hugetlbpage.rst | 2 + .../translations/zh_CN/arm64/hugetlbpage.rst | 45 ++++++++++++++++++++++ Documentation/translations/zh_CN/arm64/index.rst | 1 + 3 files changed, 48 insertions(+) create mode 100644 Documentation/translations/zh_CN/arm64/hugetlbpage.rst (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/hugetlbpage.rst b/Documentation/arm64/hugetlbpage.rst index b44f939e5210..a110124c11e3 100644 --- a/Documentation/arm64/hugetlbpage.rst +++ b/Documentation/arm64/hugetlbpage.rst @@ -1,3 +1,5 @@ +.. _hugetlbpage_index: + ==================== HugeTLBpage on ARM64 ==================== diff --git a/Documentation/translations/zh_CN/arm64/hugetlbpage.rst b/Documentation/translations/zh_CN/arm64/hugetlbpage.rst new file mode 100644 index 000000000000..13304d269d0b --- /dev/null +++ b/Documentation/translations/zh_CN/arm64/hugetlbpage.rst @@ -0,0 +1,45 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: :ref:`Documentation/arm64/hugetlbpage.rst ` + +Translator: Bailu Lin + +===================== +ARM64中的 HugeTLBpage +===================== + +大页依靠有效利用 TLBs 来提高地址翻译的性能。这取决于以下 +两点 - + + - 大页的大小 + - TLBs 支持的条目大小 + +ARM64 接口支持2种大页方式。 + +1) pud/pmd 级别的块映射 +----------------------- + +这是常规大页,他们的 pmd 或 pud 页面表条目指向一个内存块。 +不管 TLB 中支持的条目大小如何,块映射可以减少翻译大页地址 +所需遍历的页表深度。 + +2) 使用连续位 +------------- + +架构中转换页表条目(D4.5.3, ARM DDI 0487C.a)中提供一个连续 +位告诉 MMU 这个条目是一个连续条目集的一员,它可以被缓存在单 +个 TLB 条目中。 + +在 Linux 中连续位用来增加 pmd 和 pte(最后一级)级别映射的大 +小。受支持的连续页表条目数量因页面大小和页表级别而异。 + + +支持以下大页尺寸配置 - + + ====== ======== ==== ======== === + - CONT PTE PMD CONT PMD PUD + ====== ======== ==== ======== === + 4K: 64K 2M 32M 1G + 16K: 2M 32M 1G + 64K: 2M 512M 16G + ====== ======== ==== ======== === diff --git a/Documentation/translations/zh_CN/arm64/index.rst b/Documentation/translations/zh_CN/arm64/index.rst index 646ed1f7aea3..e31a6090384d 100644 --- a/Documentation/translations/zh_CN/arm64/index.rst +++ b/Documentation/translations/zh_CN/arm64/index.rst @@ -14,3 +14,4 @@ ARM64 架构 :maxdepth: 2 amu + hugetlbpage -- cgit v1.2.3-59-g8ed1b From ef5dd6a0c828b6fbd9d595e5772fcb51ff86697e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 28 Oct 2020 14:55:24 +0000 Subject: arm64: mte: Document that user PSTATE.TCO is ignored by kernel uaccess On exception entry, the kernel explicitly resets the PSTATE.TCO (tag check override) so that any kernel memory accesses will be checked (the bit is restored on exception return). This has the side-effect that the uaccess routines will not honour the PSTATE.TCO that may have been set by the user prior to a syscall. There is no issue in practice since PSTATE.TCO is expected to be used only for brief periods in specific routines (e.g. garbage collection). To control the tag checking mode of the uaccess routines, the user will have to invoke a corresponding prctl() call. Document the kernel behaviour w.r.t. PSTATE.TCO accordingly. Signed-off-by: Catalin Marinas Fixes: df9d7a22dd21 ("arm64: mte: Add Memory Tagging Extension documentation") Reviewed-by: Vincenzo Frascino Cc: Will Deacon Cc: Szabolcs Nagy Signed-off-by: Will Deacon --- Documentation/arm64/memory-tagging-extension.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst index 034d37c605e8..b540178a93f8 100644 --- a/Documentation/arm64/memory-tagging-extension.rst +++ b/Documentation/arm64/memory-tagging-extension.rst @@ -102,7 +102,9 @@ applications. system call) are not checked if the user thread tag checking mode is ``PR_MTE_TCF_NONE`` or ``PR_MTE_TCF_ASYNC``. If the tag checking mode is ``PR_MTE_TCF_SYNC``, the kernel makes a best effort to check its user -address accesses, however it cannot always guarantee it. +address accesses, however it cannot always guarantee it. Kernel accesses +to user addresses are always performed with an effective ``PSTATE.TCO`` +value of zero, regardless of the user configuration. Excluding Tags in the ``IRG``, ``ADDG`` and ``SUBG`` instructions ----------------------------------------------------------------- -- cgit v1.2.3-59-g8ed1b From 96d389ca10110d7eefb46feb6af9a0c6832f78f5 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 28 Oct 2020 13:28:39 -0500 Subject: arm64: Add workaround for Arm Cortex-A77 erratum 1508412 On Cortex-A77 r0p0 and r1p0, a sequence of a non-cacheable or device load and a store exclusive or PAR_EL1 read can cause a deadlock. The workaround requires a DMB SY before and after a PAR_EL1 register read. In addition, it's possible an interrupt (doing a device read) or KVM guest exit could be taken between the DMB and PAR read, so we also need a DMB before returning from interrupt and before returning to a guest. A deadlock is still possible with the workaround as KVM guests must also have the workaround. IOW, a malicious guest can deadlock an affected systems. This workaround also depends on a firmware counterpart to enable the h/w to insert DMB SY after load and store exclusive instructions. See the errata document SDEN-1152370 v10 [1] for more information. [1] https://static.docs.arm.com/101992/0010/Arm_Cortex_A77_MP074_Software_Developer_Errata_Notice_v10.pdf Signed-off-by: Rob Herring Reviewed-by: Catalin Marinas Acked-by: Marc Zyngier Cc: Catalin Marinas Cc: James Morse Cc: Suzuki K Poulose Cc: Will Deacon Cc: Julien Thierry Cc: kvmarm@lists.cs.columbia.edu Link: https://lore.kernel.org/r/20201028182839.166037-2-robh@kernel.org Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 20 ++++++++++++++++++++ arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/sysreg.h | 9 +++++++++ arch/arm64/kernel/cpu_errata.c | 10 ++++++++++ arch/arm64/kernel/entry.S | 3 +++ arch/arm64/kvm/arm.c | 3 ++- arch/arm64/kvm/hyp/include/hyp/switch.h | 21 +++++++++++++-------- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- arch/arm64/kvm/sys_regs.c | 2 +- arch/arm64/mm/fault.c | 2 +- 13 files changed, 66 insertions(+), 15 deletions(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index d3587805de64..719510247292 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -90,6 +90,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A77 | #1508412 | ARM64_ERRATUM_1508412 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1349291 | N/A | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f858c352f72a..1d466addb078 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -636,6 +636,26 @@ config ARM64_ERRATUM_1542419 If unsure, say Y. +config ARM64_ERRATUM_1508412 + bool "Cortex-A77: 1508412: workaround deadlock on sequence of NC/Device load and store exclusive or PAR read" + default y + help + This option adds a workaround for Arm Cortex-A77 erratum 1508412. + + Affected Cortex-A77 cores (r0p0, r1p0) could deadlock on a sequence + of a store-exclusive or read of PAR_EL1 and a load with device or + non-cacheable memory attributes. The workaround depends on a firmware + counterpart. + + KVM guests must also have the workaround implemented or they can + deadlock the system. + + Work around the issue by inserting DMB SY barriers around PAR_EL1 + register reads and warning KVM users. The DMB barrier is sufficient + to prevent a speculative PAR_EL1 read. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 42868dbd29fd..e7d98997c09c 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -65,7 +65,8 @@ #define ARM64_HAS_ARMv8_4_TTL 55 #define ARM64_HAS_TLB_RANGE 56 #define ARM64_MTE 57 +#define ARM64_WORKAROUND_1508412 58 -#define ARM64_NCAPS 58 +#define ARM64_NCAPS 59 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d52c1b3ce589..174817ba119c 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1007,6 +1007,7 @@ #include #include +#include #define __DEFINE_MRS_MSR_S_REGNUM \ " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \ @@ -1095,6 +1096,14 @@ write_sysreg_s(__scs_new, sysreg); \ } while (0) +#define read_sysreg_par() ({ \ + u64 par; \ + asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412)); \ + par = read_sysreg(par_el1); \ + asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412)); \ + par; \ +}) + #endif #endif /* __ASM_SYSREG_H */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 24d75af344b1..61314fd70f13 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -522,6 +522,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .matches = has_neoverse_n1_erratum_1542419, .cpu_enable = cpu_enable_trap_ctr_access, }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1508412 + { + /* we depend on the firmware portion for correctness */ + .desc = "ARM erratum 1508412 (kernel portion)", + .capability = ARM64_WORKAROUND_1508412, + ERRATA_MIDR_RANGE(MIDR_CORTEX_A77, + 0, 0, + 1, 0), + }, #endif { } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f30007dff35f..b295fb912b12 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -365,6 +365,9 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 br x30 #endif .else + /* Ensure any device/NC reads complete */ + alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 + eret .endif sb diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f56122eedffc..8f8fca47abfc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1719,7 +1719,8 @@ int kvm_arch_init(void *opaque) return -ENODEV; } - if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)) + if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || + cpus_have_final_cap(ARM64_WORKAROUND_1508412)) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ "Only trusted guests should be used on this system.\n"); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 313a8fa3c721..1f875a8f20c4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -140,9 +140,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * We do need to save/restore PAR_EL1 though, as we haven't * saved the guest context yet, and we may return early... */ - par = read_sysreg(par_el1); + par = read_sysreg_par(); if (!__kvm_at("s1e1r", far)) - tmp = read_sysreg(par_el1); + tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ write_sysreg(par, par_el1); @@ -421,7 +421,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 && handle_tx2_tvm(vcpu)) - return true; + goto guest; /* * We trap the first access to the FP/SIMD to save the host context @@ -431,13 +431,13 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) * Similarly for trapped SVE accesses. */ if (__hyp_handle_fpsimd(vcpu)) - return true; + goto guest; if (__hyp_handle_ptrauth(vcpu)) - return true; + goto guest; if (!__populate_fault_info(vcpu)) - return true; + goto guest; if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; @@ -452,7 +452,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) int ret = __vgic_v2_perform_cpuif_access(vcpu); if (ret == 1) - return true; + goto guest; /* Promote an illegal access to an SError.*/ if (ret == -1) @@ -468,12 +468,17 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) int ret = __vgic_v3_perform_cpuif_access(vcpu); if (ret == 1) - return true; + goto guest; } exit: /* Return to the host kernel and handle the exit */ return false; + +guest: + /* Re-enter the guest */ + asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412)); + return true; } static inline void __kvm_unexpected_el2_exception(void) diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 7a986030145f..cce43bfe158f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -43,7 +43,7 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR); ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR); ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); - ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg(par_el1); + ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par(); ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1); ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index a457a0306e03..8ae8160bc93a 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -250,7 +250,7 @@ void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); - u64 par = read_sysreg(par_el1); + u64 par = read_sysreg_par(); bool restore_host = true; struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index fe69de16dadc..62546e20b251 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -215,7 +215,7 @@ void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); - u64 par = read_sysreg(par_el1); + u64 par = read_sysreg_par(); __hyp_call_panic(spsr, elr, par); unreachable(); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d9117bc56237..41348a7781d9 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -95,7 +95,7 @@ static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break; - case PAR_EL1: *val = read_sysreg_s(SYS_PAR_EL1); break; + case PAR_EL1: *val = read_sysreg_par(); break; case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 94c99c1c19e3..1ee94002801f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -262,7 +262,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, local_irq_save(flags); asm volatile("at s1e1r, %0" :: "r" (addr)); isb(); - par = read_sysreg(par_el1); + par = read_sysreg_par(); local_irq_restore(flags); /* -- cgit v1.2.3-59-g8ed1b From f4693c2716b35d0846fd45a4ad7db78bfb25efc8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 8 Oct 2020 17:36:00 +0200 Subject: arm64: mm: extend linear region for 52-bit VA configurations For historical reasons, the arm64 kernel VA space is configured as two equally sized halves, i.e., on a 48-bit VA build, the VA space is split into a 47-bit vmalloc region and a 47-bit linear region. When support for 52-bit virtual addressing was added, this equal split was kept, resulting in a substantial waste of virtual address space in the linear region: 48-bit VA 52-bit VA 0xffff_ffff_ffff_ffff +-------------+ +-------------+ | vmalloc | | vmalloc | 0xffff_8000_0000_0000 +-------------+ _PAGE_END(48) +-------------+ | linear | : : 0xffff_0000_0000_0000 +-------------+ : : : : : : : : : : : : : : : : : currently : : unusable : : : : : : unused : : by : : : : : : : : hardware : : : : : : : 0xfff8_0000_0000_0000 : : _PAGE_END(52) +-------------+ : : | | : : | | : : | | : : | | : : | | : unusable : | | : : | linear | : by : | | : : | region | : hardware : | | : : | | : : | | : : | | : : | | : : | | : : | | 0xfff0_0000_0000_0000 +-------------+ PAGE_OFFSET +-------------+ As illustrated above, the 52-bit VA kernel uses 47 bits for the vmalloc space (as before), to ensure that a single 64k granule kernel image can support any 64k granule capable system, regardless of whether it supports the 52-bit virtual addressing extension. However, due to the fact that the VA space is still split in equal halves, the linear region is only 2^51 bytes in size, wasting almost half of the 52-bit VA space. Let's fix this, by abandoning the equal split, and simply assigning all VA space outside of the vmalloc region to the linear region. The KASAN shadow region is reconfigured so that it ends at the start of the vmalloc region, and grows downwards. That way, the arrangement of the vmalloc space (which contains kernel mappings, modules, BPF region, the vmemmap array etc) is identical between non-KASAN and KASAN builds, which aids debugging. Signed-off-by: Ard Biesheuvel Reviewed-by: Steve Capper Link: https://lore.kernel.org/r/20201008153602.9467-3-ardb@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/kasan-offsets.sh | 3 +-- Documentation/arm64/memory.rst | 19 +++++++++---------- arch/arm64/Kconfig | 20 ++++++++++---------- arch/arm64/include/asm/memory.h | 12 +++++------- arch/arm64/mm/init.c | 2 +- 5 files changed, 26 insertions(+), 30 deletions(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/kasan-offsets.sh b/Documentation/arm64/kasan-offsets.sh index 2b7a021db363..2dc5f9e18039 100644 --- a/Documentation/arm64/kasan-offsets.sh +++ b/Documentation/arm64/kasan-offsets.sh @@ -1,12 +1,11 @@ #!/bin/sh # Print out the KASAN_SHADOW_OFFSETS required to place the KASAN SHADOW -# start address at the mid-point of the kernel VA space +# start address at the top of the linear region print_kasan_offset () { printf "%02d\t" $1 printf "0x%08x00000000\n" $(( (0xffffffff & (-1 << ($1 - 1 - 32))) \ - + (1 << ($1 - 32 - $2)) \ - (1 << (64 - 32 - $2)) )) } diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index cf03b3290800..ee51eb66a578 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -32,10 +32,10 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit):: ----------------------------------------------------------------------- 0000000000000000 0000ffffffffffff 256TB user ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map - ffff800000000000 ffff9fffffffffff 32TB kasan shadow region - ffffa00000000000 ffffa00007ffffff 128MB bpf jit region - ffffa00008000000 ffffa0000fffffff 128MB modules - ffffa00010000000 fffffdffbffeffff ~93TB vmalloc +[ ffff600000000000 ffff7fffffffffff ] 32TB [ kasan shadow region ] + ffff800000000000 ffff800007ffffff 128MB bpf jit region + ffff800008000000 ffff80000fffffff 128MB modules + ffff800010000000 fffffdffbffeffff 125TB vmalloc fffffdffbfff0000 fffffdfffe5f8fff ~998MB [guard region] fffffdfffe5f9000 fffffdfffe9fffff 4124KB fixed mappings fffffdfffea00000 fffffdfffebfffff 2MB [guard region] @@ -50,12 +50,11 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support): Start End Size Use ----------------------------------------------------------------------- 0000000000000000 000fffffffffffff 4PB user - fff0000000000000 fff7ffffffffffff 2PB kernel logical memory map - fff8000000000000 fffd9fffffffffff 1440TB [gap] - fffda00000000000 ffff9fffffffffff 512TB kasan shadow region - ffffa00000000000 ffffa00007ffffff 128MB bpf jit region - ffffa00008000000 ffffa0000fffffff 128MB modules - ffffa00010000000 fffff81ffffeffff ~88TB vmalloc + fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map +[ fffd800000000000 ffff7fffffffffff ] 512TB [ kasan shadow region ] + ffff800000000000 ffff800007ffffff 128MB bpf jit region + ffff800008000000 ffff80000fffffff 128MB modules + ffff800010000000 fffff81ffffeffff 120TB vmalloc fffff81fffff0000 fffffc1ffe58ffff ~3TB [guard region] fffffc1ffe590000 fffffc1ffe9fffff 4544KB fixed mappings fffffc1ffea00000 fffffc1ffebfffff 2MB [guard region] diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1515f6f153a0..c6092cbb39af 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -331,16 +331,16 @@ config BROKEN_GAS_INST config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xdfffa00000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS - default 0xdfffd00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS - default 0xdffffe8000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS - default 0xdfffffd000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS - default 0xdffffffa00000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS - default 0xefff900000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS - default 0xefffc80000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS - default 0xeffffe4000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS - default 0xefffffc800000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS - default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS + default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS + default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS + default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS + default 0xdfffffc000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS + default 0xdffffff800000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS + default 0xefff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS + default 0xefffc00000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS + default 0xeffffe0000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS + default 0xefffffc000000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS + default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS default 0xffffffffffffffff source "arch/arm64/Kconfig.platforms" diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index cd61239bae8c..8e89f9b9091e 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -44,7 +44,7 @@ #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) -#define BPF_JIT_REGION_START (KASAN_SHADOW_END) +#define BPF_JIT_REGION_START (_PAGE_END(VA_BITS_MIN)) #define BPF_JIT_REGION_SIZE (SZ_128M) #define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) @@ -76,10 +76,11 @@ #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) #define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \ + KASAN_SHADOW_OFFSET) +#define PAGE_END (KASAN_SHADOW_END - (1UL << (vabits_actual - KASAN_SHADOW_SCALE_SHIFT))) #define KASAN_THREAD_SHIFT 1 #else #define KASAN_THREAD_SHIFT 0 -#define KASAN_SHADOW_END (_PAGE_END(VA_BITS_MIN)) +#define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) @@ -167,7 +168,6 @@ #include extern u64 vabits_actual; -#define PAGE_END (_PAGE_END(vabits_actual)) extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ @@ -238,11 +238,9 @@ static inline const void *__tag_set(const void *addr, u8 tag) /* - * The linear kernel range starts at the bottom of the virtual address - * space. Testing the top bit for the start of the region is a - * sufficient check and avoids having to worry about the tag. + * The linear kernel range starts at the bottom of the virtual address space. */ -#define __is_lm_address(addr) (!(((u64)addr) & BIT(vabits_actual - 1))) +#define __is_lm_address(addr) (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET)) #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET) #define __kimg_to_phys(addr) ((addr) - kimage_voffset) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 095540667f0f..7e15d92836d8 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -269,7 +269,7 @@ static void __init fdt_enforce_memory_region(void) void __init arm64_memblock_init(void) { - const s64 linear_region_size = BIT(vabits_actual - 1); + const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); /* Handle linux,usable-memory-range property */ fdt_enforce_memory_region(); -- cgit v1.2.3-59-g8ed1b From 8c96400d6a39be763130a5c493647c57726f7013 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 8 Oct 2020 17:36:01 +0200 Subject: arm64: mm: make vmemmap region a projection of the linear region Now that we have reverted the introduction of the vmemmap struct page pointer and the separate physvirt_offset, we can simplify things further, and place the vmemmap region in the VA space in such a way that virtual to page translations and vice versa can be implemented using a single arithmetic shift. One happy coincidence resulting from this is that the 48-bit/4k and 52-bit/64k configurations (which are assumed to be the two most prevalent) end up with the same placement of the vmemmap region. In a subsequent patch, we will take advantage of this, and unify the memory maps even more. Signed-off-by: Ard Biesheuvel Reviewed-by: Steve Capper Link: https://lore.kernel.org/r/20201008153602.9467-4-ardb@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/memory.rst | 30 +++++++++++++++--------------- arch/arm64/include/asm/memory.h | 14 ++++++-------- arch/arm64/mm/init.c | 2 ++ 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index ee51eb66a578..476edb6015b2 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -35,14 +35,14 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit):: [ ffff600000000000 ffff7fffffffffff ] 32TB [ kasan shadow region ] ffff800000000000 ffff800007ffffff 128MB bpf jit region ffff800008000000 ffff80000fffffff 128MB modules - ffff800010000000 fffffdffbffeffff 125TB vmalloc - fffffdffbfff0000 fffffdfffe5f8fff ~998MB [guard region] - fffffdfffe5f9000 fffffdfffe9fffff 4124KB fixed mappings - fffffdfffea00000 fffffdfffebfffff 2MB [guard region] - fffffdfffec00000 fffffdffffbfffff 16MB PCI I/O space - fffffdffffc00000 fffffdffffdfffff 2MB [guard region] - fffffdffffe00000 ffffffffffdfffff 2TB vmemmap - ffffffffffe00000 ffffffffffffffff 2MB [guard region] + ffff800010000000 fffffbffbffeffff 123TB vmalloc + fffffbffbfff0000 fffffbfffe7f8fff ~998MB [guard region] + fffffbfffe7f9000 fffffbfffebfffff 4124KB fixed mappings + fffffbfffec00000 fffffbfffedfffff 2MB [guard region] + fffffbfffee00000 fffffbffffdfffff 16MB PCI I/O space + fffffbffffe00000 fffffbffffffffff 2MB [guard region] + fffffc0000000000 fffffdffffffffff 2TB vmemmap + fffffe0000000000 ffffffffffffffff 2TB [guard region] AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support):: @@ -55,13 +55,13 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support): ffff800000000000 ffff800007ffffff 128MB bpf jit region ffff800008000000 ffff80000fffffff 128MB modules ffff800010000000 fffff81ffffeffff 120TB vmalloc - fffff81fffff0000 fffffc1ffe58ffff ~3TB [guard region] - fffffc1ffe590000 fffffc1ffe9fffff 4544KB fixed mappings - fffffc1ffea00000 fffffc1ffebfffff 2MB [guard region] - fffffc1ffec00000 fffffc1fffbfffff 16MB PCI I/O space - fffffc1fffc00000 fffffc1fffdfffff 2MB [guard region] - fffffc1fffe00000 ffffffffffdfffff 3968GB vmemmap - ffffffffffe00000 ffffffffffffffff 2MB [guard region] + fffff81fffff0000 fffffbfffe38ffff ~3TB [guard region] + fffffbfffe390000 fffffbfffebfffff 4544KB fixed mappings + fffffbfffec00000 fffffbfffedfffff 2MB [guard region] + fffffbfffee00000 fffffbffffdfffff 16MB PCI I/O space + fffffbffffe00000 fffffbffffffffff 2MB [guard region] + fffffc0000000000 ffffffdfffffffff ~4TB vmemmap + ffffffe000000000 ffffffffffffffff 128GB [guard region] Translation table lookup with 4KB pages:: diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 8e89f9b9091e..ecd6342e27ca 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -30,8 +30,8 @@ * keep a constant PAGE_OFFSET and "fallback" to using the higher end * of the VMEMMAP where 52-bit support is not available in hardware. */ -#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) \ - >> (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT)) +#define VMEMMAP_SHIFT (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT) +#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) >> VMEMMAP_SHIFT) /* * PAGE_OFFSET - the virtual address of the start of the linear map, at the @@ -50,7 +50,7 @@ #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (BPF_JIT_REGION_END) #define MODULES_VSIZE (SZ_128M) -#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M) +#define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT))) #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) #define PCI_IO_END (VMEMMAP_START - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) @@ -308,15 +308,13 @@ static inline void *phys_to_virt(phys_addr_t x) #else #define page_to_virt(x) ({ \ __typeof__(x) __page = x; \ - u64 __idx = ((u64)__page - VMEMMAP_START) / sizeof(struct page);\ - u64 __addr = PAGE_OFFSET + (__idx * PAGE_SIZE); \ + u64 __addr = (u64)__page << VMEMMAP_SHIFT; \ (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\ }) #define virt_to_page(x) ({ \ - u64 __idx = (__tag_reset((u64)x) - PAGE_OFFSET) / PAGE_SIZE; \ - u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page)); \ - (struct page *)__addr; \ + u64 __addr = __tag_reset((u64)(x)) & PAGE_MASK; \ + (struct page *)((s64)__addr >> VMEMMAP_SHIFT); \ }) #endif /* !CONFIG_SPARSEMEM_VMEMMAP || CONFIG_DEBUG_VIRTUAL */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7e15d92836d8..3a5e9f9298e9 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -502,6 +502,8 @@ static void __init free_unused_memmap(void) */ void __init mem_init(void) { + BUILD_BUG_ON(!is_power_of_2(sizeof(struct page))); + if (swiotlb_force == SWIOTLB_FORCE || max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit)) swiotlb_init(1); -- cgit v1.2.3-59-g8ed1b From 9ad7c6d5e75b160c9ce5775db610d964af45b83f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 8 Oct 2020 17:36:02 +0200 Subject: arm64: mm: tidy up top of kernel VA space Tidy up the way the top of the kernel VA space is organized, by mirroring the 256 MB region we have below the vmalloc space, and populating it top down with the PCI I/O space, some guard regions, and the fixmap region. The latter region is itself populated top down, and today only covers about 4 MB, and so 224 MB is ample, and no guard region is therefore required. The resulting layout is identical between 48-bit/4k and 52-bit/64k configurations. Signed-off-by: Ard Biesheuvel Reviewed-by: Steve Capper Link: https://lore.kernel.org/r/20201008153602.9467-5-ardb@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/memory.rst | 22 ++++++++++------------ arch/arm64/include/asm/memory.h | 4 ++-- arch/arm64/include/asm/pgtable.h | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index 476edb6015b2..3d62604fa7bd 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -35,12 +35,11 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit):: [ ffff600000000000 ffff7fffffffffff ] 32TB [ kasan shadow region ] ffff800000000000 ffff800007ffffff 128MB bpf jit region ffff800008000000 ffff80000fffffff 128MB modules - ffff800010000000 fffffbffbffeffff 123TB vmalloc - fffffbffbfff0000 fffffbfffe7f8fff ~998MB [guard region] - fffffbfffe7f9000 fffffbfffebfffff 4124KB fixed mappings - fffffbfffec00000 fffffbfffedfffff 2MB [guard region] - fffffbfffee00000 fffffbffffdfffff 16MB PCI I/O space - fffffbffffe00000 fffffbffffffffff 2MB [guard region] + ffff800010000000 fffffbffefffffff 124TB vmalloc + fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down) + fffffbfffe000000 fffffbfffe7fffff 8MB [guard region] + fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space + fffffbffff800000 fffffbffffffffff 8MB [guard region] fffffc0000000000 fffffdffffffffff 2TB vmemmap fffffe0000000000 ffffffffffffffff 2TB [guard region] @@ -54,12 +53,11 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support): [ fffd800000000000 ffff7fffffffffff ] 512TB [ kasan shadow region ] ffff800000000000 ffff800007ffffff 128MB bpf jit region ffff800008000000 ffff80000fffffff 128MB modules - ffff800010000000 fffff81ffffeffff 120TB vmalloc - fffff81fffff0000 fffffbfffe38ffff ~3TB [guard region] - fffffbfffe390000 fffffbfffebfffff 4544KB fixed mappings - fffffbfffec00000 fffffbfffedfffff 2MB [guard region] - fffffbfffee00000 fffffbffffdfffff 16MB PCI I/O space - fffffbffffe00000 fffffbffffffffff 2MB [guard region] + ffff800010000000 fffffbffefffffff 124TB vmalloc + fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down) + fffffbfffe000000 fffffbfffe7fffff 8MB [guard region] + fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space + fffffbffff800000 fffffbffffffffff 8MB [guard region] fffffc0000000000 ffffffdfffffffff ~4TB vmemmap ffffffe000000000 ffffffffffffffff 128GB [guard region] diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index ecd6342e27ca..03e9b112bd94 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -52,9 +52,9 @@ #define MODULES_VSIZE (SZ_128M) #define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT))) #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) -#define PCI_IO_END (VMEMMAP_START - SZ_2M) +#define PCI_IO_END (VMEMMAP_START - SZ_8M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) -#define FIXADDR_TOP (PCI_IO_START - SZ_2M) +#define FIXADDR_TOP (VMEMMAP_START - SZ_32M) #if VA_BITS > 48 #define VA_BITS_MIN (48) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4ff12a7adcfd..ec307b8bcb15 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -22,7 +22,7 @@ * and fixed mappings */ #define VMALLOC_START (MODULES_END) -#define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K) +#define VMALLOC_END (VMEMMAP_START - SZ_256M) #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) -- cgit v1.2.3-59-g8ed1b From 68af6d2483dbd0385317bc87a338b155be75eeb6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 10 Nov 2020 14:08:51 +0100 Subject: Documentation/arm64: fix RST layout of memory.rst Stephen reports that commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit VA configurations") triggers the following warnings when building the htmldocs make target of today's linux-next: Documentation/arm64/memory.rst:35: WARNING: Literal block ends without a blank line; unexpected unindent. Documentation/arm64/memory.rst:53: WARNING: Literal block ends without a blank line; unexpected unindent. Let's tweak the memory layout table to work around this. Reported-by: Stephen Rothwell Signed-off-by: Ard Biesheuvel Fixes: f4693c2716b3 ("arm64: mm: extend linear region for 52-bit VA configurations") Link: https://lore.kernel.org/r/20201110130851.15751-1-ardb@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/memory.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index 3d62604fa7bd..e7522e5c8322 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -32,7 +32,7 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit):: ----------------------------------------------------------------------- 0000000000000000 0000ffffffffffff 256TB user ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map -[ ffff600000000000 ffff7fffffffffff ] 32TB [ kasan shadow region ] + [ffff600000000000 ffff7fffffffffff] 32TB [kasan shadow region] ffff800000000000 ffff800007ffffff 128MB bpf jit region ffff800008000000 ffff80000fffffff 128MB modules ffff800010000000 fffffbffefffffff 124TB vmalloc @@ -50,7 +50,7 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support): ----------------------------------------------------------------------- 0000000000000000 000fffffffffffff 4PB user fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map -[ fffd800000000000 ffff7fffffffffff ] 512TB [ kasan shadow region ] + [fffd800000000000 ffff7fffffffffff] 512TB [kasan shadow region] ffff800000000000 ffff800007ffffff 128MB bpf jit region ffff800008000000 ffff80000fffffff 128MB modules ffff800010000000 fffffbffefffffff 124TB vmalloc -- cgit v1.2.3-59-g8ed1b From dceec3ff78076757311d92a388d50d0251fb7dbb Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 20 Nov 2020 12:33:46 -0800 Subject: arm64: expose FAR_EL1 tag bits in siginfo The kernel currently clears the tag bits (i.e. bits 56-63) in the fault address exposed via siginfo.si_addr and sigcontext.fault_address. However, the tag bits may be needed by tools in order to accurately diagnose memory errors, such as HWASan [1] or future tools based on the Memory Tagging Extension (MTE). Expose these bits via the arch_untagged_si_addr mechanism, so that they are only exposed to signal handlers with the SA_EXPOSE_TAGBITS flag set. [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html Signed-off-by: Peter Collingbourne Reviewed-by: Catalin Marinas Link: https://linux-review.googlesource.com/id/Ia8876bad8c798e0a32df7c2ce1256c4771c81446 Link: https://lore.kernel.org/r/0010296597784267472fa13b39f8238d87a72cf8.1605904350.git.pcc@google.com Signed-off-by: Catalin Marinas --- Documentation/arm64/tagged-pointers.rst | 25 +++++++--- arch/arm64/include/asm/exception.h | 2 +- arch/arm64/include/asm/signal.h | 25 ++++++++++ arch/arm64/include/asm/system_misc.h | 2 +- arch/arm64/include/asm/traps.h | 6 +-- arch/arm64/kernel/debug-monitors.c | 5 +- arch/arm64/kernel/entry-common.c | 2 - arch/arm64/kernel/ptrace.c | 7 +-- arch/arm64/kernel/sys_compat.c | 5 +- arch/arm64/kernel/traps.c | 29 ++++++------ arch/arm64/mm/fault.c | 83 ++++++++++++++++++++------------- 11 files changed, 120 insertions(+), 71 deletions(-) create mode 100644 arch/arm64/include/asm/signal.h (limited to 'Documentation/arm64') diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst index eab4323609b9..19d284b70384 100644 --- a/Documentation/arm64/tagged-pointers.rst +++ b/Documentation/arm64/tagged-pointers.rst @@ -53,12 +53,25 @@ visibility. Preserving tags --------------- -Non-zero tags are not preserved when delivering signals. This means that -signal handlers in applications making use of tags cannot rely on the -tag information for user virtual addresses being maintained for fields -inside siginfo_t. One exception to this rule is for signals raised in -response to watchpoint debug exceptions, where the tag information will -be preserved. +When delivering signals, non-zero tags are not preserved in +siginfo.si_addr unless the flag SA_EXPOSE_TAGBITS was set in +sigaction.sa_flags when the signal handler was installed. This means +that signal handlers in applications making use of tags cannot rely +on the tag information for user virtual addresses being maintained +in these fields unless the flag was set. + +Due to architecture limitations, bits 63:60 of the fault address +are not preserved in response to synchronous tag check faults +(SEGV_MTESERR) even if SA_EXPOSE_TAGBITS was set. Applications should +treat the values of these bits as undefined in order to accommodate +future architecture revisions which may preserve the bits. + +For signals raised in response to watchpoint debug exceptions, the +tag information will be preserved regardless of the SA_EXPOSE_TAGBITS +flag setting. + +Non-zero tags are never preserved in sigcontext.fault_address +regardless of the SA_EXPOSE_TAGBITS flag setting. The architecture prevents the use of a tagged PC, so the upper byte will be set to a sign-extension of bit 55 on exception return. diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 99b9383cd036..2a8aa1884d8a 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr) } asmlinkage void enter_from_user_mode(void); -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); void do_bti(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h new file mode 100644 index 000000000000..ef449f5f4ba8 --- /dev/null +++ b/arch/arm64/include/asm/signal.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_ASM_SIGNAL_H +#define __ARM64_ASM_SIGNAL_H + +#include +#include +#include + +static inline void __user *arch_untagged_si_addr(void __user *addr, + unsigned long sig, + unsigned long si_code) +{ + /* + * For historical reasons, all bits of the fault address are exposed as + * address bits for watchpoint exceptions. New architectures should + * handle the tag bits consistently. + */ + if (sig == SIGTRAP && si_code == TRAP_BRKPT) + return addr; + + return untagged_addr(addr); +} +#define arch_untagged_si_addr arch_untagged_si_addr + +#endif diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index 1ab63cfbbaf1..673be2d1263c 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -22,7 +22,7 @@ void die(const char *msg, struct pt_regs *regs, int err); struct siginfo; void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, + int signo, int sicode, unsigned long far, int err); void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index d96dc2c7c09d..54f32a0675df 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -26,9 +26,9 @@ void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); void force_signal_inject(int signal, int code, unsigned long address, unsigned int err); void arm64_notify_segfault(unsigned long addr); -void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str); -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str); -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, const char *str); +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); /* * Move regs->pc to next instruction and do necessary setup before it diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index fa76151de6ff..4f3661eeb7ec 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -234,9 +234,8 @@ static void send_user_sigtrap(int si_code) if (interrupts_enabled(regs)) local_irq_enable(); - arm64_force_sig_fault(SIGTRAP, si_code, - (void __user *)instruction_pointer(regs), - "User debug trap"); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } static int single_step_handler(unsigned long unused, unsigned int esr, diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 43d4c329775f..dbbddfbf4a72 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr) unsigned long far = read_sysreg(far_el1); local_daif_inherit(regs); - far = untagged_addr(far); do_mem_abort(far, esr, regs); } NOKPROBE_SYMBOL(el1_abort); @@ -114,7 +113,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr) user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); - far = untagged_addr(far); do_mem_abort(far, esr, regs); } NOKPROBE_SYMBOL(el0_da); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index f49b349e16a3..8ac487c84e37 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -192,14 +192,11 @@ static void ptrace_hbptriggered(struct perf_event *bp, break; } } - arm64_force_sig_ptrace_errno_trap(si_errno, - (void __user *)bkpt->trigger, + arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); } #endif - arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __user *)(bkpt->trigger), - desc); + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 3c18c2454089..265fe3eb1069 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -68,7 +68,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) */ long compat_arm_syscall(struct pt_regs *regs, int scno) { - void __user *addr; + unsigned long addr; switch (scno) { /* @@ -111,8 +111,7 @@ long compat_arm_syscall(struct pt_regs *regs, int scno) break; } - addr = (void __user *)instruction_pointer(regs) - - (compat_thumb_mode(regs) ? 2 : 4); + addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); arm64_notify_die("Oops - bad compat syscall(2)", regs, SIGILL, ILL_ILLTRP, addr, scno); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8af4e0e85736..f4ddbe9ed3f1 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -170,32 +170,32 @@ static void arm64_show_signal(int signo, const char *str) __show_regs(regs); } -void arm64_force_sig_fault(int signo, int code, void __user *addr, +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else - force_sig_fault(signo, code, addr); + force_sig_fault(signo, code, (void __user *)far); } -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); - force_sig_mceerr(code, addr, lsb); + force_sig_mceerr(code, (void __user *)far, lsb); } -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); - force_sig_ptrace_errno_trap(errno, addr); + force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, + int signo, int sicode, unsigned long far, int err) { if (user_mode(regs)) { @@ -203,7 +203,7 @@ void arm64_notify_die(const char *str, struct pt_regs *regs, current->thread.fault_address = 0; current->thread.fault_code = err; - arm64_force_sig_fault(signo, sicode, addr, str); + arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } @@ -374,7 +374,7 @@ void force_signal_inject(int signal, int code, unsigned long address, unsigned i signal = SIGKILL; } - arm64_notify_die(desc, regs, signal, code, (void __user *)address, err); + arm64_notify_die(desc, regs, signal, code, address, err); } /* @@ -385,7 +385,7 @@ void arm64_notify_segfault(unsigned long addr) int code; mmap_read_lock(current->mm); - if (find_vma(current->mm, addr) == NULL) + if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; @@ -448,12 +448,13 @@ NOKPROBE_SYMBOL(do_ptrauth_fault); static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) { - unsigned long address; + unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = untagged_addr(pt_regs_read_reg(regs, rt)); + tagged_address = pt_regs_read_reg(regs, rt); + address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -480,7 +481,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) } if (ret) - arm64_notify_segfault(address); + arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } @@ -772,7 +773,7 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) */ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) { - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1ee94002801f..29a6b8c9e830 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -40,7 +40,7 @@ #include struct fault_info { - int (*fn)(unsigned long addr, unsigned int esr, + int (*fn)(unsigned long far, unsigned int esr, struct pt_regs *regs); int sig; int code; @@ -385,8 +385,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr) current->thread.fault_code = esr; } -static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static void do_bad_area(unsigned long far, unsigned int esr, + struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + /* * If we are in kernel mode at this point, we have no context to * handle this fault with. @@ -395,8 +398,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re const struct fault_info *inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); - arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr, - inf->name); + arm64_force_sig_fault(inf->sig, inf->code, far, inf->name); } else { __do_kernel_fault(addr, esr, regs); } @@ -448,7 +450,7 @@ static bool is_write_abort(unsigned int esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } -static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, +static int __kprobes do_page_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -456,6 +458,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); if (kprobe_page_fault(regs, esr)) return 0; @@ -567,8 +570,7 @@ retry: * We had some memory, but were unable to successfully fix up * this page fault. */ - arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, - inf->name); + arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; @@ -576,8 +578,7 @@ retry: if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb, - inf->name); + arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { /* * Something tried to access memory that isn't in our memory @@ -585,8 +586,7 @@ retry: */ arm64_force_sig_fault(SIGSEGV, fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR, - (void __user *)addr, - inf->name); + far, inf->name); } return 0; @@ -596,33 +596,35 @@ no_context: return 0; } -static int __kprobes do_translation_fault(unsigned long addr, +static int __kprobes do_translation_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + if (is_ttbr0_addr(addr)) - return do_page_fault(addr, esr, regs); + return do_page_fault(far, esr, regs); - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_alignment_fault(unsigned long addr, unsigned int esr, +static int do_alignment_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs) { return 1; /* "fault" */ } -static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; - void __user *siaddr; + unsigned long siaddr; inf = esr_to_fault_info(esr); @@ -634,19 +636,30 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; } - if (esr & ESR_ELx_FnV) - siaddr = NULL; - else - siaddr = (void __user *)addr; + if (esr & ESR_ELx_FnV) { + siaddr = 0; + } else { + /* + * The architecture specifies that the tag bits of FAR_EL1 are + * UNKNOWN for synchronous external aborts. Mask them out now + * so that userspace doesn't see them. + */ + siaddr = untagged_addr(far); + } arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; } -static int do_tag_check_fault(unsigned long addr, unsigned int esr, +static int do_tag_check_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + /* + * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN for tag + * check faults. Mask them out now so that userspace doesn't see them. + */ + far &= (1UL << 60) - 1; + do_bad_area(far, esr, regs); return 0; } @@ -717,11 +730,12 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); + unsigned long addr = untagged_addr(far); - if (!inf->fn(addr, esr, regs)) + if (!inf->fn(far, esr, regs)) return; if (!user_mode(regs)) { @@ -730,8 +744,12 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) show_pte(addr); } - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)addr, esr); + /* + * At this point we have an unrecognized fault type whose tag bits may + * have been defined as UNKNOWN. Therefore we only expose the untagged + * address to the signal handler. + */ + arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); } NOKPROBE_SYMBOL(do_mem_abort); @@ -744,8 +762,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening); void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - arm64_notify_die("SP/PC alignment exception", regs, - SIGBUS, BUS_ADRALN, (void __user *)addr, esr); + arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, + addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); @@ -871,8 +889,7 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, arm64_apply_bp_hardening(); if (inf->fn(addr_if_watchpoint, esr, regs)) { - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)pc, esr); + arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); } debug_exception_exit(regs); -- cgit v1.2.3-59-g8ed1b