From 6c84f8c5cbfb4bf728f88296bc035c4a401c3423 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 7 Mar 2019 09:47:50 +0000 Subject: powerpc/highmem: Change BUG_ON() to WARN_ON() In arch/powerpc/mm/highmem.c, BUG_ON() is called only when CONFIG_DEBUG_HIGHMEM is selected, this means the BUG_ON() is not vital and can be replaced by a a WARN_ON(). At the same time, use IS_ENABLED() instead of #ifdef to clean a bit. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/highmem.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 82a0e37557a5..320c1672b2ae 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -43,9 +43,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif + WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); local_flush_tlb_page(NULL, vaddr); @@ -56,7 +54,6 @@ EXPORT_SYMBOL(kmap_atomic_prot); void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type __maybe_unused; if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); @@ -64,14 +61,12 @@ void __kunmap_atomic(void *kvaddr) return; } - type = kmap_atomic_idx(); - -#ifdef CONFIG_DEBUG_HIGHMEM - { + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { + int type = kmap_atomic_idx(); unsigned int idx; idx = type + KM_TYPE_NR * smp_processor_id(); - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); /* * force other mappings to Oops if they'll try to access @@ -80,7 +75,6 @@ void __kunmap_atomic(void *kvaddr) pte_clear(&init_mm, vaddr, kmap_pte-idx); local_flush_tlb_page(NULL, vaddr); } -#endif kmap_atomic_idx_pop(); pagefault_enable(); -- cgit v1.2.3-59-g8ed1b From f172acbfae1a78b1a3c775f78e8d0dcd15b9d768 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Wed, 13 Mar 2019 11:25:28 +0100 Subject: powerpc/mm: move warning from resize_hpt_for_hotplug() resize_hpt_for_hotplug() reports a warning when it cannot resize the hash page table ("Unable to resize hash page table to target order") but in some cases it's not a problem and can make user thinks something has not worked properly. This patch moves the warning to arch_remove_memory() to only report the problem when it is needed. Reviewed-by: David Gibson Signed-off-by: Laurent Vivier Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/sparsemem.h | 4 ++-- arch/powerpc/mm/hash_utils_64.c | 19 +++++++------------ arch/powerpc/mm/mem.c | 3 ++- arch/powerpc/platforms/pseries/lpar.c | 3 ++- 4 files changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 68da49320592..3192d454a733 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -17,9 +17,9 @@ extern int create_section_mapping(unsigned long start, unsigned long end, int ni extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_PPC_BOOK3S_64 -extern void resize_hpt_for_hotplug(unsigned long new_mem_size); +extern int resize_hpt_for_hotplug(unsigned long new_mem_size); #else -static inline void resize_hpt_for_hotplug(unsigned long new_mem_size) { } +static inline int resize_hpt_for_hotplug(unsigned long new_mem_size) { return 0; } #endif #ifdef CONFIG_NUMA diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0a4f939a8161..c4c9610ce6e3 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -755,12 +755,12 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -void resize_hpt_for_hotplug(unsigned long new_mem_size) +int resize_hpt_for_hotplug(unsigned long new_mem_size) { unsigned target_hpt_shift; if (!mmu_hash_ops.resize_hpt) - return; + return 0; target_hpt_shift = htab_shift_for_mem_size(new_mem_size); @@ -772,16 +772,11 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size) * reduce unless the target shift is at least 2 below the * current shift */ - if ((target_hpt_shift > ppc64_pft_size) - || (target_hpt_shift < (ppc64_pft_size - 1))) { - int rc; - - rc = mmu_hash_ops.resize_hpt(target_hpt_shift); - if (rc && (rc != -ENODEV)) - printk(KERN_WARNING - "Unable to resize hash page table to target order %d: %d\n", - target_hpt_shift, rc); - } + if (target_hpt_shift > ppc64_pft_size || + target_hpt_shift < ppc64_pft_size - 1) + return mmu_hash_ops.resize_hpt(target_hpt_shift); + + return 0; } int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index f6787f90e158..3665602a9dfa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -161,7 +161,8 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, */ vm_unmap_aliases(); - resize_hpt_for_hotplug(memblock_phys_mem_size()); + if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) + pr_warn("Hash collision while resizing HPT\n"); return ret; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f2a9f0adc2d3..1034ef1fe2b4 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -901,8 +901,10 @@ static int pseries_lpar_resize_hpt(unsigned long shift) break; case H_PARAMETER: + pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n"); return -EINVAL; case H_RESOURCE: + pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n"); return -EPERM; default: pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc); @@ -918,7 +920,6 @@ static int pseries_lpar_resize_hpt(unsigned long shift) if (rc != 0) { switch (state.commit_rc) { case H_PTEG_FULL: - pr_warn("Hash collision while resizing HPT\n"); return -ENOSPC; default: -- cgit v1.2.3-59-g8ed1b From 6917735e8f905da1f62ccdf62830b185524835c7 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Sat, 23 Mar 2019 18:20:55 +0530 Subject: powerpc: Remove duplicate headers Remove duplicate headers inclusions. Signed-off-by: Jagadeesh Pagadala Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/numa.c | 1 - 3 files changed, 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc0503ef9c9c..6ef32472ee1d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 506413a2c25e..587ff9788ab0 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f976676004ad..f3ee0e18edd6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 2d4d9b308f8f8dec68f6dbbff18c68ec7c6bd26f Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 18 Apr 2019 13:56:57 -0500 Subject: powerpc/numa: improve control of topology updates When booted with "topology_updates=no", or when "off" is written to /proc/powerpc/topology_updates, NUMA reassignments are inhibited for PRRN and VPHN events. However, migration and suspend unconditionally re-enable reassignments via start_topology_update(). This is incoherent. Check the topology_updates_enabled flag in start/stop_topology_update() so that callers of those APIs need not be aware of whether reassignments are enabled. This allows the administrative decision on reassignments to remain in force across migrations and suspensions. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f3ee0e18edd6..952ada44df62 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1497,6 +1497,9 @@ int start_topology_update(void) { int rc = 0; + if (!topology_updates_enabled) + return 0; + if (firmware_has_feature(FW_FEATURE_PRRN)) { if (!prrn_enabled) { prrn_enabled = 1; @@ -1530,6 +1533,9 @@ int stop_topology_update(void) { int rc = 0; + if (!topology_updates_enabled) + return 0; + if (prrn_enabled) { prrn_enabled = 0; #ifdef CONFIG_SMP @@ -1587,11 +1593,13 @@ static ssize_t topology_write(struct file *file, const char __user *buf, kbuf[read_len] = '\0'; - if (!strncmp(kbuf, "on", 2)) + if (!strncmp(kbuf, "on", 2)) { + topology_updates_enabled = true; start_topology_update(); - else if (!strncmp(kbuf, "off", 3)) + } else if (!strncmp(kbuf, "off", 3)) { stop_topology_update(); - else + topology_updates_enabled = false; + } else return -EINVAL; return count; @@ -1606,9 +1614,7 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { - /* Do not poll for changes if disabled at boot */ - if (topology_updates_enabled) - start_topology_update(); + start_topology_update(); if (vphn_enabled) topology_schedule_update(); -- cgit v1.2.3-59-g8ed1b From 558f86493df09f68f79fe056d9028d317a3ce8ab Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 18 Apr 2019 13:56:58 -0500 Subject: powerpc/numa: document topology_updates_enabled, disable by default Changing the NUMA associations for CPUs and memory at runtime is basically unsupported by the core mm, scheduler etc. We see all manner of crashes, warnings and instability when the pseries code tries to do this. Disable this behavior by default, and document the switch a bit. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 952ada44df62..6ef36d553cde 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -907,16 +907,22 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); -static bool topology_updates_enabled = true; +/* + * The platform can inform us through one of several mechanisms + * (post-migration device tree updates, PRRN or VPHN) that the NUMA + * assignment of a resource has changed. This controls whether we act + * on that. Disabled by default. + */ +static bool topology_updates_enabled; static int __init early_topology_updates(char *p) { if (!p) return 0; - if (!strcmp(p, "off")) { - pr_info("Disabling topology updates\n"); - topology_updates_enabled = false; + if (!strcmp(p, "on")) { + pr_warn("Caution: enabling topology updates\n"); + topology_updates_enabled = true; } return 0; -- cgit v1.2.3-59-g8ed1b From 69795cabe4cfe5122438d50010ad5310c113a013 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Apr 2019 16:51:18 +1000 Subject: powerpc: Add framework for Kernel Userspace Protection This patch adds a skeleton for Kernel Userspace Protection functionnalities like Kernel Userspace Access Protection and Kernel Userspace Execution Prevention The subsequent implementation of KUAP for radix makes use of a MMU feature in order to patch out assembly when KUAP is disabled or unsupported. This won't work unless there's an entry point for KUP support before the feature magic happens, so for PPC64 setup_kup() is called early in setup. On PPC32, feature_fixup() is done too early to allow the same. Suggested-by: Russell Currey Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kup.h | 11 +++++++++++ arch/powerpc/kernel/setup_64.c | 7 +++++++ arch/powerpc/mm/init-common.c | 5 +++++ arch/powerpc/mm/init_32.c | 3 +++ 4 files changed, 26 insertions(+) create mode 100644 arch/powerpc/include/asm/kup.h (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h new file mode 100644 index 000000000000..7a88b8b9b54d --- /dev/null +++ b/arch/powerpc/include/asm/kup.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_KUP_H_ +#define _ASM_POWERPC_KUP_H_ + +#ifndef __ASSEMBLY__ + +void setup_kup(void); + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_KUP_H_ */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index ba404dd9ce1d..6179c4200339 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "setup.h" @@ -331,6 +332,12 @@ void __init early_setup(unsigned long dt_ptr) */ configure_exceptions(); + /* + * Configure Kernel Userspace Protection. This needs to happen before + * feature fixups for platforms that implement this using features. + */ + setup_kup(); + /* Apply all the dynamic patching */ apply_feature_fixups(); setup_feature_keys(); diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 1e6910eb70ed..36d28e872289 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -24,6 +24,11 @@ #include #include #include +#include + +void __init setup_kup(void) +{ +} #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 41a3513cadc9..80cc97cd8878 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -178,6 +179,8 @@ void __init MMU_init(void) btext_unmap(); #endif + setup_kup(); + /* Shortly after that, the entire linear mapping will be available */ memblock_set_current_limit(lowmem_end_addr); } -- cgit v1.2.3-59-g8ed1b From 0fb1c25ab523614b056ace11be67aac8f8ccabb1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Apr 2019 16:51:19 +1000 Subject: powerpc: Add skeleton for Kernel Userspace Execution Prevention This patch adds a skeleton for Kernel Userspace Execution Prevention. Then subarches implementing it have to define CONFIG_PPC_HAVE_KUEP and provide setup_kuep() function. Signed-off-by: Christophe Leroy [mpe: Don't split strings, use pr_crit_ratelimited()] Signed-off-by: Michael Ellerman --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/kup.h | 6 ++++++ arch/powerpc/mm/fault.c | 9 ++++----- arch/powerpc/mm/init-common.c | 11 +++++++++++ arch/powerpc/platforms/Kconfig.cputype | 12 ++++++++++++ 5 files changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b8ee90bb644..a53df74589e5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2843,7 +2843,7 @@ Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [X86] + nosmep [X86,PPC] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 7a88b8b9b54d..a2a959cb4e36 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -6,6 +6,12 @@ void setup_kup(void); +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled); +#else +static inline void setup_kuep(bool disabled) { } +#endif /* CONFIG_PPC_KUEP */ + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_KUP_H_ */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 887f11bcf330..3384354abc1d 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -229,11 +229,10 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code, /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | DSISR_PROTFAULT))) { - printk_ratelimited(KERN_CRIT "kernel tried to execute" - " exec-protected page (%lx) -" - "exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, - current_uid())); + pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n", + address >= TASK_SIZE ? "exec-protected" : "user", + address, + from_kuid(&init_user_ns, current_uid())); } return is_exec || (address >= TASK_SIZE); } diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 36d28e872289..83f95a5565d6 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -26,8 +26,19 @@ #include #include +static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); + +static int __init parse_nosmep(char *p) +{ + disable_kuep = true; + pr_warn("Disabling Kernel Userspace Execution Prevention\n"); + return 0; +} +early_param("nosmep", parse_nosmep); + void __init setup_kup(void) { + setup_kuep(disable_kuep); } #define CTOR(shift) static void ctor_##shift(void *addr) \ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 842b2c7e156a..7d30bbbaa3c1 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -345,6 +345,18 @@ config PPC_RADIX_MMU_DEFAULT If you're unsure, say Y. +config PPC_HAVE_KUEP + bool + +config PPC_KUEP + bool "Kernel Userspace Execution Prevention" + depends on PPC_HAVE_KUEP + default y + help + Enable support for Kernel Userspace Execution Prevention (KUEP) + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION -- cgit v1.2.3-59-g8ed1b From de78a9c42a790011f179bc94a7da3f5d8721f4cc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Apr 2019 16:51:20 +1000 Subject: powerpc: Add a framework for Kernel Userspace Access Protection This patch implements a framework for Kernel Userspace Access Protection. Then subarches will have the possibility to provide their own implementation by providing setup_kuap() and allow/prevent_user_access(). Some platforms will need to know the area accessed and whether it is accessed from read, write or both. Therefore source, destination and size and handed over to the two functions. mpe: Rename to allow/prevent rather than unlock/lock, and add read/write wrappers. Drop the 32-bit code for now until we have an implementation for it. Add kuap to pt_regs for 64-bit as well as 32-bit. Don't split strings, use pr_crit_ratelimited(). Signed-off-by: Christophe Leroy Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/futex.h | 4 +++ arch/powerpc/include/asm/kup.h | 32 +++++++++++++++++++++ arch/powerpc/include/asm/ptrace.h | 11 +++++-- arch/powerpc/include/asm/uaccess.h | 38 +++++++++++++++++++------ arch/powerpc/kernel/asm-offsets.c | 4 +++ arch/powerpc/lib/checksum_wrappers.c | 4 +++ arch/powerpc/mm/fault.c | 19 ++++++++++--- arch/powerpc/mm/init-common.c | 10 +++++++ arch/powerpc/platforms/Kconfig.cputype | 12 ++++++++ 10 files changed, 121 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a53df74589e5..c45a19d654f3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2839,7 +2839,7 @@ noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings - nosmap [X86] + nosmap [X86,PPC] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 88b38b37c21b..3a6aa57b9d90 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -35,6 +35,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, { int oldval = 0, ret; + allow_write_to_user(uaddr, sizeof(*uaddr)); pagefault_disable(); switch (op) { @@ -62,6 +63,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, if (!ret) *oval = oldval; + prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; } @@ -75,6 +77,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; + allow_write_to_user(uaddr, sizeof(*uaddr)); __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ @@ -95,6 +98,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "cc", "memory"); *uval = prev; + prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; } diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index a2a959cb4e36..4d78b9d8c99c 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLY__ +#include + void setup_kup(void); #ifdef CONFIG_PPC_KUEP @@ -12,6 +14,36 @@ void setup_kuep(bool disabled); static inline void setup_kuep(bool disabled) { } #endif /* CONFIG_PPC_KUEP */ +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled); +#else +static inline void setup_kuap(bool disabled) { } +static inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size) { } +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) { } +#endif /* CONFIG_PPC_KUAP */ + +static inline void allow_read_from_user(const void __user *from, unsigned long size) +{ + allow_user_access(NULL, from, size); +} + +static inline void allow_write_to_user(void __user *to, unsigned long size) +{ + allow_user_access(to, NULL, size); +} + +static inline void prevent_read_from_user(const void __user *from, unsigned long size) +{ + prevent_user_access(NULL, from, size); +} + +static inline void prevent_write_to_user(void __user *to, unsigned long size) +{ + prevent_user_access(to, NULL, size); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_KUP_H_ */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 64271e562fed..6f047730e642 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -52,10 +52,17 @@ struct pt_regs }; }; + union { + struct { #ifdef CONFIG_PPC64 - unsigned long ppr; - unsigned long __pad; /* Maintain 16 byte interrupt stack alignment */ + unsigned long ppr; +#endif +#ifdef CONFIG_PPC_KUAP + unsigned long kuap; #endif + }; + unsigned long __pad[2]; /* Maintain 16 byte interrupt stack alignment */ + }; }; #endif diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 4d6d905e9138..76f34346b642 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -6,6 +6,7 @@ #include #include #include +#include /* * The fs value determines whether argument validity checking should be @@ -140,6 +141,7 @@ extern long __put_user_bad(void); #define __put_user_size(x, ptr, size, retval) \ do { \ retval = 0; \ + allow_write_to_user(ptr, size); \ switch (size) { \ case 1: __put_user_asm(x, ptr, retval, "stb"); break; \ case 2: __put_user_asm(x, ptr, retval, "sth"); break; \ @@ -147,6 +149,7 @@ do { \ case 8: __put_user_asm2(x, ptr, retval); break; \ default: __put_user_bad(); \ } \ + prevent_write_to_user(ptr, size); \ } while (0) #define __put_user_nocheck(x, ptr, size) \ @@ -239,6 +242,7 @@ do { \ __chk_user_ptr(ptr); \ if (size > sizeof(x)) \ (x) = __get_user_bad(); \ + allow_read_from_user(ptr, size); \ switch (size) { \ case 1: __get_user_asm(x, ptr, retval, "lbz"); break; \ case 2: __get_user_asm(x, ptr, retval, "lhz"); break; \ @@ -246,6 +250,7 @@ do { \ case 8: __get_user_asm2(x, ptr, retval); break; \ default: (x) = __get_user_bad(); \ } \ + prevent_read_from_user(ptr, size); \ } while (0) /* @@ -305,15 +310,21 @@ extern unsigned long __copy_tofrom_user(void __user *to, static inline unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) { - return __copy_tofrom_user(to, from, n); + unsigned long ret; + + allow_user_access(to, from, n); + ret = __copy_tofrom_user(to, from, n); + prevent_user_access(to, from, n); + return ret; } #endif /* __powerpc64__ */ static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { + unsigned long ret; if (__builtin_constant_p(n) && (n <= 8)) { - unsigned long ret = 1; + ret = 1; switch (n) { case 1: @@ -338,14 +349,18 @@ static inline unsigned long raw_copy_from_user(void *to, } barrier_nospec(); - return __copy_tofrom_user((__force void __user *)to, from, n); + allow_read_from_user(from, n); + ret = __copy_tofrom_user((__force void __user *)to, from, n); + prevent_read_from_user(from, n); + return ret; } static inline unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { + unsigned long ret; if (__builtin_constant_p(n) && (n <= 8)) { - unsigned long ret = 1; + ret = 1; switch (n) { case 1: @@ -365,17 +380,24 @@ static inline unsigned long raw_copy_to_user(void __user *to, return 0; } - return __copy_tofrom_user(to, (__force const void __user *)from, n); + allow_write_to_user(to, n); + ret = __copy_tofrom_user(to, (__force const void __user *)from, n); + prevent_write_to_user(to, n); + return ret; } extern unsigned long __clear_user(void __user *addr, unsigned long size); static inline unsigned long clear_user(void __user *addr, unsigned long size) { + unsigned long ret = size; might_fault(); - if (likely(access_ok(addr, size))) - return __clear_user(addr, size); - return size; + if (likely(access_ok(addr, size))) { + allow_write_to_user(addr, size); + ret = __clear_user(addr, size); + prevent_write_to_user(addr, size); + } + return ret; } extern long strncpy_from_user(char *dst, const char __user *src, long count); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 86a61e5f8285..66202e02fee2 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -332,6 +332,10 @@ int main(void) STACK_PT_REGS_OFFSET(_PPR, ppr); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_KUAP + STACK_PT_REGS_OFFSET(STACK_REGS_KUAP, kuap); +#endif + #if defined(CONFIG_PPC32) #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index 890d4ddd91d6..bb9307ce2440 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -29,6 +29,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, unsigned int csum; might_sleep(); + allow_read_from_user(src, len); *err_ptr = 0; @@ -60,6 +61,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, } out: + prevent_read_from_user(src, len); return (__force __wsum)csum; } EXPORT_SYMBOL(csum_and_copy_from_user); @@ -70,6 +72,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, unsigned int csum; might_sleep(); + allow_write_to_user(dst, len); *err_ptr = 0; @@ -97,6 +100,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, } out: + prevent_write_to_user(dst, len); return (__force __wsum)csum; } EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 3384354abc1d..463d1e9d026e 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -223,9 +223,11 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, } /* Is this a bad kernel fault ? */ -static bool bad_kernel_fault(bool is_exec, unsigned long error_code, +static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + int is_exec = TRAP(regs) == 0x400; + /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | DSISR_PROTFAULT))) { @@ -234,7 +236,15 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code, address, from_kuid(&init_user_ns, current_uid())); } - return is_exec || (address >= TASK_SIZE); + + if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) && + !search_exception_tables(regs->nip)) { + pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n", + address, + from_kuid(&init_user_ns, current_uid())); + } + + return is_exec || (address >= TASK_SIZE) || !search_exception_tables(regs->nip); } static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, @@ -454,9 +464,10 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, /* * The kernel should never take an execute fault nor should it - * take a page fault to a kernel address. + * take a page fault to a kernel address or a page fault to a user + * address outside of dedicated places */ - if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address))) return SIGSEGV; /* diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 83f95a5565d6..ecaedfff9992 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -27,6 +27,7 @@ #include static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); +static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); static int __init parse_nosmep(char *p) { @@ -36,9 +37,18 @@ static int __init parse_nosmep(char *p) } early_param("nosmep", parse_nosmep); +static int __init parse_nosmap(char *p) +{ + disable_kuap = true; + pr_warn("Disabling Kernel Userspace Access Protection\n"); + return 0; +} +early_param("nosmap", parse_nosmap); + void __init setup_kup(void) { setup_kuep(disable_kuep); + setup_kuap(disable_kuap); } #define CTOR(shift) static void ctor_##shift(void *addr) \ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7d30bbbaa3c1..457fc3a5ed93 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -357,6 +357,18 @@ config PPC_KUEP If you're unsure, say Y. +config PPC_HAVE_KUAP + bool + +config PPC_KUAP + bool "Kernel Userspace Access Protection" + depends on PPC_HAVE_KUAP + default y + help + Enable support for Kernel Userspace Access Protection (KUAP) + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION -- cgit v1.2.3-59-g8ed1b From b28c97505eb1a5265e367c398c3406be6ce5e313 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 18 Apr 2019 16:51:21 +1000 Subject: powerpc/64: Setup KUP on secondary CPUs Some platforms (i.e. Radix MMU) need per-CPU initialisation for KUP. Any platforms that only want to do KUP initialisation once globally can just check to see if they're running on the boot CPU, or check if whatever setup they need has already been performed. Note that this is only for 64-bit. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 3 +++ arch/powerpc/mm/init-common.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6179c4200339..684e34493bf5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -390,6 +390,9 @@ void early_setup_secondary(void) /* Initialize the hash table or TLB handling */ early_init_mmu_secondary(); + /* Perform any KUP setup that is per-cpu */ + setup_kup(); + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index ecaedfff9992..6ea5607fc564 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -45,7 +45,7 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); -void __init setup_kup(void) +void setup_kup(void) { setup_kuep(disable_kuep); setup_kuap(disable_kuap); -- cgit v1.2.3-59-g8ed1b From 1bb2bae2e6c7d94e3bc1fdce06baf31b8d811ed6 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 18 Apr 2019 16:51:22 +1000 Subject: powerpc/mm/radix: Use KUEP API for Radix MMU Execution protection already exists on radix, this just refactors the radix init to provide the KUEP setup function instead. Thus, the only functional change is that it can now be disabled. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 12 +++++++++--- arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 154472a28c77..8616b291bcec 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -531,8 +531,15 @@ static void radix_init_amor(void) mtspr(SPRN_AMOR, (3ul << 62)); } -static void radix_init_iamr(void) +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) { + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) + pr_info("Activating Kernel Userspace Execution Prevention\n"); + /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction @@ -540,6 +547,7 @@ static void radix_init_iamr(void) */ mtspr(SPRN_IAMR, (1ul << 62)); } +#endif void __init radix__early_init_mmu(void) { @@ -601,7 +609,6 @@ void __init radix__early_init_mmu(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - radix_init_iamr(); radix_init_pgtable(); /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); @@ -623,7 +630,6 @@ void radix__early_init_mmu_secondary(void) __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); radix_init_amor(); } - radix_init_iamr(); radix__switch_mmu_context(NULL, &init_mm); if (cpu_has_feature(CPU_FTR_HVMODE)) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 457fc3a5ed93..60371784c9f1 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -326,6 +326,7 @@ config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select PPC_HAVE_KUEP default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this -- cgit v1.2.3-59-g8ed1b From 890274c2dc4c0a57ae5a12d6a76fa6d05b599d98 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Apr 2019 16:51:24 +1000 Subject: powerpc/64s: Implement KUAP for Radix MMU Kernel Userspace Access Prevention utilises a feature of the Radix MMU which disallows read and write access to userspace addresses. By utilising this, the kernel is prevented from accessing user data from outside of trusted paths that perform proper safety checks, such as copy_{to/from}_user() and friends. Userspace access is disabled from early boot and is only enabled when performing an operation like copy_{to/from}_user(). The register that controls this (AMR) does not prevent userspace from accessing itself, so there is no need to save and restore when entering and exiting userspace. When entering the kernel from the kernel we save AMR and if it is not blocking user access (because eg. we faulted doing a user access) we reblock user access for the duration of the exception (ie. the page fault) and then restore the AMR when returning back to the kernel. This feature can be tested by using the lkdtm driver (CONFIG_LKDTM=y) and performing the following: # (echo ACCESS_USERSPACE) > [debugfs]/provoke-crash/DIRECT If enabled, this should send SIGSEGV to the thread. We also add paranoid checking of AMR in switch and syscall return under CONFIG_PPC_KUAP_DEBUG. Co-authored-by: Michael Ellerman Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/kup-radix.h | 102 +++++++++++++++++++++++++ arch/powerpc/include/asm/exception-64s.h | 2 + arch/powerpc/include/asm/feature-fixups.h | 3 + arch/powerpc/include/asm/kup.h | 4 + arch/powerpc/include/asm/mmu.h | 10 ++- arch/powerpc/kernel/entry_64.S | 27 ++++++- arch/powerpc/kernel/exceptions-64s.S | 3 + arch/powerpc/mm/pgtable-radix.c | 19 +++++ arch/powerpc/mm/pkeys.c | 1 + arch/powerpc/platforms/Kconfig.cputype | 8 ++ 10 files changed, 176 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/include/asm/book3s/64/kup-radix.h (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h new file mode 100644 index 000000000000..6d6628424134 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H +#define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H + +#include + +#define AMR_KUAP_BLOCK_READ UL(0x4000000000000000) +#define AMR_KUAP_BLOCK_WRITE UL(0x8000000000000000) +#define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE) +#define AMR_KUAP_SHIFT 62 + +#ifdef __ASSEMBLY__ + +.macro kuap_restore_amr gpr +#ifdef CONFIG_PPC_KUAP + BEGIN_MMU_FTR_SECTION_NESTED(67) + ld \gpr, STACK_REGS_KUAP(r1) + mtspr SPRN_AMR, \gpr + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +.macro kuap_check_amr gpr1, gpr2 +#ifdef CONFIG_PPC_KUAP_DEBUG + BEGIN_MMU_FTR_SECTION_NESTED(67) + mfspr \gpr1, SPRN_AMR + li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT) + sldi \gpr2, \gpr2, AMR_KUAP_SHIFT +999: tdne \gpr1, \gpr2 + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +.macro kuap_save_amr_and_lock gpr1, gpr2, use_cr, msr_pr_cr +#ifdef CONFIG_PPC_KUAP + BEGIN_MMU_FTR_SECTION_NESTED(67) + .ifnb \msr_pr_cr + bne \msr_pr_cr, 99f + .endif + mfspr \gpr1, SPRN_AMR + std \gpr1, STACK_REGS_KUAP(r1) + li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT) + sldi \gpr2, \gpr2, AMR_KUAP_SHIFT + cmpd \use_cr, \gpr1, \gpr2 + beq \use_cr, 99f + // We don't isync here because we very recently entered via rfid + mtspr SPRN_AMR, \gpr2 + isync +99: + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_PPC_KUAP + +#include + +/* + * We support individually allowing read or write, but we don't support nesting + * because that would require an expensive read/modify write of the AMR. + */ + +static inline void set_kuap(unsigned long value) +{ + if (!mmu_has_feature(MMU_FTR_RADIX_KUAP)) + return; + + /* + * ISA v3.0B says we need a CSI (Context Synchronising Instruction) both + * before and after the move to AMR. See table 6 on page 1134. + */ + isync(); + mtspr(SPRN_AMR, value); + isync(); +} + +static inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + // This is written so we can resolve to a single case at build time + if (__builtin_constant_p(to) && to == NULL) + set_kuap(AMR_KUAP_BLOCK_WRITE); + else if (__builtin_constant_p(from) && from == NULL) + set_kuap(AMR_KUAP_BLOCK_READ); + else + set_kuap(0); +} + +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + set_kuap(AMR_KUAP_BLOCKED); +} + +#endif /* CONFIG_PPC_KUAP */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 937bb630093f..bef4e05a6823 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -497,6 +497,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) RESTORE_CTR(r1, area); \ b bad_stack; \ 3: EXCEPTION_PROLOG_COMMON_1(); \ + kuap_save_amr_and_lock r9, r10, cr1, cr0; \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ SAVE_PPR(area, r9); \ @@ -691,6 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL) */ #define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \ EXCEPTION_PROLOG_COMMON_1(); \ + kuap_save_amr_and_lock r9, r10, cr1; \ EXCEPTION_PROLOG_COMMON_2(area); \ EXCEPTION_PROLOG_COMMON_3(trap); \ /* Volatile regs are potentially clobbered here */ \ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 40a6c9261a6b..f6fc31f8baff 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -100,6 +100,9 @@ label##5: \ #define END_MMU_FTR_SECTION(msk, val) \ END_MMU_FTR_SECTION_NESTED(msk, val, 97) +#define END_MMU_FTR_SECTION_NESTED_IFSET(msk, label) \ + END_MMU_FTR_SECTION_NESTED((msk), (msk), label) + #define END_MMU_FTR_SECTION_IFSET(msk) END_MMU_FTR_SECTION((msk), (msk)) #define END_MMU_FTR_SECTION_IFCLR(msk) END_MMU_FTR_SECTION((msk), 0) diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 4d78b9d8c99c..d7312defbe1c 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -2,6 +2,10 @@ #ifndef _ASM_POWERPC_KUP_H_ #define _ASM_POWERPC_KUP_H_ +#ifdef CONFIG_PPC64 +#include +#endif + #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8ddd4a91bdc1..38d21adfde40 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -107,6 +107,11 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) +/* + * Supports KUAP (key 0 controlling userspace addresses) on radix + */ +#define MMU_FTR_RADIX_KUAP ASM_CONST(0x80000000) + /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 @@ -164,7 +169,10 @@ enum { #endif #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | -#endif +#ifdef CONFIG_PPC_KUAP + MMU_FTR_RADIX_KUAP | +#endif /* CONFIG_PPC_KUAP */ +#endif /* CONFIG_PPC_RADIX_MMU */ 0, }; diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 15c67d2c0534..7cc25389c6bd 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -46,6 +46,7 @@ #include #endif #include +#include /* * System calls. @@ -120,6 +121,9 @@ END_BTB_FLUSH_SECTION addi r9,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) std r11,-16(r9) /* "regshere" marker */ + + kuap_check_amr r10, r11 + #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) BEGIN_FW_FTR_SECTION beq 33f @@ -275,6 +279,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) andi. r6,r8,MSR_PR ld r4,_LINK(r1) + kuap_check_amr r10, r11 + #ifdef CONFIG_PPC_BOOK3S /* * Clear MSR_RI, MSR_EE is already and remains disabled. We could do @@ -296,6 +302,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r8, PACATMSCRATCH(r13) #endif + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ ld r2,GPR2(r1) ld r1,GPR1(r1) @@ -306,8 +316,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI_TO_USER b . /* prevent speculative execution */ - /* exit to kernel */ -1: ld r2,GPR2(r1) +1: /* exit to kernel */ + kuap_restore_amr r2 + + ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 @@ -594,6 +606,8 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ + kuap_check_amr r9, r10 + FLUSH_COUNT_CACHE /* @@ -942,6 +956,8 @@ fast_exception_return: ld r4,_XER(r1) mtspr SPRN_XER,r4 + kuap_check_amr r5, r6 + REST_8GPRS(5, r1) andi. r0,r3,MSR_RI @@ -974,6 +990,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -1006,6 +1026,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r0,GPR0(r1) ld r2,GPR2(r1) ld r3,GPR3(r1) + + kuap_restore_amr r4 + ld r4,GPR4(r1) ld r1,GPR1(r1) RFI_TO_KERNEL diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9481a117e242..bedd89438827 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -19,6 +19,7 @@ #include #include #include +#include /* * There are a few constraints to be concerned with. @@ -309,6 +310,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early) mfspr r11,SPRN_DSISR /* Save DSISR */ std r11,_DSISR(r1) std r9,_CCR(r1) /* Save CR in stackframe */ + kuap_save_amr_and_lock r9, r10, cr1 /* Save r9 through r13 from EXMC save area to stack frame. */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) mfmsr r11 /* get MSR value */ @@ -1109,6 +1111,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ mfspr r12,SPRN_HSRR1 /* Save HSRR1 */ EXCEPTION_PROLOG_COMMON_1() + /* We don't touch AMR here, we never go to virtual mode */ EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 8616b291bcec..45869fd698a0 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -549,6 +550,24 @@ void setup_kuep(bool disabled) } #endif +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; + } + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif + void __init radix__early_init_mmu(void) { unsigned long lpcr; diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c index 587807763737..ae7fca40e5b3 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/pkeys.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 60371784c9f1..5e53b9fd62aa 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -327,6 +327,7 @@ config PPC_RADIX_MMU depends on PPC_BOOK3S_64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select PPC_HAVE_KUEP + select PPC_HAVE_KUAP default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this @@ -370,6 +371,13 @@ config PPC_KUAP If you're unsure, say Y. +config PPC_KUAP_DEBUG + bool "Extra debugging for Kernel Userspace Access Protection" + depends on PPC_HAVE_KUAP && PPC_RADIX_MMU + help + Add extra debugging for Kernel Userspace Access Protection (KUAP) + If you're unsure, say N. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION -- cgit v1.2.3-59-g8ed1b From 5e5be3aed23032d40d5ab7407f344f1a74f2765b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Apr 2019 16:51:25 +1000 Subject: powerpc/mm: Detect bad KUAP faults When KUAP is enabled we have logic to detect page faults that occur outside of a valid user access region and are blocked by the AMR. What we don't have at the moment is logic to detect a fault *within* a valid user access region, that has been incorrectly blocked by AMR. This is not meant to ever happen, but it can if we incorrectly save/restore the AMR, or if the AMR was overwritten for some other reason. Currently if that happens we assume it's just a regular fault that will be corrected by handling the fault normally, so we just return. But there is nothing the fault handling code can do to fix it, so the fault just happens again and we spin forever, leading to soft lockups. So add some logic to detect that case and WARN() if we ever see it. Arguably it should be a BUG(), but it's more polite to fail the access and let the kernel continue, rather than taking down the box. There should be no data integrity issue with failing the fault rather than BUG'ing, as we're just going to disallow an access that should have been allowed. To make the code a little easier to follow, unroll the condition at the end of bad_kernel_fault() and comment each case, before adding the call to bad_kuap_fault(). Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/kup-radix.h | 6 ++++++ arch/powerpc/include/asm/kup.h | 1 + arch/powerpc/mm/fault.c | 25 ++++++++++++++++++++++--- 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index 6d6628424134..7679bd0c5af0 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -95,6 +95,12 @@ static inline void prevent_user_access(void __user *to, const void __user *from, set_kuap(AMR_KUAP_BLOCKED); } +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) && + (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)), + "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read"); +} #endif /* CONFIG_PPC_KUAP */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index d7312defbe1c..28ad4654eed2 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -26,6 +26,7 @@ static inline void allow_user_access(void __user *to, const void __user *from, unsigned long size) { } static inline void prevent_user_access(void __user *to, const void __user *from, unsigned long size) { } +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) { return false; } #endif /* CONFIG_PPC_KUAP */ static inline void allow_read_from_user(const void __user *from, unsigned long size) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 463d1e9d026e..b5d3578d9f65 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -44,6 +44,7 @@ #include #include #include +#include static inline bool notify_page_fault(struct pt_regs *regs) { @@ -224,7 +225,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, /* Is this a bad kernel fault ? */ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long address, bool is_write) { int is_exec = TRAP(regs) == 0x400; @@ -235,6 +236,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, address >= TASK_SIZE ? "exec-protected" : "user", address, from_kuid(&init_user_ns, current_uid())); + + // Kernel exec fault is always bad + return true; } if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) && @@ -244,7 +248,22 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, from_kuid(&init_user_ns, current_uid())); } - return is_exec || (address >= TASK_SIZE) || !search_exception_tables(regs->nip); + // Kernel fault on kernel address is bad + if (address >= TASK_SIZE) + return true; + + // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad + if (!search_exception_tables(regs->nip)) + return true; + + // Read/write fault in a valid region (the exception table search passed + // above), but blocked by KUAP is bad, it can never succeed. + if (bad_kuap_fault(regs, is_write)) + return true; + + // What's left? Kernel fault on user in well defined regions (extable + // matched), and allowed by KUAP in the faulting context. + return false; } static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, @@ -467,7 +486,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, * take a page fault to a kernel address or a page fault to a user * address outside of dedicated places */ - if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address))) + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) return SIGSEGV; /* -- cgit v1.2.3-59-g8ed1b From 06fbe81b5909847aa13f9c86c2b6f9bbc5c2795b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:33 +0000 Subject: powerpc/8xx: Add Kernel Userspace Execution Prevention This patch adds Kernel Userspace Execution Prevention on the 8xx. When a page is Executable, it is set Executable for Key 0 and NX for Key 1. Up to now, the User group is defined with Key 0 for both User and Supervisor. By changing the group to Key 0 for User and Key 1 for Supervisor, this patch prevents the Kernel from being able to execute user code. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 7 +++++++ arch/powerpc/mm/8xx_mmu.c | 12 ++++++++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 20 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index fc5a653d5dd2..3cb743284e09 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -41,6 +41,13 @@ */ #define MI_APG_INIT 0x4fffffff +/* + * 0 => Kernel => 01 (all accesses performed according to page definition) + * 1 => User => 10 (all accesses performed according to swaped page definition) + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) + */ +#define MI_APG_KUEP 0x6fffffff + /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in * this register are used to create the TLB entry. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index fe1f6443d57f..e257a0c9bd08 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -213,3 +213,15 @@ void flush_instruction_cache(void) mtspr(SPRN_IC_CST, IDC_INVALL); isync(); } + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + if (disabled) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + mtspr(SPRN_MI_AP, MI_APG_KUEP); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 2e45a6e2bc99..00fa0d110dcb 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -34,6 +34,7 @@ config PPC_8xx bool "Freescale 8xx" select FSL_SOC select SYS_SUPPORTS_HUGETLBFS + select PPC_HAVE_KUEP config 40x bool "AMCC 40x" -- cgit v1.2.3-59-g8ed1b From 2679f9bd0abafb3044bcbaac0600b32159ac8bf2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:34 +0000 Subject: powerpc/8xx: Add Kernel Userspace Access Protection This patch adds Kernel Userspace Access Protection on the 8xx. When a page is RO or RW, it is set RO or RW for Key 0 and NA for Key 1. Up to now, the User group is defined with Key 0 for both User and Supervisor. By changing the group to Key 0 for User and Key 1 for Supervisor, this patch prevents the Kernel from being able to access user data. At exception entry, the kernel saves SPRN_MD_AP in the regs struct, and reapply the protection. At exception exit it restores SPRN_MD_AP with the value saved on exception entry. Signed-off-by: Christophe Leroy [mpe: Drop allow_read/write_to/from_user() as they're now in kup.h] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kup.h | 3 ++ arch/powerpc/include/asm/nohash/32/kup-8xx.h | 58 ++++++++++++++++++++++++++++ arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 7 ++++ arch/powerpc/mm/8xx_mmu.c | 12 ++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 5 files changed, 81 insertions(+) create mode 100644 arch/powerpc/include/asm/nohash/32/kup-8xx.h (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 7d8ad3d6729d..043c800ec5fb 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -5,6 +5,9 @@ #ifdef CONFIG_PPC64 #include #endif +#ifdef CONFIG_PPC_8xx +#include +#endif #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h new file mode 100644 index 000000000000..1c3133b5f86a --- /dev/null +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_KUP_8XX_H_ +#define _ASM_POWERPC_KUP_8XX_H_ + +#include + +#ifdef CONFIG_PPC_KUAP + +#ifdef __ASSEMBLY__ + +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 + lis \gpr2, MD_APG_KUAP@h /* only APG0 and APG1 are used */ + mfspr \gpr1, SPRN_MD_AP + mtspr SPRN_MD_AP, \gpr2 + stw \gpr1, STACK_REGS_KUAP(\sp) +.endm + +.macro kuap_restore sp, current, gpr1, gpr2, gpr3 + lwz \gpr1, STACK_REGS_KUAP(\sp) + mtspr SPRN_MD_AP, \gpr1 +.endm + +.macro kuap_check current, gpr +#ifdef CONFIG_PPC_KUAP_DEBUG + mfspr \gpr, SPRN_MD_AP + rlwinm \gpr, \gpr, 16, 0xffff +999: twnei \gpr, MD_APG_KUAP@h + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif +.endm + +#else /* !__ASSEMBLY__ */ + +#include + +static inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + mtspr(SPRN_MD_AP, MD_APG_INIT); +} + +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} + +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf0000000), + "Bug: fault blocked by AP register !"); +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* CONFIG_PPC_KUAP */ + +#endif /* _ASM_POWERPC_KUP_8XX_H_ */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 3cb743284e09..f620adef54fc 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -121,6 +121,13 @@ */ #define MD_APG_INIT 0x4fffffff +/* + * 0 => No user => 01 (all accesses performed according to page definition) + * 1 => User => 10 (all accesses performed according to swaped page definition) + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) + */ +#define MD_APG_KUAP 0x6fffffff + /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in * this register are used to create the TLB entry. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index e257a0c9bd08..87648b58d295 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -225,3 +225,15 @@ void __init setup_kuep(bool disabled) mtspr(SPRN_MI_AP, MI_APG_KUEP); } #endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n"); + + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 00fa0d110dcb..ab586963893a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -35,6 +35,7 @@ config PPC_8xx select FSL_SOC select SYS_SUPPORTS_HUGETLBFS select PPC_HAVE_KUEP + select PPC_HAVE_KUAP config 40x bool "AMCC 40x" -- cgit v1.2.3-59-g8ed1b From 31ed2b13c48d779efc838ad54e30121e088a62af Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:35 +0000 Subject: powerpc/32s: Implement Kernel Userspace Execution Prevention. To implement Kernel Userspace Execution Prevention, this patch sets NX bit on all user segments on kernel entry and clears NX bit on all user segments on kernel exit. Note that powerpc 601 doesn't have the NX bit, so KUEP will not work on it. A warning is displayed at startup. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/kup.h | 42 +++++++++++++++++++++++++++ arch/powerpc/include/asm/book3s/32/mmu-hash.h | 3 ++ arch/powerpc/include/asm/kup.h | 3 ++ arch/powerpc/kernel/entry_32.S | 9 ++++++ arch/powerpc/kernel/head_32.S | 15 +++++++++- arch/powerpc/mm/ppc_mmu_32.c | 13 +++++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 7 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/book3s/32/kup.h (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h new file mode 100644 index 000000000000..5f97c742ca71 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BOOK3S_32_KUP_H +#define _ASM_POWERPC_BOOK3S_32_KUP_H + +#include + +#ifdef __ASSEMBLY__ + +.macro kuep_update_sr gpr1, gpr2 /* NEVER use r0 as gpr2 due to addis */ +101: mtsrin \gpr1, \gpr2 + addi \gpr1, \gpr1, 0x111 /* next VSID */ + rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */ + addis \gpr2, \gpr2, 0x1000 /* address of next segment */ + bdnz 101b + isync +.endm + +.macro kuep_lock gpr1, gpr2 +#ifdef CONFIG_PPC_KUEP + li \gpr1, NUM_USER_SEGMENTS + li \gpr2, 0 + mtctr \gpr1 + mfsrin \gpr1, \gpr2 + oris \gpr1, \gpr1, SR_NX@h /* set Nx */ + kuep_update_sr \gpr1, \gpr2 +#endif +.endm + +.macro kuep_unlock gpr1, gpr2 +#ifdef CONFIG_PPC_KUEP + li \gpr1, NUM_USER_SEGMENTS + li \gpr2, 0 + mtctr \gpr1 + mfsrin \gpr1, \gpr2 + rlwinm \gpr1, \gpr1, 0, ~SR_NX /* Clear Nx */ + kuep_update_sr \gpr1, \gpr2 +#endif +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 5cb588395fdc..8c5727a322b1 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -63,6 +63,9 @@ typedef pte_t *pgtable_t; #define PP_RWRW 2 /* Supervisor read/write, User read/write */ #define PP_RXRX 3 /* Supervisor read, User read */ +/* Values for Segment Registers */ +#define SR_NX 0x10000000 /* No Execute */ + #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 043c800ec5fb..5b5e39643a27 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -8,6 +8,9 @@ #ifdef CONFIG_PPC_8xx #include #endif +#ifdef CONFIG_PPC_BOOK3S_32 +#include +#endif #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1182bf603d3c..2f3d159c11d7 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -162,6 +162,9 @@ transfer_to_handler: andis. r12,r12,DBCR0_IDM@h #endif ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_lock r11, r12 +#endif #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ @@ -427,6 +430,9 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ ACCOUNT_CPU_USER_EXIT(r2, r5, r7) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_unlock r5, r7 +#endif kuap_check r2, r4 lwz r4,_LINK(r1) lwz r5,_CCR(r1) @@ -821,6 +827,9 @@ restore_user: bnel- load_dbcr0 #endif ACCOUNT_CPU_USER_EXIT(r2, r10, r11) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_unlock r10, r11 +#endif b restore diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e25b615e9f9e..19b46cb9f623 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -896,14 +896,24 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0,16 /* load up segment register values */ + li r0, NUM_USER_SEGMENTS /* load up segment register values */ mtctr r0 /* for context 0 */ lis r3,0x2000 /* Ku = 1, VSID = 0 */ +#ifdef CONFIG_PPC_KUEP + oris r3, r3, SR_NX@h /* Set Nx */ +#endif li r4,0 3: mtsrin r3,r4 addi r3,r3,0x111 /* increment VSID */ addis r4,r4,0x1000 /* address of next segment */ bdnz 3b + li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ + mtctr r0 /* for context 0 */ + rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ @@ -1007,6 +1017,9 @@ _ENTRY(switch_mmu_context) mulli r3,r3,897 /* multiply context by skew factor */ rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ addis r3,r3,0x6000 /* Set Ks, Ku bits */ +#ifdef CONFIG_PPC_KUEP + oris r3, r3, SR_NX@h /* Set Nx */ +#endif li r0,NUM_USER_SEGMENTS mtctr r0 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index f29d2f118b44..baa75673b1f5 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -394,3 +394,16 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, else /* Anything else has 256M mapped */ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); } + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + if (cpu_has_feature(CPU_FTR_601)) + pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n"); + + if (disabled) + pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index ab586963893a..6bc0a4c08c1c 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -25,6 +25,7 @@ config PPC_BOOK3S_32 bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" select PPC_FPU select PPC_HAVE_PMU_SUPPORT + select PPC_HAVE_KUEP config PPC_85xx bool "Freescale 85xx" -- cgit v1.2.3-59-g8ed1b From f342adca3afc84c4ef648352440ed6331518d72d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:36 +0000 Subject: powerpc/32s: Prepare Kernel Userspace Access Protection This patch prepares Kernel Userspace Access Protection for book3s/32. Due to limitations of the processor page protection capabilities, the protection is only against writing. read protection cannot be achieved using page protection. book3s/32 provides the following values for PP bits: PP00 provides RW for Key 0 and NA for Key 1 PP01 provides RW for Key 0 and RO for Key 1 PP10 provides RW for all PP11 provides RO for all Today PP10 is used for RW pages and PP11 for RO pages, and user segment register's Kp and Ks are set to 1. This patch modifies page protection to use PP01 for RW pages and sets user segment registers to Kp 0 and Ks 0. This will allow to setup Userspace write access protection by settng Ks to 1 in the following patch. Kernel space segment registers remain unchanged. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 2 ++ arch/powerpc/kernel/head_32.S | 22 +++++++++++----------- arch/powerpc/mm/hash_low_32.S | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 8c5727a322b1..f9eae105a9f4 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -65,6 +65,8 @@ typedef pte_t *pgtable_t; /* Values for Segment Registers */ #define SR_NX 0x10000000 /* No Execute */ +#define SR_KP 0x20000000 /* User key */ +#define SR_KS 0x40000000 /* Supervisor key */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 19b46cb9f623..69b97cc7079f 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -522,9 +522,9 @@ InstructionTLBMiss: andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - ori r1, r1, 0xe05 /* clear out reserved bits */ - andc r1, r0, r1 /* PP = user? 2 : 0 */ + rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ + ori r1, r1, 0xe06 /* clear out reserved bits */ + andc r1, r0, r1 /* PP = user? 1 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -590,11 +590,11 @@ DataLoadTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r1,r0,32-9,30,30 /* _PAGE_RW -> PP msb */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ + andc r1,r0,r1 /* PP = user? rw? 1: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -670,9 +670,9 @@ DataStoreTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - li r1,0xe05 /* clear out reserved bits & PP lsb */ - andc r1,r0,r1 /* PP = user? 2: 0 */ + rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ + li r1,0xe06 /* clear out reserved bits & PP msb */ + andc r1,r0,r1 /* PP = user? 1: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -896,9 +896,9 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0, NUM_USER_SEGMENTS /* load up segment register values */ + li r0, NUM_USER_SEGMENTS /* load up user segment register values */ mtctr r0 /* for context 0 */ - lis r3,0x2000 /* Ku = 1, VSID = 0 */ + li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ #endif @@ -910,6 +910,7 @@ load_up_mmu: li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ mtctr r0 /* for context 0 */ rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + oris r3, r3, SR_KP@h /* Kp = 1 */ 3: mtsrin r3, r4 addi r3, r3, 0x111 /* increment VSID */ addis r4, r4, 0x1000 /* address of next segment */ @@ -1016,7 +1017,6 @@ _ENTRY(switch_mmu_context) blt- 4f mulli r3,r3,897 /* multiply context by skew factor */ rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ - addis r3,r3,0x6000 /* Set Ks, Ku bits */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ #endif diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index a6c491f18a04..e27792d0b744 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -309,13 +309,13 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ ori r8,r8,0xe04 /* clear out reserved bits */ - andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) -- cgit v1.2.3-59-g8ed1b From a68c31fc01ef7863acc0fc74694bf279456a58c4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:38 +0000 Subject: powerpc/32s: Implement Kernel Userspace Access Protection This patch implements Kernel Userspace Access Protection for book3s/32. Due to limitations of the processor page protection capabilities, the protection is only against writing. read protection cannot be achieved using page protection. The previous patch modifies the page protection so that RW user pages are RW for Key 0 and RO for Key 1, and it sets Key 0 for both user and kernel. This patch changes userspace segment registers are set to Ku 0 and Ks 1. When kernel needs to write to RW pages, the associated segment register is then changed to Ks 0 in order to allow write access to the kernel. In order to avoid having the read all segment registers when locking/unlocking the access, some data is kept in the thread_struct and saved on stack on exceptions. The field identifies both the first unlocked segment and the first segment following the last unlocked one. When no segment is unlocked, it contains value 0. As the hash_page() function is not able to easily determine if a protfault is due to a bad kernel access to userspace, protfaults need to be handled by handle_page_fault when KUAP is set. Signed-off-by: Christophe Leroy [mpe: Drop allow_read/write_to/from_user() as they're now in kup.h, and adapt allow_user_access() to do nothing when to == NULL] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/kup.h | 103 +++++++++++++++++++++++++++++++ arch/powerpc/include/asm/processor.h | 3 + arch/powerpc/kernel/asm-offsets.c | 3 + arch/powerpc/kernel/head_32.S | 11 ++++ arch/powerpc/mm/ppc_mmu_32.c | 10 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + 6 files changed, 131 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 5f97c742ca71..677e9babef80 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -37,6 +37,109 @@ #endif .endm +#ifdef CONFIG_PPC_KUAP + +.macro kuap_update_sr gpr1, gpr2, gpr3 /* NEVER use r0 as gpr2 due to addis */ +101: mtsrin \gpr1, \gpr2 + addi \gpr1, \gpr1, 0x111 /* next VSID */ + rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */ + addis \gpr2, \gpr2, 0x1000 /* address of next segment */ + cmplw \gpr2, \gpr3 + blt- 101b + isync +.endm + +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 + lwz \gpr2, KUAP(\thread) + rlwinm. \gpr3, \gpr2, 28, 0xf0000000 + stw \gpr2, STACK_REGS_KUAP(\sp) + beq+ 102f + li \gpr1, 0 + stw \gpr1, KUAP(\thread) + mfsrin \gpr1, \gpr2 + oris \gpr1, \gpr1, SR_KS@h /* set Ks */ + kuap_update_sr \gpr1, \gpr2, \gpr3 +102: +.endm + +.macro kuap_restore sp, current, gpr1, gpr2, gpr3 + lwz \gpr2, STACK_REGS_KUAP(\sp) + rlwinm. \gpr3, \gpr2, 28, 0xf0000000 + stw \gpr2, THREAD + KUAP(\current) + beq+ 102f + mfsrin \gpr1, \gpr2 + rlwinm \gpr1, \gpr1, 0, ~SR_KS /* Clear Ks */ + kuap_update_sr \gpr1, \gpr2, \gpr3 +102: +.endm + +.macro kuap_check current, gpr +#ifdef CONFIG_PPC_KUAP_DEBUG + lwz \gpr2, KUAP(thread) +999: twnei \gpr, 0 + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif +.endm + +#endif /* CONFIG_PPC_KUAP */ + +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_PPC_KUAP + +#include + +static inline void kuap_update_sr(u32 sr, u32 addr, u32 end) +{ + barrier(); /* make sure thread.kuap is updated before playing with SRs */ + while (addr < end) { + mtsrin(sr, addr); + sr += 0x111; /* next VSID */ + sr &= 0xf0ffffff; /* clear VSID overflow */ + addr += 0x10000000; /* address of next segment */ + } + isync(); /* Context sync required after mtsrin() */ +} + +static inline void allow_user_access(void __user *to, const void __user *from, u32 size) +{ + u32 addr, end; + + if (__builtin_constant_p(to) && to == NULL) + return; + + addr = (__force u32)to; + + if (!addr || addr >= TASK_SIZE || !size) + return; + + end = min(addr + size, TASK_SIZE); + current->thread.kuap = (addr & 0xf0000000) | ((((end - 1) >> 28) + 1) & 0xf); + kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end); /* Clear Ks */ +} + +static inline void prevent_user_access(void __user *to, const void __user *from, u32 size) +{ + u32 addr = (__force u32)to; + u32 end = min(addr + size, TASK_SIZE); + + if (!addr || addr >= TASK_SIZE || !size) + return; + + current->thread.kuap = 0; + kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */ +} + +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + if (!is_write) + return false; + + return WARN(!regs->kuap, "Bug: write fault blocked by segment registers !"); +} + +#endif /* CONFIG_PPC_KUAP */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 3351bcf42f2d..540949b397d4 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -163,6 +163,9 @@ struct thread_struct { #ifdef CONFIG_PPC_RTAS unsigned long rtas_sp; /* stack pointer for when in RTAS */ #endif +#endif +#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) + unsigned long kuap; /* opened segments for user access */ #endif /* Debug Registers */ struct debug_reg debug; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 66202e02fee2..60b82198de7c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -147,6 +147,9 @@ int main(void) #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) OFFSET(THREAD_KVM_VCPU, thread_struct, kvm_vcpu); #endif +#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) + OFFSET(KUAP, thread_struct, kuap); +#endif #ifdef CONFIG_PPC_TRANSACTIONAL_MEM OFFSET(PACATMSCRATCH, paca_struct, tm_scratch); diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 69b97cc7079f..40aec3f00a05 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -387,7 +387,11 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) +#ifdef CONFIG_PPC_KUAP + andis. r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h +#else andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h +#endif bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ @@ -901,6 +905,9 @@ load_up_mmu: li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ +#endif +#ifdef CONFIG_PPC_KUAP + oris r3, r3, SR_KS@h /* Set Ks */ #endif li r4,0 3: mtsrin r3,r4 @@ -910,6 +917,7 @@ load_up_mmu: li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ mtctr r0 /* for context 0 */ rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + rlwinm r3, r3, 0, ~SR_KS /* Ks = 0 */ oris r3, r3, SR_KP@h /* Kp = 1 */ 3: mtsrin r3, r4 addi r3, r3, 0x111 /* increment VSID */ @@ -1019,6 +1027,9 @@ _ENTRY(switch_mmu_context) rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ +#endif +#ifdef CONFIG_PPC_KUAP + oris r3, r3, SR_KS@h /* Set Ks */ #endif li r0,NUM_USER_SEGMENTS mtctr r0 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index baa75673b1f5..bf1de3ca39bc 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -407,3 +407,13 @@ void __init setup_kuep(bool disabled) pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); } #endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6bc0a4c08c1c..60a7c7095b05 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -26,6 +26,7 @@ config PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP + select PPC_HAVE_KUAP config PPC_85xx bool "Freescale 85xx" -- cgit v1.2.3-59-g8ed1b From 60458fba469a695a026334b364cf8adbcd5807e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:48 +0530 Subject: powerpc/mm: Add helpers for accessing hash translation related variables We want to switch to allocating them runtime only when hash translation is enabled. Add helpers so that both book3s and nohash can be adapted to upcoming change easily. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 4 +- arch/powerpc/include/asm/book3s/64/mmu.h | 63 ++++++++++++++++++++++++++- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 50 +++++++++++++++++++++ arch/powerpc/kernel/paca.c | 12 ++--- arch/powerpc/mm/hash_utils_64.c | 10 ++--- arch/powerpc/mm/slb.c | 2 +- arch/powerpc/mm/slice.c | 49 ++++++++++----------- arch/powerpc/mm/subpage-prot.c | 8 ++-- 8 files changed, 154 insertions(+), 44 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index a28a28079edb..eb36fbfe4ef5 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -657,8 +657,8 @@ extern void slb_set_size(u16 size); /* 4 bits per slice and we have one slice per 1TB */ #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) -#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.slb_addr_limit >> 41) - +#define LOW_SLICE_ARRAY_SZ (BITS_PER_LONG / BITS_PER_BYTE) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->slb_addr_limit >> 41) #ifndef __ASSEMBLY__ #ifdef CONFIG_PPC_SUBPAGE_PROT diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index afe10dd11c68..c9f317090620 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -139,7 +139,7 @@ typedef struct { struct npu_context *npu_context; /* SLB page size encodings*/ - unsigned char low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; + unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ]; unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; unsigned long slb_addr_limit; # ifdef CONFIG_PPC_64K_PAGES @@ -174,6 +174,67 @@ typedef struct { #endif } mm_context_t; +static inline u16 mm_ctx_user_psize(mm_context_t *ctx) +{ + return ctx->user_psize; +} + +static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize) +{ + ctx->user_psize = user_psize; +} + +static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx) +{ + return ctx->low_slices_psize; +} + +static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx) +{ + return ctx->high_slices_psize; +} + +static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx) +{ + return ctx->slb_addr_limit; +} + +static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit) +{ + ctx->slb_addr_limit = limit; +} + +#ifdef CONFIG_PPC_64K_PAGES +static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx) +{ + return &ctx->mask_64k; +} +#endif + +static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx) +{ + return &ctx->mask_4k; +} + +#ifdef CONFIG_HUGETLB_PAGE +static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx) +{ + return &ctx->mask_16m; +} + +static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) +{ + return &ctx->mask_16g; +} +#endif + +#ifdef CONFIG_PPC_SUBPAGE_PROT +static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) +{ + return &ctx->spt; +} +#endif + /* * The current system page and segment sizes */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index f620adef54fc..c503e2f05e61 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -181,6 +181,7 @@ #ifdef CONFIG_PPC_MM_SLICES #include #define SLICE_ARRAY_SIZE (1 << (32 - SLICE_LOW_SHIFT - 1)) +#define LOW_SLICE_ARRAY_SZ SLICE_ARRAY_SIZE #endif #ifndef __ASSEMBLY__ @@ -207,6 +208,55 @@ typedef struct { void *pte_frag; } mm_context_t; +#ifdef CONFIG_PPC_MM_SLICES +static inline u16 mm_ctx_user_psize(mm_context_t *ctx) +{ + return ctx->user_psize; +} + +static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize) +{ + ctx->user_psize = user_psize; +} + +static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx) +{ + return ctx->low_slices_psize; +} + +static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx) +{ + return ctx->high_slices_psize; +} + +static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx) +{ + return ctx->slb_addr_limit; +} + +static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit) +{ + ctx->slb_addr_limit = limit; +} + +static inline struct slice_mask *mm_ctx_slice_mask_base(mm_context_t *ctx) +{ + return &ctx->mask_base_psize; +} + +#ifdef CONFIG_HUGETLB_PAGE +static inline struct slice_mask *mm_ctx_slice_mask_512k(mm_context_t *ctx) +{ + return &ctx->mask_512k; +} + +static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx) +{ + return &ctx->mask_8m; +} +#endif +#endif /* CONFIG_PPC_MM_SLICE */ + #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000) #define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE)) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e7382abee868..9cc91d03ab62 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -267,12 +267,12 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.slb_addr_limit); - get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; - memcpy(&get_paca()->mm_ctx_low_slices_psize, - &context->low_slices_psize, sizeof(context->low_slices_psize)); - memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); + VM_BUG_ON(!mm_ctx_slb_addr_limit(context)); + get_paca()->mm_ctx_slb_addr_limit = mm_ctx_slb_addr_limit(context); + memcpy(&get_paca()->mm_ctx_low_slices_psize, mm_ctx_low_slices(context), + LOW_SLICE_ARRAY_SZ); + memcpy(&get_paca()->mm_ctx_high_slices_psize, mm_ctx_high_slices(context), + TASK_SLICE_ARRAY_SZ(context)); #else /* CONFIG_PPC_MM_SLICES */ get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index c4c9610ce6e3..fee0270618ac 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1142,7 +1142,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 spp = 0; u32 **sbpm, *sbpp; @@ -1465,7 +1465,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) int psize = get_slice_psize(mm, ea); /* We only prefault standard pages for now */ - if (unlikely(psize != mm->context.user_psize)) + if (unlikely(psize != mm_ctx_user_psize(&mm->context))) return false; /* @@ -1544,7 +1544,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Hash it in */ #ifdef CONFIG_PPC_64K_PAGES - if (mm->context.user_psize == MMU_PAGE_64K) + if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, update_flags, ssize); else @@ -1557,8 +1557,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, */ if (rc == -1) hash_failure_debug(ea, access, vsid, trap, ssize, - mm->context.user_psize, - mm->context.user_psize, + mm_ctx_user_psize(&mm->context), + mm_ctx_user_psize(&mm->context), pte_val(*ptep)); out_exit: local_irq_restore(flags); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5986df48359b..78c0c0a0e355 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -739,7 +739,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= mm->context.slb_addr_limit) + if (ea >= mm_ctx_slb_addr_limit(&mm->context)) return -EFAULT; context = get_user_context(&mm->context, ea); diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index aec91dbcdc0b..35b278082391 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -101,7 +101,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; - if ((mm->context.slb_addr_limit - len) < addr) + if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) return 0; vma = find_vma(mm, addr); return (!vma || (addr + len) <= vm_start_gap(vma)); @@ -155,15 +155,15 @@ static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) { #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) - return &mm->context.mask_64k; + return mm_ctx_slice_mask_64k(&mm->context); #endif if (psize == MMU_PAGE_4K) - return &mm->context.mask_4k; + return mm_ctx_slice_mask_4k(&mm->context); #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_16M) - return &mm->context.mask_16m; + return mm_ctx_slice_mask_16m(&mm->context); if (psize == MMU_PAGE_16G) - return &mm->context.mask_16g; + return mm_ctx_slice_mask_16g(&mm->context); #endif BUG(); } @@ -253,7 +253,7 @@ static void slice_convert(struct mm_struct *mm, */ spin_lock_irqsave(&slice_convert_lock, flags); - lpsizes = mm->context.low_slices_psize; + lpsizes = mm_ctx_low_slices(&mm->context); for (i = 0; i < SLICE_NUM_LOW; i++) { if (!(mask->low_slices & (1u << i))) continue; @@ -272,8 +272,8 @@ static void slice_convert(struct mm_struct *mm, (((unsigned long)psize) << (mask_index * 4)); } - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { + hpsizes = mm_ctx_high_slices(&mm->context); + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { if (!test_bit(i, mask->high_slices)) continue; @@ -292,8 +292,8 @@ static void slice_convert(struct mm_struct *mm, } slice_dbg(" lsps=%lx, hsps=%lx\n", - (unsigned long)mm->context.low_slices_psize, - (unsigned long)mm->context.high_slices_psize); + (unsigned long)mm_ctx_low_slices(&mm->context), + (unsigned long)mm_ctx_high_slices(&mm->context)); spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -393,7 +393,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * DEFAULT_MAP_WINDOW we should apply this. */ if (high_limit > DEFAULT_MAP_WINDOW) - addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; + addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; while (addr > min_addr) { info.high_limit = addr; @@ -505,20 +505,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, return -ENOMEM; } - if (high_limit > mm->context.slb_addr_limit) { + if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { /* * Increasing the slb_addr_limit does not require * slice mask cache to be recalculated because it should * be already initialised beyond the old address limit. */ - mm->context.slb_addr_limit = high_limit; + mm_ctx_set_slb_addr_limit(&mm->context, high_limit); on_each_cpu(slice_flush_segments, mm, 1); } /* Sanity checks */ BUG_ON(mm->task_size == 0); - BUG_ON(mm->context.slb_addr_limit == 0); + BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); @@ -696,7 +696,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags) { return slice_get_unmapped_area(addr, len, flags, - current->mm->context.user_psize, 0); + mm_ctx_user_psize(¤t->mm->context), 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -706,7 +706,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags) { return slice_get_unmapped_area(addr0, len, flags, - current->mm->context.user_psize, 1); + mm_ctx_user_psize(¤t->mm->context), 1); } unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -717,10 +717,10 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) VM_BUG_ON(radix_enabled()); if (slice_addr_is_low(addr)) { - psizes = mm->context.low_slices_psize; + psizes = mm_ctx_low_slices(&mm->context); index = GET_LOW_SLICE_INDEX(addr); } else { - psizes = mm->context.high_slices_psize; + psizes = mm_ctx_high_slices(&mm->context); index = GET_HIGH_SLICE_INDEX(addr); } mask_index = index & 0x1; @@ -742,20 +742,19 @@ void slice_init_new_context_exec(struct mm_struct *mm) * duplicated. */ #ifdef CONFIG_PPC64 - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW_USER64); #else mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; #endif - - mm->context.user_psize = psize; + mm_ctx_set_user_psize(&mm->context, psize); /* * Set all slice psizes to the default. */ - lpsizes = mm->context.low_slices_psize; + lpsizes = mm_ctx_low_slices(&mm->context); memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); - hpsizes = mm->context.high_slices_psize; + hpsizes = mm_ctx_high_slices(&mm->context); memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); /* @@ -777,7 +776,7 @@ void slice_setup_new_exec(void) if (!is_32bit_task()) return; - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); } #endif @@ -816,7 +815,7 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { const struct slice_mask *maskp; - unsigned int psize = mm->context.user_psize; + unsigned int psize = mm_ctx_user_psize(&mm->context); VM_BUG_ON(radix_enabled()); diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 5e4178790dee..c72252542210 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -25,7 +25,7 @@ */ void subpage_prot_free(struct mm_struct *mm) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); unsigned long i, j, addr; u32 **p; @@ -52,7 +52,7 @@ void subpage_prot_free(struct mm_struct *mm) void subpage_prot_init_new_context(struct mm_struct *mm) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); memset(spt, 0, sizeof(*spt)); } @@ -93,7 +93,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 **spm, *spp; unsigned long i; size_t nw; @@ -189,7 +189,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 **spm, *spp; unsigned long i; size_t nw; -- cgit v1.2.3-59-g8ed1b From 67fda38f0d688580a916a8f829afd8edaffadfcf Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:49 +0530 Subject: powerpc/mm: Move slb_addr_linit to early_init_mmu Avoid #ifdef in generic code. Also enables us to do this specific to MMU translation mode on book3s64 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 11 ----------- arch/powerpc/mm/hash_utils_64.c | 2 ++ arch/powerpc/mm/tlb_nohash.c | 6 ++++++ 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2e5dfb6e0823..a07de8608484 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -947,17 +947,6 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; -#ifdef CONFIG_PPC_MM_SLICES -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; -#elif defined(CONFIG_PPC_8xx) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#else -#error "context.addr_limit not initialized." -#endif -#endif - #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fee0270618ac..fe2fc43a8664 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1036,6 +1036,8 @@ void __init hash__early_init_mmu(void) */ htab_initialize(); + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ slb_initialize(); diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ac23dc1c6535..088e0a6b5ade 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -800,5 +800,11 @@ void __init early_init_mmu(void) #ifdef CONFIG_PPC_47x early_init_mmu_47x(); #endif + +#ifdef CONFIG_PPC_MM_SLICES +#if defined(CONFIG_PPC_8xx) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; +#endif +#endif } #endif /* CONFIG_PPC64 */ -- cgit v1.2.3-59-g8ed1b From 701101865f5d3e268281ce7a254eb4a97d16cbdc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:50 +0530 Subject: powerpc/mm: Reduce memory usage for mm_context_t for radix Currently, our mm_context_t on book3s64 include all hash specific context details like slice mask and subpage protection details. We can skip allocating these with radix translation. This will help us to save 8K per mm_context with radix translation. With the patch applied we have sizeof(mm_context_t) = 136 sizeof(struct hash_mm_context) = 8288 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 33 +++++++++++++++++- arch/powerpc/include/asm/book3s/64/mmu.h | 49 +++++++-------------------- arch/powerpc/kernel/setup-common.c | 6 ++++ arch/powerpc/mm/hash_utils_64.c | 4 ++- arch/powerpc/mm/mmu_context_book3s64.c | 16 ++++++++- 5 files changed, 68 insertions(+), 40 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index eb36fbfe4ef5..4481bedbb5be 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -658,7 +658,7 @@ extern void slb_set_size(u16 size); /* 4 bits per slice and we have one slice per 1TB */ #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #define LOW_SLICE_ARRAY_SZ (BITS_PER_LONG / BITS_PER_BYTE) -#define TASK_SLICE_ARRAY_SZ(x) ((x)->slb_addr_limit >> 41) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->hash_context->slb_addr_limit >> 41) #ifndef __ASSEMBLY__ #ifdef CONFIG_PPC_SUBPAGE_PROT @@ -693,6 +693,37 @@ static inline void subpage_prot_free(struct mm_struct *mm) {} static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ +/* + * One bit per slice. We have lower slices which cover 256MB segments + * upto 4G range. That gets us 16 low slices. For the rest we track slices + * in 1TB size. + */ +struct slice_mask { + u64 low_slices; + DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); +}; + +struct hash_mm_context { + u16 user_psize; /* page size index */ + + /* SLB page size encodings*/ + unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ]; + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long slb_addr_limit; +#ifdef CONFIG_PPC_64K_PAGES + struct slice_mask mask_64k; +#endif + struct slice_mask mask_4k; +#ifdef CONFIG_HUGETLB_PAGE + struct slice_mask mask_16m; + struct slice_mask mask_16g; +#endif + +#ifdef CONFIG_PPC_SUBPAGE_PROT + struct subpage_prot_table spt; +#endif /* CONFIG_PPC_SUBPAGE_PROT */ +}; + #if 0 /* * The code below is equivalent to this function for arguments diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index c9f317090620..e510e46b07ce 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -104,16 +104,6 @@ struct spinlock; /* Maximum possible number of NPUs in a system. */ #define NV_MAX_NPUS 8 -/* - * One bit per slice. We have lower slices which cover 256MB segments - * upto 4G range. That gets us 16 low slices. For the rest we track slices - * in 1TB size. - */ -struct slice_mask { - u64 low_slices; - DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); -}; - typedef struct { union { /* @@ -127,7 +117,6 @@ typedef struct { mm_context_id_t id; mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE]; }; - u16 user_psize; /* page size index */ /* Number of bits in the mm_cpumask */ atomic_t active_cpus; @@ -137,23 +126,9 @@ typedef struct { /* NPU NMMU context */ struct npu_context *npu_context; + struct hash_mm_context *hash_context; - /* SLB page size encodings*/ - unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ]; - unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long slb_addr_limit; -# ifdef CONFIG_PPC_64K_PAGES - struct slice_mask mask_64k; -# endif - struct slice_mask mask_4k; -# ifdef CONFIG_HUGETLB_PAGE - struct slice_mask mask_16m; - struct slice_mask mask_16g; -# endif unsigned long vdso_base; -#ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; -#endif /* CONFIG_PPC_SUBPAGE_PROT */ /* * pagetable fragment support */ @@ -176,62 +151,62 @@ typedef struct { static inline u16 mm_ctx_user_psize(mm_context_t *ctx) { - return ctx->user_psize; + return ctx->hash_context->user_psize; } static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize) { - ctx->user_psize = user_psize; + ctx->hash_context->user_psize = user_psize; } static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx) { - return ctx->low_slices_psize; + return ctx->hash_context->low_slices_psize; } static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx) { - return ctx->high_slices_psize; + return ctx->hash_context->high_slices_psize; } static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx) { - return ctx->slb_addr_limit; + return ctx->hash_context->slb_addr_limit; } static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit) { - ctx->slb_addr_limit = limit; + ctx->hash_context->slb_addr_limit = limit; } #ifdef CONFIG_PPC_64K_PAGES static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx) { - return &ctx->mask_64k; + return &ctx->hash_context->mask_64k; } #endif static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx) { - return &ctx->mask_4k; + return &ctx->hash_context->mask_4k; } #ifdef CONFIG_HUGETLB_PAGE static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx) { - return &ctx->mask_16m; + return &ctx->hash_context->mask_16m; } static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) { - return &ctx->mask_16g; + return &ctx->hash_context->mask_16g; } #endif #ifdef CONFIG_PPC_SUBPAGE_PROT static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) { - return &ctx->spt; + return &ctx->hash_context->spt; } #endif diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index a07de8608484..21b1ce200b22 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -947,6 +947,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; +#ifdef CONFIG_PPC_MM_SLICES +#if defined(CONFIG_PPC_8xx) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; +#endif +#endif + #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fe2fc43a8664..27239a076773 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -963,6 +963,7 @@ void __init hash__early_init_devtree(void) htab_scan_page_sizes(); } +struct hash_mm_context init_hash_mm_context; void __init hash__early_init_mmu(void) { #ifndef CONFIG_PPC_64K_PAGES @@ -1036,7 +1037,8 @@ void __init hash__early_init_mmu(void) */ htab_initialize(); - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + init_mm.context.hash_context = &init_hash_mm_context; + init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index f720c5cc0b5e..6eef5a36b2e9 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -63,6 +63,12 @@ static int hash__init_new_context(struct mm_struct *mm) if (index < 0) return index; + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), GFP_KERNEL); + if (!mm->context.hash_context) { + ida_free(&mmu_context_ida, index); + return -ENOMEM; + } + /* * The old code would re-promote on fork, we don't do that when using * slices as it could cause problem promoting slices that have been @@ -77,8 +83,14 @@ static int hash__init_new_context(struct mm_struct *mm) * We should not be calling init_new_context() on init_mm. Hence a * check against 0 is OK. */ - if (mm->context.id == 0) + if (mm->context.id == 0) { + memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); slice_init_new_context_exec(mm); + } else { + /* This is fork. Copy hash_context details from current->mm */ + memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); + + } subpage_prot_init_new_context(mm); @@ -118,6 +130,7 @@ static int radix__init_new_context(struct mm_struct *mm) asm volatile("ptesync;isync" : : : "memory"); mm->context.npu_context = NULL; + mm->context.hash_context = NULL; return index; } @@ -162,6 +175,7 @@ static void destroy_contexts(mm_context_t *ctx) if (context_id) ida_free(&mmu_context_ida, context_id); } + kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) -- cgit v1.2.3-59-g8ed1b From ef629cc5bf0543eb57d6e344ba776880ac35fd00 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:51 +0530 Subject: powerc/mm/hash: Reduce hash_mm_context size Allocate subpage protect related variables only if we use the feature. This helps in reducing the hash related mm context struct by around 4K Before the patch sizeof(struct hash_mm_context) = 8288 After the patch sizeof(struct hash_mm_context) = 4160 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 4 +--- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/mm/hash_utils_64.c | 3 +++ arch/powerpc/mm/mmu_context_book3s64.c | 17 +++++++++++++--- arch/powerpc/mm/subpage-prot.c | 28 ++++++++++++++++++++------- 5 files changed, 40 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 4481bedbb5be..eeb40091b46b 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -687,10 +687,8 @@ struct subpage_prot_table { #define SBP_L3_SHIFT (SBP_L2_SHIFT + SBP_L2_BITS) extern void subpage_prot_free(struct mm_struct *mm); -extern void subpage_prot_init_new_context(struct mm_struct *mm); #else static inline void subpage_prot_free(struct mm_struct *mm) {} -static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ /* @@ -720,7 +718,7 @@ struct hash_mm_context { #endif #ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; + struct subpage_prot_table *spt; #endif /* CONFIG_PPC_SUBPAGE_PROT */ }; diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index e510e46b07ce..230a9dec7677 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -206,7 +206,7 @@ static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) #ifdef CONFIG_PPC_SUBPAGE_PROT static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) { - return &ctx->hash_context->spt; + return ctx->hash_context->spt; } #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 27239a076773..6a2d315495a3 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1150,6 +1150,9 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) u32 spp = 0; u32 **sbpm, *sbpp; + if (!spt) + return 0; + if (ea >= spt->maxaddr) return 0; if (ea < 0x100000000UL) { diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 6eef5a36b2e9..cb2b08635508 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -63,7 +63,8 @@ static int hash__init_new_context(struct mm_struct *mm) if (index < 0) return index; - mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), GFP_KERNEL); + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), + GFP_KERNEL); if (!mm->context.hash_context) { ida_free(&mmu_context_ida, index); return -ENOMEM; @@ -89,11 +90,21 @@ static int hash__init_new_context(struct mm_struct *mm) } else { /* This is fork. Copy hash_context details from current->mm */ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); +#ifdef CONFIG_PPC_SUBPAGE_PROT + /* inherit subpage prot detalis if we have one. */ + if (current->mm->context.hash_context->spt) { + mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), + GFP_KERNEL); + if (!mm->context.hash_context->spt) { + ida_free(&mmu_context_ida, index); + kfree(mm->context.hash_context); + return -ENOMEM; + } + } +#endif } - subpage_prot_init_new_context(mm); - pkey_mm_init(mm); return index; } diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index c72252542210..c9dff4e1f295 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -29,6 +29,9 @@ void subpage_prot_free(struct mm_struct *mm) unsigned long i, j, addr; u32 **p; + if (!spt) + return; + for (i = 0; i < 4; ++i) { if (spt->low_prot[i]) { free_page((unsigned long)spt->low_prot[i]); @@ -48,13 +51,7 @@ void subpage_prot_free(struct mm_struct *mm) free_page((unsigned long)p); } spt->maxaddr = 0; -} - -void subpage_prot_init_new_context(struct mm_struct *mm) -{ - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); - - memset(spt, 0, sizeof(*spt)); + kfree(spt); } static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, @@ -99,6 +96,9 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) size_t nw; unsigned long next, limit; + if (!spt) + return ; + down_write(&mm->mmap_sem); limit = addr + len; if (limit > spt->maxaddr) @@ -218,6 +218,20 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, return -EFAULT; down_write(&mm->mmap_sem); + + if (!spt) { + /* + * Allocate subpage prot table if not already done. + * Do this with mmap_sem held + */ + spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); + if (!spt) { + err = -ENOMEM; + goto out; + } + mm->context.hash_context->spt = spt; + } + subpage_mark_vma_nohuge(mm, addr, len); for (limit = addr + len; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); -- cgit v1.2.3-59-g8ed1b From a35a3c6f60657812366fca86a9ce71df1b8f7aff Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:13 +0530 Subject: powerpc/mm/hash64: Add a variable to track the end of IO mapping This makes it easy to update the region mapping in the later patch Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 3 ++- arch/powerpc/include/asm/book3s/64/pgtable.h | 8 +++++--- arch/powerpc/include/asm/book3s/64/radix.h | 1 + arch/powerpc/mm/hash_utils_64.c | 1 + arch/powerpc/mm/pgtable-radix.c | 1 + arch/powerpc/mm/pgtable_64.c | 2 ++ 6 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 54b7af6cd27f..8cbc4106d449 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -69,7 +69,8 @@ #define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) -#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_END (H_KERN_VIRT_START + H_KERN_VIRT_SIZE) /* * Region IDs diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e3d18b3f6e5d..f8ab18f77d1b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -277,9 +277,12 @@ extern unsigned long __vmalloc_end; extern unsigned long __kernel_virt_start; extern unsigned long __kernel_virt_size; extern unsigned long __kernel_io_start; +extern unsigned long __kernel_io_end; #define KERN_VIRT_START __kernel_virt_start #define KERN_VIRT_SIZE __kernel_virt_size #define KERN_IO_START __kernel_io_start +#define KERN_IO_END __kernel_io_end + extern struct page *vmemmap; extern unsigned long ioremap_bot; extern unsigned long pci_io_base; @@ -296,8 +299,7 @@ extern unsigned long pci_io_base; #include /* - * The second half of the kernel virtual space is used for IO mappings, - * it's itself carved into the PIO region (ISA and PHB IO space) and + * IO space itself carved into the PIO region (ISA and PHB IO space) and * the ioremap space * * ISA_IO_BASE = KERN_IO_START, 64K reserved area @@ -310,7 +312,7 @@ extern unsigned long pci_io_base; #define PHB_IO_BASE (ISA_IO_END) #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) -#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) +#define IOREMAP_END (KERN_IO_END) /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 5ab134eeed20..6d760a083d62 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -111,6 +111,7 @@ #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) #define RADIX_KERN_IO_START (RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) +#define RADIX_KERN_IO_END (RADIX_KERN_VIRT_START + RADIX_KERN_VIRT_SIZE) #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 6a2d315495a3..f6ffe8545717 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1013,6 +1013,7 @@ void __init hash__early_init_mmu(void) __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; __kernel_io_start = H_KERN_IO_START; + __kernel_io_end = H_KERN_IO_END; vmemmap = (struct page *)H_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 45869fd698a0..4c1a9843d0f2 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -605,6 +605,7 @@ void __init radix__early_init_mmu(void) __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; __kernel_io_start = RADIX_KERN_IO_START; + __kernel_io_end = RADIX_KERN_IO_END; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index fb1375c07e8c..7cea39bdf05f 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -98,6 +98,8 @@ unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); unsigned long __kernel_io_start; EXPORT_SYMBOL(__kernel_io_start); +unsigned long __kernel_io_end; +EXPORT_SYMBOL(__kernel_io_end); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; -- cgit v1.2.3-59-g8ed1b From 0034d395f89d9c092bb15adbabdca5283e258b41 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:14 +0530 Subject: powerpc/mm/hash64: Map all the kernel regions in the same 0xc range This patch maps vmalloc, IO and vmemap regions in the 0xc address range instead of the current 0xd and 0xf range. This brings the mapping closer to radix translation mode. With hash 64K page size each of this region is 512TB whereas with 4K config we are limited by the max page table range of 64TB and hence there regions are of 16TB size. The kernel mapping is now: On 4K hash kernel_region_map_size = 16TB kernel vmalloc start = 0xc000100000000000 kernel IO start = 0xc000200000000000 kernel vmemmap start = 0xc000300000000000 64K hash, 64K radix and 4k radix: kernel_region_map_size = 512TB kernel vmalloc start = 0xc008000000000000 kernel IO start = 0xc00a000000000000 kernel vmemmap start = 0xc00c000000000000 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 13 ++++ arch/powerpc/include/asm/book3s/64/hash-64k.h | 11 ++++ arch/powerpc/include/asm/book3s/64/hash.h | 95 +++++++++++++++++---------- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 31 +++++---- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/radix.h | 41 ++++++------ arch/powerpc/include/asm/page.h | 3 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 +- arch/powerpc/mm/copro_fault.c | 14 ++-- arch/powerpc/mm/hash_utils_64.c | 26 ++++---- arch/powerpc/mm/pgtable-radix.c | 7 +- arch/powerpc/mm/pgtable_64.c | 2 - arch/powerpc/mm/ptdump/hashpagetable.c | 2 +- arch/powerpc/mm/ptdump/ptdump.c | 3 +- arch/powerpc/mm/slb.c | 22 ++++--- arch/powerpc/platforms/cell/spu_base.c | 4 +- drivers/misc/cxl/fault.c | 2 +- drivers/misc/ocxl/link.c | 2 +- 18 files changed, 172 insertions(+), 109 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 54fab723a8c7..4c9dfd625461 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -13,6 +13,19 @@ */ #define MAX_EA_BITS_PER_CONTEXT 46 +/* + * Our page table limit us to 64TB. Hence for the kernel mapping, + * each MAP area is limited to 16 TB. + * The four map areas are: linear mapping, vmap, IO and vmemmap + */ +#define H_KERN_MAP_SIZE (ASM_CONST(1) << (MAX_EA_BITS_PER_CONTEXT - 2)) + +/* + * Define the address range of the kernel non-linear virtual area + * 16TB + */ +#define H_KERN_VIRT_START ASM_CONST(0xc000100000000000) + #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 81f4eb6e7da4..0d0191cda050 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -14,6 +14,17 @@ */ #define MAX_EA_BITS_PER_CONTEXT 49 +/* + * We use one context for each MAP area. + */ +#define H_KERN_MAP_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) + +/* + * Define the address range of the kernel non-linear virtual area + * 2PB + */ +#define H_KERN_VIRT_START ASM_CONST(0xc008000000000000) + /* * 64k aligned address free up few of the lower bits of RPN for us * We steal that here. For more deatils look at pte_pfn/pfn_pte() diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 8cbc4106d449..76741a221910 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -29,6 +29,10 @@ #define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \ H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) #define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) +/* + * Top 2 bits are ignored in page table walk. + */ +#define EA_MASK (~(0xcUL << 60)) /* * We store the slot details in the second half of page table. @@ -42,53 +46,56 @@ #endif /* - * Define the address range of the kernel non-linear virtual area. In contrast - * to the linear mapping, this is managed using the kernel page tables and then - * inserted into the hash page table to actually take effect, similarly to user - * mappings. + * +------------------------------+ + * | | + * | | + * | | + * +------------------------------+ Kernel virtual map end (0xc00e000000000000) + * | | + * | | + * | 512TB/16TB of vmemmap | + * | | + * | | + * +------------------------------+ Kernel vmemmap start + * | | + * | 512TB/16TB of IO map | + * | | + * +------------------------------+ Kernel IO map start + * | | + * | 512TB/16TB of vmap | + * | | + * +------------------------------+ Kernel virt start (0xc008000000000000) + * | | + * | | + * | | + * +------------------------------+ Kernel linear (0xc.....) */ -#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -/* - * Allow virtual mapping of one context size. - * 512TB for 64K page size - * 64TB for 4K page size - */ -#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE H_KERN_MAP_SIZE +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) -/* - * 8TB IO mapping size - */ -#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */ - -/* - * The vmalloc space starts at the beginning of the kernel non-linear virtual - * region, and occupies 504T (64K) or 56T (4K) - */ -#define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) -#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_SIZE H_KERN_MAP_SIZE +#define H_KERN_IO_END (H_KERN_IO_START + H_KERN_IO_SIZE) -#define H_KERN_IO_START H_VMALLOC_END -#define H_KERN_IO_END (H_KERN_VIRT_START + H_KERN_VIRT_SIZE) +#define H_VMEMMAP_START H_KERN_IO_END +#define H_VMEMMAP_SIZE H_KERN_MAP_SIZE +#define H_VMEMMAP_END (H_VMEMMAP_START + H_VMEMMAP_SIZE) /* * Region IDs */ -#define REGION_SHIFT 60UL -#define REGION_MASK (0xfUL << REGION_SHIFT) -#define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) - -#define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START)) -#define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) -#define VMEMMAP_REGION_ID (0xfUL) /* Server only */ -#define USER_REGION_ID (0UL) +#define USER_REGION_ID 1 +#define KERNEL_REGION_ID 2 +#define VMALLOC_REGION_ID 3 +#define IO_REGION_ID 4 +#define VMEMMAP_REGION_ID 5 /* * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ -#define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA @@ -104,6 +111,26 @@ #define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) #ifndef __ASSEMBLY__ +static inline int get_region_id(unsigned long ea) +{ + int id = (ea >> 60UL); + + if (id == 0) + return USER_REGION_ID; + + VM_BUG_ON(id != 0xc); + VM_BUG_ON(ea >= H_VMEMMAP_END); + + if (ea >= H_VMEMMAP_START) + return VMEMMAP_REGION_ID; + else if (ea >= H_KERN_IO_START) + return IO_REGION_ID; + else if (ea >= H_VMALLOC_START) + return VMALLOC_REGION_ID; + + return KERNEL_REGION_ID; +} + #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) static inline int hash__pgd_bad(pgd_t pgd) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index eeb40091b46b..8a30bf189f10 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -588,7 +588,8 @@ extern void slb_set_size(u16 size); #endif #define MAX_VMALLOC_CTX_CNT 1 -#define MAX_MEMMAP_CTX_CNT 1 +#define MAX_IO_CTX_CNT 1 +#define MAX_VMEMMAP_CTX_CNT 1 /* * 256MB segment @@ -601,13 +602,10 @@ extern void slb_set_size(u16 size); * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 * because of the modulo operation in vsid scramble. * - * We add one extra context to MIN_USER_CONTEXT so that we can map kernel - * context easily. The +1 is to map the unused 0xe region mapping. */ #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) #define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \ - MAX_MEMMAP_CTX_CNT + 2) - + MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT) /* * For platforms that support on 65bit VA we limit the context bits */ @@ -776,7 +774,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, /* * Bad address. We return VSID 0 for that */ - if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) + if ((ea & EA_MASK) >= H_PGTABLE_RANGE) return 0; if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) @@ -803,28 +801,29 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, * 0x00002 - [ 0xc002000000000000 - 0xc003ffffffffffff] * 0x00003 - [ 0xc004000000000000 - 0xc005ffffffffffff] * 0x00004 - [ 0xc006000000000000 - 0xc007ffffffffffff] - - * 0x00005 - [ 0xd000000000000000 - 0xd001ffffffffffff ] - * 0x00006 - Not used - Can map 0xe000000000000000 range. - * 0x00007 - [ 0xf000000000000000 - 0xf001ffffffffffff ] * - * So we can compute the context from the region (top nibble) by - * subtracting 11, or 0xc - 1. + * vmap, IO, vmemap + * + * 0x00005 - [ 0xc008000000000000 - 0xc009ffffffffffff] + * 0x00006 - [ 0xc00a000000000000 - 0xc00bffffffffffff] + * 0x00007 - [ 0xc00c000000000000 - 0xc00dffffffffffff] + * */ static inline unsigned long get_kernel_context(unsigned long ea) { - unsigned long region_id = REGION_ID(ea); + unsigned long region_id = get_region_id(ea); unsigned long ctx; /* - * For linear mapping we do support multiple context + * Depending on Kernel config, kernel region can have one context + * or more. */ if (region_id == KERNEL_REGION_ID) { /* * We already verified ea to be not beyond the addr limit. */ - ctx = 1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT); + ctx = 1 + ((ea & EA_MASK) >> MAX_EA_BITS_PER_CONTEXT); } else - ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT; + ctx = region_id + MAX_KERNEL_CTX_CNT - 2; return ctx; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index f8ab18f77d1b..7dede2e34b70 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -279,7 +279,6 @@ extern unsigned long __kernel_virt_size; extern unsigned long __kernel_io_start; extern unsigned long __kernel_io_end; #define KERN_VIRT_START __kernel_virt_start -#define KERN_VIRT_SIZE __kernel_virt_size #define KERN_IO_START __kernel_io_start #define KERN_IO_END __kernel_io_end diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 6d760a083d62..574eca33f893 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -72,19 +72,17 @@ * | | * | | * | | - * +------------------------------+ Kernel IO map end (0xc010000000000000) + * +------------------------------+ Kernel vmemmap end (0xc010000000000000) * | | + * | 512TB | * | | - * | 1/2 of virtual map | + * +------------------------------+ Kernel IO map end/vmemap start * | | + * | 512TB | * | | - * +------------------------------+ Kernel IO map start + * +------------------------------+ Kernel vmap end/ IO map start * | | - * | 1/4 of virtual map | - * | | - * +------------------------------+ Kernel vmemap start - * | | - * | 1/4 of virtual map | + * | 512TB | * | | * +------------------------------+ Kernel virt start (0xc008000000000000) * | | @@ -93,25 +91,24 @@ * +------------------------------+ Kernel linear (0xc.....) */ -#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) -#define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000) - +#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) /* - * The vmalloc space starts at the beginning of that region, and - * occupies a quarter of it on radix config. - * (we keep a quarter for the virtual memmap) + * 49 = MAX_EA_BITS_PER_CONTEXT (hash specific). To make sure we pick + * the same value as hash. */ +#define RADIX_KERN_MAP_SIZE (1UL << 49) + #define RADIX_VMALLOC_START RADIX_KERN_VIRT_START -#define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2) +#define RADIX_VMALLOC_SIZE RADIX_KERN_MAP_SIZE #define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE) -/* - * Defines the address of the vmemap area, in its own region on - * hash table CPUs. - */ -#define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) -#define RADIX_KERN_IO_START (RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) -#define RADIX_KERN_IO_END (RADIX_KERN_VIRT_START + RADIX_KERN_VIRT_SIZE) +#define RADIX_KERN_IO_START RADIX_VMALLOC_END +#define RADIX_KERN_IO_SIZE RADIX_KERN_MAP_SIZE +#define RADIX_KERN_IO_END (RADIX_KERN_IO_START + RADIX_KERN_IO_SIZE) + +#define RADIX_VMEMMAP_START RADIX_KERN_IO_END +#define RADIX_VMEMMAP_SIZE RADIX_KERN_MAP_SIZE +#define RADIX_VMEMMAP_END (RADIX_VMEMMAP_START + RADIX_VMEMMAP_SIZE) #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index ed870468ef6f..918228f2205b 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -139,7 +139,8 @@ static inline bool pfn_valid(unsigned long pfn) * return true for some vmalloc addresses, which is incorrect. So explicitly * check that the address is in the kernel region. */ -#define virt_addr_valid(kaddr) (REGION_ID(kaddr) == KERNEL_REGION_ID && \ +/* may be can drop get_region_id */ +#define virt_addr_valid(kaddr) (get_region_id((unsigned long)kaddr) == KERNEL_REGION_ID && \ pfn_valid(virt_to_pfn(kaddr))) #else #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 3b9662a4207e..085509148d95 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -822,7 +822,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr) raddr = per_cpu_ptr(addr, cpu); l = (unsigned long)raddr; - if (REGION_ID(l) == VMALLOC_REGION_ID) { + if (get_region_id(l) == VMALLOC_REGION_ID) { l = vmalloc_to_phys(raddr); raddr = (unsigned int *)l; } diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index c8da352e8686..9b0321061bc8 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -105,7 +105,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) u64 vsid, vsidkey; int psize, ssize; - switch (REGION_ID(ea)) { + switch (get_region_id(ea)) { case USER_REGION_ID: pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); if (mm == NULL) @@ -117,10 +117,14 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) break; case VMALLOC_REGION_ID: pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; + break; + case IO_REGION_ID: + pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea); + psize = mmu_io_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); vsidkey = SLB_VSID_KERNEL; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f6ffe8545717..9c4ae4aa133e 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1009,12 +1009,11 @@ void __init hash__early_init_mmu(void) __pgd_val_bits = HASH_PGD_VAL_BITS; __kernel_virt_start = H_KERN_VIRT_START; - __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; __kernel_io_start = H_KERN_IO_START; __kernel_io_end = H_KERN_IO_END; - vmemmap = (struct page *)H_VMEMMAP_BASE; + vmemmap = (struct page *)H_VMEMMAP_START; ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PCI @@ -1241,7 +1240,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, trace_hash_fault(ea, access, trap); /* Get region & vsid */ - switch (REGION_ID(ea)) { + switch (get_region_id(ea)) { case USER_REGION_ID: user_region = 1; if (! mm) { @@ -1255,10 +1254,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, break; case VMALLOC_REGION_ID: vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + break; + + case IO_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_io_psize; ssize = mmu_kernel_ssize; break; default: @@ -1424,7 +1426,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, unsigned long flags = 0; struct mm_struct *mm = current->mm; - if (REGION_ID(ea) == VMALLOC_REGION_ID) + if ((get_region_id(ea) == VMALLOC_REGION_ID) || + (get_region_id(ea) == IO_REGION_ID)) mm = &init_mm; if (dsisr & DSISR_NOHPTE) @@ -1440,8 +1443,9 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; + unsigned int region_id = get_region_id(ea); - if (REGION_ID(ea) == VMALLOC_REGION_ID) + if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) mm = &init_mm; if (dsisr & DSISR_NOHPTE) @@ -1458,7 +1462,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, * 2) user space access kernel space. */ access |= _PAGE_PRIVILEGED; - if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) + if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) @@ -1502,7 +1506,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); - BUG_ON(REGION_ID(ea) != USER_REGION_ID); + BUG_ON(get_region_id(ea) != USER_REGION_ID); if (!should_hash_preload(mm, ea)) return; diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 4c1a9843d0f2..4d9fa9e900d5 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -136,6 +136,10 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, */ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); +#ifdef CONFIG_PPC_64K_PAGES + BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); +#endif + if (unlikely(!slab_is_available())) return early_map_kernel_page(ea, pa, flags, map_page_size, nid, region_start, region_end); @@ -601,12 +605,11 @@ void __init radix__early_init_mmu(void) __pgd_val_bits = RADIX_PGD_VAL_BITS; __kernel_virt_start = RADIX_KERN_VIRT_START; - __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; __kernel_io_start = RADIX_KERN_IO_START; __kernel_io_end = RADIX_KERN_IO_END; - vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + vmemmap = (struct page *)RADIX_VMEMMAP_START; ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PCI diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 7cea39bdf05f..56068cac2a3c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -90,8 +90,6 @@ unsigned long __pgd_val_bits; EXPORT_SYMBOL(__pgd_val_bits); unsigned long __kernel_virt_start; EXPORT_SYMBOL(__kernel_virt_start); -unsigned long __kernel_virt_size; -EXPORT_SYMBOL(__kernel_virt_size); unsigned long __vmalloc_start; EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index b430e4e08af6..b9bda0105841 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -500,7 +500,7 @@ static void populate_markers(void) address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; #ifdef CONFIG_PPC_BOOK3S_64 - address_markers[9].start_address = H_VMEMMAP_BASE; + address_markers[9].start_address = H_VMEMMAP_START; #else address_markers[9].start_address = VMEMMAP_BASE; #endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 37138428ab55..63fc56feea15 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -303,8 +303,9 @@ static void populate_markers(void) address_markers[i++].start_address = PHB_IO_END; address_markers[i++].start_address = IOREMAP_BASE; address_markers[i++].start_address = IOREMAP_END; + /* What is the ifdef about? */ #ifdef CONFIG_PPC_BOOK3S_64 - address_markers[i++].start_address = H_VMEMMAP_BASE; + address_markers[i++].start_address = H_VMEMMAP_START; #else address_markers[i++].start_address = VMEMMAP_BASE; #endif diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 78c0c0a0e355..721cb09c9044 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -694,7 +694,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) if (id == KERNEL_REGION_ID) { /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS)) + if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; @@ -702,20 +702,25 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) #ifdef CONFIG_SPARSEMEM_VMEMMAP } else if (id == VMEMMAP_REGION_ID) { - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + if (ea >= H_VMEMMAP_END) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; #endif } else if (id == VMALLOC_REGION_ID) { - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + if (ea >= H_VMALLOC_END) return -EFAULT; - if (ea < H_VMALLOC_END) - flags = local_paca->vmalloc_sllp; - else - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + flags = local_paca->vmalloc_sllp; + + } else if (id == IO_REGION_ID) { + + if (ea >= H_KERN_IO_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { return -EFAULT; } @@ -725,6 +730,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) ssize = MMU_SEGSIZE_256M; context = get_kernel_context(ea); + return slb_insert_entry(ea, context, flags, ssize, true); } @@ -761,7 +767,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) long do_slb_fault(struct pt_regs *regs, unsigned long ea) { - unsigned long id = REGION_ID(ea); + unsigned long id = get_region_id(ea); /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 7f12c7b78c0f..4770cce1bfe2 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -194,7 +194,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) * faults need to be deferred to process context. */ if ((dsisr & MFC_DSISR_PTE_NOT_FOUND) && - (REGION_ID(ea) != USER_REGION_ID)) { + (get_region_id(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); ret = hash_page(ea, @@ -224,7 +224,7 @@ static void __spu_kernel_slb(void *addr, struct copro_slb *slb) unsigned long ea = (unsigned long)addr; u64 llp; - if (REGION_ID(ea) == KERNEL_REGION_ID) + if (get_region_id(ea) == KERNEL_REGION_ID) llp = mmu_psize_defs[mmu_linear_psize].sllp; else llp = mmu_psize_defs[mmu_virtual_psize].sllp; diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index dc7b34174f85..a4d17a5a9763 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -168,7 +168,7 @@ int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar) if (dsisr & CXL_PSL_DSISR_An_S) access |= _PAGE_WRITE; - if (!mm && (REGION_ID(dar) != USER_REGION_ID)) + if (!mm && (get_region_id(dar) != USER_REGION_ID)) access |= _PAGE_PRIVILEGED; if (dsisr & DSISR_NOHPTE) diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index d50b861d7e57..04ec3d74f828 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -163,7 +163,7 @@ static void xsl_fault_handler_bh(struct work_struct *fault_work) if (fault->dsisr & SPA_XSL_S) access |= _PAGE_WRITE; - if (REGION_ID(fault->dar) != USER_REGION_ID) + if (get_region_id(fault->dar) != USER_REGION_ID) access |= _PAGE_PRIVILEGED; local_irq_save(flags); -- cgit v1.2.3-59-g8ed1b From e09093927e54323018dbb3bf6189c85a7f176bae Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:15 +0530 Subject: powerpc/mm: Validate address values against different region limits This adds an explicit check in various functions. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 18 +++++++++++++++--- arch/powerpc/mm/pgtable-hash64.c | 13 ++++++++++--- arch/powerpc/mm/pgtable-radix.c | 16 ++++++++++++++++ arch/powerpc/mm/pgtable_64.c | 5 +++++ 4 files changed, 46 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 9c4ae4aa133e..f727197de713 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -781,9 +781,16 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size) int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) { - int rc = htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize); + int rc; + + if (end >= H_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); if (rc < 0) { int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, @@ -924,6 +931,11 @@ static void __init htab_initialize(void) DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", base, size, prot); + if ((base + size) >= H_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + continue; + } + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); } diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index c08d49046a96..d934de4e2b3a 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -112,9 +112,16 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) { - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); + int rc; + + if ((start + page_size) >= H_VMEMMAP_END) { + pr_warn("Outisde the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); if (rc < 0) { int rc2 = htab_remove_mapping(start, start + page_size, mmu_vmemmap_psize, diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 4d9fa9e900d5..e6d5065b0bc8 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -339,6 +339,12 @@ void __init radix_init_pgtable(void) * page tables will be allocated within the range. No * need or a node (which we don't have yet). */ + + if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + continue; + } + WARN_ON(create_physical_mapping(reg->base, reg->base + reg->size, -1)); @@ -895,6 +901,11 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + return -1; + } + return create_physical_mapping(start, end, nid); } @@ -922,6 +933,11 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret; + if ((start + page_size) >= RADIX_VMEMMAP_END) { + pr_warn("Outisde the supported range\n"); + return -1; + } + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); BUG_ON(ret); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 56068cac2a3c..72f58c076e26 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -121,6 +121,11 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_ if (pgprot_val(prot) & H_PAGE_4K_PFN) return NULL; + if ((ea + size) >= (void *)IOREMAP_END) { + pr_warn("Outisde the supported range\n"); + return NULL; + } + WARN_ON(pa & ~PAGE_MASK); WARN_ON(((unsigned long)ea) & ~PAGE_MASK); WARN_ON(size & ~PAGE_MASK); -- cgit v1.2.3-59-g8ed1b From 5f53d28608f600d9ee07378453bd2d49e132fff4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:19 +0530 Subject: powerpc/mm/hash: Rename KERNEL_REGION_ID to LINEAR_MAP_REGION_ID The region actually point to linear map. Rename the #define to clarify thati. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 4 ++-- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/mm/copro_fault.c | 4 ++-- arch/powerpc/mm/slb.c | 4 ++-- arch/powerpc/platforms/cell/spu_base.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 7faa3d7214c0..1d1183048cfd 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -89,7 +89,7 @@ * Region IDs */ #define USER_REGION_ID 0 -#define KERNEL_REGION_ID 1 +#define LINEAR_MAP_REGION_ID 1 #define VMALLOC_REGION_ID NON_LINEAR_REGION_ID(H_VMALLOC_START) #define IO_REGION_ID NON_LINEAR_REGION_ID(H_KERN_IO_START) #define VMEMMAP_REGION_ID NON_LINEAR_REGION_ID(H_VMEMMAP_START) @@ -120,7 +120,7 @@ static inline int get_region_id(unsigned long ea) return USER_REGION_ID; if (ea < H_KERN_VIRT_START) - return KERNEL_REGION_ID; + return LINEAR_MAP_REGION_ID; VM_BUG_ON(id != 0xc); BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2); diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 9a9adbeef070..1e4705516a54 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -817,7 +817,7 @@ static inline unsigned long get_kernel_context(unsigned long ea) * Depending on Kernel config, kernel region can have one context * or more. */ - if (region_id == KERNEL_REGION_ID) { + if (region_id == LINEAR_MAP_REGION_ID) { /* * We already verified ea to be not beyond the addr limit. */ diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 9b0321061bc8..f137286740cb 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -129,8 +129,8 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) vsid = get_kernel_vsid(ea, mmu_kernel_ssize); vsidkey = SLB_VSID_KERNEL; break; - case KERNEL_REGION_ID: - pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); + case LINEAR_MAP_REGION_ID: + pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea); psize = mmu_linear_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 721cb09c9044..89e4531de64b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -691,7 +691,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) unsigned long flags; int ssize; - if (id == KERNEL_REGION_ID) { + if (id == LINEAR_MAP_REGION_ID) { /* We only support upto MAX_PHYSMEM_BITS */ if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) @@ -790,7 +790,7 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) * first class kernel code. But for performance it's probably nicer * if they go via fast_exception_return too. */ - if (id >= KERNEL_REGION_ID) { + if (id >= LINEAR_MAP_REGION_ID) { long err; #ifdef CONFIG_DEBUG_VM /* Catch recursive kernel SLB faults. */ diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 4770cce1bfe2..6646f152d57b 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -224,7 +224,7 @@ static void __spu_kernel_slb(void *addr, struct copro_slb *slb) unsigned long ea = (unsigned long)addr; u64 llp; - if (get_region_id(ea) == KERNEL_REGION_ID) + if (get_region_id(ea) == LINEAR_MAP_REGION_ID) llp = mmu_psize_defs[mmu_linear_psize].sllp; else llp = mmu_psize_defs[mmu_virtual_psize].sllp; -- cgit v1.2.3-59-g8ed1b From 26ad26718dfaa7cf49d106d212ebf2370076c253 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 30 Mar 2019 11:13:45 +0530 Subject: powerpc/mm: Fix section mismatch warning This patch fix the below section mismatch warnings. WARNING: vmlinux.o(.text+0x2d1f44): Section mismatch in reference from the function devm_memremap_pages_release() to the function .meminit.text:arch_remove_memory() WARNING: vmlinux.o(.text+0x2d265c): Section mismatch in reference from the function devm_memremap_pages() to the function .meminit.text:arch_add_memory() Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3665602a9dfa..e12bec98366f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -131,8 +131,8 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap * } #ifdef CONFIG_MEMORY_HOTREMOVE -int __meminit arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +int __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; -- cgit v1.2.3-59-g8ed1b From f341d89790b0b7f99ca7835e0cf7de1026ceae39 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 23 Apr 2019 16:10:17 +0100 Subject: powerpc/mm: fix spelling mistake "Outisde" -> "Outside" There are several identical spelling mistakes in warning messages, fix these. Signed-off-by: Colin Ian King Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 4 ++-- arch/powerpc/mm/pgtable-hash64.c | 2 +- arch/powerpc/mm/pgtable-radix.c | 6 +++--- arch/powerpc/mm/pgtable_64.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f727197de713..6eb89643ce58 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -784,7 +784,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid int rc; if (end >= H_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } @@ -932,7 +932,7 @@ static void __init htab_initialize(void) base, size, prot); if ((base + size) >= H_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); continue; } diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index d934de4e2b3a..097a3b3538b1 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -115,7 +115,7 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start, int rc; if ((start + page_size) >= H_VMEMMAP_END) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index e6d5065b0bc8..fcb0169e2d32 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -341,7 +341,7 @@ void __init radix_init_pgtable(void) */ if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); continue; } @@ -902,7 +902,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { if (end >= RADIX_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } @@ -934,7 +934,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, int ret; if ((start + page_size) >= RADIX_VMEMMAP_END) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 72f58c076e26..95ad2a09501c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -122,7 +122,7 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_ return NULL; if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return NULL; } -- cgit v1.2.3-59-g8ed1b From b2d3b5ee66f2a04a918cc043cec0c9ed3de58f40 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Tue, 2 Oct 2018 10:35:59 -0500 Subject: powerpc/pseries: Track LMB nid instead of using device tree When removing memory we need to remove the memory from the node it was added to instead of looking up the node it should be in in the device tree. During testing we have seen scenarios where the affinity for a LMB changes due to a partition migration or PRRN event. In these cases the node the LMB exists in may not match the node the device tree indicates it belongs in. This can lead to a system crash when trying to DLPAR remove the LMB after a migration or PRRN event. The current code looks up the node in the device tree to remove the LMB from, the crash occurs when we try to offline this node and it does not have any data, i.e. node_data[nid] == NULL. 36:mon> e cpu 0x36: Vector: 300 (Data Access) at [c0000001828b7810] pc: c00000000036d08c: try_offline_node+0x2c/0x1b0 lr: c0000000003a14ec: remove_memory+0xbc/0x110 sp: c0000001828b7a90 msr: 800000000280b033 dar: 9a28 dsisr: 40000000 current = 0xc0000006329c4c80 paca = 0xc000000007a55200 softe: 0 irq_happened: 0x01 pid = 76926, comm = kworker/u320:3 36:mon> t [link register ] c0000000003a14ec remove_memory+0xbc/0x110 [c0000001828b7a90] c00000000006a1cc arch_remove_memory+0x9c/0xd0 (unreliable) [c0000001828b7ad0] c0000000003a14e0 remove_memory+0xb0/0x110 [c0000001828b7b20] c0000000000c7db4 dlpar_remove_lmb+0x94/0x160 [c0000001828b7b60] c0000000000c8ef8 dlpar_memory+0x7e8/0xd10 [c0000001828b7bf0] c0000000000bf828 handle_dlpar_errorlog+0xf8/0x160 [c0000001828b7c60] c0000000000bf8cc pseries_hp_work_fn+0x3c/0xa0 [c0000001828b7c90] c000000000128cd8 process_one_work+0x298/0x5a0 [c0000001828b7d20] c000000000129068 worker_thread+0x88/0x620 [c0000001828b7dc0] c00000000013223c kthread+0x1ac/0x1c0 [c0000001828b7e30] c00000000000b45c ret_from_kernel_thread+0x5c/0x80 To resolve this we need to track the node a LMB belongs to when it is added to the system so we can remove it from that node instead of the node that the device tree indicates it should belong to. Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/drmem.h | 21 +++++++++++++++++++++ arch/powerpc/mm/drmem.c | 6 +++++- arch/powerpc/platforms/pseries/hotplug-memory.c | 17 ++++++++--------- 3 files changed, 34 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 7c1d8e74b25d..7f3279b014db 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -17,6 +17,9 @@ struct drmem_lmb { u32 drc_index; u32 aa_index; u32 flags; +#ifdef CONFIG_MEMORY_HOTPLUG + int nid; +#endif }; struct drmem_lmb_info { @@ -104,4 +107,22 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) lmb->aa_index = 0xffffffff; } +#ifdef CONFIG_MEMORY_HOTPLUG +static inline void lmb_set_nid(struct drmem_lmb *lmb) +{ + lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr); +} +static inline void lmb_clear_nid(struct drmem_lmb *lmb) +{ + lmb->nid = -1; +} +#else +static inline void lmb_set_nid(struct drmem_lmb *lmb) +{ +} +static inline void lmb_clear_nid(struct drmem_lmb *lmb) +{ +} +#endif + #endif /* _ASM_POWERPC_LMB_H */ diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 3f1803672c9b..641891df2046 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -366,8 +366,10 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop) if (!drmem_info->lmbs) return; - for_each_drmem_lmb(lmb) + for_each_drmem_lmb(lmb) { read_drconf_v1_cell(lmb, &prop); + lmb_set_nid(lmb); + } } static void __init init_drmem_v2_lmbs(const __be32 *prop) @@ -412,6 +414,8 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop) lmb->aa_index = dr_cell.aa_index; lmb->flags = dr_cell.flags; + + lmb_set_nid(lmb); } } } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index d291b618a559..47087832f8b2 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -379,7 +379,7 @@ static int dlpar_add_lmb(struct drmem_lmb *); static int dlpar_remove_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int nid, rc; + int rc; if (!lmb_is_removable(lmb)) return -EINVAL; @@ -389,14 +389,14 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) return rc; block_sz = pseries_memory_block_size(); - nid = memory_add_physaddr_to_nid(lmb->base_addr); - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->nid, lmb->base_addr, block_sz); /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); + lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; @@ -653,7 +653,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) static int dlpar_add_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int nid, rc; + int rc; if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; @@ -664,13 +664,11 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) return rc; } + lmb_set_nid(lmb); block_sz = memory_block_size_bytes(); - /* Find the node id for this address */ - nid = memory_add_physaddr_to_nid(lmb->base_addr); - /* Add the memory */ - rc = __add_memory(nid, lmb->base_addr, block_sz); + rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; @@ -678,8 +676,9 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->nid, lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); + lmb_clear_nid(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; } -- cgit v1.2.3-59-g8ed1b From 2c474c03505677cfd987d52e8bf42abe8c270529 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 30 Apr 2019 13:29:07 +0530 Subject: powerpc/mm/radix: Fix kernel crash when running subpage protect test This patch fixes the below crash by making sure we touch the subpage protection related structures only if we know they are allocated on the platform. With radix translation we don't allocate hash context at all and trying to access subpage_prot_table results in: Faulting instruction address: 0xc00000000008bdb4 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV .... NIP [c00000000008bdb4] sys_subpage_prot+0x74/0x590 LR [c00000000000b688] system_call+0x5c/0x70 Call Trace: [c00020002c6b7d30] [c00020002c6b7d90] 0xc00020002c6b7d90 (unreliable) [c00020002c6b7e20] [c00000000000b688] system_call+0x5c/0x70 Instruction dump: fb61ffd8 fb81ffe0 fba1ffe8 fbc1fff0 fbe1fff8 f821ff11 e92d1178 f9210068 39200000 e92d0968 ebe90630 e93f03e8 60000000 3860fffe e9410068 We also move the subpage_prot_table with mmp_sem held to avoid race between two parallel subpage_prot syscall. Fixes: 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix") Reported-by: Sachin Sant Signed-off-by: Aneesh Kumar K.V Tested-by: Sachin Sant Signed-off-by: Michael Ellerman --- arch/powerpc/mm/subpage-prot.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index c9dff4e1f295..473dd430e306 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -90,16 +90,18 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; unsigned long next, limit; + down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); if (!spt) - return ; + goto err_out; - down_write(&mm->mmap_sem); limit = addr + len; if (limit > spt->maxaddr) limit = spt->maxaddr; @@ -127,6 +129,8 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); } + +err_out: up_write(&mm->mmap_sem); } @@ -189,7 +193,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; @@ -219,6 +223,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, down_write(&mm->mmap_sem); + spt = mm_ctx_subpage_prot(&mm->context); if (!spt) { /* * Allocate subpage prot table if not already done. -- cgit v1.2.3-59-g8ed1b From a1ac2a9c4f98482e49305ab5551b7b32f9cac39b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Mar 2019 13:03:45 +0000 Subject: powerpc/book3e: drop BUG_ON() in map_kernel_page() early_alloc_pgtable() never returns NULL as it panics on failure. This patch drops the three BUG_ON() which check the non nullity of early_alloc_pgtable() returned value. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-book3e.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index 1032ef7aaf62..390a6d0b216d 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -98,20 +98,17 @@ int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) #ifndef __PAGETABLE_PUD_FOLDED if (pgd_none(*pgdp)) { pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); pgd_populate(&init_mm, pgdp, pudp); } #endif /* !__PAGETABLE_PUD_FOLDED */ pudp = pud_offset(pgdp, ea); if (pud_none(*pudp)) { pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); if (!pmd_present(*pmdp)) { ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); -- cgit v1.2.3-59-g8ed1b From 9d9f2cccde952126185e3336af0d4dc62eb254ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 09:59:59 +0000 Subject: powerpc/mm: change #include "mmu_decl.h" to This patch make inclusion of mmu_decl.h independant of the location of the file including it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/40x_mmu.c | 2 +- arch/powerpc/mm/44x_mmu.c | 2 +- arch/powerpc/mm/8xx_mmu.c | 2 +- arch/powerpc/mm/dma-noncoherent.c | 2 +- arch/powerpc/mm/fsl_booke_mmu.c | 2 +- arch/powerpc/mm/init_32.c | 2 +- arch/powerpc/mm/init_64.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/powerpc/mm/mmu_context_nohash.c | 2 +- arch/powerpc/mm/pgtable-book3e.c | 2 +- arch/powerpc/mm/pgtable-book3s64.c | 2 +- arch/powerpc/mm/pgtable-hash64.c | 2 +- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/mm/ppc_mmu_32.c | 2 +- arch/powerpc/mm/tlb_hash32.c | 2 +- arch/powerpc/mm/tlb_nohash.c | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index b9cf6f8764b0..460459b6f53e 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -49,7 +49,7 @@ #include #include -#include "mmu_decl.h" +#include extern int __map_without_ltlbs; /* diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index aad127acdbaa..c07983ebc02e 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -31,7 +31,7 @@ #include #include -#include "mmu_decl.h" +#include /* Used by the 44x TLB replacement exception handler. * Just needed it declared someplace. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 87648b58d295..70d55b615b62 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -17,7 +17,7 @@ #include #include -#include "mmu_decl.h" +#include #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index b5d2658c26af..2f6154b76328 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -36,7 +36,7 @@ #include #include -#include "mmu_decl.h" +#include /* * This address range defaults to a value that is safe for all diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index 210cbc1faf63..71a1a36751dd 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -54,7 +54,7 @@ #include #include -#include "mmu_decl.h" +#include unsigned int tlbcam_index; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 80cc97cd8878..3eb4cb09749c 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -47,7 +47,7 @@ #include #include -#include "mmu_decl.h" +#include #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) /* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a4c155af1597..45b02fa11cd8 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -66,7 +66,7 @@ #include #include -#include "mmu_decl.h" +#include phys_addr_t memstart_addr = ~0; EXPORT_SYMBOL_GPL(memstart_addr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e12bec98366f..105c58f8900a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -54,7 +54,7 @@ #include #include -#include "mmu_decl.h" +#include #ifndef CPU_FTR_COHERENT_ICACHE #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 1945c5f19f5e..ae4505d5b4b8 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -52,7 +52,7 @@ #include #include -#include "mmu_decl.h" +#include /* * The MPC8xx has only 16 contexts. We rotate through them on each task switch. diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index 390a6d0b216d..f296c2e88b09 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -15,7 +15,7 @@ #include #include -#include "mmu_decl.h" +#include #ifdef CONFIG_SPARSEMEM_VMEMMAP /* diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index a4341aba0af4..16bda049187a 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -17,7 +17,7 @@ #include #include -#include "mmu_decl.h" +#include #include unsigned long __pmd_frag_nr; diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 097a3b3538b1..1fd025dba4a3 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -19,7 +19,7 @@ #include #include -#include "mmu_decl.h" +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6e56a6240bfa..c9cdbb84d31f 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -36,7 +36,7 @@ #include #include -#include "mmu_decl.h" +#include unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 95ad2a09501c..95ed76519411 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -52,7 +52,7 @@ #include #include -#include "mmu_decl.h" +#include #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index bf1de3ca39bc..1db55159031c 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -34,7 +34,7 @@ #include #include -#include "mmu_decl.h" +#include struct hash_pte *Hash, *Hash_end; unsigned long Hash_size, Hash_mask; diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index cf8472cf3d59..8d56f0417f87 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -32,7 +32,7 @@ #include #include -#include "mmu_decl.h" +#include /* * Called when unmapping pages to flush entries from the TLB/hash table. diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 088e0a6b5ade..704e613a0b14 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -46,7 +46,7 @@ #include #include -#include "mmu_decl.h" +#include /* * This struct lists the sw-supported page sizes. The hardawre MMU may support -- cgit v1.2.3-59-g8ed1b From 47d99948eee48a84a4b242c17915a4ff59a29b5d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:00 +0000 Subject: powerpc/mm: Move book3s64 specifics in subdirectory mm/book3s64 Many files in arch/powerpc/mm are only for book3S64. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Update the selftest sym links, shorten new filenames, cleanup some whitespace and formatting in the new files.] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 25 +- arch/powerpc/mm/book3s64/Makefile | 24 + arch/powerpc/mm/book3s64/hash_4k.c | 124 ++ arch/powerpc/mm/book3s64/hash_64k.c | 333 +++++ arch/powerpc/mm/book3s64/hash_hugepage.c | 191 +++ arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 152 ++ arch/powerpc/mm/book3s64/hash_native.c | 884 ++++++++++++ arch/powerpc/mm/book3s64/hash_pgtable.c | 463 ++++++ arch/powerpc/mm/book3s64/hash_tlb.c | 265 ++++ arch/powerpc/mm/book3s64/hash_utils.c | 1946 ++++++++++++++++++++++++++ arch/powerpc/mm/book3s64/iommu_api.c | 482 +++++++ arch/powerpc/mm/book3s64/mmu_context.c | 263 ++++ arch/powerpc/mm/book3s64/pgtable.c | 449 ++++++ arch/powerpc/mm/book3s64/pkeys.c | 428 ++++++ arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 110 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 1124 +++++++++++++++ arch/powerpc/mm/book3s64/radix_tlb.c | 1101 +++++++++++++++ arch/powerpc/mm/book3s64/slb.c | 833 +++++++++++ arch/powerpc/mm/book3s64/subpage_prot.c | 289 ++++ arch/powerpc/mm/book3s64/vphn.c | 73 + arch/powerpc/mm/book3s64/vphn.h | 16 + arch/powerpc/mm/hash64_4k.c | 124 -- arch/powerpc/mm/hash64_64k.c | 333 ----- arch/powerpc/mm/hash_native_64.c | 884 ------------ arch/powerpc/mm/hash_utils_64.c | 1930 ------------------------- arch/powerpc/mm/hugepage-hash64.c | 191 --- arch/powerpc/mm/hugetlbpage-hash64.c | 147 -- arch/powerpc/mm/hugetlbpage-radix.c | 110 -- arch/powerpc/mm/mmu_context_book3s64.c | 263 ---- arch/powerpc/mm/mmu_context_iommu.c | 482 ------- arch/powerpc/mm/numa.c | 2 +- arch/powerpc/mm/pgtable-book3s64.c | 449 ------ arch/powerpc/mm/pgtable-hash64.c | 463 ------ arch/powerpc/mm/pgtable-radix.c | 1124 --------------- arch/powerpc/mm/pkeys.c | 428 ------ arch/powerpc/mm/slb.c | 832 ----------- arch/powerpc/mm/subpage-prot.c | 289 ---- arch/powerpc/mm/tlb-radix.c | 1101 --------------- arch/powerpc/mm/tlb_hash64.c | 259 ---- arch/powerpc/mm/vphn.c | 71 - arch/powerpc/mm/vphn.h | 17 - tools/testing/selftests/powerpc/vphn/vphn.c | 2 +- tools/testing/selftests/powerpc/vphn/vphn.h | 2 +- 43 files changed, 9556 insertions(+), 9522 deletions(-) create mode 100644 arch/powerpc/mm/book3s64/Makefile create mode 100644 arch/powerpc/mm/book3s64/hash_4k.c create mode 100644 arch/powerpc/mm/book3s64/hash_64k.c create mode 100644 arch/powerpc/mm/book3s64/hash_hugepage.c create mode 100644 arch/powerpc/mm/book3s64/hash_hugetlbpage.c create mode 100644 arch/powerpc/mm/book3s64/hash_native.c create mode 100644 arch/powerpc/mm/book3s64/hash_pgtable.c create mode 100644 arch/powerpc/mm/book3s64/hash_tlb.c create mode 100644 arch/powerpc/mm/book3s64/hash_utils.c create mode 100644 arch/powerpc/mm/book3s64/iommu_api.c create mode 100644 arch/powerpc/mm/book3s64/mmu_context.c create mode 100644 arch/powerpc/mm/book3s64/pgtable.c create mode 100644 arch/powerpc/mm/book3s64/pkeys.c create mode 100644 arch/powerpc/mm/book3s64/radix_hugetlbpage.c create mode 100644 arch/powerpc/mm/book3s64/radix_pgtable.c create mode 100644 arch/powerpc/mm/book3s64/radix_tlb.c create mode 100644 arch/powerpc/mm/book3s64/slb.c create mode 100644 arch/powerpc/mm/book3s64/subpage_prot.c create mode 100644 arch/powerpc/mm/book3s64/vphn.c create mode 100644 arch/powerpc/mm/book3s64/vphn.h delete mode 100644 arch/powerpc/mm/hash64_4k.c delete mode 100644 arch/powerpc/mm/hash64_64k.c delete mode 100644 arch/powerpc/mm/hash_native_64.c delete mode 100644 arch/powerpc/mm/hash_utils_64.c delete mode 100644 arch/powerpc/mm/hugepage-hash64.c delete mode 100644 arch/powerpc/mm/hugetlbpage-hash64.c delete mode 100644 arch/powerpc/mm/hugetlbpage-radix.c delete mode 100644 arch/powerpc/mm/mmu_context_book3s64.c delete mode 100644 arch/powerpc/mm/mmu_context_iommu.c delete mode 100644 arch/powerpc/mm/pgtable-book3s64.c delete mode 100644 arch/powerpc/mm/pgtable-hash64.c delete mode 100644 arch/powerpc/mm/pgtable-radix.c delete mode 100644 arch/powerpc/mm/pkeys.c delete mode 100644 arch/powerpc/mm/slb.c delete mode 100644 arch/powerpc/mm/subpage-prot.c delete mode 100644 arch/powerpc/mm/tlb-radix.c delete mode 100644 arch/powerpc/mm/tlb_hash64.c delete mode 100644 arch/powerpc/mm/vphn.c delete mode 100644 arch/powerpc/mm/vphn.h (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3c1bd9fa23cd..a137fdf775e2 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,53 +5,34 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o -hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o \ - $(hash64-y) mmu_context_book3s64.o \ - pgtable-book3s64.o pgtable-frag.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o -obj-$(CONFIG_PPC_BOOK3S) += tlb_hash$(BITS).o -ifdef CONFIG_PPC_BOOK3S_64 -obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o -obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o -endif +obj-$(CONFIG_PPC_BOOK3S_32) += tlb_hash32.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o -obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ -obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb_nohash.o := n KCOV_INSTRUMENT_fsl_booke_mmu.o := n - -# Instrumenting the SLB fault path can lead to duplicate SLB entries -KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile new file mode 100644 index 000000000000..974b4fc19f4f --- /dev/null +++ b/arch/powerpc/mm/book3s64/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := $(NO_MINIMAL_TOC) + +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + +obj-y += hash_pgtable.o hash_utils.o slb.o \ + mmu_context.o pgtable.o hash_tlb.o +obj-$(CONFIG_PPC_NATIVE) += hash_native.o +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o +obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o +obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o +endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o +obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Instrumenting the SLB fault path can lead to duplicate SLB entries +KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c new file mode 100644 index 000000000000..22e787123cdf --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -0,0 +1,124 @@ +/* + * Copyright IBM Corporation, 2015 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include + +int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, int subpg_prot) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned long rflags, pa; + unsigned long old_pte, new_pte; + unsigned long vpn, hash, slot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. Since this is 4K insert of 64K page size + * also add H_PAGE_COMBO + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* + * PP bits. _PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + rflags = htab_convert_pte_flags(new_pte); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + vpn = hpt_vpn(ea, vsid, ssize); + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* + * There MIGHT be an HPTE for this pte + */ + unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize, + rpte, 0); + + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K, + MMU_PAGE_4K, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + hash = hpt_hash(vpn, shift, ssize); + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_4K, MMU_PAGE_4K, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + MMU_PAGE_4K, + MMU_PAGE_4K, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_4K, MMU_PAGE_4K, old_pte); + return -1; + } + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c new file mode 100644 index 000000000000..7084ce2951e6 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -0,0 +1,333 @@ +/* + * Copyright IBM Corporation, 2015 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include + +/* + * Return true, if the entry has a slot value which + * the software considers as invalid. + */ +static inline bool hpte_soft_invalid(unsigned long hidx) +{ + return ((hidx & 0xfUL) == 0xfUL); +} + +/* + * index from 0 - 15 + */ +bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) +{ + return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index))); +} + +int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, int subpg_prot) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned int subpg_index; + unsigned long rflags, pa; + unsigned long old_pte, new_pte, subpg_pte; + unsigned long vpn, hash, slot, gslot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. Since this is 4K insert of 64K page size + * also add H_PAGE_COMBO + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* + * Handle the subpage protection bits + */ + subpg_pte = new_pte & ~subpg_prot; + rflags = htab_convert_pte_flags(subpg_pte); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } + + subpg_index = (ea & (PAGE_SIZE - 1)) >> shift; + vpn = hpt_vpn(ea, vsid, ssize); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + /* + *None of the sub 4k page is hashed + */ + if (!(old_pte & H_PAGE_HASHPTE)) + goto htab_insert_hpte; + /* + * Check if the pte was already inserted into the hash table + * as a 64k HW page, and invalidate the 64k HPTE if so. + */ + if (!(old_pte & H_PAGE_COMBO)) { + flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); + /* + * clear the old slot details from the old and new pte. + * On hash insert failure we use old pte value and we don't + * want slot information there if we have a insert failure. + */ + old_pte &= ~H_PAGE_HASHPTE; + new_pte &= ~H_PAGE_HASHPTE; + goto htab_insert_hpte; + } + /* + * Check for sub page valid and update + */ + if (__rpte_sub_valid(rpte, subpg_index)) { + int ret; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, + subpg_index); + ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize, flags); + + /* + * If we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + if (ret == -1) + goto htab_insert_hpte; + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; + } + +htab_insert_hpte: + + /* + * Initialize all hidx entries to invalid value, the first time + * the PTE is about to allocate a 4K HPTE. + */ + if (!(old_pte & H_PAGE_COMBO)) + rpte.hidx = INVALID_RPTE_HIDX; + + /* + * handle H_PAGE_4K_PFN case + */ + if (old_pte & H_PAGE_4K_PFN) { + /* + * All the sub 4k page have the same + * physical address. + */ + pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT; + } else { + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + pa += (subpg_index << shift); + } + hash = hpt_hash(vpn, shift, ssize); +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_4K, MMU_PAGE_4K, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + bool soft_invalid; + + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize); + + soft_invalid = hpte_soft_invalid(slot); + if (unlikely(soft_invalid)) { + /* + * We got a valid slot from a hardware point of view. + * but we cannot use it, because we use this special + * value; as defined by hpte_soft_invalid(), to track + * invalid slots. We cannot use it. So invalidate it. + */ + gslot = slot & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize, 0); + } + + if (unlikely(slot == -1 || soft_invalid)) { + /* + * For soft invalid slot, let's ensure that we release a + * slot from the primary, with the hope that we will + * acquire that slot next time we try. This will ensure + * that we do not get the same soft-invalid slot. + */ + if (soft_invalid || (mftb() & 0x1)) + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_4K, MMU_PAGE_4K, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); + new_pte |= H_PAGE_HASHPTE; + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} + +int __hash_page_64K(unsigned long ea, unsigned long access, + unsigned long vsid, pte_t *ptep, unsigned long trap, + unsigned long flags, int ssize) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned long rflags, pa; + unsigned long old_pte, new_pte; + unsigned long vpn, hash, slot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Check if PTE has the cache-inhibit bit set + * If so, bail out and refault as a 4k page + */ + if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && + unlikely(pte_ci(pte))) + return 0; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + rflags = htab_convert_pte_flags(new_pte); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + vpn = hpt_vpn(ea, vsid, ssize); + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + unsigned long gslot; + + /* + * There MIGHT be an HPTE for this pte + */ + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K, + MMU_PAGE_64K, ssize, + flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + hash = hpt_hash(vpn, shift, ssize); + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_64K, MMU_PAGE_64K, + ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + MMU_PAGE_64K, + MMU_PAGE_64K, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_64K, MMU_PAGE_64K, old_pte); + return -1; + } + + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c new file mode 100644 index 000000000000..440823797de7 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -0,0 +1,191 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* + * PPC64 THP Support for hash based MMUs + */ +#include +#include + +int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, + pmd_t *pmdp, unsigned long trap, unsigned long flags, + int ssize, unsigned int psize) +{ + unsigned int index, valid; + unsigned char *hpte_slot_array; + unsigned long rflags, pa, hidx; + unsigned long old_pmd, new_pmd; + int ret, lpsize = MMU_PAGE_16M; + unsigned long vpn, hash, shift, slot; + + /* + * atomically mark the linux large page PMD busy and dirty + */ + do { + pmd_t pmd = READ_ONCE(*pmdp); + + old_pmd = pmd_val(pmd); + /* If PMD busy, retry the access */ + if (unlikely(old_pmd & H_PAGE_BUSY)) + return 0; + /* If PMD permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pmd))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pmd |= _PAGE_DIRTY; + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + + /* + * Make sure this is thp or devmap entry + */ + if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) + return 0; + + rflags = htab_convert_pte_flags(new_pmd); + +#if 0 + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } +#endif + /* + * Find the slot index details for this ea, using base page size. + */ + shift = mmu_psize_defs[psize].shift; + index = (ea & ~HPAGE_PMD_MASK) >> shift; + BUG_ON(index >= PTE_FRAG_SIZE); + + vpn = hpt_vpn(ea, vsid, ssize); + hpte_slot_array = get_hpte_slot_array(pmdp); + if (psize == MMU_PAGE_4K) { + /* + * invalidate the old hpte entry if we have that mapped via 64K + * base page size. This is because demote_segment won't flush + * hash page table entries. + */ + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { + flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, + ssize, flags); + /* + * With THP, we also clear the slot information with + * respect to all the 64K hash pte mapping the 16MB + * page. They are all invalid now. This make sure we + * don't find the slot valid when we fault with 4k + * base page size. + * + */ + memset(hpte_slot_array, 0, PTE_FRAG_SIZE); + } + } + + valid = hpte_valid(hpte_slot_array, index); + if (valid) { + /* update the hpte bits */ + hash = hpt_hash(vpn, shift, ssize); + hidx = hpte_hash_index(hpte_slot_array, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, + psize, lpsize, ssize, flags); + /* + * We failed to update, try to insert a new entry. + */ + if (ret == -1) { + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + valid = 0; + hpte_slot_array[index] = 0; + } + } + + if (!valid) { + unsigned long hpte_group; + + hash = hpt_hash(vpn, shift, ssize); + /* insert new entry */ + pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; + new_pmd |= H_PAGE_HASHPTE; + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + psize, lpsize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pmd and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *pmdp = __pmd(old_pmd); + hash_failure_debug(ea, access, vsid, trap, ssize, + psize, lpsize, old_pmd); + return -1; + } + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + mark_hpte_slot_valid(hpte_slot_array, index, slot); + } + /* + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with + * base page size 4k. + */ + if (psize == MMU_PAGE_4K) + new_pmd |= H_PAGE_COMBO; + /* + * The hpte valid is stored in the pgtable whose address is in the + * second half of the PMD. Order this against clearing of the busy bit in + * huge pmd. + */ + smp_wmb(); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c new file mode 100644 index 000000000000..2d4e02aa15a3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth + */ + +#include +#include +#include +#include +#include +#include + +extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rlags, + unsigned long vflags, int psize, int ssize); + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, unsigned int shift, unsigned int mmu_psize) +{ + real_pte_t rpte; + unsigned long vpn; + unsigned long old_pte, new_pte; + unsigned long rflags, pa; + long slot, offset; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + vpn = hpt_vpn(ea, vsid, ssize); + + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* Make sure this is a hugetlb entry */ + if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + return 0; + + rflags = htab_convert_pte_flags(new_pte); + if (unlikely(mmu_psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; + rpte = __real_pte(__pte(old_pte), ptep, offset); + + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long gslot; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, + mmu_psize, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, shift, ssize); + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + + /* clear HPTE slot informations in new PTE */ + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + + slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, + mmu_psize, ssize); + + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, mmu_psize, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long pte_val; + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, + _PAGE_PRESENT, _PAGE_INVALID, 1); + + return __pte(pte_val); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + + if (radix_enabled()) + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, + old_pte, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c new file mode 100644 index 000000000000..aaa28fd918fe --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -0,0 +1,884 @@ +/* + * native hashtable management. + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG_LOW + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#ifdef __BIG_ENDIAN__ +#define HPTE_LOCK_BIT 3 +#else +#define HPTE_LOCK_BIT (56+3) +#endif + +DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + asm volatile("ptesync": : :"memory"); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and any caching of partition table + * entries. Then flush the remaining sets of the TLB. Hash mode uses + * partition scoped TLB translations. + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + for (set = 1; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + /* + * Now invalidate the process table cache. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + asm volatile("ptesync": : :"memory"); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + +static inline unsigned long ___tlbie(unsigned long vpn, int psize, + int apsize, int ssize) +{ + unsigned long va; + unsigned int penc; + unsigned long sllp; + + /* + * We need 14 to 65 bits of va for a tlibe of 4K page + * With vpn we ignore the lower VPN_SHIFT bits already. + * And top two bits are already ignored because we can + * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT + * of 12. + */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ + if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + /* clear out bits after (52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); + va |= ssize << 8; + sllp = get_sllp_encoding(apsize); + va |= sllp << 5; + asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + default: + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); /* AVAL */ + va |= 1; /* L */ + asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + } + return va; +} + +static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + /* Need the extra ptesync to ensure we don't reorder tlbie*/ + asm volatile("ptesync": : :"memory"); + ___tlbie(vpn, psize, apsize, ssize); + } +} + +static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long rb; + + rb = ___tlbie(vpn, psize, apsize, ssize); + trace_tlbie(0, 0, rb, 0, 0, 0, 0); +} + +static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long va; + unsigned int penc; + unsigned long sllp; + + /* VPN_SHIFT can be atmost 12 */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64 bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ + if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + /* clear out bits after(52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); + va |= ssize << 8; + sllp = get_sllp_encoding(apsize); + va |= sllp << 5; + asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) + : : "r" (va), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + default: + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); + va |= 1; /* L */ + asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) + : : "r" (va), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + } + trace_tlbie(0, 1, va, 0, 0, 0, 0); + +} + +static inline void tlbie(unsigned long vpn, int psize, int apsize, + int ssize, int local) +{ + unsigned int use_local; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); + + if (use_local) + use_local = mmu_psize_defs[psize].tlbiel; + if (lock_tlbie && !use_local) + raw_spin_lock(&native_tlbie_lock); + asm volatile("ptesync": : :"memory"); + if (use_local) { + __tlbiel(vpn, psize, apsize, ssize); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie(vpn, psize, apsize, ssize); + fixup_tlbie(vpn, psize, apsize, ssize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + if (lock_tlbie && !use_local) + raw_spin_unlock(&native_tlbie_lock); +} + +static inline void native_lock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = (unsigned long *)&hptep->v; + + while (1) { + if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) + break; + spin_begin(); + while(test_bit(HPTE_LOCK_BIT, word)) + spin_cpu_relax(); + spin_end(); + } +} + +static inline void native_unlock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = (unsigned long *)&hptep->v; + + clear_bit_unlock(HPTE_LOCK_BIT, word); +} + +static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int apsize, int ssize) +{ + struct hash_pte *hptep = htab_address + hpte_group; + unsigned long hpte_v, hpte_r; + int i; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," + " rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, vpn, pa, rflags, vflags, psize); + } + + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { + /* retry with lock held */ + native_lock_hpte(hptep); + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) + break; + native_unlock_hpte(hptep); + } + + hptep++; + } + + if (i == HPTES_PER_GROUP) + return -1; + + hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", + i, hpte_v, hpte_r); + } + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); + hpte_v = hpte_old_to_new_v(hpte_v); + } + + hptep->r = cpu_to_be64(hpte_r); + /* Guarantee the second dword is visible before the valid bit */ + eieio(); + /* + * Now set the first dword including the valid bit + * NOTE: this also unlocks the hpte + */ + hptep->v = cpu_to_be64(hpte_v); + + __asm__ __volatile__ ("ptesync" : : : "memory"); + + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); +} + +static long native_hpte_remove(unsigned long hpte_group) +{ + struct hash_pte *hptep; + int i; + int slot_offset; + unsigned long hpte_v; + + DBG_LOW(" remove(group=%lx)\n", hpte_group); + + /* pick a random entry to start at */ + slot_offset = mftb() & 0x7; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group + slot_offset; + hpte_v = be64_to_cpu(hptep->v); + + if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { + /* retry with lock held */ + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + if ((hpte_v & HPTE_V_VALID) + && !(hpte_v & HPTE_V_BOLTED)) + break; + native_unlock_hpte(hptep); + } + + slot_offset++; + slot_offset &= 0x7; + } + + if (i == HPTES_PER_GROUP) + return -1; + + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + return i; +} + +static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, + unsigned long vpn, int bpsize, + int apsize, int ssize, unsigned long flags) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v, want_v; + int ret = 0, local = 0; + + want_v = hpte_encode_avpn(vpn, bpsize, ssize); + + DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", + vpn, want_v & HPTE_V_AVPN, slot, newpp); + + hpte_v = hpte_get_old_v(hptep); + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { + DBG_LOW(" -> miss\n"); + ret = -1; + } else { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID))) { + ret = -1; + } else { + DBG_LOW(" -> hit\n"); + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N | + HPTE_R_C))); + } + native_unlock_hpte(hptep); + } + + if (flags & HPTE_LOCAL_UPDATE) + local = 1; + /* + * Ensure it is out of the tlb too if it is not a nohpte fault + */ + if (!(flags & HPTE_NOHPTE_UPDATE)) + tlbie(vpn, bpsize, apsize, ssize, local); + + return ret; +} + +static long native_hpte_find(unsigned long vpn, int psize, int ssize) +{ + struct hash_pte *hptep; + unsigned long hash; + unsigned long i; + long slot; + unsigned long want_v, hpte_v; + + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* Bolted mappings are only ever in the primary group */ + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + + hptep = htab_address + slot; + hpte_v = hpte_get_old_v(hptep); + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* HPTE matches */ + return slot; + ++slot; + } + + return -1; +} + +/* + * Update the page protection bits. Intended to be used to create + * guard pages for kernel data structures on pages which are bolted + * in the HPT. Assumes pages being operated on will not be stolen. + * + * No need to lock here because we should be the only user. + */ +static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, + int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + panic("could not find page to bolt\n"); + hptep = htab_address + slot; + + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N))); + /* + * Ensure it is out of the tlb too. Bolted entries base and + * actual page size will be same. + */ + tlbie(vpn, psize, psize, ssize, 0); +} + +/* + * Remove a bolted kernel entry. Memory hotplug uses this. + * + * No need to lock here because we should be the only user. + */ +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + return -ENOENT; + + hptep = htab_address + slot; + + VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); + + /* Invalidate the hpte */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(vpn, psize, psize, ssize, 0); + return 0; +} + + +static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, + int bpsize, int apsize, int ssize, int local) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + + local_irq_save(flags); + + DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); + + want_v = hpte_encode_avpn(vpn, bpsize, ssize); + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + else + native_unlock_hpte(hptep); + } + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ + tlbie(vpn, bpsize, apsize, ssize, local); + + local_irq_restore(flags); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + int i; + struct hash_pte *hptep; + int actual_psize = MMU_PAGE_16M; + unsigned int max_hpte_count, valid; + unsigned long flags, s_addr = addr; + unsigned long hpte_v, want_v, shift; + unsigned long hidx, vpn = 0, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + local_irq_save(flags); + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + hpte_v = hpte_get_old_v(hptep); + + /* Even if we miss, we need to invalidate the TLB */ + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* recheck with locks held */ + native_lock_hpte(hptep); + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* + * Invalidate the hpte. NOTE: this also unlocks it + */ + + hptep->v = 0; + } else + native_unlock_hpte(hptep); + } + /* + * We need to do tlb invalidate for all the address, tlbie + * instruction compares entry_VA in tlb with the VA specified + * here + */ + tlbie(vpn, psize, actual_psize, ssize, local); + } + local_irq_restore(flags); +} +#else +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + WARN(1, "%s called without THP support\n", __func__); +} +#endif + +static void hpte_decode(struct hash_pte *hpte, unsigned long slot, + int *psize, int *apsize, int *ssize, unsigned long *vpn) +{ + unsigned long avpn, pteg, vpi; + unsigned long hpte_v = be64_to_cpu(hpte->v); + unsigned long hpte_r = be64_to_cpu(hpte->r); + unsigned long vsid, seg_off; + int size, a_size, shift; + /* Look at the 8 bit LP value */ + unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); + hpte_r = hpte_new_to_old_r(hpte_r); + } + if (!(hpte_v & HPTE_V_LARGE)) { + size = MMU_PAGE_4K; + a_size = MMU_PAGE_4K; + } else { + size = hpte_page_sizes[lp] & 0xf; + a_size = hpte_page_sizes[lp] >> 4; + } + /* This works for all page sizes, and for 256M and 1T segments */ + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; + shift = mmu_psize_defs[size].shift; + + avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + + switch (*ssize) { + case MMU_SEGSIZE_256M: + /* We only have 28 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (shift < 23) { + vpi = (vsid ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + case MMU_SEGSIZE_1T: + /* We only have 40 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (shift < 23) { + vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + default: + *vpn = size = 0; + } + *psize = size; + *apsize = a_size; +} + +/* + * clear all mappings on kexec. All cpus are in real mode (or they will + * be when they isi), and we are the only one left. We rely on our kernel + * mapping being 0xC0's and the hardware ignoring those two real bits. + * + * This must be called with interrupts disabled. + * + * Taking the native_tlbie_lock is unsafe here due to the possibility of + * lockdep being on. On pre POWER5 hardware, not taking the lock could + * cause deadlock. POWER5 and newer not taking the lock is fine. This only + * gets called during boot before secondary CPUs have come up and during + * crashdump and all bets are off anyway. + * + * TODO: add batching support when enabled. remember, no dynamic memory here, + * although there is the control page available... + */ +static void native_hpte_clear(void) +{ + unsigned long vpn = 0; + unsigned long slot, slots; + struct hash_pte *hptep = htab_address; + unsigned long hpte_v; + unsigned long pteg_count; + int psize, apsize, ssize; + + pteg_count = htab_hash_mask + 1; + + slots = pteg_count * HPTES_PER_GROUP; + + for (slot = 0; slot < slots; slot++, hptep++) { + /* + * we could lock the pte here, but we are the only cpu + * running, right? and for crash dump, we probably + * don't want to wait for a maybe bad cpu. + */ + hpte_v = be64_to_cpu(hptep->v); + + /* + * Call __tlbie() here rather than tlbie() since we can't take the + * native_tlbie_lock. + */ + if (hpte_v & HPTE_V_VALID) { + hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); + hptep->v = 0; + ___tlbie(vpn, psize, apsize, ssize); + } + } + + asm volatile("eieio; tlbsync; ptesync":::"memory"); +} + +/* + * Batched hash table flush, we batch the tlbie's to avoid taking/releasing + * the lock all the time + */ +static void native_flush_hash_range(unsigned long number, int local) +{ + unsigned long vpn = 0; + unsigned long hash, index, hidx, shift, slot; + struct hash_pte *hptep; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + real_pte_t pte; + struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + unsigned long psize = batch->psize; + int ssize = batch->ssize; + int i; + unsigned int use_local; + + use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && + mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use(); + + local_irq_save(flags); + + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + continue; + /* lock and try again */ + native_lock_hpte(hptep); + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + hptep->v = 0; + + } pte_iterate_hashed_end(); + } + + if (use_local) { + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbiel(vpn, psize, psize, ssize); + } pte_iterate_hashed_end(); + } + asm volatile("ptesync":::"memory"); + } else { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbie(vpn, psize, psize, ssize); + } pte_iterate_hashed_end(); + } + /* + * Just do one more with the last used values. + */ + fixup_tlbie(vpn, psize, psize, ssize); + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } + + local_irq_restore(flags); +} + +void __init hpte_init_native(void) +{ + mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; + mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; + mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; + mmu_hash_ops.hpte_insert = native_hpte_insert; + mmu_hash_ops.hpte_remove = native_hpte_remove; + mmu_hash_ops.hpte_clear_all = native_hpte_clear; + mmu_hash_ops.flush_hash_range = native_flush_hash_range; + mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; +} diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c new file mode 100644 index 000000000000..1fd025dba4a3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -0,0 +1,463 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define CREATE_TRACE_POINTS +#include + +#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) +#warning Limited user VSID range means pagetable space is wasted +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * vmemmap is the starting address of the virtual address space where + * struct pages are allocated for all possible PFNs present on the system + * including holes and bad memory (hence sparse). These virtual struct + * pages are stored in sequence in this virtual address space irrespective + * of the fact whether the corresponding PFN is valid or not. This achieves + * constant relationship between address of struct page and its PFN. + * + * During boot or memory hotplug operation when a new memory section is + * added, physical memory allocation (including hash table bolting) will + * be performed for the set of struct pages which are part of the memory + * section. This saves memory by not allocating struct pages for PFNs + * which are not valid. + * + * ---------------------------------------------- + * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| + * ---------------------------------------------- + * + * f000000000000000 c000000000000000 + * vmemmap +--------------+ +--------------+ + * + | page struct | +--------------> | page struct | + * | +--------------+ +--------------+ + * | | page struct | +--------------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | + +------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | | +--> | page struct | + * | +--------------+ | | +--------------+ + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | +-------+ | + * | +--------------+ | + * | | page struct | +-----------+ + * | +--------------+ + * | | page struct | No mapping + * | +--------------+ + * | | page struct | No mapping + * v +--------------+ + * + * ----------------------------------------- + * | RELATION BETWEEN STRUCT PAGES AND PFNS| + * ----------------------------------------- + * + * vmemmap +--------------+ +---------------+ + * + | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * v +--------------+ +---------------+ + */ +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc; + + if ((start + page_size) >= H_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + serialize_against_pte_lookup(vma->vm_mm); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_user_vsid(&mm->context, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + if (mm_is_thread_local(mm)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_current_mm_pte variants which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_curren_mm_pte to finish. + */ + serialize_against_pte_lookup(mm); + return old_pmd; +} + +int hash__has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_STRICT_KERNEL_RWX +static bool hash__change_memory_range(unsigned long start, unsigned long end, + unsigned long newpp) +{ + unsigned long idx; + unsigned int step, shift; + + shift = mmu_psize_defs[mmu_linear_psize].shift; + step = 1 << shift; + + start = ALIGN_DOWN(start, step); + end = ALIGN(end, step); // aligns up + + if (start >= end) + return false; + + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); + + return true; +} + +void hash__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); +} + +void hash__mark_initmem_nx(void) +{ + unsigned long start, end, pp; + + start = (unsigned long)__init_begin; + end = (unsigned long)__init_end; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + + WARN_ON(!hash__change_memory_range(start, end, pp)); +} +#endif diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c new file mode 100644 index 000000000000..d4f0101447b1 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -0,0 +1,265 @@ +/* + * This file contains the routines for flushing entries from the + * TLB and MMU hash table. + * + * Derived from arch/ppc64/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); + +/* + * A linux PTE was changed and the corresponding hash table entry + * neesd to be flushed. This function will either perform the flush + * immediately or will batch it up if the current CPU has an active + * batch on it. + */ +void hpte_need_flush(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long pte, int huge) +{ + unsigned long vpn; + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); + unsigned long vsid; + unsigned int psize; + int ssize; + real_pte_t rpte; + int i, offset; + + i = batch->index; + + /* + * Get page size (maybe move back to caller). + * + * NOTE: when using special 64K mappings in 4K environment like + * for SPEs, we obtain the page size from the slice, which thus + * must still exist (and thus the VMA not reused) at the time + * of this call + */ + if (huge) { +#ifdef CONFIG_HUGETLB_PAGE + psize = get_slice_psize(mm, addr); + /* Mask the address for the correct page size */ + addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); + if (unlikely(psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; +#else + BUG(); + psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ +#endif + } else { + psize = pte_pagesize_index(mm, addr, pte); + /* + * Mask the address for the standard page size. If we + * have a 64k page kernel, but the hardware does not + * support 64k pages, this might be different from the + * hardware page size encoded in the slice table. + */ + addr &= PAGE_MASK; + offset = PTRS_PER_PTE; + } + + + /* Build full vaddr */ + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_user_vsid(&mm->context, addr, ssize); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + WARN_ON(vsid == 0); + vpn = hpt_vpn(addr, vsid, ssize); + rpte = __real_pte(__pte(pte), ptep, offset); + + /* + * Check if we have an active batch on this CPU. If not, just + * flush now and return. + */ + if (!batch->active) { + flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); + put_cpu_var(ppc64_tlb_batch); + return; + } + + /* + * This can happen when we are in the middle of a TLB batch and + * we encounter memory pressure (eg copy_page_range when it tries + * to allocate a new pte). If we have to reclaim memory and end + * up scanning and resetting referenced bits then our batch context + * will change mid stream. + * + * We also need to ensure only one page size is present in a given + * batch + */ + if (i != 0 && (mm != batch->mm || batch->psize != psize || + batch->ssize != ssize)) { + __flush_tlb_pending(batch); + i = 0; + } + if (i == 0) { + batch->mm = mm; + batch->psize = psize; + batch->ssize = ssize; + } + batch->pte[i] = rpte; + batch->vpn[i] = vpn; + batch->index = ++i; + if (i >= PPC64_TLB_BATCH_NR) + __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); +} + +/* + * This function is called when terminating an mmu batch or when a batch + * is full. It will perform the flush of all the entries currently stored + * in a batch. + * + * Must be called from within some kind of spinlock/non-preempt region... + */ +void __flush_tlb_pending(struct ppc64_tlb_batch *batch) +{ + int i, local; + + i = batch->index; + local = mm_is_thread_local(batch->mm); + if (i == 1) + flush_hash_page(batch->vpn[0], batch->pte[0], + batch->psize, batch->ssize, local); + else + flush_hash_range(i, local); + batch->index = 0; +} + +void hash__tlb_flush(struct mmu_gather *tlb) +{ + struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); + + /* + * If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + put_cpu_var(ppc64_tlb_batch); +} + +/** + * __flush_hash_table_range - Flush all HPTEs for a given address range + * from the hash table (and the TLB). But keeps + * the linux PTEs intact. + * + * @mm : mm_struct of the target address space (generally init_mm) + * @start : starting address + * @end : ending address (not included in the flush) + * + * This function is mostly to be used by some IO hotplug code in order + * to remove all hash entries from a given address range used to map IO + * space on a removed PCI-PCI bidge without tearing down the full mapping + * since 64K pages may overlap with other bridges when using 64K pages + * with 4K HW pages on IO space. + * + * Because of that usage pattern, it is implemented for small size rather + * than speed. + */ +void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + bool is_thp; + int hugepage_shift; + unsigned long flags; + + start = _ALIGN_DOWN(start, PAGE_SIZE); + end = _ALIGN_UP(end, PAGE_SIZE); + + BUG_ON(!mm->pgd); + + /* + * Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + for (; start < end; start += PAGE_SIZE) { + pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, + &hugepage_shift); + unsigned long pte; + + if (ptep == NULL) + continue; + pte = pte_val(*ptep); + if (is_thp) + trace_hugepage_invalidate(start, pte); + if (!(pte & H_PAGE_HASHPTE)) + continue; + if (unlikely(is_thp)) + hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); + else + hpte_need_flush(mm, start, ptep, pte, hugepage_shift); + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} + +void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + pte_t *start_pte; + unsigned long flags; + + addr = _ALIGN_DOWN(addr, PMD_SIZE); + /* + * Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + start_pte = pte_offset_map(pmd, addr); + for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { + unsigned long pteval = pte_val(*pte); + if (pteval & H_PAGE_HASHPTE) + hpte_need_flush(mm, addr, pte, pteval, 0); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c new file mode 100644 index 000000000000..b21a81d42f15 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -0,0 +1,1946 @@ +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG +#undef DEBUG_LOW + +#define pr_fmt(fmt) "hash-mmu: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024L*MB) + +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +static unsigned long _SDR1; +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +u8 hpte_page_sizes[1 << LP_BITS]; +EXPORT_SYMBOL_GPL(hpte_page_sizes); + +struct hash_pte *htab_address; +unsigned long htab_size_bytes; +unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); +int mmu_linear_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_linear_psize); +int mmu_virtual_psize = MMU_PAGE_4K; +int mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif +int mmu_io_psize = MMU_PAGE_4K; +int mmu_kernel_ssize = MMU_SEGSIZE_256M; +EXPORT_SYMBOL_GPL(mmu_kernel_ssize); +int mmu_highuser_ssize = MMU_SEGSIZE_256M; +u16 mmu_slb_size = 64; +EXPORT_SYMBOL_GPL(mmu_slb_size); +#ifdef CONFIG_PPC_64K_PAGES +int mmu_ci_restrictions; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_SPINLOCK(linear_map_hash_lock); +#endif /* CONFIG_DEBUG_PAGEALLOC */ +struct mmu_hash_ops mmu_hash_ops; +EXPORT_SYMBOL(mmu_hash_ops); + +/* + * These are definitions of page sizes arrays to be used when none + * is provided by the firmware. + */ + +/* + * Fallback (4k pages only) + */ +static struct mmu_psize_def mmu_psize_defaults[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 0, + }, +}; + +/* + * POWER4, GPUL, POWER5 + * + * Support for 16Mb large pages + */ +static struct mmu_psize_def mmu_psize_defaults_gp[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 1, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .sllp = SLB_VSID_L, + .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, + [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, + .avpnm = 0x1UL, + .tlbiel = 0, + }, +}; + +/* + * 'R' and 'C' update notes: + * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* + * create writeable HPTEs without C set, because the hcall H_PROTECT + * that we use in that case will not update C + * - The above is however not a problem, because we also don't do that + * fancy "no flush" variant of eviction and we use H_REMOVE which will + * do the right thing and thus we don't have the race I described earlier + * + * - Under bare metal, we do have the race, so we need R and C set + * - We make sure R is always set and never lost + * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping + */ +unsigned long htab_convert_pte_flags(unsigned long pteflags) +{ + unsigned long rflags = 0; + + /* _PAGE_EXEC -> NOEXEC */ + if ((pteflags & _PAGE_EXEC) == 0) + rflags |= HPTE_R_N; + /* + * PPP bits: + * Linux uses slb key 0 for kernel and 1 for user. + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). + */ + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) { + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + rflags |= (HPTE_R_PP0 | 0x2); + else + rflags |= 0x3; + } + } else { + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) + rflags |= 0x1; + } + /* + * We can't allow hardware to update hpte bits. Hence always + * set 'R' bit and set 'C' if it is a write fault + */ + rflags |= HPTE_R_R; + + if (pteflags & _PAGE_DIRTY) + rflags |= HPTE_R_C; + /* + * Add in WIG bits + */ + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) + rflags |= HPTE_R_I; + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); + else + /* + * Add memory coherence if cache inhibited is not set + */ + rflags |= HPTE_R_M; + + rflags |= pte_to_hpte_pkey_bits(pteflags); + return rflags; +} + +int htab_bolt_mapping(unsigned long vstart, unsigned long vend, + unsigned long pstart, unsigned long prot, + int psize, int ssize) +{ + unsigned long vaddr, paddr; + unsigned int step, shift; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + prot = htab_convert_pte_flags(prot); + + DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", + vstart, vend, pstart, prot, psize, ssize); + + for (vaddr = vstart, paddr = pstart; vaddr < vend; + vaddr += step, paddr += step) { + unsigned long hash, hpteg; + unsigned long vsid = get_kernel_vsid(vaddr, ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); + unsigned long tprot = prot; + + /* + * If we hit a bad address return error. + */ + if (!vsid) + return -1; + /* Make kernel text executable */ + if (overlaps_kernel_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* Make kvm guest trampolines executable */ + if (overlaps_kvm_tmp(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* + * If relocatable, check if it overlaps interrupt vectors that + * are copied down to real 0. For relocatable kernel + * (e.g. kdump case) we copy interrupt vectors down to real + * address 0. Mark that region as executable. This is + * because on p8 system with relocation on exception feature + * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence + * in order to execute the interrupt handlers in virtual + * mode the vector region need to be marked as executable. + */ + if ((PHYSICAL_START > MEMORY_START) && + overlaps_interrupt_vector_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + hash = hpt_hash(vpn, shift, ssize); + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + BUG_ON(!mmu_hash_ops.hpte_insert); + ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, + ssize); + + if (ret < 0) + break; + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled() && + (paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + } + return ret < 0 ? ret : 0; +} + +int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize) +{ + unsigned long vaddr; + unsigned int step, shift; + int rc; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + if (!mmu_hash_ops.hpte_removebolted) + return -ENODEV; + + for (vaddr = vstart; vaddr < vend; vaddr += step) { + rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); + if (rc == -ENOENT) { + ret = -ENOENT; + continue; + } + if (rc < 0) + return rc; + } + + return ret; +} + +static bool disable_1tb_segments = false; + +static int __init parse_disable_1tb_segments(char *p) +{ + disable_1tb_segments = true; + return 0; +} +early_param("disable_1tb_segments", parse_disable_1tb_segments); + +static int __init htab_dt_scan_seg_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); + if (prop == NULL) + return 0; + for (; size >= 4; size -= 4, ++prop) { + if (be32_to_cpu(prop[0]) == 40) { + DBG("1T segment support detected\n"); + + if (disable_1tb_segments) { + DBG("1T segments disabled by command line\n"); + break; + } + + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; + return 1; + } + } + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 0; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + return idx; +} + +static int __init htab_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); + while(size > 0) { + unsigned int base_shift = be32_to_cpu(prop[0]); + unsigned int slbenc = be32_to_cpu(prop[1]); + unsigned int lpnum = be32_to_cpu(prop[2]); + struct mmu_psize_def *def; + int idx, base_idx; + + size -= 3; prop += 3; + base_idx = get_idx_from_shift(base_shift); + if (base_idx < 0) { + /* skip the pte encoding also */ + prop += lpnum * 2; size -= lpnum * 2; + continue; + } + def = &mmu_psize_defs[base_idx]; + if (base_idx == MMU_PAGE_16M) + cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; + + def->shift = base_shift; + if (base_shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (base_shift - 23)) - 1; + def->sllp = slbenc; + /* + * We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + while (size > 0 && lpnum) { + unsigned int shift = be32_to_cpu(prop[0]); + int penc = be32_to_cpu(prop[1]); + + prop += 2; size -= 2; + lpnum--; + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + if (penc == -1) + pr_err("Invalid penc for base_shift=%d " + "shift=%d\n", base_shift, shift); + + def->penc[idx] = penc; + pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," + " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", + base_shift, shift, def->sllp, + def->avpnm, def->tlbiel, def->penc[idx]); + } + } + + return 1; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Scan for 16G memory blocks that have been set aside for huge pages + * and reserve those blocks for 16G huge pages. + */ +static int __init htab_dt_scan_hugepage_blocks(unsigned long node, + const char *uname, int depth, + void *data) { + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be64 *addr_prop; + const __be32 *page_count_prop; + unsigned int expected_pages; + long unsigned int phys_addr; + long unsigned int block_size; + + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* + * This property is the log base 2 of the number of virtual pages that + * will represent this memory block. + */ + page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); + if (page_count_prop == NULL) + return 0; + expected_pages = (1 << be32_to_cpu(page_count_prop[0])); + addr_prop = of_get_flat_dt_prop(node, "reg", NULL); + if (addr_prop == NULL) + return 0; + phys_addr = be64_to_cpu(addr_prop[0]); + block_size = be64_to_cpu(addr_prop[1]); + if (block_size != (16 * GB)) + return 0; + printk(KERN_INFO "Huge page(16GB) memory: " + "addr = 0x%lX size = 0x%lX pages = %d\n", + phys_addr, block_size, expected_pages); + if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { + memblock_reserve(phys_addr, block_size * expected_pages); + pseries_add_gpage(phys_addr, block_size, expected_pages); + } + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static void mmu_psize_set_default_penc(void) +{ + int bpsize, apsize; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) + mmu_psize_defs[bpsize].penc[apsize] = -1; +} + +#ifdef CONFIG_PPC_64K_PAGES + +static bool might_have_hea(void) +{ + /* + * The HEA ethernet adapter requires awareness of the + * GX bus. Without that awareness we can easily assume + * we will never see an HEA ethernet device. + */ +#ifdef CONFIG_IBMEBUS + return !cpu_has_feature(CPU_FTR_ARCH_207S) && + firmware_has_feature(FW_FEATURE_SPLPAR); +#else + return false; +#endif +} + +#endif /* #ifdef CONFIG_PPC_64K_PAGES */ + +static void __init htab_scan_page_sizes(void) +{ + int rc; + + /* se the invalid penc to -1 */ + mmu_psize_set_default_penc(); + + /* Default to 4K pages only */ + memcpy(mmu_psize_defs, mmu_psize_defaults, + sizeof(mmu_psize_defaults)); + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); + if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { + /* + * Nothing in the device-tree, but the CPU supports 16M pages, + * so let's fallback on a known size list for 16M capable CPUs. + */ + memcpy(mmu_psize_defs, mmu_psize_defaults_gp, + sizeof(mmu_psize_defaults_gp)); + } + +#ifdef CONFIG_HUGETLB_PAGE + if (!hugetlb_disabled) { + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); + } +#endif /* CONFIG_HUGETLB_PAGE */ +} + +/* + * Fill in the hpte_page_sizes[] array. + * We go through the mmu_psize_defs[] array looking for all the + * supported base/actual page size combinations. Each combination + * has a unique pagesize encoding (penc) value in the low bits of + * the LP field of the HPTE. For actual page sizes less than 1MB, + * some of the upper LP bits are used for RPN bits, meaning that + * we need to fill in several entries in hpte_page_sizes[]. + * + * In diagrammatic form, with r = RPN bits and z = page size bits: + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ... + * + * The zzzz bits are implementation-specific but are chosen so that + * no encoding for a larger page size uses the same value in its + * low-order N bits as the encoding for the 2^(12+N) byte page size + * (if it exists). + */ +static void init_hpte_page_sizes(void) +{ + long int ap, bp; + long int shift, penc; + + for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { + if (!mmu_psize_defs[bp].shift) + continue; /* not a supported page size */ + for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { + penc = mmu_psize_defs[bp].penc[ap]; + if (penc == -1 || !mmu_psize_defs[ap].shift) + continue; + shift = mmu_psize_defs[ap].shift - LP_SHIFT; + if (shift <= 0) + continue; /* should never happen */ + /* + * For page sizes less than 1MB, this loop + * replicates the entry for all possible values + * of the rrrr bits. + */ + while (penc < (1 << LP_BITS)) { + hpte_page_sizes[penc] = (ap << 4) | bp; + penc += 1 << shift; + } + } + } +} + +static void __init htab_init_page_sizes(void) +{ + init_hpte_page_sizes(); + + if (!debug_pagealloc_enabled()) { + /* + * Pick a size for the linear mapping. Currently, we only + * support 16M, 1M and 4K which is the default + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; + } + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Pick a size for the ordinary pages. Default is 4K, we support + * 64K for user mappings and vmalloc if supported by the processor. + * We only use 64k for ioremap if the processor + * (and firmware) support cache-inhibited large pages. + * If not, we use 4k and set mmu_ci_restrictions so that + * hash_page knows to switch processes that use cache-inhibited + * mappings to 4k pages. + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift) { + mmu_virtual_psize = MMU_PAGE_64K; + mmu_vmalloc_psize = MMU_PAGE_64K; + if (mmu_linear_psize == MMU_PAGE_4K) + mmu_linear_psize = MMU_PAGE_64K; + if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { + /* + * When running on pSeries using 64k pages for ioremap + * would stop us accessing the HEA ethernet. So if we + * have the chance of ever seeing one, stay at 4k. + */ + if (!might_have_hea()) + mmu_io_psize = MMU_PAGE_64K; + } else + mmu_ci_restrictions = 1; + } +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* + * We try to use 16M pages for vmemmap if that is supported + * and we have at least 1G of RAM at boot + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift && + memblock_phys_mem_size() >= 0x40000000) + mmu_vmemmap_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_64K].shift) + mmu_vmemmap_psize = MMU_PAGE_64K; + else + mmu_vmemmap_psize = MMU_PAGE_4K; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + printk(KERN_DEBUG "Page orders: linear mapping = %d, " + "virtual = %d, io = %d" +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ", vmemmap = %d" +#endif + "\n", + mmu_psize_defs[mmu_linear_psize].shift, + mmu_psize_defs[mmu_virtual_psize].shift, + mmu_psize_defs[mmu_io_psize].shift +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ,mmu_psize_defs[mmu_vmemmap_psize].shift +#endif + ); +} + +static int __init htab_dt_scan_pftsize(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + if (prop != NULL) { + /* pft_size[0] is the NUMA CEC cookie */ + ppc64_pft_size = be32_to_cpu(prop[1]); + return 1; + } + return 0; +} + +unsigned htab_shift_for_mem_size(unsigned long mem_size) +{ + unsigned memshift = __ilog2(mem_size); + unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned pteg_shift; + + /* round mem_size up to next power of 2 */ + if ((1UL << memshift) < mem_size) + memshift += 1; + + /* aim for 2 pages / pteg */ + pteg_shift = memshift - (pshift + 1); + + /* + * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab + * size permitted by the architecture. + */ + return max(pteg_shift + 7, 18U); +} + +static unsigned long __init htab_get_table_size(void) +{ + /* + * If hash size isn't already provided by the platform, we try to + * retrieve it from the device-tree. If it's not there neither, we + * calculate it now based on the total RAM size + */ + if (ppc64_pft_size == 0) + of_scan_flat_dt(htab_dt_scan_pftsize, NULL); + if (ppc64_pft_size) + return 1UL << ppc64_pft_size; + + return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size()); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int resize_hpt_for_hotplug(unsigned long new_mem_size) +{ + unsigned target_hpt_shift; + + if (!mmu_hash_ops.resize_hpt) + return 0; + + target_hpt_shift = htab_shift_for_mem_size(new_mem_size); + + /* + * To avoid lots of HPT resizes if memory size is fluctuating + * across a boundary, we deliberately have some hysterisis + * here: we immediately increase the HPT size if the target + * shift exceeds the current shift, but we won't attempt to + * reduce unless the target shift is at least 2 below the + * current shift + */ + if (target_hpt_shift > ppc64_pft_size || + target_hpt_shift < ppc64_pft_size - 1) + return mmu_hash_ops.resize_hpt(target_hpt_shift); + + return 0; +} + +int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) +{ + int rc; + + if (end >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); + + if (rc < 0) { + int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +int hash__remove_section_mapping(unsigned long start, unsigned long end) +{ + int rc = htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); + WARN_ON(rc < 0); + return rc; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long htab_size) +{ + mmu_partition_table_init(); + + /* + * PS field (VRMA page size) is not used for LPID 0, hence set to 0. + * For now, UPRT is 0 and we have no segment table. + */ + htab_size = __ilog2(htab_size) - 18; + mmu_partition_table_set_entry(0, hash_table | htab_size, 0); + pr_info("Partition table %p\n", partition_tb); +} + +static void __init htab_initialize(void) +{ + unsigned long table; + unsigned long pteg_count; + unsigned long prot; + unsigned long base = 0, size = 0; + struct memblock_region *reg; + + DBG(" -> htab_initialize()\n"); + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + mmu_kernel_ssize = MMU_SEGSIZE_1T; + mmu_highuser_ssize = MMU_SEGSIZE_1T; + printk(KERN_INFO "Using 1TB segments\n"); + } + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = htab_get_table_size(); + pteg_count = htab_size_bytes >> 7; + + htab_hash_mask = pteg_count - 1; + + if (firmware_has_feature(FW_FEATURE_LPAR) || + firmware_has_feature(FW_FEATURE_PS3_LV1)) { + /* Using a hypervisor which owns the htab */ + htab_address = NULL; + _SDR1 = 0; + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + register_process_table(0, 0, 0); +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && mmu_hash_ops.hpte_clear_all) + mmu_hash_ops.hpte_clear_all(); +#endif + } else { + unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; + +#ifdef CONFIG_PPC_CELL + /* + * Cell may require the hash table down low when using the + * Axon IOMMU in order to fit the dynamic region over it, see + * comments in cell/iommu.c + */ + if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) { + limit = 0x80000000; + pr_info("Hash table forced below 2G for Axon IOMMU\n"); + } +#endif /* CONFIG_PPC_CELL */ + + table = memblock_phys_alloc_range(htab_size_bytes, + htab_size_bytes, + 0, limit); + if (!table) + panic("ERROR: Failed to allocate %pa bytes below %pa\n", + &htab_size_bytes, &limit); + + DBG("Hash table allocated at %lx, size: %lx\n", table, + htab_size_bytes); + + htab_address = __va(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(htab_size_bytes) - 18; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, htab_size_bytes); + } + + prot = pgprot_val(PAGE_KERNEL); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled()) { + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = memblock_alloc_try_nid( + linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!linear_map_hash_slots) + panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", + __func__, linear_map_hash_count, &ppc64_rma_size); + } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + + /* create bolted the linear mapping in the hash table */ + for_each_memblock(memory, reg) { + base = (unsigned long)__va(reg->base); + size = reg->size; + + DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", + base, size, prot); + + if ((base + size) >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), + prot, mmu_linear_psize, mmu_kernel_ssize)); + } + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + /* + * If we have a memory_limit and we've allocated TCEs then we need to + * explicitly map the TCE area at the top of RAM. We also cope with the + * case that the TCEs start below memory_limit. + * tce_alloc_start/end are 16MB aligned so the mapping should work + * for either 4K or 16MB pages. + */ + if (tce_alloc_start) { + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); + + if (base + size >= tce_alloc_start) + tce_alloc_start = base + size + 1; + + BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, + __pa(tce_alloc_start), prot, + mmu_linear_psize, mmu_kernel_ssize)); + } + + + DBG(" <- htab_initialize()\n"); +} +#undef KB +#undef MB + +void __init hash__early_init_devtree(void) +{ + /* Initialize segment sizes */ + of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); + + /* Initialize page sizes */ + htab_scan_page_sizes(); +} + +struct hash_mm_context init_hash_mm_context; +void __init hash__early_init_mmu(void) +{ +#ifndef CONFIG_PPC_64K_PAGES + /* + * We have code in __hash_page_4K() and elsewhere, which assumes it can + * do the following: + * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); + * + * Where the slot number is between 0-15, and values of 8-15 indicate + * the secondary bucket. For that code to work H_PAGE_F_SECOND and + * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and + * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here + * with a BUILD_BUG_ON(). + */ + BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); +#endif /* CONFIG_PPC_64K_PAGES */ + + htab_init_page_sizes(); + + /* + * initialize page table size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = H_PMD_FRAG_NR; + __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; + + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pud_cache_index = H_PUD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; + /* + * 4k use hugepd format, so for hash set then to + * zero + */ + __pmd_val_bits = HASH_PMD_VAL_BITS; + __pud_val_bits = HASH_PUD_VAL_BITS; + __pgd_val_bits = HASH_PGD_VAL_BITS; + + __kernel_virt_start = H_KERN_VIRT_START; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; + __kernel_io_end = H_KERN_IO_END; + vmemmap = (struct page *)H_VMEMMAP_START; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + + /* Select appropriate backend */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) + ps3_early_mm_init(); + else if (firmware_has_feature(FW_FEATURE_LPAR)) + hpte_init_pseries(); + else if (IS_ENABLED(CONFIG_PPC_NATIVE)) + hpte_init_native(); + + if (!mmu_hash_ops.hpte_insert) + panic("hash__early_init_mmu: No MMU hash ops defined!\n"); + + /* + * Initialize the MMU Hash table and create the linear mapping + * of memory. Has to be done before SLB initialization as this is + * currently where the page size encoding is obtained. + */ + htab_initialize(); + + init_mm.context.hash_context = &init_hash_mm_context; + init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + + pr_info("Initializing hash mmu with SLB\n"); + /* Initialize SLB management */ + slb_initialize(); + + if (cpu_has_feature(CPU_FTR_ARCH_206) + && cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +#ifdef CONFIG_SMP +void hash__early_init_mmu_secondary(void) +{ + /* Initialize hash table for that CPU */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } + /* Initialize SLB */ + slb_initialize(); + + if (cpu_has_feature(CPU_FTR_ARCH_206) + && cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} +#endif /* CONFIG_SMP */ + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct page *page; + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + flush_dcache_icache_page(page); + set_bit(PG_arch_1, &page->flags); + } else + pp |= HPTE_R_N; + } + return pp; +} + +#ifdef CONFIG_PPC_MM_SLICES +static unsigned int get_paca_psize(unsigned long addr) +{ + unsigned char *psizes; + unsigned long index, mask_index; + + if (addr < SLICE_LOW_TOP) { + psizes = get_paca()->mm_ctx_low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = get_paca()->mm_ctx_high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + } + mask_index = index & 0x1; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; +} + +#else +unsigned int get_paca_psize(unsigned long addr) +{ + return get_paca()->mm_ctx_user_psize; +} +#endif + +/* + * Demote a segment to using 4k pages. + * For now this makes the whole process use 4k pages. + */ +#ifdef CONFIG_PPC_64K_PAGES +void demote_segment_4k(struct mm_struct *mm, unsigned long addr) +{ + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) + return; + slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); + copro_flush_all_slbs(mm); + if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { + + copy_mm_to_paca(mm); + slb_flush_and_restore_bolted(); + } +} +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_SUBPAGE_PROT +/* + * This looks up a 2-bit protection code for a 4k subpage of a 64k page. + * Userspace sets the subpage permissions using the subpage_prot system call. + * + * Result is 0: full permissions, _PAGE_RW: read-only, + * _PAGE_RWX: no access. + */ +static int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + u32 spp = 0; + u32 **sbpm, *sbpp; + + if (!spt) + return 0; + + if (ea >= spt->maxaddr) + return 0; + if (ea < 0x100000000UL) { + /* addresses below 4GB use spt->low_prot */ + sbpm = spt->low_prot; + } else { + sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; + if (!sbpm) + return 0; + } + sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!sbpp) + return 0; + spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; + + /* extract 2-bit bitfield for this 4k subpage */ + spp >>= 30 - 2 * ((ea >> 12) & 0xf); + + /* + * 0 -> full premission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); + return spp; +} + +#else /* CONFIG_PPC_SUBPAGE_PROT */ +static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + return 0; +} +#endif + +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, int lpsize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", + trap, vsid, ssize, psize, lpsize, pte); +} + +static void check_paca_psize(unsigned long ea, struct mm_struct *mm, + int psize, bool user_region) +{ + if (user_region) { + if (psize != get_paca_psize(ea)) { + copy_mm_to_paca(mm); + slb_flush_and_restore_bolted(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +} + +/* + * Result code is: + * 0 - handled + * 1 - normal page fault + * -1 - critical hash insertion error + * -2 - access not permitted by subpage protection mechanism + */ +int hash_page_mm(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap, + unsigned long flags) +{ + bool is_thp; + enum ctx_state prev_state = exception_enter(); + pgd_t *pgdir; + unsigned long vsid; + pte_t *ptep; + unsigned hugeshift; + int rc, user_region = 0; + int psize, ssize; + + DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", + ea, access, trap); + trace_hash_fault(ea, access, trap); + + /* Get region & vsid */ + switch (get_region_id(ea)) { + case USER_REGION_ID: + user_region = 1; + if (! mm) { + DBG_LOW(" user region with no mm !\n"); + rc = 1; + goto bail; + } + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + break; + case VMALLOC_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + break; + + case IO_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + break; + default: + /* + * Not a valid range + * Send the problem up to do_page_fault() + */ + rc = 1; + goto bail; + } + DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + + /* Bad address. */ + if (!vsid) { + DBG_LOW("Bad address!\n"); + rc = 1; + goto bail; + } + /* Get pgdir */ + pgdir = mm->pgd; + if (pgdir == NULL) { + rc = 1; + goto bail; + } + + /* Check CPU locality */ + if (user_region && mm_is_thread_local(mm)) + flags |= HPTE_LOCAL_UPDATE; + +#ifndef CONFIG_PPC_64K_PAGES + /* + * If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. + */ + if (psize != MMU_PAGE_4K) + ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get PTE and page size from page tables */ + ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); + if (ptep == NULL || !pte_present(*ptep)) { + DBG_LOW(" no PTE !\n"); + rc = 1; + goto bail; + } + + /* Add _PAGE_PRESENT to the required access perm */ + access |= _PAGE_PRESENT; + + /* + * Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (!check_pte_access(access, pte_val(*ptep))) { + DBG_LOW(" no access !\n"); + rc = 1; + goto bail; + } + + if (hugeshift) { + if (is_thp) + rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, flags, ssize, psize); +#ifdef CONFIG_HUGETLB_PAGE + else + rc = __hash_page_huge(ea, access, vsid, ptep, trap, + flags, ssize, hugeshift, psize); +#else + else { + /* + * if we have hugeshift, and is not transhuge with + * hugetlb disabled, something is really wrong. + */ + rc = 1; + WARN_ON(1); + } +#endif + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + + goto bail; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + /* Do actual hashing */ +#ifdef CONFIG_PPC_64K_PAGES + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } + + /* + * If this PTE is non-cacheable and we have restrictions on + * using non cacheable large pages, then we switch to 4k + */ + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { + if (user_region) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } else if (ea < VMALLOC_END) { + /* + * some driver did a non-cacheable mapping + * in vmalloc space, so switch vmalloc + * to 4k pages + */ + printk(KERN_ALERT "Reducing vmalloc segment " + "to 4kB pages because of " + "non-cacheable mapping\n"); + psize = mmu_vmalloc_psize = MMU_PAGE_4K; + copro_flush_all_slbs(mm); + } + } + +#endif /* CONFIG_PPC_64K_PAGES */ + + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + flags, ssize); + else +#endif /* CONFIG_PPC_64K_PAGES */ + { + int spp = subpage_protection(mm, ea); + if (access & spp) + rc = -2; + else + rc = __hash_page_4K(ea, access, vsid, ptep, trap, + flags, ssize, spp); + } + + /* + * Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + psize, pte_val(*ptep)); +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + DBG_LOW(" -> rc=%d\n", rc); + +bail: + exception_exit(prev_state); + return rc; +} +EXPORT_SYMBOL_GPL(hash_page_mm); + +int hash_page(unsigned long ea, unsigned long access, unsigned long trap, + unsigned long dsisr) +{ + unsigned long flags = 0; + struct mm_struct *mm = current->mm; + + if ((get_region_id(ea) == VMALLOC_REGION_ID) || + (get_region_id(ea) == IO_REGION_ID)) + mm = &init_mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + return hash_page_mm(mm, ea, access, trap, flags); +} +EXPORT_SYMBOL_GPL(hash_page); + +int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, + unsigned long dsisr) +{ + unsigned long access = _PAGE_PRESENT | _PAGE_READ; + unsigned long flags = 0; + struct mm_struct *mm = current->mm; + unsigned int region_id = get_region_id(ea); + + if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) + mm = &init_mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + if (dsisr & DSISR_ISSTORE) + access |= _PAGE_WRITE; + /* + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. + */ + access |= _PAGE_PRIVILEGED; + if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) + access &= ~_PAGE_PRIVILEGED; + + if (trap == 0x400) + access |= _PAGE_EXEC; + + return hash_page_mm(mm, ea, access, trap, flags); +} + +#ifdef CONFIG_PPC_MM_SLICES +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + int psize = get_slice_psize(mm, ea); + + /* We only prefault standard pages for now */ + if (unlikely(psize != mm_ctx_user_psize(&mm->context))) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) + return false; + + return true; +} +#else +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + return true; +} +#endif + +void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) +{ + int hugepage_shift; + unsigned long vsid; + pgd_t *pgdir; + pte_t *ptep; + unsigned long flags; + int rc, ssize, update_flags = 0; + unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); + + BUG_ON(get_region_id(ea) != USER_REGION_ID); + + if (!should_hash_preload(mm, ea)) + return; + + DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," + " trap=%lx\n", mm, mm->pgd, ea, access, trap); + + /* Get Linux PTE if available */ + pgdir = mm->pgd; + if (pgdir == NULL) + return; + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + if (!vsid) + return; + /* + * Hash doesn't like irqs. Walking linux page table with irq disabled + * saves us from holding multiple locks. + */ + local_irq_save(flags); + + /* + * THP pages use update_mmu_cache_pmd. We don't do + * hash preload there. Hence can ignore THP here + */ + ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); + if (!ptep) + goto out_exit; + + WARN_ON(hugepage_shift); +#ifdef CONFIG_PPC_64K_PAGES + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on + * a 64K kernel), then we don't preload, hash_page() will take + * care of it once we actually try to access the page. + * That way we don't have to duplicate all of the logic for segment + * page size demotion here + */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) + goto out_exit; +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Is that local to this CPU ? */ + if (mm_is_thread_local(mm)) + update_flags |= HPTE_LOCAL_UPDATE; + + /* Hash it in */ +#ifdef CONFIG_PPC_64K_PAGES + if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + update_flags, ssize); + else +#endif /* CONFIG_PPC_64K_PAGES */ + rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, + ssize, subpage_protection(mm, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm_ctx_user_psize(&mm->context), + mm_ctx_user_psize(&mm->context), + pte_val(*ptep)); +out_exit: + local_irq_restore(flags); +} + +#ifdef CONFIG_PPC_MEM_KEYS +/* + * Return the protection key associated with the given address and the + * mm_struct. + */ +u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) +{ + pte_t *ptep; + u16 pkey = 0; + unsigned long flags; + + if (!mm || !mm->pgd) + return 0; + + local_irq_save(flags); + ptep = find_linux_pte(mm->pgd, address, NULL, NULL); + if (ptep) + pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); + local_irq_restore(flags); + + return pkey; +} +#endif /* CONFIG_PPC_MEM_KEYS */ + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static inline void tm_flush_hash_page(int local) +{ + /* + * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a + * page back to a block device w/PIO could pick up transactional data + * (bad!) so we force an abort here. Before the sync the page will be + * made read-only, which will flush_hash_page. BIG ISSUE here: if the + * kernel uses a page from userspace without unmapping it first, it may + * see the speculated version. + */ + if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs && + MSR_TM_ACTIVE(current->thread.regs->msr)) { + tm_enable(); + tm_abort(TM_CAUSE_TLBI); + } +} +#else +static inline void tm_flush_hash_page(int local) +{ +} +#endif + +/* + * Return the global hash slot, corresponding to the given PTE, which contains + * the HPTE. + */ +unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, + int ssize, real_pte_t rpte, unsigned int subpg_index) +{ + unsigned long hash, gslot, hidx; + + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(rpte, subpg_index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + gslot += hidx & _PTEIDX_GROUP_IX; + return gslot; +} + +/* + * WARNING: This is called from hash_low_64.S, if you change this prototype, + * do not forget to update the assembly call site ! + */ +void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, + unsigned long flags) +{ + unsigned long index, shift, gslot; + int local = flags & HPTE_LOCAL_UPDATE; + + DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index); + DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); + /* + * We use same base page size and actual psize, because we don't + * use these functions for hugepage + */ + mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize, + ssize, local); + } pte_iterate_hashed_end(); + + tm_flush_hash_page(local); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void flush_hash_hugepage(unsigned long vsid, unsigned long addr, + pmd_t *pmdp, unsigned int psize, int ssize, + unsigned long flags) +{ + int i, max_hpte_count, valid; + unsigned long s_addr; + unsigned char *hpte_slot_array; + unsigned long hidx, shift, vpn, hash, slot; + int local = flags & HPTE_LOCAL_UPDATE; + + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_page_pmd + */ + if (!hpte_slot_array) + return; + + if (mmu_hash_ops.hugepage_invalidate) { + mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array, + psize, ssize, local); + goto tm_abort; + } + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, local); + } +tm_abort: + tm_flush_hash_page(local); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void flush_hash_range(unsigned long number, int local) +{ + if (mmu_hash_ops.flush_hash_range) + mmu_hash_ops.flush_hash_range(number, local); + else { + int i; + struct ppc64_tlb_batch *batch = + this_cpu_ptr(&ppc64_tlb_batch); + + for (i = 0; i < number; i++) + flush_hash_page(batch->vpn[i], batch->pte[i], + batch->psize, batch->ssize, local); + } +} + +/* + * low_hash_fault is called when we the low level hash code failed + * to instert a PTE due to an hypervisor error + */ +void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) +{ + enum ctx_state prev_state = exception_enter(); + + if (user_mode(regs)) { +#ifdef CONFIG_PPC_SUBPAGE_PROT + if (rc == -2) + _exception(SIGSEGV, regs, SEGV_ACCERR, address); + else +#endif + _exception(SIGBUS, regs, BUS_ADRERR, address); + } else + bad_page_fault(regs, address, SIGBUS); + + exception_exit(prev_state); +} + +long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int ssize) +{ + unsigned long hpte_group; + long slot; + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, + psize, psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, + vflags | HPTE_V_SECONDARY, + psize, psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + goto repeat; + } + } + + return slot; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; + + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + + BUG_ON (ret < 0); + spin_lock(&linear_map_hash_lock); + BUG_ON(linear_map_hash_slots[lmi] & 0x80); + linear_map_hash_slots[lmi] = ret | 0x80; + spin_unlock(&linear_map_hash_lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash, hidx, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + spin_lock(&linear_map_hash_lock); + BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + hidx = linear_map_hash_slots[lmi] & 0x7f; + linear_map_hash_slots[lmi] = 0; + spin_unlock(&linear_map_hash_lock); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, + mmu_linear_psize, + mmu_kernel_ssize, 0); +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi); + else + kernel_unmap_linear_page(vaddr, lmi); + } + local_irq_restore(flags); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* + * On virtualized systems the first entry is our RMA region aka VRMA, + * non-virtualized 64-bit hash MMU systems don't have a limitation + * on real mode access. + * + * For guests on platforms before POWER9, we clamp the it limit to 1G + * to avoid some funky things such as RTAS bugs etc... + */ + if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { + ppc64_rma_size = first_memblock_size; + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) + ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(ppc64_rma_size); + } else { + ppc64_rma_size = ULONG_MAX; + } +} + +#ifdef CONFIG_DEBUG_FS + +static int hpt_order_get(void *data, u64 *val) +{ + *val = ppc64_pft_size; + return 0; +} + +static int hpt_order_set(void *data, u64 val) +{ + if (!mmu_hash_ops.resize_hpt) + return -ENODEV; + + return mmu_hash_ops.resize_hpt(val); +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); + +static int __init hash64_debugfs(void) +{ + if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, + NULL, &fops_hpt_order)) { + pr_err("lpar: unable to create hpt_order debugsfs file\n"); + } + + return 0; +} +machine_device_initcall(pseries, hash64_debugfs); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c new file mode 100644 index 000000000000..e7a9c4f6bfca --- /dev/null +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -0,0 +1,482 @@ +/* + * IOMMU helpers in MMU context. + * + * Copyright (C) 2015 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(mem_list_mutex); + +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) + +struct mm_iommu_table_group_mem_t { + struct list_head next; + struct rcu_head rcu; + unsigned long used; + atomic64_t mapped; + unsigned int pageshift; + u64 ua; /* userspace address */ + u64 entries; /* number of entries in hpas/hpages[] */ + /* + * in mm_iommu_get we temporarily use this to store + * struct page address. + * + * We need to convert ua to hpa in real mode. Make it + * simpler by storing physical address. + */ + union { + struct page **hpages; /* vmalloc'ed */ + phys_addr_t *hpas; + }; +#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) + u64 dev_hpa; /* Device memory base address */ +}; + +static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, + unsigned long npages, bool incr) +{ + long ret = 0, locked, lock_limit; + + if (!npages) + return 0; + + down_write(&mm->mmap_sem); + + if (incr) { + locked = mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + mm->locked_vm += npages; + } else { + if (WARN_ON_ONCE(npages > mm->locked_vm)) + npages = mm->locked_vm; + mm->locked_vm -= npages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", + current ? current->pid : 0, + incr ? '+' : '-', + npages << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK)); + up_write(&mm->mmap_sem); + + return ret; +} + +bool mm_iommu_preregistered(struct mm_struct *mm) +{ + return !list_empty(&mm->context.iommu_group_mem_list); +} +EXPORT_SYMBOL_GPL(mm_iommu_preregistered); + +static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + struct mm_iommu_table_group_mem_t *mem; + long i, ret, locked_entries = 0; + unsigned int pageshift; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, + next) { + /* Overlap? */ + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem->ua + + (mem->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + goto unlock_exit; + } + + } + + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { + ret = mm_iommu_adjust_locked_vm(mm, entries, true); + if (ret) + goto unlock_exit; + + locked_entries = entries; + } + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + goto unlock_exit; + } + + if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { + mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); + mem->dev_hpa = dev_hpa; + goto good_exit; + } + mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; + + /* + * For a starting point for a maximum page size calculation + * we use @ua and @entries natural alignment to allow IOMMU pages + * smaller than huge pages but still bigger than PAGE_SIZE. + */ + mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); + mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); + if (!mem->hpas) { + kfree(mem); + ret = -ENOMEM; + goto unlock_exit; + } + + down_read(&mm->mmap_sem); + ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + up_read(&mm->mmap_sem); + if (ret != entries) { + /* free the reference taken */ + for (i = 0; i < ret; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + ret = -EFAULT; + goto unlock_exit; + } + + pageshift = PAGE_SHIFT; + for (i = 0; i < entries; ++i) { + struct page *page = mem->hpages[i]; + + /* + * Allow to use larger than 64k IOMMU pages. Only do that + * if we are backed by hugetlb. + */ + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { + struct page *head = compound_head(page); + + pageshift = compound_order(head) + PAGE_SHIFT; + } + mem->pageshift = min(mem->pageshift, pageshift); + /* + * We don't need struct page reference any more, switch + * to physical address. + */ + mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + } + +good_exit: + ret = 0; + atomic64_set(&mem->mapped, 1); + mem->used = 1; + mem->ua = ua; + mem->entries = entries; + *pmem = mem; + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + +unlock_exit: + if (locked_entries && ret) + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + + mutex_unlock(&mem_list_mutex); + + return ret; +} + +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, + pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_new); + +long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_newdev); + +static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) +{ + long i; + struct page *page = NULL; + + if (!mem->hpas) + return; + + for (i = 0; i < mem->entries; ++i) { + if (!mem->hpas[i]) + continue; + + page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); + if (!page) + continue; + + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) + SetPageDirty(page); + + put_page(page); + mem->hpas[i] = 0; + } +} + +static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) +{ + + mm_iommu_unpin(mem); + vfree(mem->hpas); + kfree(mem); +} + +static void mm_iommu_free(struct rcu_head *head) +{ + struct mm_iommu_table_group_mem_t *mem = container_of(head, + struct mm_iommu_table_group_mem_t, rcu); + + mm_iommu_do_free(mem); +} + +static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) +{ + list_del_rcu(&mem->next); + call_rcu(&mem->rcu, mm_iommu_free); +} + +long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) +{ + long ret = 0; + unsigned long entries, dev_hpa; + + mutex_lock(&mem_list_mutex); + + if (mem->used == 0) { + ret = -ENOENT; + goto unlock_exit; + } + + --mem->used; + /* There are still users, exit */ + if (mem->used) + goto unlock_exit; + + /* Are there still mappings? */ + if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { + ++mem->used; + ret = -EBUSY; + goto unlock_exit; + } + + /* @mapped became 0 so now mappings are disabled, release the region */ + entries = mem->entries; + dev_hpa = mem->dev_hpa; + mm_iommu_release(mem); + + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + mm_iommu_adjust_locked_vm(mm, entries, false); + +unlock_exit: + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_put); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, + unsigned long ua, unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_lookup); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, + unsigned long ua, unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, + next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} + +struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, + unsigned long ua, unsigned long entries) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ret = mem; + ++mem->used; + break; + } + } + + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_get); + +long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned int pageshift, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + u64 *va; + + if (entry >= mem->entries) + return -EFAULT; + + if (pageshift > mem->pageshift) + return -EFAULT; + + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + va = &mem->hpas[entry]; + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); + + return 0; +} +EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); + +long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned int pageshift, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + unsigned long *pa; + + if (entry >= mem->entries) + return -EFAULT; + + if (pageshift > mem->pageshift) + return -EFAULT; + + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); + if (!pa) + return -EFAULT; + + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); + + return 0; +} + +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) +{ + struct mm_iommu_table_group_mem_t *mem; + long entry; + void *va; + unsigned long *pa; + + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); + if (!mem) + return; + + if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) + return; + + entry = (ua - mem->ua) >> PAGE_SHIFT; + va = &mem->hpas[entry]; + + pa = (void *) vmalloc_to_phys(va); + if (!pa) + return; + + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; +} + +bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift, unsigned long *size) +{ + struct mm_iommu_table_group_mem_t *mem; + unsigned long end; + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + continue; + + end = mem->dev_hpa + (mem->entries << PAGE_SHIFT); + if ((mem->dev_hpa <= hpa) && (hpa < end)) { + /* + * Since the IOMMU page size might be bigger than + * PAGE_SIZE, the amount of preregistered memory + * starting from @hpa might be smaller than 1<mapped)) + return 0; + + /* Last mm_iommu_put() has been called, no more mappings allowed() */ + return -ENXIO; +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); + +void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) +{ + atomic64_add_unless(&mem->mapped, -1, 1); +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); + +void mm_iommu_init(struct mm_struct *mm) +{ + INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list); +} diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c new file mode 100644 index 000000000000..cb2b08635508 --- /dev/null +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -0,0 +1,263 @@ +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static DEFINE_IDA(mmu_context_ida); + +static int alloc_context_id(int min_id, int max_id) +{ + return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); +} + +void hash__reserve_context_id(int id) +{ + int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); + + WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); +} + +int hash__alloc_context_id(void) +{ + unsigned long max; + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + max = MAX_USER_CONTEXT; + else + max = MAX_USER_CONTEXT_65BIT_VA; + + return alloc_context_id(MIN_USER_CONTEXT, max); +} +EXPORT_SYMBOL_GPL(hash__alloc_context_id); + +void slb_setup_new_exec(void); + +static int hash__init_new_context(struct mm_struct *mm) +{ + int index; + + index = hash__alloc_context_id(); + if (index < 0) + return index; + + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), + GFP_KERNEL); + if (!mm->context.hash_context) { + ida_free(&mmu_context_ida, index); + return -ENOMEM; + } + + /* + * The old code would re-promote on fork, we don't do that when using + * slices as it could cause problem promoting slices that have been + * forced down to 4K. + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is OK. + */ + if (mm->context.id == 0) { + memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); + slice_init_new_context_exec(mm); + } else { + /* This is fork. Copy hash_context details from current->mm */ + memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); +#ifdef CONFIG_PPC_SUBPAGE_PROT + /* inherit subpage prot detalis if we have one. */ + if (current->mm->context.hash_context->spt) { + mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), + GFP_KERNEL); + if (!mm->context.hash_context->spt) { + ida_free(&mmu_context_ida, index); + kfree(mm->context.hash_context); + return -ENOMEM; + } + } +#endif + + } + + pkey_mm_init(mm); + return index; +} + +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); + + slb_setup_new_exec(); +} + +static int radix__init_new_context(struct mm_struct *mm) +{ + unsigned long rts_field; + int index, max_id; + + max_id = (1 << mmu_pid_bits) - 1; + index = alloc_context_id(mmu_base_pid, max_id); + if (index < 0) + return index; + + /* + * set the process table entry, + */ + rts_field = radix__get_tree_size(); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + + /* + * Order the above store with subsequent update of the PID + * register (at which point HW can start loading/caching + * the entry) and the corresponding load by the MMU from + * the L2 cache. + */ + asm volatile("ptesync;isync" : : : "memory"); + + mm->context.npu_context = NULL; + mm->context.hash_context = NULL; + + return index; +} + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + if (radix_enabled()) + index = radix__init_new_context(mm); + else + index = hash__init_new_context(mm); + + if (index < 0) + return index; + + mm->context.id = index; + + mm->context.pte_frag = NULL; + mm->context.pmd_frag = NULL; +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(mm); +#endif + atomic_set(&mm->context.active_cpus, 0); + atomic_set(&mm->context.copros, 0); + + return 0; +} + +void __destroy_context(int context_id) +{ + ida_free(&mmu_context_ida, context_id); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +static void destroy_contexts(mm_context_t *ctx) +{ + int index, context_id; + + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +} + +static void pmd_frag_destroy(void *pmd_frag) +{ + int count; + struct page *page; + + page = virt_to_page(pmd_frag); + /* drop all the pending references */ + count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { + pgtable_pmd_page_dtor(page); + __free_page(page); + } +} + +static void destroy_pagetable_cache(struct mm_struct *mm) +{ + void *frag; + + frag = mm->context.pte_frag; + if (frag) + pte_frag_destroy(frag); + + frag = mm->context.pmd_frag; + if (frag) + pmd_frag_destroy(frag); + return; +} + +void destroy_context(struct mm_struct *mm) +{ +#ifdef CONFIG_SPAPR_TCE_IOMMU + WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); +#endif + if (radix_enabled()) + WARN_ON(process_tb[mm->context.id].prtb0 != 0); + else + subpage_prot_free(mm); + destroy_contexts(&mm->context); + mm->context.id = MMU_NO_CONTEXT; +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + destroy_pagetable_cache(mm); + + if (radix_enabled()) { + /* + * Radix doesn't have a valid bit in the process table + * entries. However we know that at least P9 implementation + * will avoid caching an entry with an invalid RTS field, + * and 0 is invalid. So this will do. + * + * This runs before the "fullmm" tlb flush in exit_mmap, + * which does a RIC=2 tlbie to clear the process table + * entry. See the "fullmm" comments in tlb-radix.c. + * + * No barrier required here after the store because + * this process will do the invalidate, which starts with + * ptesync. + */ + process_tb[mm->context.id].prtb0 = 0; + } +} + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + isync(); +} +#endif diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c new file mode 100644 index 000000000000..16bda049187a --- /dev/null +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -0,0 +1,449 @@ +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +unsigned long __pmd_frag_nr; +EXPORT_SYMBOL(__pmd_frag_nr); +unsigned long __pmd_frag_size_shift; +EXPORT_SYMBOL(__pmd_frag_size_shift); + +int (*register_process_table)(unsigned long base, unsigned long page_size, + unsigned long tbl_size); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + /* + * We can use MMU_PAGE_2M here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pmdp_ptep(pmdp), + pmd_pte(entry), address, MMU_PAGE_2M); + } + return changed; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(pmd_lockptr(mm, pmdp)); + WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +static void do_nothing(void *unused) +{ + +} +/* + * Serialize against find_current_mm_pte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_current_mm_pte to finish. + */ +void serialize_against_pte_lookup(struct mm_struct *mm) +{ + smp_mb(); + smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); +} + +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + unsigned long old_pmd; + + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + serialize_against_pte_lookup(vma->vm_mm); + return __pmd(old_pmd); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + if (radix_enabled()) + prefetch((void *)addr); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* For use by kexec */ +void mmu_cleanup_all(void) +{ + if (radix_enabled()) + radix__mmu_cleanup_all(); + else if (mmu_hash_ops.hpte_clear_all) + mmu_hash_ops.hpte_clear_all(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid) +{ + if (radix_enabled()) + return radix__create_section_mapping(start, end, nid); + + return hash__create_section_mapping(start, end, nid); +} + +int __meminit remove_section_mapping(unsigned long start, unsigned long end) +{ + if (radix_enabled()) + return radix__remove_section_mapping(start, end); + + return hash__remove_section_mapping(start, end); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +void __init mmu_partition_table_init(void) +{ + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + unsigned long ptcr; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); + /* Initialize the Partition Table with no entries */ + partition_tb = memblock_alloc(patb_size, patb_size); + if (!partition_tb) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, patb_size, patb_size); + + /* + * update partition table control register, + * 64 K size. + */ + ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); + mtspr(SPRN_PTCR, ptcr); + powernv_set_nmmu_ptcr(ptcr); +} + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1) +{ + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* + * Global flush of TLBs and partition table caches for this lpid. + * The type of flush (hash or radix) depends on what the previous + * use of this partition ID was, not the new use. + */ + asm volatile("ptesync" : : : "memory"); + if (old & PATB_HR) { + asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); + } else { + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} +EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); + +static pmd_t *get_pmd_from_cache(struct mm_struct *mm) +{ + void *pmd_frag, *ret; + + if (PMD_FRAG_NR == 1) + return NULL; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pmd_frag; + if (ret) { + pmd_frag = ret + PMD_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0) + pmd_frag = NULL; + mm->context.pmd_frag = pmd_frag; + } + spin_unlock(&mm->page_table_lock); + return (pmd_t *)ret; +} + +static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) +{ + void *ret = NULL; + struct page *page; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + page = alloc_page(gfp); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_pages(page, 0); + return NULL; + } + + atomic_set(&page->pt_frag_refcount, 1); + + ret = page_address(page); + /* + * if we support only one fragment just return the + * allocated page. + */ + if (PMD_FRAG_NR == 1) + return ret; + + spin_lock(&mm->page_table_lock); + /* + * If we find pgtable_page set, we return + * the allocated page with single fragement + * count. + */ + if (likely(!mm->context.pmd_frag)) { + atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + mm->context.pmd_frag = ret + PMD_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pmd_t *)ret; +} + +pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmd; + + pmd = get_pmd_from_cache(mm); + if (pmd) + return pmd; + + return __alloc_for_pmdcache(mm); +} + +void pmd_fragment_free(unsigned long *pmd) +{ + struct page *page = virt_to_page(pmd); + + BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&page->pt_frag_refcount)) { + pgtable_pmd_page_dtor(page); + __free_page(page); + } +} + +static inline void pgtable_free(void *table, int index) +{ + switch (index) { + case PTE_INDEX: + pte_fragment_free(table, 0); + break; + case PMD_INDEX: + pmd_fragment_free(table); + break; + case PUD_INDEX: + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); + break; +#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) + /* 16M hugepd directory at pud level */ + case HTLB_16M_INDEX: + BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); + break; + /* 16G hugepd directory at the pgd level */ + case HTLB_16G_INDEX: + BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); + break; +#endif + /* We don't free pgd table via RCU callback */ + default: + BUG(); + } +} + +#ifdef CONFIG_SMP +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(index > MAX_PGTABLE_INDEX_SIZE); + pgf |= index; + tlb_remove_table(tlb, (void *)pgf); +} + +void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + return pgtable_free(table, index); +} +#else +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + return pgtable_free(table, index); +} +#endif + +#ifdef CONFIG_PROC_FS +atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; + +void arch_report_meminfo(struct seq_file *m) +{ + /* + * Hash maps the memory with one size mmu_linear_psize. + * So don't bother to print these on hash + */ + if (!radix_enabled()) + return; + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); + seq_printf(m, "DirectMap64k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); + seq_printf(m, "DirectMap2M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); + seq_printf(m, "DirectMap1G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); +} +#endif /* CONFIG_PROC_FS */ + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + unsigned long pte_val; + + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); + + return __pte(pte_val); + +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + if (radix_enabled()) + return radix__ptep_modify_prot_commit(vma, addr, + ptep, old_pte, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); +} + +/* + * For hash translation mode, we use the deposited table to store hash slot + * information and they are stored at PTRS_PER_PMD offset from related pmd + * location. Hence a pmd move requires deposit and withdraw. + * + * For radix translation with split pmd ptl, we store the deposited table in the + * pmd page. Hence if we have different pmd page we need to withdraw during pmd + * move. + * + * With hash we use deposited table always irrespective of anon or not. + * With radix we use deposited table only for anonymous mapping. + */ +int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) +{ + if (radix_enabled()) + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); + + return true; +} diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c new file mode 100644 index 000000000000..ae7fca40e5b3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PowerPC Memory Protection Keys management + * + * Copyright 2017, Ram Pai, IBM Corporation. + */ + +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_TRUE(pkey_disabled); +int pkeys_total; /* Total pkeys as per device tree */ +u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ +u32 reserved_allocation_mask; /* Bits set for reserved keys */ +static bool pkey_execute_disable_supported; +static bool pkeys_devtree_defined; /* property exported by device tree */ +static u64 pkey_amr_mask; /* Bits in AMR not to be touched */ +static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ +static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ +static int execute_only_key = 2; + +#define AMR_BITS_PER_PKEY 2 +#define AMR_RD_BIT 0x1UL +#define AMR_WR_BIT 0x2UL +#define IAMR_EX_BIT 0x1UL +#define PKEY_REG_BITS (sizeof(u64)*8) +#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) + +static void scan_pkey_feature(void) +{ + u32 vals[2]; + struct device_node *cpu; + + cpu = of_find_node_by_type(NULL, "cpu"); + if (!cpu) + return; + + if (of_property_read_u32_array(cpu, + "ibm,processor-storage-keys", vals, 2)) + return; + + /* + * Since any pkey can be used for data or execute, we will just treat + * all keys as equal and track them as one entity. + */ + pkeys_total = vals[0]; + pkeys_devtree_defined = true; +} + +static inline bool pkey_mmu_enabled(void) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return pkeys_total; + else + return cpu_has_feature(CPU_FTR_PKEY); +} + +static int pkey_initialize(void) +{ + int os_reserved, i; + + /* + * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral + * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE. + * Ensure that the bits a distinct. + */ + BUILD_BUG_ON(PKEY_DISABLE_EXECUTE & + (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + /* + * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous + * in the vmaflag. Make sure that is really the case. + */ + BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + + __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + != (sizeof(u64) * BITS_PER_BYTE)); + + /* scan the device tree for pkey feature */ + scan_pkey_feature(); + + /* + * Let's assume 32 pkeys on P8 bare metal, if its not defined by device + * tree. We make this exception since skiboot forgot to expose this + * property on power8. + */ + if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) && + cpu_has_feature(CPU_FTRS_POWER8)) + pkeys_total = 32; + + /* + * Adjust the upper limit, based on the number of bits supported by + * arch-neutral code. + */ + pkeys_total = min_t(int, pkeys_total, + ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); + + if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) + static_branch_enable(&pkey_disabled); + else + static_branch_disable(&pkey_disabled); + + if (static_branch_likely(&pkey_disabled)) + return 0; + + /* + * The device tree cannot be relied to indicate support for + * execute_disable support. Instead we use a PVR check. + */ + if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p)) + pkey_execute_disable_supported = false; + else + pkey_execute_disable_supported = true; + +#ifdef CONFIG_PPC_4K_PAGES + /* + * The OS can manage only 8 pkeys due to its inability to represent them + * in the Linux 4K PTE. + */ + os_reserved = pkeys_total - 8; +#else + os_reserved = 0; +#endif + /* Bits are in LE format. */ + reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); + + /* register mask is in BE format */ + pkey_amr_mask = ~0x0ul; + pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); + + pkey_iamr_mask = ~0x0ul; + pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); + pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + + pkey_uamor_mask = ~0x0ul; + pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); + pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + + /* mark the rest of the keys as reserved and hence unavailable */ + for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { + reserved_allocation_mask |= (0x1 << i); + pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); + } + initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); + + if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { + /* + * Insufficient number of keys to support + * execute only key. Mark it unavailable. + * Any AMR, UAMOR, IAMR bit set for + * this key is irrelevant since this key + * can never be allocated. + */ + execute_only_key = -1; + } + + return 0; +} + +arch_initcall(pkey_initialize); + +void pkey_mm_init(struct mm_struct *mm) +{ + if (static_branch_likely(&pkey_disabled)) + return; + mm_pkey_allocation_map(mm) = initial_allocation_mask; + mm->context.execute_only_pkey = execute_only_key; +} + +static inline u64 read_amr(void) +{ + return mfspr(SPRN_AMR); +} + +static inline void write_amr(u64 value) +{ + mtspr(SPRN_AMR, value); +} + +static inline u64 read_iamr(void) +{ + if (!likely(pkey_execute_disable_supported)) + return 0x0UL; + + return mfspr(SPRN_IAMR); +} + +static inline void write_iamr(u64 value) +{ + if (!likely(pkey_execute_disable_supported)) + return; + + mtspr(SPRN_IAMR, value); +} + +static inline u64 read_uamor(void) +{ + return mfspr(SPRN_UAMOR); +} + +static inline void write_uamor(u64 value) +{ + mtspr(SPRN_UAMOR, value); +} + +static bool is_pkey_enabled(int pkey) +{ + u64 uamor = read_uamor(); + u64 pkey_bits = 0x3ul << pkeyshift(pkey); + u64 uamor_pkey_bits = (uamor & pkey_bits); + + /* + * Both the bits in UAMOR corresponding to the key should be set or + * reset. + */ + WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits)); + return !!(uamor_pkey_bits); +} + +static inline void init_amr(int pkey, u8 init_bits) +{ + u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); + u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); + + write_amr(old_amr | new_amr_bits); +} + +static inline void init_iamr(int pkey, u8 init_bits) +{ + u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); + u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); + + write_iamr(old_iamr | new_iamr_bits); +} + +/* + * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that + * specified in @init_val. + */ +int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + u64 new_amr_bits = 0x0ul; + u64 new_iamr_bits = 0x0ul; + + if (!is_pkey_enabled(pkey)) + return -EINVAL; + + if (init_val & PKEY_DISABLE_EXECUTE) { + if (!pkey_execute_disable_supported) + return -EINVAL; + new_iamr_bits |= IAMR_EX_BIT; + } + init_iamr(pkey, new_iamr_bits); + + /* Set the bits we need in AMR: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT; + else if (init_val & PKEY_DISABLE_WRITE) + new_amr_bits |= AMR_WR_BIT; + + init_amr(pkey, new_amr_bits); + return 0; +} + +void thread_pkey_regs_save(struct thread_struct *thread) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + /* + * TODO: Skip saving registers if @thread hasn't used any keys yet. + */ + thread->amr = read_amr(); + thread->iamr = read_iamr(); + thread->uamor = read_uamor(); +} + +void thread_pkey_regs_restore(struct thread_struct *new_thread, + struct thread_struct *old_thread) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + if (old_thread->amr != new_thread->amr) + write_amr(new_thread->amr); + if (old_thread->iamr != new_thread->iamr) + write_iamr(new_thread->iamr); + if (old_thread->uamor != new_thread->uamor) + write_uamor(new_thread->uamor); +} + +void thread_pkey_regs_init(struct thread_struct *thread) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + thread->amr = pkey_amr_mask; + thread->iamr = pkey_iamr_mask; + thread->uamor = pkey_uamor_mask; + + write_uamor(pkey_uamor_mask); + write_amr(pkey_amr_mask); + write_iamr(pkey_iamr_mask); +} + +static inline bool pkey_allows_readwrite(int pkey) +{ + int pkey_shift = pkeyshift(pkey); + + if (!is_pkey_enabled(pkey)) + return true; + + return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift)); +} + +int __execute_only_pkey(struct mm_struct *mm) +{ + return mm->context.execute_only_pkey; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) + return false; + + return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey); +} + +/* + * This should only be called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, + int pkey) +{ + /* + * If the currently associated pkey is execute-only, but the requested + * protection is not execute-only, move it back to the default pkey. + */ + if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC)) + return 0; + + /* + * The requested protection is execute-only. Hence let's use an + * execute-only pkey. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } + + /* Nothing to override. */ + return vma_pkey(vma); +} + +static bool pkey_access_permitted(int pkey, bool write, bool execute) +{ + int pkey_shift; + u64 amr; + + if (!is_pkey_enabled(pkey)) + return true; + + pkey_shift = pkeyshift(pkey); + if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift))) + return true; + + amr = read_amr(); /* Delay reading amr until absolutely needed */ + return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) || + (write && !(amr & (AMR_WR_BIT << pkey_shift)))); +} + +bool arch_pte_access_permitted(u64 pte, bool write, bool execute) +{ + if (static_branch_likely(&pkey_disabled)) + return true; + + return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); +} + +/* + * We only want to enforce protection keys on the current thread because we + * effectively have no access to AMR/IAMR for other threads or any way to tell + * which AMR/IAMR in a threaded process we could use. + * + * So do not enforce things if the VMA is not from the current mm, or if we are + * in a kernel thread. + */ +static inline bool vma_is_foreign(struct vm_area_struct *vma) +{ + if (!current->mm) + return true; + + /* if it is not our ->mm, it has to be foreign */ + if (current->mm != vma->vm_mm) + return true; + + return false; +} + +bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, + bool execute, bool foreign) +{ + if (static_branch_likely(&pkey_disabled)) + return true; + /* + * Do not enforce our key-permissions on a foreign vma. + */ + if (foreign || vma_is_foreign(vma)) + return true; + + return pkey_access_permitted(vma_pkey(vma), write, execute); +} + +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + /* Duplicate the oldmm pkey state in mm: */ + mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); + mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; +} diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c new file mode 100644 index 000000000000..cab06331c0c0 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); +} + +/* + * A vairant of hugetlb_get_unmapped_area doing topdown search + * FIXME!! should we do as x86 does or non hugetlb area does ? + * ie, use topdown or not based on mmap_is_legacy check ? + */ +unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; + struct vm_unmapped_area_info info; + + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > high_limit) + return -ENOMEM; + + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (high_limit - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + return addr; + } + /* + * We are always doing an topdown search here. Slice code + * does that too. + */ + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + + return vm_unmapped_area(&info); +} + +void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_hugetlb_page(vma, addr); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c new file mode 100644 index 000000000000..c9bcf428dd2b --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -0,0 +1,1124 @@ +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "radix-mmu: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned int mmu_pid_bits; +unsigned int mmu_base_pid; + +static int native_register_process_table(unsigned long base, unsigned long pg_sz, + unsigned long table_size) +{ + unsigned long patb0, patb1; + + patb0 = be64_to_cpu(partition_tb[0].patb0); + patb1 = base | table_size | PATB_GR; + + mmu_partition_table_set_entry(0, patb0, patb1); + + return 0; +} + +static __ref void *early_alloc_pgtable(unsigned long size, int nid, + unsigned long region_start, unsigned long region_end) +{ + phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; + phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; + void *ptr; + + if (region_start) + min_addr = region_start; + if (region_end) + max_addr = region_end; + + ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", + __func__, size, size, nid, &min_addr, &max_addr); + + return ptr; +} + +static int early_map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, + region_start, region_end); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, + region_start, region_end); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + smp_wmb(); + return 0; +} + +/* + * nid, region_start, and region_end are hints to try to place the page + * table memory in the same node or region. + */ +static int __map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + +#ifdef CONFIG_PPC_64K_PAGES + BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); +#endif + + if (unlikely(!slab_is_available())) + return early_map_kernel_page(ea, pa, flags, map_page_size, + nid, region_start, region_end); + + /* + * Should make page table allocation functions be able to take a + * node, so we can place kernel page tables on the right nodes after + * boot. + */ + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + smp_wmb(); + return 0; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) +{ + unsigned long idx; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + start = ALIGN_DOWN(start, PAGE_SIZE); + end = PAGE_ALIGN(end); // aligns up + + pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", + start, end, clear); + + for (idx = start; idx < end; idx += PAGE_SIZE) { + pgdp = pgd_offset_k(idx); + pudp = pud_alloc(&init_mm, pgdp, idx); + if (!pudp) + continue; + if (pud_huge(*pudp)) { + ptep = (pte_t *)pudp; + goto update_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, idx); + if (!pmdp) + continue; + if (pmd_huge(*pmdp)) { + ptep = pmdp_ptep(pmdp); + goto update_the_pte; + } + ptep = pte_alloc_kernel(pmdp, idx); + if (!ptep) + continue; +update_the_pte: + radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); + } + + radix__flush_tlb_kernel_range(start, end); +} + +void radix__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + radix__change_memory_range(start, end, _PAGE_WRITE); +} + +void radix__mark_initmem_nx(void) +{ + unsigned long start = (unsigned long)__init_begin; + unsigned long end = (unsigned long)__init_end; + + radix__change_memory_range(start, end, _PAGE_EXEC); +} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + +static inline void __meminit +print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) +{ + char buf[10]; + + if (end <= start) + return; + + string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); + + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); +} + +static unsigned long next_boundary(unsigned long addr, unsigned long end) +{ +#ifdef CONFIG_STRICT_KERNEL_RWX + if (addr < __pa_symbol(__init_begin)) + return __pa_symbol(__init_begin); +#endif + return end; +} + +static int __meminit create_physical_mapping(unsigned long start, + unsigned long end, + int nid) +{ + unsigned long vaddr, addr, mapping_size = 0; + bool prev_exec, exec = false; + pgprot_t prot; + int psize; + + start = _ALIGN_UP(start, PAGE_SIZE); + for (addr = start; addr < end; addr += mapping_size) { + unsigned long gap, previous_size; + int rc; + + gap = next_boundary(addr, end) - addr; + previous_size = mapping_size; + prev_exec = exec; + + if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && + mmu_psize_defs[MMU_PAGE_1G].shift) { + mapping_size = PUD_SIZE; + psize = MMU_PAGE_1G; + } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && + mmu_psize_defs[MMU_PAGE_2M].shift) { + mapping_size = PMD_SIZE; + psize = MMU_PAGE_2M; + } else { + mapping_size = PAGE_SIZE; + psize = mmu_virtual_psize; + } + + vaddr = (unsigned long)__va(addr); + + if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { + prot = PAGE_KERNEL_X; + exec = true; + } else { + prot = PAGE_KERNEL; + exec = false; + } + + if (mapping_size != previous_size || exec != prev_exec) { + print_mapping(start, addr, previous_size, prev_exec); + start = addr; + } + + rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); + if (rc) + return rc; + + update_page_count(psize, 1); + } + + print_mapping(start, addr, mapping_size, exec); + return 0; +} + +void __init radix_init_pgtable(void) +{ + unsigned long rts_field; + struct memblock_region *reg; + + /* We don't support slb for radix */ + mmu_slb_size = 0; + /* + * Create the linear mapping, using standard page size for now + */ + for_each_memblock(memory, reg) { + /* + * The memblock allocator is up at this point, so the + * page tables will be allocated within the range. No + * need or a node (which we don't have yet). + */ + + if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + + WARN_ON(create_physical_mapping(reg->base, + reg->base + reg->size, + -1)); + } + + /* Find out how many PID bits are supported */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (!mmu_pid_bits) + mmu_pid_bits = 20; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* + * When KVM is possible, we only use the top half of the + * PID space to avoid collisions between host and guest PIDs + * which can cause problems due to prefetch when exiting the + * guest with AIL=3 + */ + mmu_base_pid = 1 << (mmu_pid_bits - 1); +#else + mmu_base_pid = 1; +#endif + } else { + /* The guest uses the bottom half of the PID space */ + if (!mmu_pid_bits) + mmu_pid_bits = 19; + mmu_base_pid = 1; + } + + /* + * Allocate Partition table and process table for the + * host. + */ + BUG_ON(PRTB_SIZE_SHIFT > 36); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); + /* + * Fill in the process table. + */ + rts_field = radix__get_tree_size(); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + /* + * Fill in the partition table. We are suppose to use effective address + * of process table here. But our linear mapping also enable us to use + * physical address here. + */ + register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); + + /* + * The init_mm context is given the first available (non-zero) PID, + * which is the "guard PID" and contains no page table. PIDR should + * never be set to zero because that duplicates the kernel address + * space at the 0x0... offset (quadrant 0)! + * + * An arbitrary PID that may later be allocated by the PID allocator + * for userspace processes must not be used either, because that + * would cause stale user mappings for that PID on CPUs outside of + * the TLB invalidation scheme (because it won't be in mm_cpumask). + * + * So permanently carve out one PID for the purpose of a guard PID. + */ + init_mm.context.id = mmu_base_pid; + mmu_base_pid++; +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field, dw0; + + mmu_partition_table_init(); + rts_field = radix__get_tree_size(); + dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; + mmu_partition_table_set_entry(0, dw0, 0); + + pr_info("Initializing Radix MMU\n"); + pr_info("Partition table %p\n", partition_tb); +} + +void __init radix_init_native(void) +{ + register_process_table = native_register_process_table; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Find MMU PID size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + /* Grab page size encodings */ + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size shift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +void __init radix__early_init_devtree(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + /* + * let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; +found: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + return; +} + +static void radix_init_amor(void) +{ + /* + * In HV mode, we init AMOR (Authority Mask Override Register) so that + * the hypervisor and guest can setup IAMR (Instruction Authority Mask + * Register), enable key 0 and set it to 1. + * + * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) + */ + mtspr(SPRN_AMOR, (3ul << 62)); +} + +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPRN_IAMR, (1ul << 62)); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; + } + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* vmemmap mapping */ + mmu_vmemmap_psize = mmu_virtual_psize; +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pud_cache_index = RADIX_PUD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + + __kernel_virt_start = RADIX_KERN_VIRT_START; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; + __kernel_io_end = RADIX_KERN_IO_END; + vmemmap = (struct page *)RADIX_VMEMMAP_START; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + __pte_frag_nr = RADIX_PTE_FRAG_NR; + __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = RADIX_PMD_FRAG_NR; + __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + radix_init_native(); + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + radix_init_partition_table(); + radix_init_amor(); + } else { + radix_init_pseries(); + } + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + radix_init_pgtable(); + /* Switch to the guard PID before turning on MMU */ + radix__switch_mmu_context(NULL, &init_mm); + if (cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * update partition table control register and UPRT + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + radix_init_amor(); + } + + radix__switch_mmu_context(NULL, &init_mm); + if (cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +void radix__mmu_cleanup_all(void) +{ + unsigned long lpcr; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); + mtspr(SPRN_PTCR, 0); + powernv_set_nmmu_ptcr(0); + radix__flush_tlb_all(); + } +} + +void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* + * Radix mode is not limited by RMA / VRMA addressing. + */ + ppc64_rma_size = ULONG_MAX; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, pte_start); + pmd_clear(pmd); +} + +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, pmd_start); + pud_clear(pud); +} + +struct change_mapping_params { + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long aligned_start; + unsigned long aligned_end; +}; + +static int __meminit stop_machine_change_mapping(void *data) +{ + struct change_mapping_params *params = + (struct change_mapping_params *)data; + + if (!data) + return -1; + + spin_unlock(&init_mm.page_table_lock); + pte_clear(&init_mm, params->aligned_start, params->pte); + create_physical_mapping(params->aligned_start, params->start, -1); + create_physical_mapping(params->end, params->aligned_end, -1); + spin_lock(&init_mm.page_table_lock); + return 0; +} + +static void remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { + /* + * The vmemmap_free() and remove_section_mapping() + * codepaths call us with aligned addresses. + */ + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, pte); + } +} + +/* + * clear the pte and potentially split the mapping helper + */ +static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, + unsigned long size, pte_t *pte) +{ + unsigned long mask = ~(size - 1); + unsigned long aligned_start = addr & mask; + unsigned long aligned_end = addr + size; + struct change_mapping_params params; + bool split_region = false; + + if ((end - addr) < size) { + /* + * We're going to clear the PTE, but not flushed + * the mapping, time to remap and flush. The + * effects if visible outside the processor or + * if we are running in code close to the + * mapping we cleared, we are in trouble. + */ + if (overlaps_kernel_text(aligned_start, addr) || + overlaps_kernel_text(end, aligned_end)) { + /* + * Hack, just return, don't pte_clear + */ + WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " + "text, not splitting\n", addr, end); + return; + } + split_region = true; + } + + if (split_region) { + params.pte = pte; + params.start = addr; + params.end = end; + params.aligned_start = addr & ~(size - 1); + params.aligned_end = min_t(unsigned long, aligned_end, + (unsigned long)__va(memblock_end_of_DRAM())); + stop_machine(stop_machine_change_mapping, ¶ms, NULL); + return; + } + + pte_clear(&init_mm, addr, pte); +} + +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmd; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_huge(*pmd)) { + split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next); + free_pte_table(pte_base, pmd); + } +} + +static void remove_pud_table(pud_t *pud_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pmd_t *pmd_base; + pud_t *pud; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_huge(*pud)) { + split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next); + free_pmd_table(pmd_base, pud); + } +} + +static void __meminit remove_pagetable(unsigned long start, unsigned long end) +{ + unsigned long addr, next; + pud_t *pud_base; + pgd_t *pgd; + + spin_lock(&init_mm.page_table_lock); + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (pgd_huge(*pgd)) { + split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + continue; + } + + pud_base = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud_base, addr, next); + } + + spin_unlock(&init_mm.page_table_lock); + radix__flush_tlb_kernel_range(start, end); +} + +int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) +{ + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + return create_physical_mapping(start, end, nid); +} + +int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) +{ + remove_pagetable(start, end); + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int map_page_size, + int nid) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); +} + +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); + int ret; + + if ((start + page_size) >= RADIX_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + BUG_ON(ret); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + remove_pagetable(start, start + page_size); +} +#endif +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); +#endif + + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_devmap(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + + /*FIXME!! Verify whether we need this kick below */ + serialize_against_pte_lookup(vma->vm_mm); + + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * Serialize against find_current_mm_pte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_current_mm_pte to finish. + */ + serialize_against_pte_lookup(mm); + return old_pmd; +} + +int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, + pte_t entry, unsigned long address, int psize) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | + _PAGE_RW | _PAGE_EXEC); + + unsigned long change = pte_val(entry) ^ pte_val(*ptep); + /* + * To avoid NMMU hang while relaxing access, we need mark + * the pte invalid in between. + */ + if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + unsigned long old_pte, new_pte; + + old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); + /* + * new value of pte + */ + new_pte = old_pte | set; + radix__flush_tlb_page_psize(mm, address, psize); + __radix_pte_update(ptep, _PAGE_INVALID, new_pte); + } else { + __radix_pte_update(ptep, 0, set); + /* + * Book3S does not require a TLB flush when relaxing access + * restrictions when the address space is not attached to a + * NMMU, because the core MMU will reload the pte after taking + * an access fault, which is defined by the architectue. + */ + } + /* See ptesync comment in radix__set_pte_at */ +} + +void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * To avoid NMMU hang while relaxing access we need to flush the tlb before + * we set the new value. We need to do this only for radix, because hash + * translation does flush when updating the linux pte. + */ + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_tlb_page(vma, addr); + + set_pte_at(mm, addr, ptep, pte); +} diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c new file mode 100644 index 000000000000..6a23b9ebd2a1 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -0,0 +1,1101 @@ +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define RIC_FLUSH_TLB 0 +#define RIC_FLUSH_PWC 1 +#define RIC_FLUSH_ALL 2 + +/* + * tlbiel instruction for radix, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) + : "memory"); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + + /* Do the same for process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + + asm volatile("ptesync": : :"memory"); +} + +void radix__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is); + else + WARN(1, "%s called on pre-POWER9 CPU\n", __func__); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void __tlbiel_pid(unsigned long pid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbiel_lpid(unsigned long lpid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rb |= set << PPC_BITLSHIFT(51); + rs = 0; /* LPID comes from LPIDR */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rb |= set << PPC_BITLSHIFT(51); + rs = 0; /* LPID comes from LPIDR */ + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 1, rb, rs, ric, prs, r); +} + + +static inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie(void) +{ + unsigned long pid = 0; + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_lpid(unsigned long lpid) +{ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +{ + int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_pid(pid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void _tlbie_pid(unsigned long pid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid(pid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_pid(pid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid(pid, RIC_FLUSH_ALL); + } + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) +{ + int set; + + VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_lpid(lpid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid(lpid, RIC_FLUSH_ALL); + } + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +{ + int set; + + VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_lpid_guest(lpid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); +} + + +static inline void __tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbiel_va(va, pid, ap, ric); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_va_range(start, end, pid, page_size, psize); + asm volatile("ptesync": : :"memory"); +} + +static inline void __tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, ric); + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, ric); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); + __tlbie_va_range(start, end, pid, page_size, psize); + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid, RIC_FLUSH_TLB); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +#ifndef CONFIG_SMP +void radix__local_flush_all_mm(struct mm_struct *mm) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid, RIC_FLUSH_ALL); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_all_mm); +#endif /* CONFIG_SMP */ + +void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (is_vm_hugetlb_page(vma)) + return radix__local_flush_hugetlb_page(vma, vmaddr); +#endif + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +static bool mm_is_singlethreaded(struct mm_struct *mm) +{ + if (atomic_read(&mm->context.copros) > 0) + return false; + if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) + return true; + return false; +} + +static bool mm_needs_flush_escalation(struct mm_struct *mm) +{ + /* + * P9 nest MMU has issues with the page walk cache + * caching PTEs and not flushing them properly when + * RIC = 0 for a PID/LPID invalidate + */ + if (atomic_read(&mm->context.copros) > 0) + return true; + return false; +} + +#ifdef CONFIG_SMP +static void do_exit_flush_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + unsigned long pid = mm->context.id; + + if (current->mm == mm) + return; /* Local CPU */ + + if (current->active_mm == mm) { + /* + * Must be a kernel thread because sender is single-threaded. + */ + BUG_ON(current->mm); + mmgrab(&init_mm); + switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; + mmdrop(mm); + } + _tlbiel_pid(pid, RIC_FLUSH_ALL); +} + +static void exit_flush_lazy_tlbs(struct mm_struct *mm) +{ + /* + * Would be nice if this was async so it could be run in + * parallel with our local flush, but generic code does not + * give a good API for it. Could extend the generic code or + * make a special powerpc IPI for flushing TLBs. + * For now it's not too performance critical. + */ + smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, + (void *)mm, 1); + mm_reset_thread_local(mm); +} + +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + /* + * Order loads of mm_cpumask vs previous stores to clear ptes before + * the invalidate. See barrier in switch_mm_irqs_off + */ + smp_mb(); + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { +local: + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) +{ + unsigned long pid; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (!fullmm) { + exit_flush_lazy_tlbs(mm); + goto local; + } + } + _tlbie_pid(pid, RIC_FLUSH_ALL); + } else { +local: + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } + preempt_enable(); +} +void radix__flush_all_mm(struct mm_struct *mm) +{ + __flush_all_mm(mm, false); +} +EXPORT_SYMBOL(radix__flush_all_mm); + +void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + tlb->need_flush_all = 1; +} +EXPORT_SYMBOL(radix__flush_tlb_pwc); + +void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) +{ + unsigned long pid; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } else { +local: + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_page(vma, vmaddr); +#endif + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#else /* CONFIG_SMP */ +#define radix__flush_all_mm radix__local_flush_all_mm +#endif /* CONFIG_SMP */ + +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + _tlbie_pid(0, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +#define TLB_FLUSH_ALL -1UL + +/* + * Number of pages above which we invalidate the entire PID rather than + * flush individual pages, for local and global flushes respectively. + * + * tlbie goes out to the interconnect and individual ops are more costly. + * It also does not iterate over sets like the local tlbiel variant when + * invalidating a full PID, so it has a far lower threshold to change from + * individual page flushes to full-pid flushes. + */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; + +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool flush_all_sizes) + +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (end != TLB_FLUSH_ALL) { + exit_flush_lazy_tlbs(mm); + goto is_local; + } + } + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } else { +is_local: + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } + + if (full) { + if (local) { + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } else { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } + } else { + bool hflush = flush_all_sizes; + bool gflush = flush_all_sizes; + unsigned long hstart, hend; + unsigned long gstart, gend; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + hflush = true; + + if (hflush) { + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + if (hstart == hend) + hflush = false; + } + + if (gflush) { + gstart = (start + PUD_SIZE - 1) & PUD_MASK; + gend = end & PUD_MASK; + if (gstart == gend) + gflush = false; + } + + asm volatile("ptesync": : :"memory"); + if (local) { + __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbiel_va_range(hstart, hend, pid, + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbiel_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbie_va_range(hstart, hend, pid, + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbie_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); + fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + } + preempt_enable(); +} + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end, false); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + +static int radix_get_mmu_psize(int page_size) +{ + int psize; + + if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift)) + psize = mmu_virtual_psize; + else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift)) + psize = MMU_PAGE_2M; + else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift)) + psize = MMU_PAGE_1G; + else + return -1; + return psize; +} + +/* + * Flush partition scoped LPID address translation for all CPUs. + */ +void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + int psize = radix_get_mmu_psize(page_size); + + _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page); + +/* + * Flush partition scoped PWC from LPID for all CPUs. + */ +void radix__flush_pwc_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_PWC); +} +EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__flush_tlb_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__local_flush_tlb_lpid(unsigned int lpid) +{ + _tlbiel_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); + +/* + * Flush process scoped translations from LPID (=LPIDR). + * Important difference, the guest normally manages its own translations, + * but some cases e.g., vCPU CPU migration require KVM to flush. + */ +void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +{ + _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); + + +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize); + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + int psize = 0; + struct mm_struct *mm = tlb->mm; + int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; + + /* + * if page size is not something we understand, do a full mm flush + * + * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush + * that flushes the process table entry cache upon process teardown. + * See the comment for radix in arch_exit_mmap(). + */ + if (tlb->fullmm) { + __flush_all_mm(mm, true); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + } else if (mm_tlb_flush_nested(mm)) { + /* + * If there is a concurrent invalidation that is clearing ptes, + * then it's possible this invalidation will miss one of those + * cleared ptes and miss flushing the TLB. If this invalidate + * returns before the other one flushes TLBs, that can result + * in it returning while there are still valid TLBs inside the + * range to be invalidated. + * + * See mm/memory.c:tlb_finish_mmu() for more details. + * + * The solution to this is ensure the entire range is always + * flushed here. The problem for powerpc is that the flushes + * are page size specific, so this "forced flush" would not + * do the right thing if there are a mix of page sizes in + * the range to be invalidated. So use __flush_tlb_range + * which invalidates all possible page sizes in the range. + * + * PWC flush probably is not be required because the core code + * shouldn't free page tables in this path, but accounting + * for the possibility makes us a bit more robust. + * + * need_flush_all is an uncommon case because page table + * teardown should be done with exclusive locks held (but + * after locks are dropped another invalidate could come + * in), it could be optimized further if necessary. + */ + if (!tlb->need_flush_all) + __radix__flush_tlb_range(mm, start, end, true); + else + radix__flush_all_mm(mm); +#endif + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { + if (!tlb->need_flush_all) + radix__flush_tlb_mm(mm); + else + radix__flush_all_mm(mm); + } else { + if (!tlb->need_flush_all) + radix__flush_tlb_range_psize(mm, start, end, psize); + else + radix__flush_tlb_pwc_range_psize(mm, start, end, psize); + } + tlb->need_flush_all = 0; +} + +static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, + unsigned long start, unsigned long end, + int psize, bool also_pwc) +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (end != TLB_FLUSH_ALL) { + exit_flush_lazy_tlbs(mm); + goto is_local; + } + } + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } else { +is_local: + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } + + if (full) { + if (local) { + _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } + } else { + if (local) + _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + } + preempt_enable(); +} + +void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + return __radix__flush_tlb_range_psize(mm, start, end, psize, false); +} + +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + __radix__flush_tlb_range_psize(mm, start, end, psize, true); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) +{ + unsigned long pid, end; + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + /* 4k page size, just blow the world */ + if (PAGE_SIZE == 0x1000) { + radix__flush_all_mm(mm); + return; + } + + end = addr + HPAGE_PMD_SIZE; + + /* Otherwise first do the PWC, then iterate the pages. */ + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else { +local: + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } + + preempt_enable(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M); +} +EXPORT_SYMBOL(radix__flush_pmd_tlb_range); + +void radix__flush_tlb_all(void) +{ + unsigned long rb,prs,r,rs; + unsigned long ric = RIC_FLUSH_ALL; + + rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */ + + asm volatile("ptesync": : :"memory"); + /* + * now flush guest entries by passing PRS = 1 and LPID != 0 + */ + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); + /* + * now flush host entires by passing PRS = 0 and LPID == 0 + */ + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) +{ + unsigned long pid = mm->context.id; + + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + /* + * If this context hasn't run on that CPU before and KVM is + * around, there's a slim chance that the guest on another + * CPU just brought in obsolete translation into the TLB of + * this CPU due to a bad prefetch using the guest PID on + * the way into the hypervisor. + * + * We work around this here. If KVM is possible, we check if + * any sibling thread is in KVM. If it is, the window may exist + * and thus we flush that PID from the core. + * + * A potential future improvement would be to mark which PIDs + * have never been used on the system and avoid it if the PID + * is new and the process has no other cpumask bit set. + */ + if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { + int cpu = smp_processor_id(); + int sib = cpu_first_thread_sibling(cpu); + bool flush = false; + + for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { + if (sib == cpu) + continue; + if (!cpu_possible(sib)) + continue; + if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) + flush = true; + } + if (flush) + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } +} +EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c new file mode 100644 index 000000000000..c22742218bd3 --- /dev/null +++ b/arch/powerpc/mm/book3s64/slb.c @@ -0,0 +1,833 @@ +/* + * PowerPC64 SLB support. + * + * Copyright (C) 2004 David Gibson , IBM + * Based on earlier code written by: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard , IBM + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +enum slb_index { + LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ + KSTACK_INDEX = 1, /* Kernel stack map */ +}; + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); + +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) + +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, + enum slb_index index) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; +} + +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, + unsigned long flags) +{ + return (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); +} + +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, + unsigned long flags) +{ + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); +} + +static void assert_slb_presence(bool present, unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return; + + /* + * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware + * ignores all other bits from 0-27, so just clear them all. + */ + ea &= ~((1UL << 28) - 1); + asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); + + WARN_ON(present == (tmp == 0)); +#endif +} + +static inline void slb_shadow_update(unsigned long ea, int ssize, + unsigned long flags, + enum slb_index index) +{ + struct slb_shadow *p = get_slb_shadow(); + + /* + * Clear the ESID first so the entry is not valid while we are + * updating it. No write barriers are needed here, provided + * we only update the current CPU's SLB shadow buffer. + */ + WRITE_ONCE(p->save_area[index].esid, 0); + WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags))); + WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index))); +} + +static inline void slb_shadow_clear(enum slb_index index) +{ + WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); +} + +static inline void create_shadowed_slbe(unsigned long ea, int ssize, + unsigned long flags, + enum slb_index index) +{ + /* + * Updating the shadow buffer before writing the SLB ensures + * we don't get a stale entry here if we get preempted by PHYP + * between these two statements. + */ + slb_shadow_update(ea, ssize, flags, index); + + assert_slb_presence(false, ea); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data(ea, ssize, flags)), + "r" (mk_esid_data(ea, ssize, index)) + : "memory" ); +} + +/* + * Insert bolted entries into SLB (which may not be empty, so don't clear + * slb_cache_ptr). + */ +void __slb_restore_bolted_realmode(void) +{ + struct slb_shadow *p = get_slb_shadow(); + enum slb_index index; + + /* No isync needed because realmode. */ + for (index = 0; index < SLB_NUM_BOLTED; index++) { + asm volatile("slbmte %0,%1" : + : "r" (be64_to_cpu(p->save_area[index].vsid)), + "r" (be64_to_cpu(p->save_area[index].esid))); + } + + assert_slb_presence(true, local_paca->kstack); +} + +/* + * Insert the bolted entries into an empty SLB. + */ +void slb_restore_bolted_realmode(void) +{ + __slb_restore_bolted_realmode(); + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +/* + * This flushes all SLB entries including 0, so it must be realmode. + */ +void slb_flush_all_realmode(void) +{ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); +} + +/* + * This flushes non-bolted entries, it can be run in virtual mode. Must + * be called with interrupts disabled. + */ +void slb_flush_and_restore_bolted(void) +{ + struct slb_shadow *p = get_slb_shadow(); + + BUILD_BUG_ON(SLB_NUM_BOLTED != 2); + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + asm volatile("isync\n" + "slbia\n" + "slbmte %0, %1\n" + "isync\n" + :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), + "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) + : "memory"); + assert_slb_presence(true, get_paca()->kstack); + + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +void slb_save_contents(struct slb_entry *slb_ptr) +{ + int i; + unsigned long e, v; + + /* Save slb_cache_ptr value. */ + get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; + + if (!slb_ptr) + return; + + for (i = 0; i < mmu_slb_size; i++) { + asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); + asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); + slb_ptr->esid = e; + slb_ptr->vsid = v; + slb_ptr++; + } +} + +void slb_dump_contents(struct slb_entry *slb_ptr) +{ + int i, n; + unsigned long e, v; + unsigned long llp; + + if (!slb_ptr) + return; + + pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); + pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); + + for (i = 0; i < mmu_slb_size; i++) { + e = slb_ptr->esid; + v = slb_ptr->vsid; + slb_ptr++; + + if (!e && !v) + continue; + + pr_err("%02d %016lx %016lx\n", i, e, v); + + if (!(e & SLB_ESID_V)) { + pr_err("\n"); + continue; + } + llp = v & SLB_VSID_LLP; + if (v & SLB_VSID_B_1T) { + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID_1T(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); + } else { + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); + } + } + pr_err("----------------------------------\n"); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); +} + +void slb_vmalloc_update(void) +{ + /* + * vmalloc is not bolted, so just have to flush non-bolted. + */ + slb_flush_and_restore_bolted(); +} + +static bool preload_hit(struct thread_info *ti, unsigned long esid) +{ + unsigned char i; + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; +} + +static bool preload_add(struct thread_info *ti, unsigned long ea) +{ + unsigned char idx; + unsigned long esid; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; + + if (preload_hit(ti, esid)) + return false; + + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; + + return true; +} + +static void preload_age(struct thread_info *ti) +{ + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; +} + +void slb_setup_new_exec(void) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long exec = 0x10000000; + + WARN_ON(irqs_disabled()); + + /* + * preload cache can only be used to determine whether a SLB + * entry exists if it does not start to overflow. + */ + if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* + * We have no good place to clear the slb preload cache on exec, + * flush_thread is about the earliest arch hook but that happens + * after we switch to the mm and have aleady preloaded the SLBEs. + * + * For the most part that's probably okay to use entries from the + * previous exec, they will age out if unused. It may turn out to + * be an advantage to clear the cache before switching to it, + * however. + */ + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + if (!is_kernel_addr(exec)) { + if (preload_add(ti, exec)) + slb_allocate_user(mm, exec); + } + + /* Libraries and mmaps. */ + if (!is_kernel_addr(mm->mmap_base)) { + if (preload_add(ti, mm->mmap_base)) + slb_allocate_user(mm, mm->mmap_base); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + +void preload_new_slb_context(unsigned long start, unsigned long sp) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long heap = mm->start_brk; + + WARN_ON(irqs_disabled()); + + /* see above */ + if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* Userspace entry address. */ + if (!is_kernel_addr(start)) { + if (preload_add(ti, start)) + slb_allocate_user(mm, start); + } + + /* Top of stack, grows down. */ + if (!is_kernel_addr(sp)) { + if (preload_add(ti, sp)) + slb_allocate_user(mm, sp); + } + + /* Bottom of heap, grows up. */ + if (heap && !is_kernel_addr(heap)) { + if (preload_add(ti, heap)) + slb_allocate_user(mm, heap); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + + +/* Flush all user entries from the segment table of the current processor. */ +void switch_slb(struct task_struct *tsk, struct mm_struct *mm) +{ + struct thread_info *ti = task_thread_info(tsk); + unsigned char i; + + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + asm volatile("isync" : : : "memory"); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * SLBIA IH=3 invalidates all Class=1 SLBEs and their + * associated lookaside structures, which matches what + * switch_slb wants. So ARCH_300 does not use the slb + * cache. + */ + asm volatile(PPC_SLBIA(3)); + } else { + unsigned long offset = get_paca()->slb_cache_ptr; + + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + unsigned long slbie_data = 0; + + for (i = 0; i < offset; i++) { + unsigned long ea; + + ea = (unsigned long) + get_paca()->slb_cache[i] << SID_SHIFT; + /* + * Could assert_slb_presence(true) here, but + * hypervisor or machine check could have come + * in and removed the entry at this point. + */ + + slbie_data = ea; + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + asm volatile("slbie %0" : : "r" (slbie_data)); + + } else { + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + unsigned long ksp_vsid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(1) "\n" + "slbmte %0,%1\n" + "isync" + :: "r"(ksp_vsid_data), + "r"(ksp_esid_data)); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + } + + get_paca()->slb_cache_ptr = 0; + } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + copy_mm_to_paca(mm); + + /* + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. + */ + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); + + preload_age(ti); + preload_add(ti, pc); + } + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + unsigned long ea; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; + + slb_allocate_user(mm, ea); + } + + /* + * Synchronize slbmte preloads with possible subsequent user memory + * address accesses by the kernel (user mode won't happen until + * rfid, which is safe). + */ + asm volatile("isync" : : : "memory"); +} + +void slb_set_size(u16 size) +{ + mmu_slb_size = size; +} + +void slb_initialize(void) +{ + unsigned long linear_llp, vmalloc_llp, io_llp; + unsigned long lflags; + static int slb_encoding_inited; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + unsigned long vmemmap_llp; +#endif + + /* Prepare our SLB miss handler based on our page size */ + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + io_llp = mmu_psize_defs[mmu_io_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + if (!slb_encoding_inited) { + slb_encoding_inited = 1; + pr_devel("SLB: linear LLP = %04lx\n", linear_llp); + pr_devel("SLB: io LLP = %04lx\n", io_llp); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); +#endif + } + + get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + lflags = SLB_VSID_KERNEL | linear_llp; + + /* Invalidate the entire SLB (even entry 0) & all the ERATS */ + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + asm volatile("isync; slbia; isync":::"memory"); + create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); + + /* + * For the boot cpu, we're running on the stack in init_thread_union, + * which is in the first segment of the linear mapping, and also + * get_paca()->kstack hasn't been initialized yet. + * For secondary cpus, we need to bolt the kernel stack entry now. + */ + slb_shadow_clear(KSTACK_INDEX); + if (raw_smp_processor_id() != boot_cpuid && + (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) + create_shadowed_slbe(get_paca()->kstack, + mmu_kernel_ssize, lflags, KSTACK_INDEX); + + asm volatile("isync":::"memory"); +} + +static void slb_cache_update(unsigned long esid_data) +{ + int slb_cache_index; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; /* ISAv3.0B and later does not use slb_cache */ + + /* + * Now update slb cache entries + */ + slb_cache_index = local_paca->slb_cache_ptr; + if (slb_cache_index < SLB_CACHE_ENTRIES) { + /* + * We have space in slb cache for optimized switch_slb(). + * Top 36 bits from esid_data as per ISA + */ + local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache_ptr++; + } else { + /* + * Our cache is full and the current cache content strictly + * doesn't indicate the active SLB conents. Bump the ptr + * so that switch_slb() will ignore the cache. + */ + local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + } +} + +static enum slb_index alloc_slb_index(bool kernel) +{ + enum slb_index index; + + /* + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. + */ + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (local_paca->slb_used_bitmap != U32_MAX) { + index = ffz(local_paca->slb_used_bitmap); + local_paca->slb_used_bitmap |= 1U << index; + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = local_paca->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + local_paca->stab_rr = index; + if (index < 32) { + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + else + local_paca->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); + + return index; +} + +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) +{ + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; + + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; + + /* + * There must not be a kernel SLB fault in alloc_slb_index or before + * slbmte here or the allocation bitmaps could get out of whack with + * the SLB. + * + * User SLB faults or preloads take this path which might get inlined + * into the caller, so add compiler barriers here to ensure unsafe + * memory accesses do not come between. + */ + barrier(); + + index = alloc_slb_index(kernel); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * User preloads should add isync afterwards in case the kernel + * accesses user memory before it returns to userspace with rfid. + */ + assert_slb_presence(false, ea); + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); + + barrier(); + + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if (id == LINEAR_MAP_REGION_ID) { + + /* We only support upto MAX_PHYSMEM_BITS */ + if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + + if (ea >= H_VMEMMAP_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + + if (ea >= H_VMALLOC_END) + return -EFAULT; + + flags = local_paca->vmalloc_sllp; + + } else if (id == IO_REGION_ID) { + + if (ea >= H_KERN_IO_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = get_kernel_context(ea); + + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; + + /* + * consider this as bad access if we take a SLB miss + * on an address above addr limit. + */ + if (ea >= mm_ctx_slb_addr_limit(&mm->context)) + return -EFAULT; + + context = get_user_context(&mm->context, ea); + if (!context) + return -EFAULT; + + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } + + ssize = user_segment_size(ea); + + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +long do_slb_fault(struct pt_regs *regs, unsigned long ea) +{ + unsigned long id = get_region_id(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (unlikely(!(regs->msr & MSR_RI))) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything + * that is not bolted. E.g., PACA and global variables are okay, + * mm->context stuff is not. + * + * SLB user faults can access all of kernel memory, but must be + * careful not to touch things like IRQ state because it is not + * "reconciled" here. The difficulty is that we must use + * fast_exception_return to return from kernel SLB faults without + * looking at possible non-bolted memory. We could test user vs + * kernel faults in the interrupt handler asm and do a full fault, + * reconcile, ret_from_except for user faults which would make them + * first class kernel code. But for performance it's probably nicer + * if they go via fast_exception_return too. + */ + if (id >= LINEAR_MAP_REGION_ID) { + long err; +#ifdef CONFIG_DEBUG_VM + /* Catch recursive kernel SLB faults. */ + BUG_ON(local_paca->in_kernel_slb_handler); + local_paca->in_kernel_slb_handler = 1; +#endif + err = slb_allocate_kernel(ea, id); +#ifdef CONFIG_DEBUG_VM + local_paca->in_kernel_slb_handler = 0; +#endif + return err; + } else { + struct mm_struct *mm = current->mm; + long err; + + if (unlikely(!mm)) + return -EFAULT; + + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; + } +} + +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) +{ + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c new file mode 100644 index 000000000000..473dd430e306 --- /dev/null +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -0,0 +1,289 @@ +/* + * Copyright 2007-2008 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Free all pages allocated for subpage protection maps and pointers. + * Also makes sure that the subpage_prot_table structure is + * reinitialized for the next user. + */ +void subpage_prot_free(struct mm_struct *mm) +{ + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + unsigned long i, j, addr; + u32 **p; + + if (!spt) + return; + + for (i = 0; i < 4; ++i) { + if (spt->low_prot[i]) { + free_page((unsigned long)spt->low_prot[i]); + spt->low_prot[i] = NULL; + } + } + addr = 0; + for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { + p = spt->protptrs[i]; + if (!p) + continue; + spt->protptrs[i] = NULL; + for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; + ++j, addr += PAGE_SIZE) + if (p[j]) + free_page((unsigned long)p[j]); + free_page((unsigned long)p); + } + spt->maxaddr = 0; + kfree(spt); +} + +static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, + int npages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return; + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return; + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; npages > 0; --npages) { + pte_update(mm, addr, pte, 0, 0, 0); + addr += PAGE_SIZE; + ++pte; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +} + +/* + * Clear the subpage protection map for an address range, allowing + * all accesses that are allowed by the pte permissions. + */ +static void subpage_prot_clear(unsigned long addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt; + u32 **spm, *spp; + unsigned long i; + size_t nw; + unsigned long next, limit; + + down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) + goto err_out; + + limit = addr + len; + if (limit > spt->maxaddr) + limit = spt->maxaddr; + for (; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + if (addr < 0x100000000UL) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) + continue; + } + spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!spp) + continue; + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + memset(spp, 0, nw * sizeof(u32)); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + +err_out: + up_write(&mm->mmap_sem); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct mm_walk subpage_proto_walk = { + .mm = mm, + .pmd_entry = subpage_walk_pmd_entry, + }; + + /* + * We don't try too hard, we just mark all the vma in that range + * VM_NOHUGEPAGE and split them. + */ + vma = find_vma(mm, addr); + /* + * If the range is in unmapped range, just return + */ + if (vma && ((addr + len) <= vma->vm_start)) + return; + + while (vma) { + if (vma->vm_start >= (addr + len)) + break; + vma->vm_flags |= VM_NOHUGEPAGE; + walk_page_vma(vma, &subpage_proto_walk); + vma = vma->vm_next; + } +} +#else +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + return; +} +#endif + +/* + * Copy in a subpage protection map for an address range. + * The map has 2 bits per 4k subpage, so 32 bits per 64k page. + * Each 2-bit field is 0 to allow any access, 1 to prevent writes, + * 2 or 3 to prevent all accesses. + * Note that the normal page protections also apply; the subpage + * protection mechanism is an additional constraint, so putting 0 + * in a 2-bit field won't allow writes to a page that is otherwise + * write-protected. + */ +SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, + unsigned long, len, u32 __user *, map) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt; + u32 **spm, *spp; + unsigned long i; + size_t nw; + unsigned long next, limit; + int err; + + if (radix_enabled()) + return -ENOENT; + + /* Check parameters */ + if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || + addr >= mm->task_size || len >= mm->task_size || + addr + len > mm->task_size) + return -EINVAL; + + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; + + if (!map) { + /* Clear out the protection map for the address range */ + subpage_prot_clear(addr, len); + return 0; + } + + if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) + return -EFAULT; + + down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) { + /* + * Allocate subpage prot table if not already done. + * Do this with mmap_sem held + */ + spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); + if (!spt) { + err = -ENOMEM; + goto out; + } + mm->context.hash_context->spt = spt; + } + + subpage_mark_vma_nohuge(mm, addr, len); + for (limit = addr + len; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + err = -ENOMEM; + if (addr < 0x100000000UL) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) { + spm = (u32 **)get_zeroed_page(GFP_KERNEL); + if (!spm) + goto out; + spt->protptrs[addr >> SBP_L3_SHIFT] = spm; + } + } + spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); + spp = *spm; + if (!spp) { + spp = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!spp) + goto out; + *spm = spp; + } + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + local_irq_disable(); + demote_segment_4k(mm, addr); + local_irq_enable(); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + up_write(&mm->mmap_sem); + if (__copy_from_user(spp, map, nw * sizeof(u32))) + return -EFAULT; + map += nw; + down_write(&mm->mmap_sem); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + if (limit > spt->maxaddr) + spt->maxaddr = limit; + err = 0; + out: + up_write(&mm->mmap_sem); + return err; +} diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c new file mode 100644 index 000000000000..0ee7734afb50 --- /dev/null +++ b/arch/powerpc/mm/book3s64/vphn.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "vphn.h" + +/* + * The associativity domain numbers are returned from the hypervisor as a + * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the + * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 + * bytes. + * + * --- 16-bit fields --> + * _________________________ + * | 0 | 1 | 2 | 3 | be_packed[0] + * ------+-----+-----+------ + * _________________________ + * | 4 | 5 | 6 | 7 | be_packed[1] + * ------------------------- + * ... + * _________________________ + * | 20 | 21 | 22 | 23 | be_packed[5] + * ------------------------- + * + * Convert to the sequence they would appear in the ibm,associativity property. + */ +int vphn_unpack_associativity(const long *packed, __be32 *unpacked) +{ + __be64 be_packed[VPHN_REGISTER_COUNT]; + int i, nr_assoc_doms = 0; + const __be16 *field = (const __be16 *) be_packed; + u16 last = 0; + bool is_32bit = false; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + /* Let's fix the values returned by plpar_hcall9() */ + for (i = 0; i < VPHN_REGISTER_COUNT; i++) + be_packed[i] = cpu_to_be64(packed[i]); + + for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { + u16 new = be16_to_cpup(field++); + + if (is_32bit) { + /* + * Let's concatenate the 16 bits of this field to the + * 15 lower bits of the previous field + */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(last << 16 | new); + is_32bit = false; + } else if (new == VPHN_FIELD_UNUSED) + /* This is the list terminator */ + break; + else if (new & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(new & VPHN_FIELD_MASK); + } else { + /* + * Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + last = new; + is_32bit = true; + } + } + + /* The first cell contains the length of the property */ + unpacked[0] = cpu_to_be32(nr_assoc_doms); + + return nr_assoc_doms; +} diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h new file mode 100644 index 000000000000..f0b93c2dd578 --- /dev/null +++ b/arch/powerpc/mm/book3s64/vphn.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_POWERPC_MM_VPHN_H_ +#define _ARCH_POWERPC_MM_VPHN_H_ + +/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ +#define VPHN_REGISTER_COUNT 6 + +/* + * 6 64-bit registers unpacked into up to 24 be32 associativity values. To + * form the complete property we have to add the length in the first cell. + */ +#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) + +extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); + +#endif diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c deleted file mode 100644 index 6fa6765a10eb..000000000000 --- a/arch/powerpc/mm/hash64_4k.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -#include -#include -#include - -int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, int subpg_prot) -{ - real_pte_t rpte; - unsigned long hpte_group; - unsigned long rflags, pa; - unsigned long old_pte, new_pte; - unsigned long vpn, hash, slot; - unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; - - /* - * atomically mark the linux large page PTE busy and dirty - */ - do { - pte_t pte = READ_ONCE(*ptep); - - old_pte = pte_val(pte); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access. Since this is 4K insert of 64K page size - * also add H_PAGE_COMBO - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* - * PP bits. _PAGE_USER is already PP bit 0x2, so we only - * need to add in 0x1 if it's a read-only user page - */ - rflags = htab_convert_pte_flags(new_pte); - rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); - - if (cpu_has_feature(CPU_FTR_NOEXECUTE) && - !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - /* - * There MIGHT be an HPTE for this pte - */ - unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize, - rpte, 0); - - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K, - MMU_PAGE_4K, ssize, flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - hash = hpt_hash(vpn, shift, ssize); - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - MMU_PAGE_4K, MMU_PAGE_4K, ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, - HPTE_V_SECONDARY, - MMU_PAGE_4K, - MMU_PAGE_4K, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - mmu_hash_ops.hpte_remove(hpte_group); - /* - * FIXME!! Should be try the group from which we removed ? - */ - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - MMU_PAGE_4K, MMU_PAGE_4K, old_pte); - return -1; - } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); - } - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c deleted file mode 100644 index 3afa253d7f52..000000000000 --- a/arch/powerpc/mm/hash64_64k.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -#include -#include -#include - -/* - * Return true, if the entry has a slot value which - * the software considers as invalid. - */ -static inline bool hpte_soft_invalid(unsigned long hidx) -{ - return ((hidx & 0xfUL) == 0xfUL); -} - -/* - * index from 0 - 15 - */ -bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) -{ - return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index))); -} - -int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, int subpg_prot) -{ - real_pte_t rpte; - unsigned long hpte_group; - unsigned int subpg_index; - unsigned long rflags, pa; - unsigned long old_pte, new_pte, subpg_pte; - unsigned long vpn, hash, slot, gslot; - unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; - - /* - * atomically mark the linux large page PTE busy and dirty - */ - do { - pte_t pte = READ_ONCE(*ptep); - - old_pte = pte_val(pte); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access. Since this is 4K insert of 64K page size - * also add H_PAGE_COMBO - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* - * Handle the subpage protection bits - */ - subpg_pte = new_pte & ~subpg_prot; - rflags = htab_convert_pte_flags(subpg_pte); - - if (cpu_has_feature(CPU_FTR_NOEXECUTE) && - !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { - - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } - - subpg_index = (ea & (PAGE_SIZE - 1)) >> shift; - vpn = hpt_vpn(ea, vsid, ssize); - rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); - /* - *None of the sub 4k page is hashed - */ - if (!(old_pte & H_PAGE_HASHPTE)) - goto htab_insert_hpte; - /* - * Check if the pte was already inserted into the hash table - * as a 64k HW page, and invalidate the 64k HPTE if so. - */ - if (!(old_pte & H_PAGE_COMBO)) { - flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); - /* - * clear the old slot details from the old and new pte. - * On hash insert failure we use old pte value and we don't - * want slot information there if we have a insert failure. - */ - old_pte &= ~H_PAGE_HASHPTE; - new_pte &= ~H_PAGE_HASHPTE; - goto htab_insert_hpte; - } - /* - * Check for sub page valid and update - */ - if (__rpte_sub_valid(rpte, subpg_index)) { - int ret; - - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, - subpg_index); - ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, - MMU_PAGE_4K, MMU_PAGE_4K, - ssize, flags); - - /* - * If we failed because typically the HPTE wasn't really here - * we try an insertion. - */ - if (ret == -1) - goto htab_insert_hpte; - - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; - } - -htab_insert_hpte: - - /* - * Initialize all hidx entries to invalid value, the first time - * the PTE is about to allocate a 4K HPTE. - */ - if (!(old_pte & H_PAGE_COMBO)) - rpte.hidx = INVALID_RPTE_HIDX; - - /* - * handle H_PAGE_4K_PFN case - */ - if (old_pte & H_PAGE_4K_PFN) { - /* - * All the sub 4k page have the same - * physical address. - */ - pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT; - } else { - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - pa += (subpg_index << shift); - } - hash = hpt_hash(vpn, shift, ssize); -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - MMU_PAGE_4K, MMU_PAGE_4K, ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - bool soft_invalid; - - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, HPTE_V_SECONDARY, - MMU_PAGE_4K, MMU_PAGE_4K, - ssize); - - soft_invalid = hpte_soft_invalid(slot); - if (unlikely(soft_invalid)) { - /* - * We got a valid slot from a hardware point of view. - * but we cannot use it, because we use this special - * value; as defined by hpte_soft_invalid(), to track - * invalid slots. We cannot use it. So invalidate it. - */ - gslot = slot & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn, - MMU_PAGE_4K, MMU_PAGE_4K, - ssize, 0); - } - - if (unlikely(slot == -1 || soft_invalid)) { - /* - * For soft invalid slot, let's ensure that we release a - * slot from the primary, with the hope that we will - * acquire that slot next time we try. This will ensure - * that we do not get the same soft-invalid slot. - */ - if (soft_invalid || (mftb() & 0x1)) - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - mmu_hash_ops.hpte_remove(hpte_group); - /* - * FIXME!! Should be try the group from which we removed ? - */ - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - MMU_PAGE_4K, MMU_PAGE_4K, old_pte); - return -1; - } - - new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); - new_pte |= H_PAGE_HASHPTE; - - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} - -int __hash_page_64K(unsigned long ea, unsigned long access, - unsigned long vsid, pte_t *ptep, unsigned long trap, - unsigned long flags, int ssize) -{ - real_pte_t rpte; - unsigned long hpte_group; - unsigned long rflags, pa; - unsigned long old_pte, new_pte; - unsigned long vpn, hash, slot; - unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift; - - /* - * atomically mark the linux large page PTE busy and dirty - */ - do { - pte_t pte = READ_ONCE(*ptep); - - old_pte = pte_val(pte); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - /* - * Check if PTE has the cache-inhibit bit set - * If so, bail out and refault as a 4k page - */ - if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && - unlikely(pte_ci(pte))) - return 0; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access. - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - rflags = htab_convert_pte_flags(new_pte); - rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); - - if (cpu_has_feature(CPU_FTR_NOEXECUTE) && - !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - unsigned long gslot; - - /* - * There MIGHT be an HPTE for this pte - */ - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K, - MMU_PAGE_64K, ssize, - flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - hash = hpt_hash(vpn, shift, ssize); - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - MMU_PAGE_64K, MMU_PAGE_64K, - ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, - HPTE_V_SECONDARY, - MMU_PAGE_64K, - MMU_PAGE_64K, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - mmu_hash_ops.hpte_remove(hpte_group); - /* - * FIXME!! Should be try the group from which we removed ? - */ - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - MMU_PAGE_64K, MMU_PAGE_64K, old_pte); - return -1; - } - - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); - } - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c deleted file mode 100644 index aaa28fd918fe..000000000000 --- a/arch/powerpc/mm/hash_native_64.c +++ /dev/null @@ -1,884 +0,0 @@ -/* - * native hashtable management. - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#undef DEBUG_LOW - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef DEBUG_LOW -#define DBG_LOW(fmt...) udbg_printf(fmt) -#else -#define DBG_LOW(fmt...) -#endif - -#ifdef __BIG_ENDIAN__ -#define HPTE_LOCK_BIT 3 -#else -#define HPTE_LOCK_BIT (56+3) -#endif - -DEFINE_RAW_SPINLOCK(native_tlbie_lock); - -static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) -{ - unsigned long rb; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - - asm volatile("tlbiel %0" : : "r" (rb)); -} - -/* - * tlbiel instruction for hash, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) -{ - unsigned long rb; - unsigned long rs; - unsigned int r = 0; /* hash format */ - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) - : "memory"); -} - - -static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa206(set, is); - - asm volatile("ptesync": : :"memory"); -} - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and any caching of partition table - * entries. Then flush the remaining sets of the TLB. Hash mode uses - * partition scoped TLB translations. - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - for (set = 1; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); - - /* - * Now invalidate the process table cache. - * - * From ISA v3.0B p. 1078: - * The following forms are invalid. - * * PRS=1, R=0, and RIC!=2 (The only process-scoped - * HPT caching is of the Process Table.) - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 1); - - asm volatile("ptesync": : :"memory"); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -void hash__tlbiel_all(unsigned int action) -{ - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) - tlbiel_all_isa206(POWER8_TLB_SETS, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) - tlbiel_all_isa206(POWER7_TLB_SETS, is); - else - WARN(1, "%s called on pre-POWER7 CPU\n", __func__); -} - -static inline unsigned long ___tlbie(unsigned long vpn, int psize, - int apsize, int ssize) -{ - unsigned long va; - unsigned int penc; - unsigned long sllp; - - /* - * We need 14 to 65 bits of va for a tlibe of 4K page - * With vpn we ignore the lower VPN_SHIFT bits already. - * And top two bits are already ignored because we can - * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT - * of 12. - */ - va = vpn << VPN_SHIFT; - /* - * clear top 16 bits of 64bit va, non SLS segment - * Older versions of the architecture (2.02 and earler) require the - * masking of the top 16 bits. - */ - if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) - va &= ~(0xffffULL << 48); - - switch (psize) { - case MMU_PAGE_4K: - /* clear out bits after (52) [0....52.....63] */ - va &= ~((1ul << (64 - 52)) - 1); - va |= ssize << 8; - sllp = get_sllp_encoding(apsize); - va |= sllp << 5; - asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) - : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - default: - /* We need 14 to 14 + i bits of va */ - penc = mmu_psize_defs[psize].penc[apsize]; - va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); - va |= penc << 12; - va |= ssize << 8; - /* - * AVAL bits: - * We don't need all the bits, but rest of the bits - * must be ignored by the processor. - * vpn cover upto 65 bits of va. (0...65) and we need - * 58..64 bits of va. - */ - va |= (vpn & 0xfe); /* AVAL */ - va |= 1; /* L */ - asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) - : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - } - return va; -} - -static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) -{ - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - /* Need the extra ptesync to ensure we don't reorder tlbie*/ - asm volatile("ptesync": : :"memory"); - ___tlbie(vpn, psize, apsize, ssize); - } -} - -static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) -{ - unsigned long rb; - - rb = ___tlbie(vpn, psize, apsize, ssize); - trace_tlbie(0, 0, rb, 0, 0, 0, 0); -} - -static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) -{ - unsigned long va; - unsigned int penc; - unsigned long sllp; - - /* VPN_SHIFT can be atmost 12 */ - va = vpn << VPN_SHIFT; - /* - * clear top 16 bits of 64 bit va, non SLS segment - * Older versions of the architecture (2.02 and earler) require the - * masking of the top 16 bits. - */ - if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) - va &= ~(0xffffULL << 48); - - switch (psize) { - case MMU_PAGE_4K: - /* clear out bits after(52) [0....52.....63] */ - va &= ~((1ul << (64 - 52)) - 1); - va |= ssize << 8; - sllp = get_sllp_encoding(apsize); - va |= sllp << 5; - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) - : : "r" (va), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - default: - /* We need 14 to 14 + i bits of va */ - penc = mmu_psize_defs[psize].penc[apsize]; - va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); - va |= penc << 12; - va |= ssize << 8; - /* - * AVAL bits: - * We don't need all the bits, but rest of the bits - * must be ignored by the processor. - * vpn cover upto 65 bits of va. (0...65) and we need - * 58..64 bits of va. - */ - va |= (vpn & 0xfe); - va |= 1; /* L */ - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) - : : "r" (va), "i" (CPU_FTR_ARCH_206) - : "memory"); - break; - } - trace_tlbie(0, 1, va, 0, 0, 0, 0); - -} - -static inline void tlbie(unsigned long vpn, int psize, int apsize, - int ssize, int local) -{ - unsigned int use_local; - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - - use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); - - if (use_local) - use_local = mmu_psize_defs[psize].tlbiel; - if (lock_tlbie && !use_local) - raw_spin_lock(&native_tlbie_lock); - asm volatile("ptesync": : :"memory"); - if (use_local) { - __tlbiel(vpn, psize, apsize, ssize); - asm volatile("ptesync": : :"memory"); - } else { - __tlbie(vpn, psize, apsize, ssize); - fixup_tlbie(vpn, psize, apsize, ssize); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); - } - if (lock_tlbie && !use_local) - raw_spin_unlock(&native_tlbie_lock); -} - -static inline void native_lock_hpte(struct hash_pte *hptep) -{ - unsigned long *word = (unsigned long *)&hptep->v; - - while (1) { - if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) - break; - spin_begin(); - while(test_bit(HPTE_LOCK_BIT, word)) - spin_cpu_relax(); - spin_end(); - } -} - -static inline void native_unlock_hpte(struct hash_pte *hptep) -{ - unsigned long *word = (unsigned long *)&hptep->v; - - clear_bit_unlock(HPTE_LOCK_BIT, word); -} - -static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, - unsigned long pa, unsigned long rflags, - unsigned long vflags, int psize, int apsize, int ssize) -{ - struct hash_pte *hptep = htab_address + hpte_group; - unsigned long hpte_v, hpte_r; - int i; - - if (!(vflags & HPTE_V_BOLTED)) { - DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," - " rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, vpn, pa, rflags, vflags, psize); - } - - for (i = 0; i < HPTES_PER_GROUP; i++) { - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { - /* retry with lock held */ - native_lock_hpte(hptep); - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) - break; - native_unlock_hpte(hptep); - } - - hptep++; - } - - if (i == HPTES_PER_GROUP) - return -1; - - hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; - - if (!(vflags & HPTE_V_BOLTED)) { - DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", - i, hpte_v, hpte_r); - } - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); - hpte_v = hpte_old_to_new_v(hpte_v); - } - - hptep->r = cpu_to_be64(hpte_r); - /* Guarantee the second dword is visible before the valid bit */ - eieio(); - /* - * Now set the first dword including the valid bit - * NOTE: this also unlocks the hpte - */ - hptep->v = cpu_to_be64(hpte_v); - - __asm__ __volatile__ ("ptesync" : : : "memory"); - - return i | (!!(vflags & HPTE_V_SECONDARY) << 3); -} - -static long native_hpte_remove(unsigned long hpte_group) -{ - struct hash_pte *hptep; - int i; - int slot_offset; - unsigned long hpte_v; - - DBG_LOW(" remove(group=%lx)\n", hpte_group); - - /* pick a random entry to start at */ - slot_offset = mftb() & 0x7; - - for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + hpte_group + slot_offset; - hpte_v = be64_to_cpu(hptep->v); - - if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { - /* retry with lock held */ - native_lock_hpte(hptep); - hpte_v = be64_to_cpu(hptep->v); - if ((hpte_v & HPTE_V_VALID) - && !(hpte_v & HPTE_V_BOLTED)) - break; - native_unlock_hpte(hptep); - } - - slot_offset++; - slot_offset &= 0x7; - } - - if (i == HPTES_PER_GROUP) - return -1; - - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - - return i; -} - -static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long vpn, int bpsize, - int apsize, int ssize, unsigned long flags) -{ - struct hash_pte *hptep = htab_address + slot; - unsigned long hpte_v, want_v; - int ret = 0, local = 0; - - want_v = hpte_encode_avpn(vpn, bpsize, ssize); - - DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", - vpn, want_v & HPTE_V_AVPN, slot, newpp); - - hpte_v = hpte_get_old_v(hptep); - /* - * We need to invalidate the TLB always because hpte_remove doesn't do - * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less - * random entry from it. When we do that we don't invalidate the TLB - * (hpte_remove) because we assume the old translation is still - * technically "valid". - */ - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { - DBG_LOW(" -> miss\n"); - ret = -1; - } else { - native_lock_hpte(hptep); - /* recheck with locks held */ - hpte_v = hpte_get_old_v(hptep); - if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || - !(hpte_v & HPTE_V_VALID))) { - ret = -1; - } else { - DBG_LOW(" -> hit\n"); - /* Update the HPTE */ - hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & - ~(HPTE_R_PPP | HPTE_R_N)) | - (newpp & (HPTE_R_PPP | HPTE_R_N | - HPTE_R_C))); - } - native_unlock_hpte(hptep); - } - - if (flags & HPTE_LOCAL_UPDATE) - local = 1; - /* - * Ensure it is out of the tlb too if it is not a nohpte fault - */ - if (!(flags & HPTE_NOHPTE_UPDATE)) - tlbie(vpn, bpsize, apsize, ssize, local); - - return ret; -} - -static long native_hpte_find(unsigned long vpn, int psize, int ssize) -{ - struct hash_pte *hptep; - unsigned long hash; - unsigned long i; - long slot; - unsigned long want_v, hpte_v; - - hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* Bolted mappings are only ever in the primary group */ - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - for (i = 0; i < HPTES_PER_GROUP; i++) { - - hptep = htab_address + slot; - hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) - /* HPTE matches */ - return slot; - ++slot; - } - - return -1; -} - -/* - * Update the page protection bits. Intended to be used to create - * guard pages for kernel data structures on pages which are bolted - * in the HPT. Assumes pages being operated on will not be stolen. - * - * No need to lock here because we should be the only user. - */ -static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, - int psize, int ssize) -{ - unsigned long vpn; - unsigned long vsid; - long slot; - struct hash_pte *hptep; - - vsid = get_kernel_vsid(ea, ssize); - vpn = hpt_vpn(ea, vsid, ssize); - - slot = native_hpte_find(vpn, psize, ssize); - if (slot == -1) - panic("could not find page to bolt\n"); - hptep = htab_address + slot; - - /* Update the HPTE */ - hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & - ~(HPTE_R_PPP | HPTE_R_N)) | - (newpp & (HPTE_R_PPP | HPTE_R_N))); - /* - * Ensure it is out of the tlb too. Bolted entries base and - * actual page size will be same. - */ - tlbie(vpn, psize, psize, ssize, 0); -} - -/* - * Remove a bolted kernel entry. Memory hotplug uses this. - * - * No need to lock here because we should be the only user. - */ -static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) -{ - unsigned long vpn; - unsigned long vsid; - long slot; - struct hash_pte *hptep; - - vsid = get_kernel_vsid(ea, ssize); - vpn = hpt_vpn(ea, vsid, ssize); - - slot = native_hpte_find(vpn, psize, ssize); - if (slot == -1) - return -ENOENT; - - hptep = htab_address + slot; - - VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); - - /* Invalidate the hpte */ - hptep->v = 0; - - /* Invalidate the TLB */ - tlbie(vpn, psize, psize, ssize, 0); - return 0; -} - - -static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, - int bpsize, int apsize, int ssize, int local) -{ - struct hash_pte *hptep = htab_address + slot; - unsigned long hpte_v; - unsigned long want_v; - unsigned long flags; - - local_irq_save(flags); - - DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); - - want_v = hpte_encode_avpn(vpn, bpsize, ssize); - hpte_v = hpte_get_old_v(hptep); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - native_lock_hpte(hptep); - /* recheck with locks held */ - hpte_v = hpte_get_old_v(hptep); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - else - native_unlock_hpte(hptep); - } - /* - * We need to invalidate the TLB always because hpte_remove doesn't do - * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less - * random entry from it. When we do that we don't invalidate the TLB - * (hpte_remove) because we assume the old translation is still - * technically "valid". - */ - tlbie(vpn, bpsize, apsize, ssize, local); - - local_irq_restore(flags); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void native_hugepage_invalidate(unsigned long vsid, - unsigned long addr, - unsigned char *hpte_slot_array, - int psize, int ssize, int local) -{ - int i; - struct hash_pte *hptep; - int actual_psize = MMU_PAGE_16M; - unsigned int max_hpte_count, valid; - unsigned long flags, s_addr = addr; - unsigned long hpte_v, want_v, shift; - unsigned long hidx, vpn = 0, hash, slot; - - shift = mmu_psize_defs[psize].shift; - max_hpte_count = 1U << (PMD_SHIFT - shift); - - local_irq_save(flags); - for (i = 0; i < max_hpte_count; i++) { - valid = hpte_valid(hpte_slot_array, i); - if (!valid) - continue; - hidx = hpte_hash_index(hpte_slot_array, i); - - /* get the vpn */ - addr = s_addr + (i * (1ul << shift)); - vpn = hpt_vpn(addr, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - - hptep = htab_address + slot; - want_v = hpte_encode_avpn(vpn, psize, ssize); - hpte_v = hpte_get_old_v(hptep); - - /* Even if we miss, we need to invalidate the TLB */ - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* recheck with locks held */ - native_lock_hpte(hptep); - hpte_v = hpte_get_old_v(hptep); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - - hptep->v = 0; - } else - native_unlock_hpte(hptep); - } - /* - * We need to do tlb invalidate for all the address, tlbie - * instruction compares entry_VA in tlb with the VA specified - * here - */ - tlbie(vpn, psize, actual_psize, ssize, local); - } - local_irq_restore(flags); -} -#else -static void native_hugepage_invalidate(unsigned long vsid, - unsigned long addr, - unsigned char *hpte_slot_array, - int psize, int ssize, int local) -{ - WARN(1, "%s called without THP support\n", __func__); -} -#endif - -static void hpte_decode(struct hash_pte *hpte, unsigned long slot, - int *psize, int *apsize, int *ssize, unsigned long *vpn) -{ - unsigned long avpn, pteg, vpi; - unsigned long hpte_v = be64_to_cpu(hpte->v); - unsigned long hpte_r = be64_to_cpu(hpte->r); - unsigned long vsid, seg_off; - int size, a_size, shift; - /* Look at the 8 bit LP value */ - unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); - hpte_r = hpte_new_to_old_r(hpte_r); - } - if (!(hpte_v & HPTE_V_LARGE)) { - size = MMU_PAGE_4K; - a_size = MMU_PAGE_4K; - } else { - size = hpte_page_sizes[lp] & 0xf; - a_size = hpte_page_sizes[lp] >> 4; - } - /* This works for all page sizes, and for 256M and 1T segments */ - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; - shift = mmu_psize_defs[size].shift; - - avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); - pteg = slot / HPTES_PER_GROUP; - if (hpte_v & HPTE_V_SECONDARY) - pteg = ~pteg; - - switch (*ssize) { - case MMU_SEGSIZE_256M: - /* We only have 28 - 23 bits of seg_off in avpn */ - seg_off = (avpn & 0x1f) << 23; - vsid = avpn >> 5; - /* We can find more bits from the pteg value */ - if (shift < 23) { - vpi = (vsid ^ pteg) & htab_hash_mask; - seg_off |= vpi << shift; - } - *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; - break; - case MMU_SEGSIZE_1T: - /* We only have 40 - 23 bits of seg_off in avpn */ - seg_off = (avpn & 0x1ffff) << 23; - vsid = avpn >> 17; - if (shift < 23) { - vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; - seg_off |= vpi << shift; - } - *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; - break; - default: - *vpn = size = 0; - } - *psize = size; - *apsize = a_size; -} - -/* - * clear all mappings on kexec. All cpus are in real mode (or they will - * be when they isi), and we are the only one left. We rely on our kernel - * mapping being 0xC0's and the hardware ignoring those two real bits. - * - * This must be called with interrupts disabled. - * - * Taking the native_tlbie_lock is unsafe here due to the possibility of - * lockdep being on. On pre POWER5 hardware, not taking the lock could - * cause deadlock. POWER5 and newer not taking the lock is fine. This only - * gets called during boot before secondary CPUs have come up and during - * crashdump and all bets are off anyway. - * - * TODO: add batching support when enabled. remember, no dynamic memory here, - * although there is the control page available... - */ -static void native_hpte_clear(void) -{ - unsigned long vpn = 0; - unsigned long slot, slots; - struct hash_pte *hptep = htab_address; - unsigned long hpte_v; - unsigned long pteg_count; - int psize, apsize, ssize; - - pteg_count = htab_hash_mask + 1; - - slots = pteg_count * HPTES_PER_GROUP; - - for (slot = 0; slot < slots; slot++, hptep++) { - /* - * we could lock the pte here, but we are the only cpu - * running, right? and for crash dump, we probably - * don't want to wait for a maybe bad cpu. - */ - hpte_v = be64_to_cpu(hptep->v); - - /* - * Call __tlbie() here rather than tlbie() since we can't take the - * native_tlbie_lock. - */ - if (hpte_v & HPTE_V_VALID) { - hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); - hptep->v = 0; - ___tlbie(vpn, psize, apsize, ssize); - } - } - - asm volatile("eieio; tlbsync; ptesync":::"memory"); -} - -/* - * Batched hash table flush, we batch the tlbie's to avoid taking/releasing - * the lock all the time - */ -static void native_flush_hash_range(unsigned long number, int local) -{ - unsigned long vpn = 0; - unsigned long hash, index, hidx, shift, slot; - struct hash_pte *hptep; - unsigned long hpte_v; - unsigned long want_v; - unsigned long flags; - real_pte_t pte; - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); - unsigned long psize = batch->psize; - int ssize = batch->ssize; - int i; - unsigned int use_local; - - use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && - mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use(); - - local_irq_save(flags); - - for (i = 0; i < number; i++) { - vpn = batch->vpn[i]; - pte = batch->pte[i]; - - pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - hptep = htab_address + slot; - want_v = hpte_encode_avpn(vpn, psize, ssize); - hpte_v = hpte_get_old_v(hptep); - - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - continue; - /* lock and try again */ - native_lock_hpte(hptep); - hpte_v = hpte_get_old_v(hptep); - - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - hptep->v = 0; - - } pte_iterate_hashed_end(); - } - - if (use_local) { - asm volatile("ptesync":::"memory"); - for (i = 0; i < number; i++) { - vpn = batch->vpn[i]; - pte = batch->pte[i]; - - pte_iterate_hashed_subpages(pte, psize, - vpn, index, shift) { - __tlbiel(vpn, psize, psize, ssize); - } pte_iterate_hashed_end(); - } - asm volatile("ptesync":::"memory"); - } else { - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - - if (lock_tlbie) - raw_spin_lock(&native_tlbie_lock); - - asm volatile("ptesync":::"memory"); - for (i = 0; i < number; i++) { - vpn = batch->vpn[i]; - pte = batch->pte[i]; - - pte_iterate_hashed_subpages(pte, psize, - vpn, index, shift) { - __tlbie(vpn, psize, psize, ssize); - } pte_iterate_hashed_end(); - } - /* - * Just do one more with the last used values. - */ - fixup_tlbie(vpn, psize, psize, ssize); - asm volatile("eieio; tlbsync; ptesync":::"memory"); - - if (lock_tlbie) - raw_spin_unlock(&native_tlbie_lock); - } - - local_irq_restore(flags); -} - -void __init hpte_init_native(void) -{ - mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; - mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; - mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; - mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; - mmu_hash_ops.hpte_insert = native_hpte_insert; - mmu_hash_ops.hpte_remove = native_hpte_remove; - mmu_hash_ops.hpte_clear_all = native_hpte_clear; - mmu_hash_ops.flush_hash_range = native_flush_hash_range; - mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; -} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c deleted file mode 100644 index 6eb89643ce58..000000000000 --- a/arch/powerpc/mm/hash_utils_64.c +++ /dev/null @@ -1,1930 +0,0 @@ -/* - * PowerPC64 port by Mike Corrigan and Dave Engebretsen - * {mikejc|engebret}@us.ibm.com - * - * Copyright (c) 2000 Mike Corrigan - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * Module name: htab.c - * - * Description: - * PowerPC Hashed Page Table functions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#undef DEBUG -#undef DEBUG_LOW - -#define pr_fmt(fmt) "hash-mmu: " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -#ifdef DEBUG_LOW -#define DBG_LOW(fmt...) udbg_printf(fmt) -#else -#define DBG_LOW(fmt...) -#endif - -#define KB (1024) -#define MB (1024*KB) -#define GB (1024L*MB) - -/* - * Note: pte --> Linux PTE - * HPTE --> PowerPC Hashed Page Table Entry - * - * Execution context: - * htab_initialize is called with the MMU off (of course), but - * the kernel has been copied down to zero so it can directly - * reference global data. At this point it is very difficult - * to print debug info. - * - */ - -static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -EXPORT_SYMBOL_GPL(mmu_psize_defs); - -u8 hpte_page_sizes[1 << LP_BITS]; -EXPORT_SYMBOL_GPL(hpte_page_sizes); - -struct hash_pte *htab_address; -unsigned long htab_size_bytes; -unsigned long htab_hash_mask; -EXPORT_SYMBOL_GPL(htab_hash_mask); -int mmu_linear_psize = MMU_PAGE_4K; -EXPORT_SYMBOL_GPL(mmu_linear_psize); -int mmu_virtual_psize = MMU_PAGE_4K; -int mmu_vmalloc_psize = MMU_PAGE_4K; -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif -int mmu_io_psize = MMU_PAGE_4K; -int mmu_kernel_ssize = MMU_SEGSIZE_256M; -EXPORT_SYMBOL_GPL(mmu_kernel_ssize); -int mmu_highuser_ssize = MMU_SEGSIZE_256M; -u16 mmu_slb_size = 64; -EXPORT_SYMBOL_GPL(mmu_slb_size); -#ifdef CONFIG_PPC_64K_PAGES -int mmu_ci_restrictions; -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -static u8 *linear_map_hash_slots; -static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ -struct mmu_hash_ops mmu_hash_ops; -EXPORT_SYMBOL(mmu_hash_ops); - -/* There are definitions of page sizes arrays to be used when none - * is provided by the firmware. - */ - -/* - * Fallback (4k pages only) - */ -static struct mmu_psize_def mmu_psize_defaults[] = { - [MMU_PAGE_4K] = { - .shift = 12, - .sllp = 0, - .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, - .avpnm = 0, - .tlbiel = 0, - }, -}; - -/* POWER4, GPUL, POWER5 - * - * Support for 16Mb large pages - */ -static struct mmu_psize_def mmu_psize_defaults_gp[] = { - [MMU_PAGE_4K] = { - .shift = 12, - .sllp = 0, - .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, - .avpnm = 0, - .tlbiel = 1, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .sllp = SLB_VSID_L, - .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, - [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, - .avpnm = 0x1UL, - .tlbiel = 0, - }, -}; - -/* - * 'R' and 'C' update notes: - * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* - * create writeable HPTEs without C set, because the hcall H_PROTECT - * that we use in that case will not update C - * - The above is however not a problem, because we also don't do that - * fancy "no flush" variant of eviction and we use H_REMOVE which will - * do the right thing and thus we don't have the race I described earlier - * - * - Under bare metal, we do have the race, so we need R and C set - * - We make sure R is always set and never lost - * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping - */ -unsigned long htab_convert_pte_flags(unsigned long pteflags) -{ - unsigned long rflags = 0; - - /* _PAGE_EXEC -> NOEXEC */ - if ((pteflags & _PAGE_EXEC) == 0) - rflags |= HPTE_R_N; - /* - * PPP bits: - * Linux uses slb key 0 for kernel and 1 for user. - * kernel RW areas are mapped with PPP=0b000 - * User area is mapped with PPP=0b010 for read/write - * or PPP=0b011 for read-only (including writeable but clean pages). - */ - if (pteflags & _PAGE_PRIVILEGED) { - /* - * Kernel read only mapped with ppp bits 0b110 - */ - if (!(pteflags & _PAGE_WRITE)) { - if (mmu_has_feature(MMU_FTR_KERNEL_RO)) - rflags |= (HPTE_R_PP0 | 0x2); - else - rflags |= 0x3; - } - } else { - if (pteflags & _PAGE_RWX) - rflags |= 0x2; - if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) - rflags |= 0x1; - } - /* - * We can't allow hardware to update hpte bits. Hence always - * set 'R' bit and set 'C' if it is a write fault - */ - rflags |= HPTE_R_R; - - if (pteflags & _PAGE_DIRTY) - rflags |= HPTE_R_C; - /* - * Add in WIG bits - */ - - if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) - rflags |= HPTE_R_I; - else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) - rflags |= (HPTE_R_I | HPTE_R_G); - else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) - rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); - else - /* - * Add memory coherence if cache inhibited is not set - */ - rflags |= HPTE_R_M; - - rflags |= pte_to_hpte_pkey_bits(pteflags); - return rflags; -} - -int htab_bolt_mapping(unsigned long vstart, unsigned long vend, - unsigned long pstart, unsigned long prot, - int psize, int ssize) -{ - unsigned long vaddr, paddr; - unsigned int step, shift; - int ret = 0; - - shift = mmu_psize_defs[psize].shift; - step = 1 << shift; - - prot = htab_convert_pte_flags(prot); - - DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", - vstart, vend, pstart, prot, psize, ssize); - - for (vaddr = vstart, paddr = pstart; vaddr < vend; - vaddr += step, paddr += step) { - unsigned long hash, hpteg; - unsigned long vsid = get_kernel_vsid(vaddr, ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); - unsigned long tprot = prot; - - /* - * If we hit a bad address return error. - */ - if (!vsid) - return -1; - /* Make kernel text executable */ - if (overlaps_kernel_text(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - /* Make kvm guest trampolines executable */ - if (overlaps_kvm_tmp(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - /* - * If relocatable, check if it overlaps interrupt vectors that - * are copied down to real 0. For relocatable kernel - * (e.g. kdump case) we copy interrupt vectors down to real - * address 0. Mark that region as executable. This is - * because on p8 system with relocation on exception feature - * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence - * in order to execute the interrupt handlers in virtual - * mode the vector region need to be marked as executable. - */ - if ((PHYSICAL_START > MEMORY_START) && - overlaps_interrupt_vector_text(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - hash = hpt_hash(vpn, shift, ssize); - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - - BUG_ON(!mmu_hash_ops.hpte_insert); - ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, - HPTE_V_BOLTED, psize, psize, - ssize); - - if (ret < 0) - break; - -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (paddr >> PAGE_SHIFT) < linear_map_hash_count) - linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ - } - return ret < 0 ? ret : 0; -} - -int htab_remove_mapping(unsigned long vstart, unsigned long vend, - int psize, int ssize) -{ - unsigned long vaddr; - unsigned int step, shift; - int rc; - int ret = 0; - - shift = mmu_psize_defs[psize].shift; - step = 1 << shift; - - if (!mmu_hash_ops.hpte_removebolted) - return -ENODEV; - - for (vaddr = vstart; vaddr < vend; vaddr += step) { - rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); - if (rc == -ENOENT) { - ret = -ENOENT; - continue; - } - if (rc < 0) - return rc; - } - - return ret; -} - -static bool disable_1tb_segments = false; - -static int __init parse_disable_1tb_segments(char *p) -{ - disable_1tb_segments = true; - return 0; -} -early_param("disable_1tb_segments", parse_disable_1tb_segments); - -static int __init htab_dt_scan_seg_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *prop; - int size = 0; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); - if (prop == NULL) - return 0; - for (; size >= 4; size -= 4, ++prop) { - if (be32_to_cpu(prop[0]) == 40) { - DBG("1T segment support detected\n"); - - if (disable_1tb_segments) { - DBG("1T segments disabled by command line\n"); - break; - } - - cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; - return 1; - } - } - cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; - return 0; -} - -static int __init get_idx_from_shift(unsigned int shift) -{ - int idx = -1; - - switch (shift) { - case 0xc: - idx = MMU_PAGE_4K; - break; - case 0x10: - idx = MMU_PAGE_64K; - break; - case 0x14: - idx = MMU_PAGE_1M; - break; - case 0x18: - idx = MMU_PAGE_16M; - break; - case 0x22: - idx = MMU_PAGE_16G; - break; - } - return idx; -} - -static int __init htab_dt_scan_page_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *prop; - int size = 0; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); - if (!prop) - return 0; - - pr_info("Page sizes from device-tree:\n"); - size /= 4; - cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); - while(size > 0) { - unsigned int base_shift = be32_to_cpu(prop[0]); - unsigned int slbenc = be32_to_cpu(prop[1]); - unsigned int lpnum = be32_to_cpu(prop[2]); - struct mmu_psize_def *def; - int idx, base_idx; - - size -= 3; prop += 3; - base_idx = get_idx_from_shift(base_shift); - if (base_idx < 0) { - /* skip the pte encoding also */ - prop += lpnum * 2; size -= lpnum * 2; - continue; - } - def = &mmu_psize_defs[base_idx]; - if (base_idx == MMU_PAGE_16M) - cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; - - def->shift = base_shift; - if (base_shift <= 23) - def->avpnm = 0; - else - def->avpnm = (1 << (base_shift - 23)) - 1; - def->sllp = slbenc; - /* - * We don't know for sure what's up with tlbiel, so - * for now we only set it for 4K and 64K pages - */ - if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) - def->tlbiel = 1; - else - def->tlbiel = 0; - - while (size > 0 && lpnum) { - unsigned int shift = be32_to_cpu(prop[0]); - int penc = be32_to_cpu(prop[1]); - - prop += 2; size -= 2; - lpnum--; - - idx = get_idx_from_shift(shift); - if (idx < 0) - continue; - - if (penc == -1) - pr_err("Invalid penc for base_shift=%d " - "shift=%d\n", base_shift, shift); - - def->penc[idx] = penc; - pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," - " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", - base_shift, shift, def->sllp, - def->avpnm, def->tlbiel, def->penc[idx]); - } - } - - return 1; -} - -#ifdef CONFIG_HUGETLB_PAGE -/* Scan for 16G memory blocks that have been set aside for huge pages - * and reserve those blocks for 16G huge pages. - */ -static int __init htab_dt_scan_hugepage_blocks(unsigned long node, - const char *uname, int depth, - void *data) { - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be64 *addr_prop; - const __be32 *page_count_prop; - unsigned int expected_pages; - long unsigned int phys_addr; - long unsigned int block_size; - - /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) - return 0; - - /* This property is the log base 2 of the number of virtual pages that - * will represent this memory block. */ - page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); - if (page_count_prop == NULL) - return 0; - expected_pages = (1 << be32_to_cpu(page_count_prop[0])); - addr_prop = of_get_flat_dt_prop(node, "reg", NULL); - if (addr_prop == NULL) - return 0; - phys_addr = be64_to_cpu(addr_prop[0]); - block_size = be64_to_cpu(addr_prop[1]); - if (block_size != (16 * GB)) - return 0; - printk(KERN_INFO "Huge page(16GB) memory: " - "addr = 0x%lX size = 0x%lX pages = %d\n", - phys_addr, block_size, expected_pages); - if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { - memblock_reserve(phys_addr, block_size * expected_pages); - pseries_add_gpage(phys_addr, block_size, expected_pages); - } - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE */ - -static void mmu_psize_set_default_penc(void) -{ - int bpsize, apsize; - for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) - for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) - mmu_psize_defs[bpsize].penc[apsize] = -1; -} - -#ifdef CONFIG_PPC_64K_PAGES - -static bool might_have_hea(void) -{ - /* - * The HEA ethernet adapter requires awareness of the - * GX bus. Without that awareness we can easily assume - * we will never see an HEA ethernet device. - */ -#ifdef CONFIG_IBMEBUS - return !cpu_has_feature(CPU_FTR_ARCH_207S) && - firmware_has_feature(FW_FEATURE_SPLPAR); -#else - return false; -#endif -} - -#endif /* #ifdef CONFIG_PPC_64K_PAGES */ - -static void __init htab_scan_page_sizes(void) -{ - int rc; - - /* se the invalid penc to -1 */ - mmu_psize_set_default_penc(); - - /* Default to 4K pages only */ - memcpy(mmu_psize_defs, mmu_psize_defaults, - sizeof(mmu_psize_defaults)); - - /* - * Try to find the available page sizes in the device-tree - */ - rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); - if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { - /* - * Nothing in the device-tree, but the CPU supports 16M pages, - * so let's fallback on a known size list for 16M capable CPUs. - */ - memcpy(mmu_psize_defs, mmu_psize_defaults_gp, - sizeof(mmu_psize_defaults_gp)); - } - -#ifdef CONFIG_HUGETLB_PAGE - if (!hugetlb_disabled) { - /* Reserve 16G huge page memory sections for huge pages */ - of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); - } -#endif /* CONFIG_HUGETLB_PAGE */ -} - -/* - * Fill in the hpte_page_sizes[] array. - * We go through the mmu_psize_defs[] array looking for all the - * supported base/actual page size combinations. Each combination - * has a unique pagesize encoding (penc) value in the low bits of - * the LP field of the HPTE. For actual page sizes less than 1MB, - * some of the upper LP bits are used for RPN bits, meaning that - * we need to fill in several entries in hpte_page_sizes[]. - * - * In diagrammatic form, with r = RPN bits and z = page size bits: - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ... - * - * The zzzz bits are implementation-specific but are chosen so that - * no encoding for a larger page size uses the same value in its - * low-order N bits as the encoding for the 2^(12+N) byte page size - * (if it exists). - */ -static void init_hpte_page_sizes(void) -{ - long int ap, bp; - long int shift, penc; - - for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { - if (!mmu_psize_defs[bp].shift) - continue; /* not a supported page size */ - for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { - penc = mmu_psize_defs[bp].penc[ap]; - if (penc == -1 || !mmu_psize_defs[ap].shift) - continue; - shift = mmu_psize_defs[ap].shift - LP_SHIFT; - if (shift <= 0) - continue; /* should never happen */ - /* - * For page sizes less than 1MB, this loop - * replicates the entry for all possible values - * of the rrrr bits. - */ - while (penc < (1 << LP_BITS)) { - hpte_page_sizes[penc] = (ap << 4) | bp; - penc += 1 << shift; - } - } - } -} - -static void __init htab_init_page_sizes(void) -{ - init_hpte_page_sizes(); - - if (!debug_pagealloc_enabled()) { - /* - * Pick a size for the linear mapping. Currently, we only - * support 16M, 1M and 4K which is the default - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - mmu_linear_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - mmu_linear_psize = MMU_PAGE_1M; - } - -#ifdef CONFIG_PPC_64K_PAGES - /* - * Pick a size for the ordinary pages. Default is 4K, we support - * 64K for user mappings and vmalloc if supported by the processor. - * We only use 64k for ioremap if the processor - * (and firmware) support cache-inhibited large pages. - * If not, we use 4k and set mmu_ci_restrictions so that - * hash_page knows to switch processes that use cache-inhibited - * mappings to 4k pages. - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift) { - mmu_virtual_psize = MMU_PAGE_64K; - mmu_vmalloc_psize = MMU_PAGE_64K; - if (mmu_linear_psize == MMU_PAGE_4K) - mmu_linear_psize = MMU_PAGE_64K; - if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { - /* - * When running on pSeries using 64k pages for ioremap - * would stop us accessing the HEA ethernet. So if we - * have the chance of ever seeing one, stay at 4k. - */ - if (!might_have_hea()) - mmu_io_psize = MMU_PAGE_64K; - } else - mmu_ci_restrictions = 1; - } -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* We try to use 16M pages for vmemmap if that is supported - * and we have at least 1G of RAM at boot - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift && - memblock_phys_mem_size() >= 0x40000000) - mmu_vmemmap_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_64K].shift) - mmu_vmemmap_psize = MMU_PAGE_64K; - else - mmu_vmemmap_psize = MMU_PAGE_4K; -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - printk(KERN_DEBUG "Page orders: linear mapping = %d, " - "virtual = %d, io = %d" -#ifdef CONFIG_SPARSEMEM_VMEMMAP - ", vmemmap = %d" -#endif - "\n", - mmu_psize_defs[mmu_linear_psize].shift, - mmu_psize_defs[mmu_virtual_psize].shift, - mmu_psize_defs[mmu_io_psize].shift -#ifdef CONFIG_SPARSEMEM_VMEMMAP - ,mmu_psize_defs[mmu_vmemmap_psize].shift -#endif - ); -} - -static int __init htab_dt_scan_pftsize(unsigned long node, - const char *uname, int depth, - void *data) -{ - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *prop; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); - if (prop != NULL) { - /* pft_size[0] is the NUMA CEC cookie */ - ppc64_pft_size = be32_to_cpu(prop[1]); - return 1; - } - return 0; -} - -unsigned htab_shift_for_mem_size(unsigned long mem_size) -{ - unsigned memshift = __ilog2(mem_size); - unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift; - unsigned pteg_shift; - - /* round mem_size up to next power of 2 */ - if ((1UL << memshift) < mem_size) - memshift += 1; - - /* aim for 2 pages / pteg */ - pteg_shift = memshift - (pshift + 1); - - /* - * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab - * size permitted by the architecture. - */ - return max(pteg_shift + 7, 18U); -} - -static unsigned long __init htab_get_table_size(void) -{ - /* If hash size isn't already provided by the platform, we try to - * retrieve it from the device-tree. If it's not there neither, we - * calculate it now based on the total RAM size - */ - if (ppc64_pft_size == 0) - of_scan_flat_dt(htab_dt_scan_pftsize, NULL); - if (ppc64_pft_size) - return 1UL << ppc64_pft_size; - - return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size()); -} - -#ifdef CONFIG_MEMORY_HOTPLUG -int resize_hpt_for_hotplug(unsigned long new_mem_size) -{ - unsigned target_hpt_shift; - - if (!mmu_hash_ops.resize_hpt) - return 0; - - target_hpt_shift = htab_shift_for_mem_size(new_mem_size); - - /* - * To avoid lots of HPT resizes if memory size is fluctuating - * across a boundary, we deliberately have some hysterisis - * here: we immediately increase the HPT size if the target - * shift exceeds the current shift, but we won't attempt to - * reduce unless the target shift is at least 2 below the - * current shift - */ - if (target_hpt_shift > ppc64_pft_size || - target_hpt_shift < ppc64_pft_size - 1) - return mmu_hash_ops.resize_hpt(target_hpt_shift); - - return 0; -} - -int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) -{ - int rc; - - if (end >= H_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - return -1; - } - - rc = htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize); - - if (rc < 0) { - int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -int hash__remove_section_mapping(unsigned long start, unsigned long end) -{ - int rc = htab_remove_mapping(start, end, mmu_linear_psize, - mmu_kernel_ssize); - WARN_ON(rc < 0); - return rc; -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -static void __init hash_init_partition_table(phys_addr_t hash_table, - unsigned long htab_size) -{ - mmu_partition_table_init(); - - /* - * PS field (VRMA page size) is not used for LPID 0, hence set to 0. - * For now, UPRT is 0 and we have no segment table. - */ - htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0); - pr_info("Partition table %p\n", partition_tb); -} - -static void __init htab_initialize(void) -{ - unsigned long table; - unsigned long pteg_count; - unsigned long prot; - unsigned long base = 0, size = 0; - struct memblock_region *reg; - - DBG(" -> htab_initialize()\n"); - - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { - mmu_kernel_ssize = MMU_SEGSIZE_1T; - mmu_highuser_ssize = MMU_SEGSIZE_1T; - printk(KERN_INFO "Using 1TB segments\n"); - } - - /* - * Calculate the required size of the htab. We want the number of - * PTEGs to equal one half the number of real pages. - */ - htab_size_bytes = htab_get_table_size(); - pteg_count = htab_size_bytes >> 7; - - htab_hash_mask = pteg_count - 1; - - if (firmware_has_feature(FW_FEATURE_LPAR) || - firmware_has_feature(FW_FEATURE_PS3_LV1)) { - /* Using a hypervisor which owns the htab */ - htab_address = NULL; - _SDR1 = 0; - /* - * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall - * to inform the hypervisor that we wish to use the HPT. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table(0, 0, 0); -#ifdef CONFIG_FA_DUMP - /* - * If firmware assisted dump is active firmware preserves - * the contents of htab along with entire partition memory. - * Clear the htab if firmware assisted dump is active so - * that we dont end up using old mappings. - */ - if (is_fadump_active() && mmu_hash_ops.hpte_clear_all) - mmu_hash_ops.hpte_clear_all(); -#endif - } else { - unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; - -#ifdef CONFIG_PPC_CELL - /* - * Cell may require the hash table down low when using the - * Axon IOMMU in order to fit the dynamic region over it, see - * comments in cell/iommu.c - */ - if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) { - limit = 0x80000000; - pr_info("Hash table forced below 2G for Axon IOMMU\n"); - } -#endif /* CONFIG_PPC_CELL */ - - table = memblock_phys_alloc_range(htab_size_bytes, - htab_size_bytes, - 0, limit); - if (!table) - panic("ERROR: Failed to allocate %pa bytes below %pa\n", - &htab_size_bytes, &limit); - - DBG("Hash table allocated at %lx, size: %lx\n", table, - htab_size_bytes); - - htab_address = __va(table); - - /* htab absolute addr + encoded htabsize */ - _SDR1 = table + __ilog2(htab_size_bytes) - 18; - - /* Initialize the HPT with no entries */ - memset((void *)table, 0, htab_size_bytes); - - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); - else - hash_init_partition_table(table, htab_size_bytes); - } - - prot = pgprot_val(PAGE_KERNEL); - -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { - linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = memblock_alloc_try_nid( - linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, - ppc64_rma_size, NUMA_NO_NODE); - if (!linear_map_hash_slots) - panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", - __func__, linear_map_hash_count, &ppc64_rma_size); - } -#endif /* CONFIG_DEBUG_PAGEALLOC */ - - /* create bolted the linear mapping in the hash table */ - for_each_memblock(memory, reg) { - base = (unsigned long)__va(reg->base); - size = reg->size; - - DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", - base, size, prot); - - if ((base + size) >= H_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - continue; - } - - BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), - prot, mmu_linear_psize, mmu_kernel_ssize)); - } - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - - /* - * If we have a memory_limit and we've allocated TCEs then we need to - * explicitly map the TCE area at the top of RAM. We also cope with the - * case that the TCEs start below memory_limit. - * tce_alloc_start/end are 16MB aligned so the mapping should work - * for either 4K or 16MB pages. - */ - if (tce_alloc_start) { - tce_alloc_start = (unsigned long)__va(tce_alloc_start); - tce_alloc_end = (unsigned long)__va(tce_alloc_end); - - if (base + size >= tce_alloc_start) - tce_alloc_start = base + size + 1; - - BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, - __pa(tce_alloc_start), prot, - mmu_linear_psize, mmu_kernel_ssize)); - } - - - DBG(" <- htab_initialize()\n"); -} -#undef KB -#undef MB - -void __init hash__early_init_devtree(void) -{ - /* Initialize segment sizes */ - of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); - - /* Initialize page sizes */ - htab_scan_page_sizes(); -} - -struct hash_mm_context init_hash_mm_context; -void __init hash__early_init_mmu(void) -{ -#ifndef CONFIG_PPC_64K_PAGES - /* - * We have code in __hash_page_4K() and elsewhere, which assumes it can - * do the following: - * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); - * - * Where the slot number is between 0-15, and values of 8-15 indicate - * the secondary bucket. For that code to work H_PAGE_F_SECOND and - * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and - * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here - * with a BUILD_BUG_ON(). - */ - BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); -#endif /* CONFIG_PPC_64K_PAGES */ - - htab_init_page_sizes(); - - /* - * initialize page table size - */ - __pte_frag_nr = H_PTE_FRAG_NR; - __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; - __pmd_frag_nr = H_PMD_FRAG_NR; - __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; - - __pte_index_size = H_PTE_INDEX_SIZE; - __pmd_index_size = H_PMD_INDEX_SIZE; - __pud_index_size = H_PUD_INDEX_SIZE; - __pgd_index_size = H_PGD_INDEX_SIZE; - __pud_cache_index = H_PUD_CACHE_INDEX; - __pte_table_size = H_PTE_TABLE_SIZE; - __pmd_table_size = H_PMD_TABLE_SIZE; - __pud_table_size = H_PUD_TABLE_SIZE; - __pgd_table_size = H_PGD_TABLE_SIZE; - /* - * 4k use hugepd format, so for hash set then to - * zero - */ - __pmd_val_bits = HASH_PMD_VAL_BITS; - __pud_val_bits = HASH_PUD_VAL_BITS; - __pgd_val_bits = HASH_PGD_VAL_BITS; - - __kernel_virt_start = H_KERN_VIRT_START; - __vmalloc_start = H_VMALLOC_START; - __vmalloc_end = H_VMALLOC_END; - __kernel_io_start = H_KERN_IO_START; - __kernel_io_end = H_KERN_IO_END; - vmemmap = (struct page *)H_VMEMMAP_START; - ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PCI - pci_io_base = ISA_IO_BASE; -#endif - - /* Select appropriate backend */ - if (firmware_has_feature(FW_FEATURE_PS3_LV1)) - ps3_early_mm_init(); - else if (firmware_has_feature(FW_FEATURE_LPAR)) - hpte_init_pseries(); - else if (IS_ENABLED(CONFIG_PPC_NATIVE)) - hpte_init_native(); - - if (!mmu_hash_ops.hpte_insert) - panic("hash__early_init_mmu: No MMU hash ops defined!\n"); - - /* Initialize the MMU Hash table and create the linear mapping - * of memory. Has to be done before SLB initialization as this is - * currently where the page size encoding is obtained. - */ - htab_initialize(); - - init_mm.context.hash_context = &init_hash_mm_context; - init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; - - pr_info("Initializing hash mmu with SLB\n"); - /* Initialize SLB management */ - slb_initialize(); - - if (cpu_has_feature(CPU_FTR_ARCH_206) - && cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} - -#ifdef CONFIG_SMP -void hash__early_init_mmu_secondary(void) -{ - /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - mtspr(SPRN_SDR1, _SDR1); - else - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - } - /* Initialize SLB */ - slb_initialize(); - - if (cpu_has_feature(CPU_FTR_ARCH_206) - && cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} -#endif /* CONFIG_SMP */ - -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) -{ - struct page *page; - - if (!pfn_valid(pte_pfn(pte))) - return pp; - - page = pte_page(pte); - - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } else - pp |= HPTE_R_N; - } - return pp; -} - -#ifdef CONFIG_PPC_MM_SLICES -static unsigned int get_paca_psize(unsigned long addr) -{ - unsigned char *psizes; - unsigned long index, mask_index; - - if (addr < SLICE_LOW_TOP) { - psizes = get_paca()->mm_ctx_low_slices_psize; - index = GET_LOW_SLICE_INDEX(addr); - } else { - psizes = get_paca()->mm_ctx_high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); - } - mask_index = index & 0x1; - return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; -} - -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->mm_ctx_user_psize; -} -#endif - -/* - * Demote a segment to using 4k pages. - * For now this makes the whole process use 4k pages. - */ -#ifdef CONFIG_PPC_64K_PAGES -void demote_segment_4k(struct mm_struct *mm, unsigned long addr) -{ - if (get_slice_psize(mm, addr) == MMU_PAGE_4K) - return; - slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); - copro_flush_all_slbs(mm); - if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - - copy_mm_to_paca(mm); - slb_flush_and_restore_bolted(); - } -} -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_PPC_SUBPAGE_PROT -/* - * This looks up a 2-bit protection code for a 4k subpage of a 64k page. - * Userspace sets the subpage permissions using the subpage_prot system call. - * - * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_RWX: no access. - */ -static int subpage_protection(struct mm_struct *mm, unsigned long ea) -{ - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); - u32 spp = 0; - u32 **sbpm, *sbpp; - - if (!spt) - return 0; - - if (ea >= spt->maxaddr) - return 0; - if (ea < 0x100000000UL) { - /* addresses below 4GB use spt->low_prot */ - sbpm = spt->low_prot; - } else { - sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; - if (!sbpm) - return 0; - } - sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; - if (!sbpp) - return 0; - spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; - - /* extract 2-bit bitfield for this 4k subpage */ - spp >>= 30 - 2 * ((ea >> 12) & 0xf); - - /* - * 0 -> full premission - * 1 -> Read only - * 2 -> no access. - * We return the flag that need to be cleared. - */ - spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); - return spp; -} - -#else /* CONFIG_PPC_SUBPAGE_PROT */ -static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) -{ - return 0; -} -#endif - -void hash_failure_debug(unsigned long ea, unsigned long access, - unsigned long vsid, unsigned long trap, - int ssize, int psize, int lpsize, unsigned long pte) -{ - if (!printk_ratelimit()) - return; - pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", - ea, access, current->comm); - pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", - trap, vsid, ssize, psize, lpsize, pte); -} - -static void check_paca_psize(unsigned long ea, struct mm_struct *mm, - int psize, bool user_region) -{ - if (user_region) { - if (psize != get_paca_psize(ea)) { - copy_mm_to_paca(mm); - slb_flush_and_restore_bolted(); - } - } else if (get_paca()->vmalloc_sllp != - mmu_psize_defs[mmu_vmalloc_psize].sllp) { - get_paca()->vmalloc_sllp = - mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_vmalloc_update(); - } -} - -/* Result code is: - * 0 - handled - * 1 - normal page fault - * -1 - critical hash insertion error - * -2 - access not permitted by subpage protection mechanism - */ -int hash_page_mm(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap, - unsigned long flags) -{ - bool is_thp; - enum ctx_state prev_state = exception_enter(); - pgd_t *pgdir; - unsigned long vsid; - pte_t *ptep; - unsigned hugeshift; - int rc, user_region = 0; - int psize, ssize; - - DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", - ea, access, trap); - trace_hash_fault(ea, access, trap); - - /* Get region & vsid */ - switch (get_region_id(ea)) { - case USER_REGION_ID: - user_region = 1; - if (! mm) { - DBG_LOW(" user region with no mm !\n"); - rc = 1; - goto bail; - } - psize = get_slice_psize(mm, ea); - ssize = user_segment_size(ea); - vsid = get_user_vsid(&mm->context, ea, ssize); - break; - case VMALLOC_REGION_ID: - vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - psize = mmu_vmalloc_psize; - ssize = mmu_kernel_ssize; - break; - - case IO_REGION_ID: - vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - psize = mmu_io_psize; - ssize = mmu_kernel_ssize; - break; - default: - /* Not a valid range - * Send the problem up to do_page_fault - */ - rc = 1; - goto bail; - } - DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); - - /* Bad address. */ - if (!vsid) { - DBG_LOW("Bad address!\n"); - rc = 1; - goto bail; - } - /* Get pgdir */ - pgdir = mm->pgd; - if (pgdir == NULL) { - rc = 1; - goto bail; - } - - /* Check CPU locality */ - if (user_region && mm_is_thread_local(mm)) - flags |= HPTE_LOCAL_UPDATE; - -#ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we might - * be hitting a special driver mapping, and need to align the - * address before we fetch the PTE. - * - * It could also be a hugepage mapping, in which case this is - * not necessary, but it's not harmful, either. - */ - if (psize != MMU_PAGE_4K) - ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get PTE and page size from page tables */ - ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); - if (ptep == NULL || !pte_present(*ptep)) { - DBG_LOW(" no PTE !\n"); - rc = 1; - goto bail; - } - - /* Add _PAGE_PRESENT to the required access perm */ - access |= _PAGE_PRESENT; - - /* Pre-check access permissions (will be re-checked atomically - * in __hash_page_XX but this pre-check is a fast path - */ - if (!check_pte_access(access, pte_val(*ptep))) { - DBG_LOW(" no access !\n"); - rc = 1; - goto bail; - } - - if (hugeshift) { - if (is_thp) - rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, - trap, flags, ssize, psize); -#ifdef CONFIG_HUGETLB_PAGE - else - rc = __hash_page_huge(ea, access, vsid, ptep, trap, - flags, ssize, hugeshift, psize); -#else - else { - /* - * if we have hugeshift, and is not transhuge with - * hugetlb disabled, something is really wrong. - */ - rc = 1; - WARN_ON(1); - } -#endif - if (current->mm == mm) - check_paca_psize(ea, mm, psize, user_region); - - goto bail; - } - -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif - /* Do actual hashing */ -#ifdef CONFIG_PPC_64K_PAGES - /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { - demote_segment_4k(mm, ea); - psize = MMU_PAGE_4K; - } - - /* If this PTE is non-cacheable and we have restrictions on - * using non cacheable large pages, then we switch to 4k - */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { - if (user_region) { - demote_segment_4k(mm, ea); - psize = MMU_PAGE_4K; - } else if (ea < VMALLOC_END) { - /* - * some driver did a non-cacheable mapping - * in vmalloc space, so switch vmalloc - * to 4k pages - */ - printk(KERN_ALERT "Reducing vmalloc segment " - "to 4kB pages because of " - "non-cacheable mapping\n"); - psize = mmu_vmalloc_psize = MMU_PAGE_4K; - copro_flush_all_slbs(mm); - } - } - -#endif /* CONFIG_PPC_64K_PAGES */ - - if (current->mm == mm) - check_paca_psize(ea, mm, psize, user_region); - -#ifdef CONFIG_PPC_64K_PAGES - if (psize == MMU_PAGE_64K) - rc = __hash_page_64K(ea, access, vsid, ptep, trap, - flags, ssize); - else -#endif /* CONFIG_PPC_64K_PAGES */ - { - int spp = subpage_protection(mm, ea); - if (access & spp) - rc = -2; - else - rc = __hash_page_4K(ea, access, vsid, ptep, trap, - flags, ssize, spp); - } - - /* Dump some info in case of hash insertion failure, they should - * never happen so it is really useful to know if/when they do - */ - if (rc == -1) - hash_failure_debug(ea, access, vsid, trap, ssize, psize, - psize, pte_val(*ptep)); -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif - DBG_LOW(" -> rc=%d\n", rc); - -bail: - exception_exit(prev_state); - return rc; -} -EXPORT_SYMBOL_GPL(hash_page_mm); - -int hash_page(unsigned long ea, unsigned long access, unsigned long trap, - unsigned long dsisr) -{ - unsigned long flags = 0; - struct mm_struct *mm = current->mm; - - if ((get_region_id(ea) == VMALLOC_REGION_ID) || - (get_region_id(ea) == IO_REGION_ID)) - mm = &init_mm; - - if (dsisr & DSISR_NOHPTE) - flags |= HPTE_NOHPTE_UPDATE; - - return hash_page_mm(mm, ea, access, trap, flags); -} -EXPORT_SYMBOL_GPL(hash_page); - -int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, - unsigned long dsisr) -{ - unsigned long access = _PAGE_PRESENT | _PAGE_READ; - unsigned long flags = 0; - struct mm_struct *mm = current->mm; - unsigned int region_id = get_region_id(ea); - - if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) - mm = &init_mm; - - if (dsisr & DSISR_NOHPTE) - flags |= HPTE_NOHPTE_UPDATE; - - if (dsisr & DSISR_ISSTORE) - access |= _PAGE_WRITE; - /* - * We set _PAGE_PRIVILEGED only when - * kernel mode access kernel space. - * - * _PAGE_PRIVILEGED is NOT set - * 1) when kernel mode access user space - * 2) user space access kernel space. - */ - access |= _PAGE_PRIVILEGED; - if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) - access &= ~_PAGE_PRIVILEGED; - - if (trap == 0x400) - access |= _PAGE_EXEC; - - return hash_page_mm(mm, ea, access, trap, flags); -} - -#ifdef CONFIG_PPC_MM_SLICES -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - int psize = get_slice_psize(mm, ea); - - /* We only prefault standard pages for now */ - if (unlikely(psize != mm_ctx_user_psize(&mm->context))) - return false; - - /* - * Don't prefault if subpage protection is enabled for the EA. - */ - if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) - return false; - - return true; -} -#else -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - return true; -} -#endif - -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) -{ - int hugepage_shift; - unsigned long vsid; - pgd_t *pgdir; - pte_t *ptep; - unsigned long flags; - int rc, ssize, update_flags = 0; - unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); - - BUG_ON(get_region_id(ea) != USER_REGION_ID); - - if (!should_hash_preload(mm, ea)) - return; - - DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," - " trap=%lx\n", mm, mm->pgd, ea, access, trap); - - /* Get Linux PTE if available */ - pgdir = mm->pgd; - if (pgdir == NULL) - return; - - /* Get VSID */ - ssize = user_segment_size(ea); - vsid = get_user_vsid(&mm->context, ea, ssize); - if (!vsid) - return; - /* - * Hash doesn't like irqs. Walking linux page table with irq disabled - * saves us from holding multiple locks. - */ - local_irq_save(flags); - - /* - * THP pages use update_mmu_cache_pmd. We don't do - * hash preload there. Hence can ignore THP here - */ - ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); - if (!ptep) - goto out_exit; - - WARN_ON(hugepage_shift); -#ifdef CONFIG_PPC_64K_PAGES - /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on - * a 64K kernel), then we don't preload, hash_page() will take - * care of it once we actually try to access the page. - * That way we don't have to duplicate all of the logic for segment - * page size demotion here - */ - if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) - goto out_exit; -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Is that local to this CPU ? */ - if (mm_is_thread_local(mm)) - update_flags |= HPTE_LOCAL_UPDATE; - - /* Hash it in */ -#ifdef CONFIG_PPC_64K_PAGES - if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) - rc = __hash_page_64K(ea, access, vsid, ptep, trap, - update_flags, ssize); - else -#endif /* CONFIG_PPC_64K_PAGES */ - rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, - ssize, subpage_protection(mm, ea)); - - /* Dump some info in case of hash insertion failure, they should - * never happen so it is really useful to know if/when they do - */ - if (rc == -1) - hash_failure_debug(ea, access, vsid, trap, ssize, - mm_ctx_user_psize(&mm->context), - mm_ctx_user_psize(&mm->context), - pte_val(*ptep)); -out_exit: - local_irq_restore(flags); -} - -#ifdef CONFIG_PPC_MEM_KEYS -/* - * Return the protection key associated with the given address and the - * mm_struct. - */ -u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) -{ - pte_t *ptep; - u16 pkey = 0; - unsigned long flags; - - if (!mm || !mm->pgd) - return 0; - - local_irq_save(flags); - ptep = find_linux_pte(mm->pgd, address, NULL, NULL); - if (ptep) - pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); - local_irq_restore(flags); - - return pkey; -} -#endif /* CONFIG_PPC_MEM_KEYS */ - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -static inline void tm_flush_hash_page(int local) -{ - /* - * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a - * page back to a block device w/PIO could pick up transactional data - * (bad!) so we force an abort here. Before the sync the page will be - * made read-only, which will flush_hash_page. BIG ISSUE here: if the - * kernel uses a page from userspace without unmapping it first, it may - * see the speculated version. - */ - if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs && - MSR_TM_ACTIVE(current->thread.regs->msr)) { - tm_enable(); - tm_abort(TM_CAUSE_TLBI); - } -} -#else -static inline void tm_flush_hash_page(int local) -{ -} -#endif - -/* - * Return the global hash slot, corresponding to the given PTE, which contains - * the HPTE. - */ -unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, - int ssize, real_pte_t rpte, unsigned int subpg_index) -{ - unsigned long hash, gslot, hidx; - - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(rpte, subpg_index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - gslot += hidx & _PTEIDX_GROUP_IX; - return gslot; -} - -/* WARNING: This is called from hash_low_64.S, if you change this prototype, - * do not forget to update the assembly call site ! - */ -void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, - unsigned long flags) -{ - unsigned long index, shift, gslot; - int local = flags & HPTE_LOCAL_UPDATE; - - DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); - pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index); - DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); - /* - * We use same base page size and actual psize, because we don't - * use these functions for hugepage - */ - mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize, - ssize, local); - } pte_iterate_hashed_end(); - - tm_flush_hash_page(local); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -void flush_hash_hugepage(unsigned long vsid, unsigned long addr, - pmd_t *pmdp, unsigned int psize, int ssize, - unsigned long flags) -{ - int i, max_hpte_count, valid; - unsigned long s_addr; - unsigned char *hpte_slot_array; - unsigned long hidx, shift, vpn, hash, slot; - int local = flags & HPTE_LOCAL_UPDATE; - - s_addr = addr & HPAGE_PMD_MASK; - hpte_slot_array = get_hpte_slot_array(pmdp); - /* - * IF we try to do a HUGE PTE update after a withdraw is done. - * we will find the below NULL. This happens when we do - * split_huge_page_pmd - */ - if (!hpte_slot_array) - return; - - if (mmu_hash_ops.hugepage_invalidate) { - mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array, - psize, ssize, local); - goto tm_abort; - } - /* - * No bluk hpte removal support, invalidate each entry - */ - shift = mmu_psize_defs[psize].shift; - max_hpte_count = HPAGE_PMD_SIZE >> shift; - for (i = 0; i < max_hpte_count; i++) { - /* - * 8 bits per each hpte entries - * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] - */ - valid = hpte_valid(hpte_slot_array, i); - if (!valid) - continue; - hidx = hpte_hash_index(hpte_slot_array, i); - - /* get the vpn */ - addr = s_addr + (i * (1ul << shift)); - vpn = hpt_vpn(addr, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, psize, - MMU_PAGE_16M, ssize, local); - } -tm_abort: - tm_flush_hash_page(local); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -void flush_hash_range(unsigned long number, int local) -{ - if (mmu_hash_ops.flush_hash_range) - mmu_hash_ops.flush_hash_range(number, local); - else { - int i; - struct ppc64_tlb_batch *batch = - this_cpu_ptr(&ppc64_tlb_batch); - - for (i = 0; i < number; i++) - flush_hash_page(batch->vpn[i], batch->pte[i], - batch->psize, batch->ssize, local); - } -} - -/* - * low_hash_fault is called when we the low level hash code failed - * to instert a PTE due to an hypervisor error - */ -void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) -{ - enum ctx_state prev_state = exception_enter(); - - if (user_mode(regs)) { -#ifdef CONFIG_PPC_SUBPAGE_PROT - if (rc == -2) - _exception(SIGSEGV, regs, SEGV_ACCERR, address); - else -#endif - _exception(SIGBUS, regs, BUS_ADRERR, address); - } else - bad_page_fault(regs, address, SIGBUS); - - exception_exit(prev_state); -} - -long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rflags, - unsigned long vflags, int psize, int ssize) -{ - unsigned long hpte_group; - long slot; - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, - psize, psize, ssize); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, - vflags | HPTE_V_SECONDARY, - psize, psize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - - mmu_hash_ops.hpte_remove(hpte_group); - goto repeat; - } - } - - return slot; -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); - long ret; - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - - /* Don't create HPTE entries for bad address */ - if (!vsid) - return; - - ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, - HPTE_V_BOLTED, - mmu_linear_psize, mmu_kernel_ssize); - - BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); - BUG_ON(linear_map_hash_slots[lmi] & 0x80); - linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); -} - -static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash, hidx, slot; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); - hidx = linear_map_hash_slots[lmi] & 0x7f; - linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, - mmu_linear_psize, - mmu_kernel_ssize, 0); -} - -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - unsigned long flags, vaddr, lmi; - int i; - - local_irq_save(flags); - for (i = 0; i < numpages; i++, page++) { - vaddr = (unsigned long)page_address(page); - lmi = __pa(vaddr) >> PAGE_SHIFT; - if (lmi >= linear_map_hash_count) - continue; - if (enable) - kernel_map_linear_page(vaddr, lmi); - else - kernel_unmap_linear_page(vaddr, lmi); - } - local_irq_restore(flags); -} -#endif /* CONFIG_DEBUG_PAGEALLOC */ - -void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * On virtualized systems the first entry is our RMA region aka VRMA, - * non-virtualized 64-bit hash MMU systems don't have a limitation - * on real mode access. - * - * For guests on platforms before POWER9, we clamp the it limit to 1G - * to avoid some funky things such as RTAS bugs etc... - */ - if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { - ppc64_rma_size = first_memblock_size; - if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) - ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(ppc64_rma_size); - } else { - ppc64_rma_size = ULONG_MAX; - } -} - -#ifdef CONFIG_DEBUG_FS - -static int hpt_order_get(void *data, u64 *val) -{ - *val = ppc64_pft_size; - return 0; -} - -static int hpt_order_set(void *data, u64 val) -{ - if (!mmu_hash_ops.resize_hpt) - return -ENODEV; - - return mmu_hash_ops.resize_hpt(val); -} - -DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); - -static int __init hash64_debugfs(void) -{ - if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, - NULL, &fops_hpt_order)) { - pr_err("lpar: unable to create hpt_order debugsfs file\n"); - } - - return 0; -} -machine_device_initcall(pseries, hash64_debugfs); -#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c deleted file mode 100644 index dfbc3b32f09b..000000000000 --- a/arch/powerpc/mm/hugepage-hash64.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -/* - * PPC64 THP Support for hash based MMUs - */ -#include -#include - -int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, - pmd_t *pmdp, unsigned long trap, unsigned long flags, - int ssize, unsigned int psize) -{ - unsigned int index, valid; - unsigned char *hpte_slot_array; - unsigned long rflags, pa, hidx; - unsigned long old_pmd, new_pmd; - int ret, lpsize = MMU_PAGE_16M; - unsigned long vpn, hash, shift, slot; - - /* - * atomically mark the linux large page PMD busy and dirty - */ - do { - pmd_t pmd = READ_ONCE(*pmdp); - - old_pmd = pmd_val(pmd); - /* If PMD busy, retry the access */ - if (unlikely(old_pmd & H_PAGE_BUSY)) - return 0; - /* If PMD permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pmd))) - return 1; - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access - */ - new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pmd |= _PAGE_DIRTY; - } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); - - /* - * Make sure this is thp or devmap entry - */ - if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) - return 0; - - rflags = htab_convert_pte_flags(new_pmd); - -#if 0 - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { - - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } -#endif - /* - * Find the slot index details for this ea, using base page size. - */ - shift = mmu_psize_defs[psize].shift; - index = (ea & ~HPAGE_PMD_MASK) >> shift; - BUG_ON(index >= PTE_FRAG_SIZE); - - vpn = hpt_vpn(ea, vsid, ssize); - hpte_slot_array = get_hpte_slot_array(pmdp); - if (psize == MMU_PAGE_4K) { - /* - * invalidate the old hpte entry if we have that mapped via 64K - * base page size. This is because demote_segment won't flush - * hash page table entries. - */ - if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { - flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, - ssize, flags); - /* - * With THP, we also clear the slot information with - * respect to all the 64K hash pte mapping the 16MB - * page. They are all invalid now. This make sure we - * don't find the slot valid when we fault with 4k - * base page size. - * - */ - memset(hpte_slot_array, 0, PTE_FRAG_SIZE); - } - } - - valid = hpte_valid(hpte_slot_array, index); - if (valid) { - /* update the hpte bits */ - hash = hpt_hash(vpn, shift, ssize); - hidx = hpte_hash_index(hpte_slot_array, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - - ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, - psize, lpsize, ssize, flags); - /* - * We failed to update, try to insert a new entry. - */ - if (ret == -1) { - /* - * large pte is marked busy, so we can be sure - * nobody is looking at hpte_slot_array. hence we can - * safely update this here. - */ - valid = 0; - hpte_slot_array[index] = 0; - } - } - - if (!valid) { - unsigned long hpte_group; - - hash = hpt_hash(vpn, shift, ssize); - /* insert new entry */ - pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; - new_pmd |= H_PAGE_HASHPTE; - -repeat: - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - - /* Insert into the hash table, primary slot */ - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, - psize, lpsize, ssize); - /* - * Primary is full, try the secondary - */ - if (unlikely(slot == -1)) { - hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, - rflags, - HPTE_V_SECONDARY, - psize, lpsize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = (hash & htab_hash_mask) * - HPTES_PER_GROUP; - - mmu_hash_ops.hpte_remove(hpte_group); - goto repeat; - } - } - /* - * Hypervisor failure. Restore old pmd and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *pmdp = __pmd(old_pmd); - hash_failure_debug(ea, access, vsid, trap, ssize, - psize, lpsize, old_pmd); - return -1; - } - /* - * large pte is marked busy, so we can be sure - * nobody is looking at hpte_slot_array. hence we can - * safely update this here. - */ - mark_hpte_slot_valid(hpte_slot_array, index, slot); - } - /* - * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with - * base page size 4k. - */ - if (psize == MMU_PAGE_4K) - new_pmd |= H_PAGE_COMBO; - /* - * The hpte valid is stored in the pgtable whose address is in the - * second half of the PMD. Order this against clearing of the busy bit in - * huge pmd. - */ - smp_wmb(); - *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c deleted file mode 100644 index b0d9209d9a86..000000000000 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * Based on the IA-32 version: - * Copyright (C) 2002, Rohit Seth - */ - -#include -#include -#include -#include -#include -#include - -extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rlags, - unsigned long vflags, int psize, int ssize); - -int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, unsigned int shift, unsigned int mmu_psize) -{ - real_pte_t rpte; - unsigned long vpn; - unsigned long old_pte, new_pte; - unsigned long rflags, pa; - long slot, offset; - - BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); - - /* Search the Linux page table for a match with va */ - vpn = hpt_vpn(ea, vsid, ssize); - - /* At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - - do { - old_pte = pte_val(*ptep); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - - /* Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* Make sure this is a hugetlb entry */ - if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) - return 0; - - rflags = htab_convert_pte_flags(new_pte); - if (unlikely(mmu_psize == MMU_PAGE_16G)) - offset = PTRS_PER_PUD; - else - offset = PTRS_PER_PMD; - rpte = __real_pte(__pte(old_pte), ptep, offset); - - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long gslot; - - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, - mmu_psize, ssize, flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, shift, ssize); - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - - /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - - slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, - mmu_psize, ssize); - - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - mmu_psize, mmu_psize, old_pte); - return -1; - } - - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); - } - - /* - * No need to use ldarx/stdcx here - */ - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} - -pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - unsigned long pte_val; - /* - * Clear the _PAGE_PRESENT so that no hardware parallel update is - * possible. Also keep the pte_present true so that we don't take - * wrong fault. - */ - pte_val = pte_update(vma->vm_mm, addr, ptep, - _PAGE_PRESENT, _PAGE_INVALID, 1); - - return __pte(pte_val); -} - -void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t old_pte, pte_t pte) -{ - - if (radix_enabled()) - return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, - old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); -} diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c deleted file mode 100644 index cab06331c0c0..000000000000 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - int psize; - struct hstate *hstate = hstate_file(vma->vm_file); - - psize = hstate_get_psize(hstate); - radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); -} - -void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - int psize; - struct hstate *hstate = hstate_file(vma->vm_file); - - psize = hstate_get_psize(hstate); - radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); -} - -void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - int psize; - struct hstate *hstate = hstate_file(vma->vm_file); - - psize = hstate_get_psize(hstate); - radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); -} - -/* - * A vairant of hugetlb_get_unmapped_area doing topdown search - * FIXME!! should we do as x86 does or non hugetlb area does ? - * ie, use topdown or not based on mmap_is_legacy check ? - */ -unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct hstate *h = hstate_file(file); - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - /* - * We are always doing an topdown search here. Slice code - * does that too. - */ - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - - return vm_unmapped_area(&info); -} - -void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t old_pte, pte_t pte) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. - */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) - radix__flush_hugetlb_page(vma, addr); - - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); -} diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c deleted file mode 100644 index cb2b08635508..000000000000 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * MMU context allocation for 64-bit kernels. - * - * Copyright (C) 2004 Anton Blanchard, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -static DEFINE_IDA(mmu_context_ida); - -static int alloc_context_id(int min_id, int max_id) -{ - return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); -} - -void hash__reserve_context_id(int id) -{ - int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); - - WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); -} - -int hash__alloc_context_id(void) -{ - unsigned long max; - - if (mmu_has_feature(MMU_FTR_68_BIT_VA)) - max = MAX_USER_CONTEXT; - else - max = MAX_USER_CONTEXT_65BIT_VA; - - return alloc_context_id(MIN_USER_CONTEXT, max); -} -EXPORT_SYMBOL_GPL(hash__alloc_context_id); - -void slb_setup_new_exec(void); - -static int hash__init_new_context(struct mm_struct *mm) -{ - int index; - - index = hash__alloc_context_id(); - if (index < 0) - return index; - - mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), - GFP_KERNEL); - if (!mm->context.hash_context) { - ida_free(&mmu_context_ida, index); - return -ENOMEM; - } - - /* - * The old code would re-promote on fork, we don't do that when using - * slices as it could cause problem promoting slices that have been - * forced down to 4K. - * - * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we properly - * initialize context slice details for newly allocated mm's (which will - * have id == 0) and don't alter context slice inherited via fork (which - * will have id != 0). - * - * We should not be calling init_new_context() on init_mm. Hence a - * check against 0 is OK. - */ - if (mm->context.id == 0) { - memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); - slice_init_new_context_exec(mm); - } else { - /* This is fork. Copy hash_context details from current->mm */ - memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); -#ifdef CONFIG_PPC_SUBPAGE_PROT - /* inherit subpage prot detalis if we have one. */ - if (current->mm->context.hash_context->spt) { - mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), - GFP_KERNEL); - if (!mm->context.hash_context->spt) { - ida_free(&mmu_context_ida, index); - kfree(mm->context.hash_context); - return -ENOMEM; - } - } -#endif - - } - - pkey_mm_init(mm); - return index; -} - -void hash__setup_new_exec(void) -{ - slice_setup_new_exec(); - - slb_setup_new_exec(); -} - -static int radix__init_new_context(struct mm_struct *mm) -{ - unsigned long rts_field; - int index, max_id; - - max_id = (1 << mmu_pid_bits) - 1; - index = alloc_context_id(mmu_base_pid, max_id); - if (index < 0) - return index; - - /* - * set the process table entry, - */ - rts_field = radix__get_tree_size(); - process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); - - /* - * Order the above store with subsequent update of the PID - * register (at which point HW can start loading/caching - * the entry) and the corresponding load by the MMU from - * the L2 cache. - */ - asm volatile("ptesync;isync" : : : "memory"); - - mm->context.npu_context = NULL; - mm->context.hash_context = NULL; - - return index; -} - -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - int index; - - if (radix_enabled()) - index = radix__init_new_context(mm); - else - index = hash__init_new_context(mm); - - if (index < 0) - return index; - - mm->context.id = index; - - mm->context.pte_frag = NULL; - mm->context.pmd_frag = NULL; -#ifdef CONFIG_SPAPR_TCE_IOMMU - mm_iommu_init(mm); -#endif - atomic_set(&mm->context.active_cpus, 0); - atomic_set(&mm->context.copros, 0); - - return 0; -} - -void __destroy_context(int context_id) -{ - ida_free(&mmu_context_ida, context_id); -} -EXPORT_SYMBOL_GPL(__destroy_context); - -static void destroy_contexts(mm_context_t *ctx) -{ - int index, context_id; - - for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { - context_id = ctx->extended_id[index]; - if (context_id) - ida_free(&mmu_context_ida, context_id); - } - kfree(ctx->hash_context); -} - -static void pmd_frag_destroy(void *pmd_frag) -{ - int count; - struct page *page; - - page = virt_to_page(pmd_frag); - /* drop all the pending references */ - count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; - /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); - } -} - -static void destroy_pagetable_cache(struct mm_struct *mm) -{ - void *frag; - - frag = mm->context.pte_frag; - if (frag) - pte_frag_destroy(frag); - - frag = mm->context.pmd_frag; - if (frag) - pmd_frag_destroy(frag); - return; -} - -void destroy_context(struct mm_struct *mm) -{ -#ifdef CONFIG_SPAPR_TCE_IOMMU - WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); -#endif - if (radix_enabled()) - WARN_ON(process_tb[mm->context.id].prtb0 != 0); - else - subpage_prot_free(mm); - destroy_contexts(&mm->context); - mm->context.id = MMU_NO_CONTEXT; -} - -void arch_exit_mmap(struct mm_struct *mm) -{ - destroy_pagetable_cache(mm); - - if (radix_enabled()) { - /* - * Radix doesn't have a valid bit in the process table - * entries. However we know that at least P9 implementation - * will avoid caching an entry with an invalid RTS field, - * and 0 is invalid. So this will do. - * - * This runs before the "fullmm" tlb flush in exit_mmap, - * which does a RIC=2 tlbie to clear the process table - * entry. See the "fullmm" comments in tlb-radix.c. - * - * No barrier required here after the store because - * this process will do the invalidate, which starts with - * ptesync. - */ - process_tb[mm->context.id].prtb0 = 0; - } -} - -#ifdef CONFIG_PPC_RADIX_MMU -void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) -{ - mtspr(SPRN_PID, next->context.id); - isync(); -} -#endif diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c deleted file mode 100644 index e7a9c4f6bfca..000000000000 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ /dev/null @@ -1,482 +0,0 @@ -/* - * IOMMU helpers in MMU context. - * - * Copyright (C) 2015 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_MUTEX(mem_list_mutex); - -#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 -#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) - -struct mm_iommu_table_group_mem_t { - struct list_head next; - struct rcu_head rcu; - unsigned long used; - atomic64_t mapped; - unsigned int pageshift; - u64 ua; /* userspace address */ - u64 entries; /* number of entries in hpas/hpages[] */ - /* - * in mm_iommu_get we temporarily use this to store - * struct page address. - * - * We need to convert ua to hpa in real mode. Make it - * simpler by storing physical address. - */ - union { - struct page **hpages; /* vmalloc'ed */ - phys_addr_t *hpas; - }; -#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) - u64 dev_hpa; /* Device memory base address */ -}; - -static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, - unsigned long npages, bool incr) -{ - long ret = 0, locked, lock_limit; - - if (!npages) - return 0; - - down_write(&mm->mmap_sem); - - if (incr) { - locked = mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - } - - pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current ? current->pid : 0, - incr ? '+' : '-', - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); - - return ret; -} - -bool mm_iommu_preregistered(struct mm_struct *mm) -{ - return !list_empty(&mm->context.iommu_group_mem_list); -} -EXPORT_SYMBOL_GPL(mm_iommu_preregistered); - -static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) -{ - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; - unsigned int pageshift; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } - - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { - ret = mm_iommu_adjust_locked_vm(mm, entries, true); - if (ret) - goto unlock_exit; - - locked_entries = entries; - } - - mem = kzalloc(sizeof(*mem), GFP_KERNEL); - if (!mem) { - ret = -ENOMEM; - goto unlock_exit; - } - - if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { - mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); - mem->dev_hpa = dev_hpa; - goto good_exit; - } - mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; - - /* - * For a starting point for a maximum page size calculation - * we use @ua and @entries natural alignment to allow IOMMU pages - * smaller than huge pages but still bigger than PAGE_SIZE. - */ - mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); - mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); - if (!mem->hpas) { - kfree(mem); - ret = -ENOMEM; - goto unlock_exit; - } - - down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); - up_read(&mm->mmap_sem); - if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); - ret = -EFAULT; - goto unlock_exit; - } - - pageshift = PAGE_SHIFT; - for (i = 0; i < entries; ++i) { - struct page *page = mem->hpages[i]; - - /* - * Allow to use larger than 64k IOMMU pages. Only do that - * if we are backed by hugetlb. - */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } - mem->pageshift = min(mem->pageshift, pageshift); - /* - * We don't need struct page reference any more, switch - * to physical address. - */ - mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; - } - -good_exit: - ret = 0; - atomic64_set(&mem->mapped, 1); - mem->used = 1; - mem->ua = ua; - mem->entries = entries; - *pmem = mem; - - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); - -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); - - mutex_unlock(&mem_list_mutex); - - return ret; -} - -long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, - struct mm_iommu_table_group_mem_t **pmem) -{ - return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, - pmem); -} -EXPORT_SYMBOL_GPL(mm_iommu_new); - -long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) -{ - return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); -} -EXPORT_SYMBOL_GPL(mm_iommu_newdev); - -static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) -{ - long i; - struct page *page = NULL; - - if (!mem->hpas) - return; - - for (i = 0; i < mem->entries; ++i) { - if (!mem->hpas[i]) - continue; - - page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); - if (!page) - continue; - - if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) - SetPageDirty(page); - - put_page(page); - mem->hpas[i] = 0; - } -} - -static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) -{ - - mm_iommu_unpin(mem); - vfree(mem->hpas); - kfree(mem); -} - -static void mm_iommu_free(struct rcu_head *head) -{ - struct mm_iommu_table_group_mem_t *mem = container_of(head, - struct mm_iommu_table_group_mem_t, rcu); - - mm_iommu_do_free(mem); -} - -static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) -{ - list_del_rcu(&mem->next); - call_rcu(&mem->rcu, mm_iommu_free); -} - -long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) -{ - long ret = 0; - unsigned long entries, dev_hpa; - - mutex_lock(&mem_list_mutex); - - if (mem->used == 0) { - ret = -ENOENT; - goto unlock_exit; - } - - --mem->used; - /* There are still users, exit */ - if (mem->used) - goto unlock_exit; - - /* Are there still mappings? */ - if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { - ++mem->used; - ret = -EBUSY; - goto unlock_exit; - } - - /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; - mm_iommu_release(mem); - - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - -unlock_exit: - mutex_unlock(&mem_list_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(mm_iommu_put); - -struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(mm_iommu_lookup); - -struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, - next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} - -struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, - unsigned long ua, unsigned long entries) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if ((mem->ua == ua) && (mem->entries == entries)) { - ret = mem; - ++mem->used; - break; - } - } - - mutex_unlock(&mem_list_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(mm_iommu_get); - -long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - u64 *va; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - va = &mem->hpas[entry]; - *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} -EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); - -long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - unsigned long *pa; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); - if (!pa) - return -EFAULT; - - *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} - -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) -{ - struct mm_iommu_table_group_mem_t *mem; - long entry; - void *va; - unsigned long *pa; - - mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); - if (!mem) - return; - - if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) - return; - - entry = (ua - mem->ua) >> PAGE_SHIFT; - va = &mem->hpas[entry]; - - pa = (void *) vmalloc_to_phys(va); - if (!pa) - return; - - *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; -} - -bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, - unsigned int pageshift, unsigned long *size) -{ - struct mm_iommu_table_group_mem_t *mem; - unsigned long end; - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - continue; - - end = mem->dev_hpa + (mem->entries << PAGE_SHIFT); - if ((mem->dev_hpa <= hpa) && (hpa < end)) { - /* - * Since the IOMMU page size might be bigger than - * PAGE_SIZE, the amount of preregistered memory - * starting from @hpa might be smaller than 1<mapped)) - return 0; - - /* Last mm_iommu_put() has been called, no more mappings allowed() */ - return -ENXIO; -} -EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); - -void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) -{ - atomic64_add_unless(&mem->mapped, -1, 1); -} -EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); - -void mm_iommu_init(struct mm_struct *mm) -{ - INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list); -} diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 6ef36d553cde..57e64273cb33 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1068,7 +1068,7 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR -#include "vphn.h" +#include "book3s64/vphn.h" struct topology_update_data { struct topology_update_data *next; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c deleted file mode 100644 index 16bda049187a..000000000000 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -unsigned long __pmd_frag_nr; -EXPORT_SYMBOL(__pmd_frag_nr); -unsigned long __pmd_frag_size_shift; -EXPORT_SYMBOL(__pmd_frag_size_shift); - -int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - /* - * We can use MMU_PAGE_2M here, because only radix - * path look at the psize. - */ - __ptep_set_access_flags(vma, pmdp_ptep(pmdp), - pmd_pte(entry), address, MMU_PAGE_2M); - } - return changed; -} - -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - /* - * Make sure hardware valid bit is not set. We don't do - * tlb flush for this update. - */ - - WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); - assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -static void do_nothing(void *unused) -{ - -} -/* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ -void serialize_against_pte_lookup(struct mm_struct *mm) -{ - smp_mb(); - smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - unsigned long old_pmd; - - old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - serialize_against_pte_lookup(vma->vm_mm); - return __pmd(old_pmd); -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - if (radix_enabled()) - prefetch((void *)addr); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/* For use by kexec */ -void mmu_cleanup_all(void) -{ - if (radix_enabled()) - radix__mmu_cleanup_all(); - else if (mmu_hash_ops.hpte_clear_all) - mmu_hash_ops.hpte_clear_all(); -} - -#ifdef CONFIG_MEMORY_HOTPLUG -int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid) -{ - if (radix_enabled()) - return radix__create_section_mapping(start, end, nid); - - return hash__create_section_mapping(start, end, nid); -} - -int __meminit remove_section_mapping(unsigned long start, unsigned long end) -{ - if (radix_enabled()) - return radix__remove_section_mapping(start, end); - - return hash__remove_section_mapping(start, end); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -void __init mmu_partition_table_init(void) -{ - unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; - unsigned long ptcr; - - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); - /* Initialize the Partition Table with no entries */ - partition_tb = memblock_alloc(patb_size, patb_size); - if (!partition_tb) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, patb_size, patb_size); - - /* - * update partition table control register, - * 64 K size. - */ - ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); - powernv_set_nmmu_ptcr(ptcr); -} - -void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) -{ - unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); - - partition_tb[lpid].patb0 = cpu_to_be64(dw0); - partition_tb[lpid].patb1 = cpu_to_be64(dw1); - - /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. - */ - asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); - } else { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); - } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} -EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); - -static pmd_t *get_pmd_from_cache(struct mm_struct *mm) -{ - void *pmd_frag, *ret; - - if (PMD_FRAG_NR == 1) - return NULL; - - spin_lock(&mm->page_table_lock); - ret = mm->context.pmd_frag; - if (ret) { - pmd_frag = ret + PMD_FRAG_SIZE; - /* - * If we have taken up all the fragments mark PTE page NULL - */ - if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0) - pmd_frag = NULL; - mm->context.pmd_frag = pmd_frag; - } - spin_unlock(&mm->page_table_lock); - return (pmd_t *)ret; -} - -static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) -{ - void *ret = NULL; - struct page *page; - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) - return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); - return NULL; - } - - atomic_set(&page->pt_frag_refcount, 1); - - ret = page_address(page); - /* - * if we support only one fragment just return the - * allocated page. - */ - if (PMD_FRAG_NR == 1) - return ret; - - spin_lock(&mm->page_table_lock); - /* - * If we find pgtable_page set, we return - * the allocated page with single fragement - * count. - */ - if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); - mm->context.pmd_frag = ret + PMD_FRAG_SIZE; - } - spin_unlock(&mm->page_table_lock); - - return (pmd_t *)ret; -} - -pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) -{ - pmd_t *pmd; - - pmd = get_pmd_from_cache(mm); - if (pmd) - return pmd; - - return __alloc_for_pmdcache(mm); -} - -void pmd_fragment_free(unsigned long *pmd) -{ - struct page *page = virt_to_page(pmd); - - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); - } -} - -static inline void pgtable_free(void *table, int index) -{ - switch (index) { - case PTE_INDEX: - pte_fragment_free(table, 0); - break; - case PMD_INDEX: - pmd_fragment_free(table); - break; - case PUD_INDEX: - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); - break; -#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) - /* 16M hugepd directory at pud level */ - case HTLB_16M_INDEX: - BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); - break; - /* 16G hugepd directory at the pgd level */ - case HTLB_16G_INDEX: - BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); - break; -#endif - /* We don't free pgd table via RCU callback */ - default: - BUG(); - } -} - -#ifdef CONFIG_SMP -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) -{ - unsigned long pgf = (unsigned long)table; - - BUG_ON(index > MAX_PGTABLE_INDEX_SIZE); - pgf |= index; - tlb_remove_table(tlb, (void *)pgf); -} - -void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - return pgtable_free(table, index); -} -#else -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) -{ - return pgtable_free(table, index); -} -#endif - -#ifdef CONFIG_PROC_FS -atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; - -void arch_report_meminfo(struct seq_file *m) -{ - /* - * Hash maps the memory with one size mmu_linear_psize. - * So don't bother to print these on hash - */ - if (!radix_enabled()) - return; - seq_printf(m, "DirectMap4k: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); - seq_printf(m, "DirectMap64k: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); - seq_printf(m, "DirectMap2M: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); - seq_printf(m, "DirectMap1G: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); -} -#endif /* CONFIG_PROC_FS */ - -pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) -{ - unsigned long pte_val; - - /* - * Clear the _PAGE_PRESENT so that no hardware parallel update is - * possible. Also keep the pte_present true so that we don't take - * wrong fault. - */ - pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); - - return __pte(pte_val); - -} - -void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t old_pte, pte_t pte) -{ - if (radix_enabled()) - return radix__ptep_modify_prot_commit(vma, addr, - ptep, old_pte, pte); - set_pte_at(vma->vm_mm, addr, ptep, pte); -} - -/* - * For hash translation mode, we use the deposited table to store hash slot - * information and they are stored at PTRS_PER_PMD offset from related pmd - * location. Hence a pmd move requires deposit and withdraw. - * - * For radix translation with split pmd ptl, we store the deposited table in the - * pmd page. Hence if we have different pmd page we need to withdraw during pmd - * move. - * - * With hash we use deposited table always irrespective of anon or not. - * With radix we use deposited table only for anonymous mapping. - */ -int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl, - struct vm_area_struct *vma) -{ - if (radix_enabled()) - return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); - - return true; -} diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c deleted file mode 100644 index 1fd025dba4a3..000000000000 --- a/arch/powerpc/mm/pgtable-hash64.c +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright 2005, Paul Mackerras, IBM Corporation. - * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#define CREATE_TRACE_POINTS -#include - -#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) -#warning Limited user VSID range means pagetable space is wasted -#endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP -/* - * vmemmap is the starting address of the virtual address space where - * struct pages are allocated for all possible PFNs present on the system - * including holes and bad memory (hence sparse). These virtual struct - * pages are stored in sequence in this virtual address space irrespective - * of the fact whether the corresponding PFN is valid or not. This achieves - * constant relationship between address of struct page and its PFN. - * - * During boot or memory hotplug operation when a new memory section is - * added, physical memory allocation (including hash table bolting) will - * be performed for the set of struct pages which are part of the memory - * section. This saves memory by not allocating struct pages for PFNs - * which are not valid. - * - * ---------------------------------------------- - * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| - * ---------------------------------------------- - * - * f000000000000000 c000000000000000 - * vmemmap +--------------+ +--------------+ - * + | page struct | +--------------> | page struct | - * | +--------------+ +--------------+ - * | | page struct | +--------------> | page struct | - * | +--------------+ | +--------------+ - * | | page struct | + +------> | page struct | - * | +--------------+ | +--------------+ - * | | page struct | | +--> | page struct | - * | +--------------+ | | +--------------+ - * | | page struct | | | - * | +--------------+ | | - * | | page struct | | | - * | +--------------+ | | - * | | page struct | | | - * | +--------------+ | | - * | | page struct | | | - * | +--------------+ | | - * | | page struct | +-------+ | - * | +--------------+ | - * | | page struct | +-----------+ - * | +--------------+ - * | | page struct | No mapping - * | +--------------+ - * | | page struct | No mapping - * v +--------------+ - * - * ----------------------------------------- - * | RELATION BETWEEN STRUCT PAGES AND PFNS| - * ----------------------------------------- - * - * vmemmap +--------------+ +---------------+ - * + | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | | - * | +--------------+ - * | | | - * | +--------------+ - * | | | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | | - * | +--------------+ - * | | | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * | +--------------+ +---------------+ - * | | page struct | +-------------> | PFN | - * v +--------------+ +---------------+ - */ -/* - * On hash-based CPUs, the vmemmap is bolted in the hash table. - * - */ -int __meminit hash__vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int rc; - - if ((start + page_size) >= H_VMEMMAP_END) { - pr_warn("Outside the supported range\n"); - return -1; - } - - rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - if (rc < 0) { - int rc2 = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -void hash__vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ - int rc = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON((rc < 0) && (rc != -ENOENT)); - WARN_ON(rc == -ENOENT); -} -#endif -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - -/* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it - */ -int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); - } else { - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } - } - - smp_wmb(); - return 0; -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - __be64 old_be, tmp; - unsigned long old; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(pmd_lockptr(mm, pmdp)); -#endif - - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - and. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - or %1,%1,%7\n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), - "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) - : "cc" ); - - old = be64_to_cpu(old_be); - - trace_hugepage_update(addr, old, clr, set); - if (old & H_PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp, old); - return old; -} - -pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); - VM_BUG_ON(pmd_devmap(*pmdp)); - - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - serialize_against_pte_lookup(vma->vm_mm); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - return pmd; -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long old_pmd) -{ - int ssize; - unsigned int psize; - unsigned long vsid; - unsigned long flags = 0; - - /* get the base page size,vsid and segment size */ -#ifdef CONFIG_DEBUG_VM - psize = get_slice_psize(mm, addr); - BUG_ON(psize == MMU_PAGE_16M); -#endif - if (old_pmd & H_PAGE_COMBO) - psize = MMU_PAGE_4K; - else - psize = MMU_PAGE_64K; - - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_user_vsid(&mm->context, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - if (mm_is_thread_local(mm)) - flags |= HPTE_LOCAL_UPDATE; - - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); -} - -pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_current_mm_pte variants which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_curren_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); - return old_pmd; -} - -int hash__has_transparent_hugepage(void) -{ - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#ifdef CONFIG_STRICT_KERNEL_RWX -static bool hash__change_memory_range(unsigned long start, unsigned long end, - unsigned long newpp) -{ - unsigned long idx; - unsigned int step, shift; - - shift = mmu_psize_defs[mmu_linear_psize].shift; - step = 1 << shift; - - start = ALIGN_DOWN(start, step); - end = ALIGN(end, step); // aligns up - - if (start >= end) - return false; - - pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", - start, end, newpp, step); - - for (idx = start; idx < end; idx += step) - /* Not sure if we can do much with the return value */ - mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, - mmu_kernel_ssize); - - return true; -} - -void hash__mark_rodata_ro(void) -{ - unsigned long start, end; - - start = (unsigned long)_stext; - end = (unsigned long)__init_begin; - - WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); -} - -void hash__mark_initmem_nx(void) -{ - unsigned long start, end, pp; - - start = (unsigned long)__init_begin; - end = (unsigned long)__init_end; - - pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); - - WARN_ON(!hash__change_memory_range(start, end, pp)); -} -#endif diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c deleted file mode 100644 index fcb0169e2d32..000000000000 --- a/arch/powerpc/mm/pgtable-radix.c +++ /dev/null @@ -1,1124 +0,0 @@ -/* - * Page table handling routines for radix page table. - * - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) "radix-mmu: " fmt - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -unsigned int mmu_pid_bits; -unsigned int mmu_base_pid; - -static int native_register_process_table(unsigned long base, unsigned long pg_sz, - unsigned long table_size) -{ - unsigned long patb0, patb1; - - patb0 = be64_to_cpu(partition_tb[0].patb0); - patb1 = base | table_size | PATB_GR; - - mmu_partition_table_set_entry(0, patb0, patb1); - - return 0; -} - -static __ref void *early_alloc_pgtable(unsigned long size, int nid, - unsigned long region_start, unsigned long region_end) -{ - phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; - phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; - void *ptr; - - if (region_start) - min_addr = region_start; - if (region_end) - max_addr = region_end; - - ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", - __func__, size, size, nid, &min_addr, &max_addr); - - return ptr; -} - -static int early_map_kernel_page(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size, - int nid, - unsigned long region_start, unsigned long region_end) -{ - unsigned long pfn = pa >> PAGE_SHIFT; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, - region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); - } - pudp = pud_offset(pgdp, ea); - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, - region_start, region_end); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE, nid, - region_start, region_end); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - -set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); - return 0; -} - -/* - * nid, region_start, and region_end are hints to try to place the page - * table memory in the same node or region. - */ -static int __map_kernel_page(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size, - int nid, - unsigned long region_start, unsigned long region_end) -{ - unsigned long pfn = pa >> PAGE_SHIFT; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - /* - * Make sure task size is correct as per the max adddr - */ - BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); - -#ifdef CONFIG_PPC_64K_PAGES - BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); -#endif - - if (unlikely(!slab_is_available())) - return early_map_kernel_page(ea, pa, flags, map_page_size, - nid, region_start, region_end); - - /* - * Should make page table allocation functions be able to take a - * node, so we can place kernel page tables on the right nodes after - * boot. - */ - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - -set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); - return 0; -} - -int radix__map_kernel_page(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size) -{ - return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); -} - -#ifdef CONFIG_STRICT_KERNEL_RWX -void radix__change_memory_range(unsigned long start, unsigned long end, - unsigned long clear) -{ - unsigned long idx; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - start = ALIGN_DOWN(start, PAGE_SIZE); - end = PAGE_ALIGN(end); // aligns up - - pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", - start, end, clear); - - for (idx = start; idx < end; idx += PAGE_SIZE) { - pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); - if (!pudp) - continue; - if (pud_huge(*pudp)) { - ptep = (pte_t *)pudp; - goto update_the_pte; - } - pmdp = pmd_alloc(&init_mm, pudp, idx); - if (!pmdp) - continue; - if (pmd_huge(*pmdp)) { - ptep = pmdp_ptep(pmdp); - goto update_the_pte; - } - ptep = pte_alloc_kernel(pmdp, idx); - if (!ptep) - continue; -update_the_pte: - radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); - } - - radix__flush_tlb_kernel_range(start, end); -} - -void radix__mark_rodata_ro(void) -{ - unsigned long start, end; - - start = (unsigned long)_stext; - end = (unsigned long)__init_begin; - - radix__change_memory_range(start, end, _PAGE_WRITE); -} - -void radix__mark_initmem_nx(void) -{ - unsigned long start = (unsigned long)__init_begin; - unsigned long end = (unsigned long)__init_end; - - radix__change_memory_range(start, end, _PAGE_EXEC); -} -#endif /* CONFIG_STRICT_KERNEL_RWX */ - -static inline void __meminit -print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) -{ - char buf[10]; - - if (end <= start) - return; - - string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); - - pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, - exec ? " (exec)" : ""); -} - -static unsigned long next_boundary(unsigned long addr, unsigned long end) -{ -#ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); -#endif - return end; -} - -static int __meminit create_physical_mapping(unsigned long start, - unsigned long end, - int nid) -{ - unsigned long vaddr, addr, mapping_size = 0; - bool prev_exec, exec = false; - pgprot_t prot; - int psize; - - start = _ALIGN_UP(start, PAGE_SIZE); - for (addr = start; addr < end; addr += mapping_size) { - unsigned long gap, previous_size; - int rc; - - gap = next_boundary(addr, end) - addr; - previous_size = mapping_size; - prev_exec = exec; - - if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift) { - mapping_size = PUD_SIZE; - psize = MMU_PAGE_1G; - } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && - mmu_psize_defs[MMU_PAGE_2M].shift) { - mapping_size = PMD_SIZE; - psize = MMU_PAGE_2M; - } else { - mapping_size = PAGE_SIZE; - psize = mmu_virtual_psize; - } - - vaddr = (unsigned long)__va(addr); - - if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || - overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { - prot = PAGE_KERNEL_X; - exec = true; - } else { - prot = PAGE_KERNEL; - exec = false; - } - - if (mapping_size != previous_size || exec != prev_exec) { - print_mapping(start, addr, previous_size, prev_exec); - start = addr; - } - - rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); - if (rc) - return rc; - - update_page_count(psize, 1); - } - - print_mapping(start, addr, mapping_size, exec); - return 0; -} - -void __init radix_init_pgtable(void) -{ - unsigned long rts_field; - struct memblock_region *reg; - - /* We don't support slb for radix */ - mmu_slb_size = 0; - /* - * Create the linear mapping, using standard page size for now - */ - for_each_memblock(memory, reg) { - /* - * The memblock allocator is up at this point, so the - * page tables will be allocated within the range. No - * need or a node (which we don't have yet). - */ - - if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - continue; - } - - WARN_ON(create_physical_mapping(reg->base, - reg->base + reg->size, - -1)); - } - - /* Find out how many PID bits are supported */ - if (cpu_has_feature(CPU_FTR_HVMODE)) { - if (!mmu_pid_bits) - mmu_pid_bits = 20; -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* - * When KVM is possible, we only use the top half of the - * PID space to avoid collisions between host and guest PIDs - * which can cause problems due to prefetch when exiting the - * guest with AIL=3 - */ - mmu_base_pid = 1 << (mmu_pid_bits - 1); -#else - mmu_base_pid = 1; -#endif - } else { - /* The guest uses the bottom half of the PID space */ - if (!mmu_pid_bits) - mmu_pid_bits = 19; - mmu_base_pid = 1; - } - - /* - * Allocate Partition table and process table for the - * host. - */ - BUG_ON(PRTB_SIZE_SHIFT > 36); - process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); - /* - * Fill in the process table. - */ - rts_field = radix__get_tree_size(); - process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - /* - * Fill in the partition table. We are suppose to use effective address - * of process table here. But our linear mapping also enable us to use - * physical address here. - */ - register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); - pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); - - /* - * The init_mm context is given the first available (non-zero) PID, - * which is the "guard PID" and contains no page table. PIDR should - * never be set to zero because that duplicates the kernel address - * space at the 0x0... offset (quadrant 0)! - * - * An arbitrary PID that may later be allocated by the PID allocator - * for userspace processes must not be used either, because that - * would cause stale user mappings for that PID on CPUs outside of - * the TLB invalidation scheme (because it won't be in mm_cpumask). - * - * So permanently carve out one PID for the purpose of a guard PID. - */ - init_mm.context.id = mmu_base_pid; - mmu_base_pid++; -} - -static void __init radix_init_partition_table(void) -{ - unsigned long rts_field, dw0; - - mmu_partition_table_init(); - rts_field = radix__get_tree_size(); - dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; - mmu_partition_table_set_entry(0, dw0, 0); - - pr_info("Initializing Radix MMU\n"); - pr_info("Partition table %p\n", partition_tb); -} - -void __init radix_init_native(void) -{ - register_process_table = native_register_process_table; -} - -static int __init get_idx_from_shift(unsigned int shift) -{ - int idx = -1; - - switch (shift) { - case 0xc: - idx = MMU_PAGE_4K; - break; - case 0x10: - idx = MMU_PAGE_64K; - break; - case 0x15: - idx = MMU_PAGE_2M; - break; - case 0x1e: - idx = MMU_PAGE_1G; - break; - } - return idx; -} - -static int __init radix_dt_scan_page_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - int size = 0; - int shift, idx; - unsigned int ap; - const __be32 *prop; - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - /* Find MMU PID size */ - prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); - if (prop && size == 4) - mmu_pid_bits = be32_to_cpup(prop); - - /* Grab page size encodings */ - prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); - if (!prop) - return 0; - - pr_info("Page sizes from device-tree:\n"); - for (; size >= 4; size -= 4, ++prop) { - - struct mmu_psize_def *def; - - /* top 3 bit is AP encoding */ - shift = be32_to_cpu(prop[0]) & ~(0xe << 28); - ap = be32_to_cpu(prop[0]) >> 29; - pr_info("Page size shift = %d AP=0x%x\n", shift, ap); - - idx = get_idx_from_shift(shift); - if (idx < 0) - continue; - - def = &mmu_psize_defs[idx]; - def->shift = shift; - def->ap = ap; - } - - /* needed ? */ - cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; - return 1; -} - -void __init radix__early_init_devtree(void) -{ - int rc; - - /* - * Try to find the available page sizes in the device-tree - */ - rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); - if (rc != 0) /* Found */ - goto found; - /* - * let's assume we have page 4k and 64k support - */ - mmu_psize_defs[MMU_PAGE_4K].shift = 12; - mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; - - mmu_psize_defs[MMU_PAGE_64K].shift = 16; - mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; -found: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - return; -} - -static void radix_init_amor(void) -{ - /* - * In HV mode, we init AMOR (Authority Mask Override Register) so that - * the hypervisor and guest can setup IAMR (Instruction Authority Mask - * Register), enable key 0 and set it to 1. - * - * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) - */ - mtspr(SPRN_AMOR, (3ul << 62)); -} - -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - if (disabled || !early_radix_enabled()) - return; - - if (smp_processor_id() == boot_cpuid) - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - /* - * Radix always uses key0 of the IAMR to determine if an access is - * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction - * fetch. - */ - mtspr(SPRN_IAMR, (1ul << 62)); -} -#endif - -#ifdef CONFIG_PPC_KUAP -void setup_kuap(bool disabled) -{ - if (disabled || !early_radix_enabled()) - return; - - if (smp_processor_id() == boot_cpuid) { - pr_info("Activating Kernel Userspace Access Prevention\n"); - cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; - } - - /* Make sure userspace can't change the AMR */ - mtspr(SPRN_UAMOR, 0); - mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); - isync(); -} -#endif - -void __init radix__early_init_mmu(void) -{ - unsigned long lpcr; - -#ifdef CONFIG_PPC_64K_PAGES - /* PAGE_SIZE mappings */ - mmu_virtual_psize = MMU_PAGE_64K; -#else - mmu_virtual_psize = MMU_PAGE_4K; -#endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - mmu_vmemmap_psize = mmu_virtual_psize; -#endif - /* - * initialize page table size - */ - __pte_index_size = RADIX_PTE_INDEX_SIZE; - __pmd_index_size = RADIX_PMD_INDEX_SIZE; - __pud_index_size = RADIX_PUD_INDEX_SIZE; - __pgd_index_size = RADIX_PGD_INDEX_SIZE; - __pud_cache_index = RADIX_PUD_INDEX_SIZE; - __pte_table_size = RADIX_PTE_TABLE_SIZE; - __pmd_table_size = RADIX_PMD_TABLE_SIZE; - __pud_table_size = RADIX_PUD_TABLE_SIZE; - __pgd_table_size = RADIX_PGD_TABLE_SIZE; - - __pmd_val_bits = RADIX_PMD_VAL_BITS; - __pud_val_bits = RADIX_PUD_VAL_BITS; - __pgd_val_bits = RADIX_PGD_VAL_BITS; - - __kernel_virt_start = RADIX_KERN_VIRT_START; - __vmalloc_start = RADIX_VMALLOC_START; - __vmalloc_end = RADIX_VMALLOC_END; - __kernel_io_start = RADIX_KERN_IO_START; - __kernel_io_end = RADIX_KERN_IO_END; - vmemmap = (struct page *)RADIX_VMEMMAP_START; - ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PCI - pci_io_base = ISA_IO_BASE; -#endif - __pte_frag_nr = RADIX_PTE_FRAG_NR; - __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; - __pmd_frag_nr = RADIX_PMD_FRAG_NR; - __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; - - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - radix_init_native(); - lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - radix_init_partition_table(); - radix_init_amor(); - } else { - radix_init_pseries(); - } - - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - - radix_init_pgtable(); - /* Switch to the guard PID before turning on MMU */ - radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} - -void radix__early_init_mmu_secondary(void) -{ - unsigned long lpcr; - /* - * update partition table control register and UPRT - */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - radix_init_amor(); - } - - radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); -} - -void radix__mmu_cleanup_all(void) -{ - unsigned long lpcr; - - if (!firmware_has_feature(FW_FEATURE_LPAR)) { - lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); - mtspr(SPRN_PTCR, 0); - powernv_set_nmmu_ptcr(0); - radix__flush_tlb_all(); - } -} - -void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * Radix mode is not limited by RMA / VRMA addressing. - */ - ppc64_rma_size = ULONG_MAX; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_pte_table(pte_t *pte_start, pmd_t *pmd) -{ - pte_t *pte; - int i; - - for (i = 0; i < PTRS_PER_PTE; i++) { - pte = pte_start + i; - if (!pte_none(*pte)) - return; - } - - pte_free_kernel(&init_mm, pte_start); - pmd_clear(pmd); -} - -static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) -{ - pmd_t *pmd; - int i; - - for (i = 0; i < PTRS_PER_PMD; i++) { - pmd = pmd_start + i; - if (!pmd_none(*pmd)) - return; - } - - pmd_free(&init_mm, pmd_start); - pud_clear(pud); -} - -struct change_mapping_params { - pte_t *pte; - unsigned long start; - unsigned long end; - unsigned long aligned_start; - unsigned long aligned_end; -}; - -static int __meminit stop_machine_change_mapping(void *data) -{ - struct change_mapping_params *params = - (struct change_mapping_params *)data; - - if (!data) - return -1; - - spin_unlock(&init_mm.page_table_lock); - pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start, -1); - create_physical_mapping(params->end, params->aligned_end, -1); - spin_lock(&init_mm.page_table_lock); - return 0; -} - -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end) -{ - unsigned long next; - pte_t *pte; - - pte = pte_start + pte_index(addr); - for (; addr < end; addr = next, pte++) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - if (next > end) - next = end; - - if (!pte_present(*pte)) - continue; - - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; - } - - pte_clear(&init_mm, addr, pte); - } -} - -/* - * clear the pte and potentially split the mapping helper - */ -static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, - unsigned long size, pte_t *pte) -{ - unsigned long mask = ~(size - 1); - unsigned long aligned_start = addr & mask; - unsigned long aligned_end = addr + size; - struct change_mapping_params params; - bool split_region = false; - - if ((end - addr) < size) { - /* - * We're going to clear the PTE, but not flushed - * the mapping, time to remap and flush. The - * effects if visible outside the processor or - * if we are running in code close to the - * mapping we cleared, we are in trouble. - */ - if (overlaps_kernel_text(aligned_start, addr) || - overlaps_kernel_text(end, aligned_end)) { - /* - * Hack, just return, don't pte_clear - */ - WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " - "text, not splitting\n", addr, end); - return; - } - split_region = true; - } - - if (split_region) { - params.pte = pte; - params.start = addr; - params.end = end; - params.aligned_start = addr & ~(size - 1); - params.aligned_end = min_t(unsigned long, aligned_end, - (unsigned long)__va(memblock_end_of_DRAM())); - stop_machine(stop_machine_change_mapping, ¶ms, NULL); - return; - } - - pte_clear(&init_mm, addr, pte); -} - -static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end) -{ - unsigned long next; - pte_t *pte_base; - pmd_t *pmd; - - pmd = pmd_start + pmd_index(addr); - for (; addr < end; addr = next, pmd++) { - next = pmd_addr_end(addr, end); - - if (!pmd_present(*pmd)) - continue; - - if (pmd_huge(*pmd)) { - split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); - continue; - } - - pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next); - free_pte_table(pte_base, pmd); - } -} - -static void remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end) -{ - unsigned long next; - pmd_t *pmd_base; - pud_t *pud; - - pud = pud_start + pud_index(addr); - for (; addr < end; addr = next, pud++) { - next = pud_addr_end(addr, end); - - if (!pud_present(*pud)) - continue; - - if (pud_huge(*pud)) { - split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); - continue; - } - - pmd_base = (pmd_t *)pud_page_vaddr(*pud); - remove_pmd_table(pmd_base, addr, next); - free_pmd_table(pmd_base, pud); - } -} - -static void __meminit remove_pagetable(unsigned long start, unsigned long end) -{ - unsigned long addr, next; - pud_t *pud_base; - pgd_t *pgd; - - spin_lock(&init_mm.page_table_lock); - - for (addr = start; addr < end; addr = next) { - next = pgd_addr_end(addr, end); - - pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) - continue; - - if (pgd_huge(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); - continue; - } - - pud_base = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud_base, addr, next); - } - - spin_unlock(&init_mm.page_table_lock); - radix__flush_tlb_kernel_range(start, end); -} - -int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) -{ - if (end >= RADIX_VMALLOC_START) { - pr_warn("Outside the supported range\n"); - return -1; - } - - return create_physical_mapping(start, end, nid); -} - -int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) -{ - remove_pagetable(start, end); - return 0; -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, - pgprot_t flags, unsigned int map_page_size, - int nid) -{ - return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); -} - -int __meminit radix__vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding */ - unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; - int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); - int ret; - - if ((start + page_size) >= RADIX_VMEMMAP_END) { - pr_warn("Outside the supported range\n"); - return -1; - } - - ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); - BUG_ON(ret); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) -{ - remove_pagetable(start, start + page_size); -} -#endif -#endif - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - unsigned long old; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(pmd_lockptr(mm, pmdp)); -#endif - - old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); - trace_hugepage_update(addr, old, clr, set); - - return old; -} - -pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) - -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); - VM_BUG_ON(pmd_devmap(*pmdp)); - /* - * khugepaged calls this for normal pmd - */ - pmd = *pmdp; - pmd_clear(pmdp); - - /*FIXME!! Verify whether we need this kick below */ - serialize_against_pte_lookup(vma->vm_mm); - - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); - - return pmd; -} - -/* - * For us pgtable_t is pte_t *. Inorder to save the deposisted - * page table, we consider the allocated page table as a list - * head. On withdraw we need to make sure we zero out the used - * list_head memory area. - */ -void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - struct list_head *lh = (struct list_head *) pgtable; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); - pmd_huge_pte(mm, pmdp) = pgtable; -} - -pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pte_t *ptep; - pgtable_t pgtable; - struct list_head *lh; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; - list_del(lh); - } - ptep = (pte_t *) pgtable; - *ptep = __pte(0); - ptep++; - *ptep = __pte(0); - return pgtable; -} - - -pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - unsigned long old; - - old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); - return old_pmd; -} - -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, - pte_t entry, unsigned long address, int psize) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | - _PAGE_RW | _PAGE_EXEC); - - unsigned long change = pte_val(entry) ^ pte_val(*ptep); - /* - * To avoid NMMU hang while relaxing access, we need mark - * the pte invalid in between. - */ - if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { - unsigned long old_pte, new_pte; - - old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); - /* - * new value of pte - */ - new_pte = old_pte | set; - radix__flush_tlb_page_psize(mm, address, psize); - __radix_pte_update(ptep, _PAGE_INVALID, new_pte); - } else { - __radix_pte_update(ptep, 0, set); - /* - * Book3S does not require a TLB flush when relaxing access - * restrictions when the address space is not attached to a - * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architectue. - */ - } - /* See ptesync comment in radix__set_pte_at */ -} - -void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t old_pte, pte_t pte) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. We need to do this only for radix, because hash - * translation does flush when updating the linux pte. - */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) - radix__flush_tlb_page(vma, addr); - - set_pte_at(mm, addr, ptep, pte); -} diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c deleted file mode 100644 index ae7fca40e5b3..000000000000 --- a/arch/powerpc/mm/pkeys.c +++ /dev/null @@ -1,428 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * PowerPC Memory Protection Keys management - * - * Copyright 2017, Ram Pai, IBM Corporation. - */ - -#include -#include -#include -#include -#include -#include - -DEFINE_STATIC_KEY_TRUE(pkey_disabled); -int pkeys_total; /* Total pkeys as per device tree */ -u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ -u32 reserved_allocation_mask; /* Bits set for reserved keys */ -static bool pkey_execute_disable_supported; -static bool pkeys_devtree_defined; /* property exported by device tree */ -static u64 pkey_amr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ -static int execute_only_key = 2; - -#define AMR_BITS_PER_PKEY 2 -#define AMR_RD_BIT 0x1UL -#define AMR_WR_BIT 0x2UL -#define IAMR_EX_BIT 0x1UL -#define PKEY_REG_BITS (sizeof(u64)*8) -#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) - -static void scan_pkey_feature(void) -{ - u32 vals[2]; - struct device_node *cpu; - - cpu = of_find_node_by_type(NULL, "cpu"); - if (!cpu) - return; - - if (of_property_read_u32_array(cpu, - "ibm,processor-storage-keys", vals, 2)) - return; - - /* - * Since any pkey can be used for data or execute, we will just treat - * all keys as equal and track them as one entity. - */ - pkeys_total = vals[0]; - pkeys_devtree_defined = true; -} - -static inline bool pkey_mmu_enabled(void) -{ - if (firmware_has_feature(FW_FEATURE_LPAR)) - return pkeys_total; - else - return cpu_has_feature(CPU_FTR_PKEY); -} - -static int pkey_initialize(void) -{ - int os_reserved, i; - - /* - * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral - * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE. - * Ensure that the bits a distinct. - */ - BUILD_BUG_ON(PKEY_DISABLE_EXECUTE & - (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - /* - * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous - * in the vmaflag. Make sure that is really the case. - */ - BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + - __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) - != (sizeof(u64) * BITS_PER_BYTE)); - - /* scan the device tree for pkey feature */ - scan_pkey_feature(); - - /* - * Let's assume 32 pkeys on P8 bare metal, if its not defined by device - * tree. We make this exception since skiboot forgot to expose this - * property on power8. - */ - if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) && - cpu_has_feature(CPU_FTRS_POWER8)) - pkeys_total = 32; - - /* - * Adjust the upper limit, based on the number of bits supported by - * arch-neutral code. - */ - pkeys_total = min_t(int, pkeys_total, - ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); - - if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) - static_branch_enable(&pkey_disabled); - else - static_branch_disable(&pkey_disabled); - - if (static_branch_likely(&pkey_disabled)) - return 0; - - /* - * The device tree cannot be relied to indicate support for - * execute_disable support. Instead we use a PVR check. - */ - if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p)) - pkey_execute_disable_supported = false; - else - pkey_execute_disable_supported = true; - -#ifdef CONFIG_PPC_4K_PAGES - /* - * The OS can manage only 8 pkeys due to its inability to represent them - * in the Linux 4K PTE. - */ - os_reserved = pkeys_total - 8; -#else - os_reserved = 0; -#endif - /* Bits are in LE format. */ - reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); - - /* register mask is in BE format */ - pkey_amr_mask = ~0x0ul; - pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); - - pkey_iamr_mask = ~0x0ul; - pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); - pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - - pkey_uamor_mask = ~0x0ul; - pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - - /* mark the rest of the keys as reserved and hence unavailable */ - for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { - reserved_allocation_mask |= (0x1 << i); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); - } - initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); - - if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { - /* - * Insufficient number of keys to support - * execute only key. Mark it unavailable. - * Any AMR, UAMOR, IAMR bit set for - * this key is irrelevant since this key - * can never be allocated. - */ - execute_only_key = -1; - } - - return 0; -} - -arch_initcall(pkey_initialize); - -void pkey_mm_init(struct mm_struct *mm) -{ - if (static_branch_likely(&pkey_disabled)) - return; - mm_pkey_allocation_map(mm) = initial_allocation_mask; - mm->context.execute_only_pkey = execute_only_key; -} - -static inline u64 read_amr(void) -{ - return mfspr(SPRN_AMR); -} - -static inline void write_amr(u64 value) -{ - mtspr(SPRN_AMR, value); -} - -static inline u64 read_iamr(void) -{ - if (!likely(pkey_execute_disable_supported)) - return 0x0UL; - - return mfspr(SPRN_IAMR); -} - -static inline void write_iamr(u64 value) -{ - if (!likely(pkey_execute_disable_supported)) - return; - - mtspr(SPRN_IAMR, value); -} - -static inline u64 read_uamor(void) -{ - return mfspr(SPRN_UAMOR); -} - -static inline void write_uamor(u64 value) -{ - mtspr(SPRN_UAMOR, value); -} - -static bool is_pkey_enabled(int pkey) -{ - u64 uamor = read_uamor(); - u64 pkey_bits = 0x3ul << pkeyshift(pkey); - u64 uamor_pkey_bits = (uamor & pkey_bits); - - /* - * Both the bits in UAMOR corresponding to the key should be set or - * reset. - */ - WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits)); - return !!(uamor_pkey_bits); -} - -static inline void init_amr(int pkey, u8 init_bits) -{ - u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); - u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); - - write_amr(old_amr | new_amr_bits); -} - -static inline void init_iamr(int pkey, u8 init_bits) -{ - u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); - u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); - - write_iamr(old_iamr | new_iamr_bits); -} - -/* - * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that - * specified in @init_val. - */ -int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) -{ - u64 new_amr_bits = 0x0ul; - u64 new_iamr_bits = 0x0ul; - - if (!is_pkey_enabled(pkey)) - return -EINVAL; - - if (init_val & PKEY_DISABLE_EXECUTE) { - if (!pkey_execute_disable_supported) - return -EINVAL; - new_iamr_bits |= IAMR_EX_BIT; - } - init_iamr(pkey, new_iamr_bits); - - /* Set the bits we need in AMR: */ - if (init_val & PKEY_DISABLE_ACCESS) - new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT; - else if (init_val & PKEY_DISABLE_WRITE) - new_amr_bits |= AMR_WR_BIT; - - init_amr(pkey, new_amr_bits); - return 0; -} - -void thread_pkey_regs_save(struct thread_struct *thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - /* - * TODO: Skip saving registers if @thread hasn't used any keys yet. - */ - thread->amr = read_amr(); - thread->iamr = read_iamr(); - thread->uamor = read_uamor(); -} - -void thread_pkey_regs_restore(struct thread_struct *new_thread, - struct thread_struct *old_thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - if (old_thread->amr != new_thread->amr) - write_amr(new_thread->amr); - if (old_thread->iamr != new_thread->iamr) - write_iamr(new_thread->iamr); - if (old_thread->uamor != new_thread->uamor) - write_uamor(new_thread->uamor); -} - -void thread_pkey_regs_init(struct thread_struct *thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - thread->amr = pkey_amr_mask; - thread->iamr = pkey_iamr_mask; - thread->uamor = pkey_uamor_mask; - - write_uamor(pkey_uamor_mask); - write_amr(pkey_amr_mask); - write_iamr(pkey_iamr_mask); -} - -static inline bool pkey_allows_readwrite(int pkey) -{ - int pkey_shift = pkeyshift(pkey); - - if (!is_pkey_enabled(pkey)) - return true; - - return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift)); -} - -int __execute_only_pkey(struct mm_struct *mm) -{ - return mm->context.execute_only_pkey; -} - -static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) -{ - /* Do this check first since the vm_flags should be hot */ - if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) - return false; - - return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey); -} - -/* - * This should only be called for *plain* mprotect calls. - */ -int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, - int pkey) -{ - /* - * If the currently associated pkey is execute-only, but the requested - * protection is not execute-only, move it back to the default pkey. - */ - if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC)) - return 0; - - /* - * The requested protection is execute-only. Hence let's use an - * execute-only pkey. - */ - if (prot == PROT_EXEC) { - pkey = execute_only_pkey(vma->vm_mm); - if (pkey > 0) - return pkey; - } - - /* Nothing to override. */ - return vma_pkey(vma); -} - -static bool pkey_access_permitted(int pkey, bool write, bool execute) -{ - int pkey_shift; - u64 amr; - - if (!is_pkey_enabled(pkey)) - return true; - - pkey_shift = pkeyshift(pkey); - if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift))) - return true; - - amr = read_amr(); /* Delay reading amr until absolutely needed */ - return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) || - (write && !(amr & (AMR_WR_BIT << pkey_shift)))); -} - -bool arch_pte_access_permitted(u64 pte, bool write, bool execute) -{ - if (static_branch_likely(&pkey_disabled)) - return true; - - return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); -} - -/* - * We only want to enforce protection keys on the current thread because we - * effectively have no access to AMR/IAMR for other threads or any way to tell - * which AMR/IAMR in a threaded process we could use. - * - * So do not enforce things if the VMA is not from the current mm, or if we are - * in a kernel thread. - */ -static inline bool vma_is_foreign(struct vm_area_struct *vma) -{ - if (!current->mm) - return true; - - /* if it is not our ->mm, it has to be foreign */ - if (current->mm != vma->vm_mm) - return true; - - return false; -} - -bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, - bool execute, bool foreign) -{ - if (static_branch_likely(&pkey_disabled)) - return true; - /* - * Do not enforce our key-permissions on a foreign vma. - */ - if (foreign || vma_is_foreign(vma)) - return true; - - return pkey_access_permitted(vma_pkey(vma), write, execute); -} - -void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - /* Duplicate the oldmm pkey state in mm: */ - mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); - mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; -} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c deleted file mode 100644 index 89e4531de64b..000000000000 --- a/arch/powerpc/mm/slb.c +++ /dev/null @@ -1,832 +0,0 @@ -/* - * PowerPC64 SLB support. - * - * Copyright (C) 2004 David Gibson , IBM - * Based on earlier code written by: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard , IBM - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -enum slb_index { - LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ - KSTACK_INDEX = 1, /* Kernel stack map */ -}; - -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); - -#define slb_esid_mask(ssize) \ - (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) - -static inline unsigned long mk_esid_data(unsigned long ea, int ssize, - enum slb_index index) -{ - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; -} - -static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, - unsigned long flags) -{ - return (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); -} - -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, - unsigned long flags) -{ - return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); -} - -static void assert_slb_presence(bool present, unsigned long ea) -{ -#ifdef CONFIG_DEBUG_VM - unsigned long tmp; - - WARN_ON_ONCE(mfmsr() & MSR_EE); - - if (!cpu_has_feature(CPU_FTR_ARCH_206)) - return; - - /* - * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware - * ignores all other bits from 0-27, so just clear them all. - */ - ea &= ~((1UL << 28) - 1); - asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); - - WARN_ON(present == (tmp == 0)); -#endif -} - -static inline void slb_shadow_update(unsigned long ea, int ssize, - unsigned long flags, - enum slb_index index) -{ - struct slb_shadow *p = get_slb_shadow(); - - /* - * Clear the ESID first so the entry is not valid while we are - * updating it. No write barriers are needed here, provided - * we only update the current CPU's SLB shadow buffer. - */ - WRITE_ONCE(p->save_area[index].esid, 0); - WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags))); - WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index))); -} - -static inline void slb_shadow_clear(enum slb_index index) -{ - WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); -} - -static inline void create_shadowed_slbe(unsigned long ea, int ssize, - unsigned long flags, - enum slb_index index) -{ - /* - * Updating the shadow buffer before writing the SLB ensures - * we don't get a stale entry here if we get preempted by PHYP - * between these two statements. - */ - slb_shadow_update(ea, ssize, flags, index); - - assert_slb_presence(false, ea); - asm volatile("slbmte %0,%1" : - : "r" (mk_vsid_data(ea, ssize, flags)), - "r" (mk_esid_data(ea, ssize, index)) - : "memory" ); -} - -/* - * Insert bolted entries into SLB (which may not be empty, so don't clear - * slb_cache_ptr). - */ -void __slb_restore_bolted_realmode(void) -{ - struct slb_shadow *p = get_slb_shadow(); - enum slb_index index; - - /* No isync needed because realmode. */ - for (index = 0; index < SLB_NUM_BOLTED; index++) { - asm volatile("slbmte %0,%1" : - : "r" (be64_to_cpu(p->save_area[index].vsid)), - "r" (be64_to_cpu(p->save_area[index].esid))); - } - - assert_slb_presence(true, local_paca->kstack); -} - -/* - * Insert the bolted entries into an empty SLB. - */ -void slb_restore_bolted_realmode(void) -{ - __slb_restore_bolted_realmode(); - get_paca()->slb_cache_ptr = 0; - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; -} - -/* - * This flushes all SLB entries including 0, so it must be realmode. - */ -void slb_flush_all_realmode(void) -{ - asm volatile("slbmte %0,%0; slbia" : : "r" (0)); -} - -/* - * This flushes non-bolted entries, it can be run in virtual mode. Must - * be called with interrupts disabled. - */ -void slb_flush_and_restore_bolted(void) -{ - struct slb_shadow *p = get_slb_shadow(); - - BUILD_BUG_ON(SLB_NUM_BOLTED != 2); - - WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - asm volatile("isync\n" - "slbia\n" - "slbmte %0, %1\n" - "isync\n" - :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), - "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) - : "memory"); - assert_slb_presence(true, get_paca()->kstack); - - get_paca()->slb_cache_ptr = 0; - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; -} - -void slb_save_contents(struct slb_entry *slb_ptr) -{ - int i; - unsigned long e, v; - - /* Save slb_cache_ptr value. */ - get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; - - if (!slb_ptr) - return; - - for (i = 0; i < mmu_slb_size; i++) { - asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); - asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); - slb_ptr->esid = e; - slb_ptr->vsid = v; - slb_ptr++; - } -} - -void slb_dump_contents(struct slb_entry *slb_ptr) -{ - int i, n; - unsigned long e, v; - unsigned long llp; - - if (!slb_ptr) - return; - - pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); - pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); - - for (i = 0; i < mmu_slb_size; i++) { - e = slb_ptr->esid; - v = slb_ptr->vsid; - slb_ptr++; - - if (!e && !v) - continue; - - pr_err("%02d %016lx %016lx\n", i, e, v); - - if (!(e & SLB_ESID_V)) { - pr_err("\n"); - continue; - } - llp = v & SLB_VSID_LLP; - if (v & SLB_VSID_B_1T) { - pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", - GET_ESID_1T(e), - (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); - } else { - pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", - GET_ESID(e), - (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); - } - } - pr_err("----------------------------------\n"); - - /* Dump slb cache entires as well. */ - pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); - pr_err("Valid SLB cache entries:\n"); - n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); - for (i = 0; i < n; i++) - pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); - pr_err("Rest of SLB cache entries:\n"); - for (i = n; i < SLB_CACHE_ENTRIES; i++) - pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); -} - -void slb_vmalloc_update(void) -{ - /* - * vmalloc is not bolted, so just have to flush non-bolted. - */ - slb_flush_and_restore_bolted(); -} - -static bool preload_hit(struct thread_info *ti, unsigned long esid) -{ - unsigned char i; - - for (i = 0; i < ti->slb_preload_nr; i++) { - unsigned char idx; - - idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; - if (esid == ti->slb_preload_esid[idx]) - return true; - } - return false; -} - -static bool preload_add(struct thread_info *ti, unsigned long ea) -{ - unsigned char idx; - unsigned long esid; - - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { - /* EAs are stored >> 28 so 256MB segments don't need clearing */ - if (ea & ESID_MASK_1T) - ea &= ESID_MASK_1T; - } - - esid = ea >> SID_SHIFT; - - if (preload_hit(ti, esid)) - return false; - - idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; - ti->slb_preload_esid[idx] = esid; - if (ti->slb_preload_nr == SLB_PRELOAD_NR) - ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; - else - ti->slb_preload_nr++; - - return true; -} - -static void preload_age(struct thread_info *ti) -{ - if (!ti->slb_preload_nr) - return; - ti->slb_preload_nr--; - ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; -} - -void slb_setup_new_exec(void) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long exec = 0x10000000; - - WARN_ON(irqs_disabled()); - - /* - * preload cache can only be used to determine whether a SLB - * entry exists if it does not start to overflow. - */ - if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* - * We have no good place to clear the slb preload cache on exec, - * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have aleady preloaded the SLBEs. - * - * For the most part that's probably okay to use entries from the - * previous exec, they will age out if unused. It may turn out to - * be an advantage to clear the cache before switching to it, - * however. - */ - - /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. - */ - if (!is_kernel_addr(exec)) { - if (preload_add(ti, exec)) - slb_allocate_user(mm, exec); - } - - /* Libraries and mmaps. */ - if (!is_kernel_addr(mm->mmap_base)) { - if (preload_add(ti, mm->mmap_base)) - slb_allocate_user(mm, mm->mmap_base); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - -void preload_new_slb_context(unsigned long start, unsigned long sp) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long heap = mm->start_brk; - - WARN_ON(irqs_disabled()); - - /* see above */ - if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* Userspace entry address. */ - if (!is_kernel_addr(start)) { - if (preload_add(ti, start)) - slb_allocate_user(mm, start); - } - - /* Top of stack, grows down. */ - if (!is_kernel_addr(sp)) { - if (preload_add(ti, sp)) - slb_allocate_user(mm, sp); - } - - /* Bottom of heap, grows up. */ - if (heap && !is_kernel_addr(heap)) { - if (preload_add(ti, heap)) - slb_allocate_user(mm, heap); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - - -/* Flush all user entries from the segment table of the current processor. */ -void switch_slb(struct task_struct *tsk, struct mm_struct *mm) -{ - struct thread_info *ti = task_thread_info(tsk); - unsigned char i; - - /* - * We need interrupts hard-disabled here, not just soft-disabled, - * so that a PMU interrupt can't occur, which might try to access - * user memory (to get a stack trace) and possible cause an SLB miss - * which would update the slb_cache/slb_cache_ptr fields in the PACA. - */ - hard_irq_disable(); - asm volatile("isync" : : : "memory"); - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - /* - * SLBIA IH=3 invalidates all Class=1 SLBEs and their - * associated lookaside structures, which matches what - * switch_slb wants. So ARCH_300 does not use the slb - * cache. - */ - asm volatile(PPC_SLBIA(3)); - } else { - unsigned long offset = get_paca()->slb_cache_ptr; - - if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && - offset <= SLB_CACHE_ENTRIES) { - unsigned long slbie_data = 0; - - for (i = 0; i < offset; i++) { - unsigned long ea; - - ea = (unsigned long) - get_paca()->slb_cache[i] << SID_SHIFT; - /* - * Could assert_slb_presence(true) here, but - * hypervisor or machine check could have come - * in and removed the entry at this point. - */ - - slbie_data = ea; - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* user slbs have C=1 */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } - - /* Workaround POWER5 < DD2.1 issue */ - if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) - asm volatile("slbie %0" : : "r" (slbie_data)); - - } else { - struct slb_shadow *p = get_slb_shadow(); - unsigned long ksp_esid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].esid); - unsigned long ksp_vsid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); - - asm volatile(PPC_SLBIA(1) "\n" - "slbmte %0,%1\n" - "isync" - :: "r"(ksp_vsid_data), - "r"(ksp_esid_data)); - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - } - - get_paca()->slb_cache_ptr = 0; - } - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; - - copy_mm_to_paca(mm); - - /* - * We gradually age out SLBs after a number of context switches to - * reduce reload overhead of unused entries (like we do with FP/VEC - * reload). Each time we wrap 256 switches, take an entry out of the - * SLB preload cache. - */ - tsk->thread.load_slb++; - if (!tsk->thread.load_slb) { - unsigned long pc = KSTK_EIP(tsk); - - preload_age(ti); - preload_add(ti, pc); - } - - for (i = 0; i < ti->slb_preload_nr; i++) { - unsigned char idx; - unsigned long ea; - - idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; - ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; - - slb_allocate_user(mm, ea); - } - - /* - * Synchronize slbmte preloads with possible subsequent user memory - * address accesses by the kernel (user mode won't happen until - * rfid, which is safe). - */ - asm volatile("isync" : : : "memory"); -} - -void slb_set_size(u16 size) -{ - mmu_slb_size = size; -} - -void slb_initialize(void) -{ - unsigned long linear_llp, vmalloc_llp, io_llp; - unsigned long lflags; - static int slb_encoding_inited; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - unsigned long vmemmap_llp; -#endif - - /* Prepare our SLB miss handler based on our page size */ - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - io_llp = mmu_psize_defs[mmu_io_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; - get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; -#endif - if (!slb_encoding_inited) { - slb_encoding_inited = 1; - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); - pr_devel("SLB: io LLP = %04lx\n", io_llp); -#ifdef CONFIG_SPARSEMEM_VMEMMAP - pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); -#endif - } - - get_paca()->stab_rr = SLB_NUM_BOLTED - 1; - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; - - lflags = SLB_VSID_KERNEL | linear_llp; - - /* Invalidate the entire SLB (even entry 0) & all the ERATS */ - asm volatile("isync":::"memory"); - asm volatile("slbmte %0,%0"::"r" (0) : "memory"); - asm volatile("isync; slbia; isync":::"memory"); - create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - - /* For the boot cpu, we're running on the stack in init_thread_union, - * which is in the first segment of the linear mapping, and also - * get_paca()->kstack hasn't been initialized yet. - * For secondary cpus, we need to bolt the kernel stack entry now. - */ - slb_shadow_clear(KSTACK_INDEX); - if (raw_smp_processor_id() != boot_cpuid && - (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) - create_shadowed_slbe(get_paca()->kstack, - mmu_kernel_ssize, lflags, KSTACK_INDEX); - - asm volatile("isync":::"memory"); -} - -static void slb_cache_update(unsigned long esid_data) -{ - int slb_cache_index; - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - return; /* ISAv3.0B and later does not use slb_cache */ - - /* - * Now update slb cache entries - */ - slb_cache_index = local_paca->slb_cache_ptr; - if (slb_cache_index < SLB_CACHE_ENTRIES) { - /* - * We have space in slb cache for optimized switch_slb(). - * Top 36 bits from esid_data as per ISA - */ - local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; - local_paca->slb_cache_ptr++; - } else { - /* - * Our cache is full and the current cache content strictly - * doesn't indicate the active SLB conents. Bump the ptr - * so that switch_slb() will ignore the cache. - */ - local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; - } -} - -static enum slb_index alloc_slb_index(bool kernel) -{ - enum slb_index index; - - /* - * The allocation bitmaps can become out of synch with the SLB - * when the _switch code does slbie when bolting a new stack - * segment and it must not be anywhere else in the SLB. This leaves - * a kernel allocated entry that is unused in the SLB. With very - * large systems or small segment sizes, the bitmaps could slowly - * fill with these entries. They will eventually be cleared out - * by the round robin allocator in that case, so it's probably not - * worth accounting for. - */ - - /* - * SLBs beyond 32 entries are allocated with stab_rr only - * POWER7/8/9 have 32 SLB entries, this could be expanded if a - * future CPU has more. - */ - if (local_paca->slb_used_bitmap != U32_MAX) { - index = ffz(local_paca->slb_used_bitmap); - local_paca->slb_used_bitmap |= 1U << index; - if (kernel) - local_paca->slb_kern_bitmap |= 1U << index; - } else { - /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ - index = local_paca->stab_rr; - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - local_paca->stab_rr = index; - if (index < 32) { - if (kernel) - local_paca->slb_kern_bitmap |= 1U << index; - else - local_paca->slb_kern_bitmap &= ~(1U << index); - } - } - BUG_ON(index < SLB_NUM_BOLTED); - - return index; -} - -static long slb_insert_entry(unsigned long ea, unsigned long context, - unsigned long flags, int ssize, bool kernel) -{ - unsigned long vsid; - unsigned long vsid_data, esid_data; - enum slb_index index; - - vsid = get_vsid(context, ea, ssize); - if (!vsid) - return -EFAULT; - - /* - * There must not be a kernel SLB fault in alloc_slb_index or before - * slbmte here or the allocation bitmaps could get out of whack with - * the SLB. - * - * User SLB faults or preloads take this path which might get inlined - * into the caller, so add compiler barriers here to ensure unsafe - * memory accesses do not come between. - */ - barrier(); - - index = alloc_slb_index(kernel); - - vsid_data = __mk_vsid_data(vsid, ssize, flags); - esid_data = mk_esid_data(ea, ssize, index); - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - * User preloads should add isync afterwards in case the kernel - * accesses user memory before it returns to userspace with rfid. - */ - assert_slb_presence(false, ea); - asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); - - barrier(); - - if (!kernel) - slb_cache_update(esid_data); - - return 0; -} - -static long slb_allocate_kernel(unsigned long ea, unsigned long id) -{ - unsigned long context; - unsigned long flags; - int ssize; - - if (id == LINEAR_MAP_REGION_ID) { - - /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) - return -EFAULT; - - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - } else if (id == VMEMMAP_REGION_ID) { - - if (ea >= H_VMEMMAP_END) - return -EFAULT; - - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; -#endif - } else if (id == VMALLOC_REGION_ID) { - - if (ea >= H_VMALLOC_END) - return -EFAULT; - - flags = local_paca->vmalloc_sllp; - - } else if (id == IO_REGION_ID) { - - if (ea >= H_KERN_IO_END) - return -EFAULT; - - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; - - } else { - return -EFAULT; - } - - ssize = MMU_SEGSIZE_1T; - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - ssize = MMU_SEGSIZE_256M; - - context = get_kernel_context(ea); - - return slb_insert_entry(ea, context, flags, ssize, true); -} - -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) -{ - unsigned long context; - unsigned long flags; - int bpsize; - int ssize; - - /* - * consider this as bad access if we take a SLB miss - * on an address above addr limit. - */ - if (ea >= mm_ctx_slb_addr_limit(&mm->context)) - return -EFAULT; - - context = get_user_context(&mm->context, ea); - if (!context) - return -EFAULT; - - if (unlikely(ea >= H_PGTABLE_RANGE)) { - WARN_ON(1); - return -EFAULT; - } - - ssize = user_segment_size(ea); - - bpsize = get_slice_psize(mm, ea); - flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; - - return slb_insert_entry(ea, context, flags, ssize, false); -} - -long do_slb_fault(struct pt_regs *regs, unsigned long ea) -{ - unsigned long id = get_region_id(ea); - - /* IRQs are not reconciled here, so can't check irqs_disabled */ - VM_WARN_ON(mfmsr() & MSR_EE); - - if (unlikely(!(regs->msr & MSR_RI))) - return -EINVAL; - - /* - * SLB kernel faults must be very careful not to touch anything - * that is not bolted. E.g., PACA and global variables are okay, - * mm->context stuff is not. - * - * SLB user faults can access all of kernel memory, but must be - * careful not to touch things like IRQ state because it is not - * "reconciled" here. The difficulty is that we must use - * fast_exception_return to return from kernel SLB faults without - * looking at possible non-bolted memory. We could test user vs - * kernel faults in the interrupt handler asm and do a full fault, - * reconcile, ret_from_except for user faults which would make them - * first class kernel code. But for performance it's probably nicer - * if they go via fast_exception_return too. - */ - if (id >= LINEAR_MAP_REGION_ID) { - long err; -#ifdef CONFIG_DEBUG_VM - /* Catch recursive kernel SLB faults. */ - BUG_ON(local_paca->in_kernel_slb_handler); - local_paca->in_kernel_slb_handler = 1; -#endif - err = slb_allocate_kernel(ea, id); -#ifdef CONFIG_DEBUG_VM - local_paca->in_kernel_slb_handler = 0; -#endif - return err; - } else { - struct mm_struct *mm = current->mm; - long err; - - if (unlikely(!mm)) - return -EFAULT; - - err = slb_allocate_user(mm, ea); - if (!err) - preload_add(current_thread_info(), ea); - - return err; - } -} - -void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) -{ - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } -} diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c deleted file mode 100644 index 473dd430e306..000000000000 --- a/arch/powerpc/mm/subpage-prot.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright 2007-2008 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* - * Free all pages allocated for subpage protection maps and pointers. - * Also makes sure that the subpage_prot_table structure is - * reinitialized for the next user. - */ -void subpage_prot_free(struct mm_struct *mm) -{ - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); - unsigned long i, j, addr; - u32 **p; - - if (!spt) - return; - - for (i = 0; i < 4; ++i) { - if (spt->low_prot[i]) { - free_page((unsigned long)spt->low_prot[i]); - spt->low_prot[i] = NULL; - } - } - addr = 0; - for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { - p = spt->protptrs[i]; - if (!p) - continue; - spt->protptrs[i] = NULL; - for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; - ++j, addr += PAGE_SIZE) - if (p[j]) - free_page((unsigned long)p[j]); - free_page((unsigned long)p); - } - spt->maxaddr = 0; - kfree(spt); -} - -static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, - int npages) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return; - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) - return; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - arch_enter_lazy_mmu_mode(); - for (; npages > 0; --npages) { - pte_update(mm, addr, pte, 0, 0, 0); - addr += PAGE_SIZE; - ++pte; - } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); -} - -/* - * Clear the subpage protection map for an address range, allowing - * all accesses that are allowed by the pte permissions. - */ -static void subpage_prot_clear(unsigned long addr, unsigned long len) -{ - struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt; - u32 **spm, *spp; - unsigned long i; - size_t nw; - unsigned long next, limit; - - down_write(&mm->mmap_sem); - - spt = mm_ctx_subpage_prot(&mm->context); - if (!spt) - goto err_out; - - limit = addr + len; - if (limit > spt->maxaddr) - limit = spt->maxaddr; - for (; addr < limit; addr = next) { - next = pmd_addr_end(addr, limit); - if (addr < 0x100000000UL) { - spm = spt->low_prot; - } else { - spm = spt->protptrs[addr >> SBP_L3_SHIFT]; - if (!spm) - continue; - } - spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; - if (!spp) - continue; - spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); - - i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); - nw = PTRS_PER_PTE - i; - if (addr + (nw << PAGE_SHIFT) > next) - nw = (next - addr) >> PAGE_SHIFT; - - memset(spp, 0, nw * sizeof(u32)); - - /* now flush any existing HPTEs for the range */ - hpte_flush_range(mm, addr, nw); - } - -err_out: - up_write(&mm->mmap_sem); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct vm_area_struct *vma = walk->vma; - split_huge_pmd(vma, pmd, addr); - return 0; -} - -static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; - - /* - * We don't try too hard, we just mark all the vma in that range - * VM_NOHUGEPAGE and split them. - */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; - vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); - vma = vma->vm_next; - } -} -#else -static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - return; -} -#endif - -/* - * Copy in a subpage protection map for an address range. - * The map has 2 bits per 4k subpage, so 32 bits per 64k page. - * Each 2-bit field is 0 to allow any access, 1 to prevent writes, - * 2 or 3 to prevent all accesses. - * Note that the normal page protections also apply; the subpage - * protection mechanism is an additional constraint, so putting 0 - * in a 2-bit field won't allow writes to a page that is otherwise - * write-protected. - */ -SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, - unsigned long, len, u32 __user *, map) -{ - struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt; - u32 **spm, *spp; - unsigned long i; - size_t nw; - unsigned long next, limit; - int err; - - if (radix_enabled()) - return -ENOENT; - - /* Check parameters */ - if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || - addr >= mm->task_size || len >= mm->task_size || - addr + len > mm->task_size) - return -EINVAL; - - if (is_hugepage_only_range(mm, addr, len)) - return -EINVAL; - - if (!map) { - /* Clear out the protection map for the address range */ - subpage_prot_clear(addr, len); - return 0; - } - - if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) - return -EFAULT; - - down_write(&mm->mmap_sem); - - spt = mm_ctx_subpage_prot(&mm->context); - if (!spt) { - /* - * Allocate subpage prot table if not already done. - * Do this with mmap_sem held - */ - spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); - if (!spt) { - err = -ENOMEM; - goto out; - } - mm->context.hash_context->spt = spt; - } - - subpage_mark_vma_nohuge(mm, addr, len); - for (limit = addr + len; addr < limit; addr = next) { - next = pmd_addr_end(addr, limit); - err = -ENOMEM; - if (addr < 0x100000000UL) { - spm = spt->low_prot; - } else { - spm = spt->protptrs[addr >> SBP_L3_SHIFT]; - if (!spm) { - spm = (u32 **)get_zeroed_page(GFP_KERNEL); - if (!spm) - goto out; - spt->protptrs[addr >> SBP_L3_SHIFT] = spm; - } - } - spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); - spp = *spm; - if (!spp) { - spp = (u32 *)get_zeroed_page(GFP_KERNEL); - if (!spp) - goto out; - *spm = spp; - } - spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); - - local_irq_disable(); - demote_segment_4k(mm, addr); - local_irq_enable(); - - i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); - nw = PTRS_PER_PTE - i; - if (addr + (nw << PAGE_SHIFT) > next) - nw = (next - addr) >> PAGE_SHIFT; - - up_write(&mm->mmap_sem); - if (__copy_from_user(spp, map, nw * sizeof(u32))) - return -EFAULT; - map += nw; - down_write(&mm->mmap_sem); - - /* now flush any existing HPTEs for the range */ - hpte_flush_range(mm, addr, nw); - } - if (limit > spt->maxaddr) - spt->maxaddr = limit; - err = 0; - out: - up_write(&mm->mmap_sem); - return err; -} diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c deleted file mode 100644 index 6a23b9ebd2a1..000000000000 --- a/arch/powerpc/mm/tlb-radix.c +++ /dev/null @@ -1,1101 +0,0 @@ -/* - * TLB flush routines for radix kernels. - * - * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define RIC_FLUSH_TLB 0 -#define RIC_FLUSH_PWC 1 -#define RIC_FLUSH_ALL 2 - -/* - * tlbiel instruction for radix, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) -{ - unsigned long rb; - unsigned long rs; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) - : "memory"); -} - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and the entire Page Walk Cache - * and partition table entries. Then flush the remaining sets of the - * TLB. - */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); - - /* Do the same for process scoped entries. */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); - - asm volatile("ptesync": : :"memory"); -} - -void radix__tlbiel_all(unsigned int action) -{ - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is); - else - WARN(1, "%s called on pre-POWER9 CPU\n", __func__); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void __tlbiel_pid(unsigned long pid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rb |= set << PPC_BITLSHIFT(51); - rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_pid(unsigned long pid, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} - -static inline void __tlbiel_lpid(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rs = lpid; - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 0, rb, rs, ric, prs, r); -} - -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - - -static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} - -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = lpid; - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 0, rb, rs, ric, prs, r); -} - -static inline void fixup_tlbie(void) -{ - unsigned long pid = 0; - unsigned long va = ((1UL << 52) - 1); - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - asm volatile("ptesync": : :"memory"); - __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); - } -} - -static inline void fixup_tlbie_lpid(unsigned long lpid) -{ - unsigned long va = ((1UL << 52) - 1); - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - asm volatile("ptesync": : :"memory"); - __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); - } -} - -/* - * We use 128 set in radix mode and 256 set in hpt mode. - */ -static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) -{ - int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_pid(pid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void _tlbie_pid(unsigned long pid, unsigned long ric) -{ - asm volatile("ptesync": : :"memory"); - - /* - * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint - * in the asm statement. - */ - switch (ric) { - case RIC_FLUSH_TLB: - __tlbie_pid(pid, RIC_FLUSH_TLB); - break; - case RIC_FLUSH_PWC: - __tlbie_pid(pid, RIC_FLUSH_PWC); - break; - case RIC_FLUSH_ALL: - default: - __tlbie_pid(pid, RIC_FLUSH_ALL); - } - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) -{ - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_lpid(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) -{ - asm volatile("ptesync": : :"memory"); - - /* - * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint - * in the asm statement. - */ - switch (ric) { - case RIC_FLUSH_TLB: - __tlbie_lpid(lpid, RIC_FLUSH_TLB); - break; - case RIC_FLUSH_PWC: - __tlbie_lpid(lpid, RIC_FLUSH_PWC); - break; - case RIC_FLUSH_ALL: - default: - __tlbie_lpid(lpid, RIC_FLUSH_ALL); - } - fixup_tlbie_lpid(lpid); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) -{ - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_lpid_guest(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); -} - - -static inline void __tlbiel_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize) -{ - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - - for (addr = start; addr < end; addr += page_size) - __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); -} - -static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) -{ - unsigned long ap = mmu_get_ap(psize); - - asm volatile("ptesync": : :"memory"); - __tlbiel_va(va, pid, ap, ric); - asm volatile("ptesync": : :"memory"); -} - -static inline void _tlbiel_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize, bool also_pwc) -{ - asm volatile("ptesync": : :"memory"); - if (also_pwc) - __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); - __tlbiel_va_range(start, end, pid, page_size, psize); - asm volatile("ptesync": : :"memory"); -} - -static inline void __tlbie_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize) -{ - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - - for (addr = start; addr < end; addr += page_size) - __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); -} - -static inline void _tlbie_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) -{ - unsigned long ap = mmu_get_ap(psize); - - asm volatile("ptesync": : :"memory"); - __tlbie_va(va, pid, ap, ric); - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long psize, unsigned long ric) -{ - unsigned long ap = mmu_get_ap(psize); - - asm volatile("ptesync": : :"memory"); - __tlbie_lpid_va(va, lpid, ap, ric); - fixup_tlbie_lpid(lpid); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -static inline void _tlbie_va_range(unsigned long start, unsigned long end, - unsigned long pid, unsigned long page_size, - unsigned long psize, bool also_pwc) -{ - asm volatile("ptesync": : :"memory"); - if (also_pwc) - __tlbie_pid(pid, RIC_FLUSH_PWC); - __tlbie_va_range(start, end, pid, page_size, psize); - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -/* - * Base TLB flushing operations: - * - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes kernel pages - * - * - local_* variants of page and mm only apply to the current - * processor - */ -void radix__local_flush_tlb_mm(struct mm_struct *mm) -{ - unsigned long pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); - preempt_enable(); -} -EXPORT_SYMBOL(radix__local_flush_tlb_mm); - -#ifndef CONFIG_SMP -void radix__local_flush_all_mm(struct mm_struct *mm) -{ - unsigned long pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); - preempt_enable(); -} -EXPORT_SYMBOL(radix__local_flush_all_mm); -#endif /* CONFIG_SMP */ - -void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, - int psize) -{ - unsigned long pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - preempt_enable(); -} - -void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ -#ifdef CONFIG_HUGETLB_PAGE - /* need the return fix for nohash.c */ - if (is_vm_hugetlb_page(vma)) - return radix__local_flush_hugetlb_page(vma, vmaddr); -#endif - radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); -} -EXPORT_SYMBOL(radix__local_flush_tlb_page); - -static bool mm_is_singlethreaded(struct mm_struct *mm) -{ - if (atomic_read(&mm->context.copros) > 0) - return false; - if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) - return true; - return false; -} - -static bool mm_needs_flush_escalation(struct mm_struct *mm) -{ - /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate - */ - if (atomic_read(&mm->context.copros) > 0) - return true; - return false; -} - -#ifdef CONFIG_SMP -static void do_exit_flush_lazy_tlb(void *arg) -{ - struct mm_struct *mm = arg; - unsigned long pid = mm->context.id; - - if (current->mm == mm) - return; /* Local CPU */ - - if (current->active_mm == mm) { - /* - * Must be a kernel thread because sender is single-threaded. - */ - BUG_ON(current->mm); - mmgrab(&init_mm); - switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; - mmdrop(mm); - } - _tlbiel_pid(pid, RIC_FLUSH_ALL); -} - -static void exit_flush_lazy_tlbs(struct mm_struct *mm) -{ - /* - * Would be nice if this was async so it could be run in - * parallel with our local flush, but generic code does not - * give a good API for it. Could extend the generic code or - * make a special powerpc IPI for flushing TLBs. - * For now it's not too performance critical. - */ - smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, - (void *)mm, 1); - mm_reset_thread_local(mm); -} - -void radix__flush_tlb_mm(struct mm_struct *mm) -{ - unsigned long pid; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - /* - * Order loads of mm_cpumask vs previous stores to clear ptes before - * the invalidate. See barrier in switch_mm_irqs_off - */ - smp_mb(); - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_TLB); - } - preempt_enable(); -} -EXPORT_SYMBOL(radix__flush_tlb_mm); - -static void __flush_all_mm(struct mm_struct *mm, bool fullmm) -{ - unsigned long pid; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (!fullmm) { - exit_flush_lazy_tlbs(mm); - goto local; - } - } - _tlbie_pid(pid, RIC_FLUSH_ALL); - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_ALL); - } - preempt_enable(); -} -void radix__flush_all_mm(struct mm_struct *mm) -{ - __flush_all_mm(mm, false); -} -EXPORT_SYMBOL(radix__flush_all_mm); - -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) -{ - tlb->need_flush_all = 1; -} -EXPORT_SYMBOL(radix__flush_tlb_pwc); - -void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, - int psize) -{ - unsigned long pid; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - } else { -local: - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - } - preempt_enable(); -} - -void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_page(vma, vmaddr); -#endif - radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); -} -EXPORT_SYMBOL(radix__flush_tlb_page); - -#else /* CONFIG_SMP */ -#define radix__flush_all_mm radix__local_flush_all_mm -#endif /* CONFIG_SMP */ - -void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - _tlbie_pid(0, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL(radix__flush_tlb_kernel_range); - -#define TLB_FLUSH_ALL -1UL - -/* - * Number of pages above which we invalidate the entire PID rather than - * flush individual pages, for local and global flushes respectively. - * - * tlbie goes out to the interconnect and individual ops are more costly. - * It also does not iterate over sets like the local tlbiel variant when - * invalidating a full PID, so it has a far lower threshold to change from - * individual page flushes to full-pid flushes. - */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; - -static inline void __radix__flush_tlb_range(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool flush_all_sizes) - -{ - unsigned long pid; - unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; - unsigned long page_size = 1UL << page_shift; - unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } - - if (full) { - if (local) { - _tlbiel_pid(pid, RIC_FLUSH_TLB); - } else { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); - } - } else { - bool hflush = flush_all_sizes; - bool gflush = flush_all_sizes; - unsigned long hstart, hend; - unsigned long gstart, gend; - - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - hflush = true; - - if (hflush) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart == hend) - hflush = false; - } - - if (gflush) { - gstart = (start + PUD_SIZE - 1) & PUD_MASK; - gend = end & PUD_MASK; - if (gstart == gend) - gflush = false; - } - - asm volatile("ptesync": : :"memory"); - if (local) { - __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); - if (hflush) - __tlbiel_va_range(hstart, hend, pid, - PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbiel_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); - asm volatile("ptesync": : :"memory"); - } else { - __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); - if (hflush) - __tlbie_va_range(hstart, hend, pid, - PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbie_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); - fixup_tlbie(); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); - } - } - preempt_enable(); -} - -void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) - -{ -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_tlb_range(vma, start, end); -#endif - - __radix__flush_tlb_range(vma->vm_mm, start, end, false); -} -EXPORT_SYMBOL(radix__flush_tlb_range); - -static int radix_get_mmu_psize(int page_size) -{ - int psize; - - if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift)) - psize = mmu_virtual_psize; - else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift)) - psize = MMU_PAGE_2M; - else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift)) - psize = MMU_PAGE_1G; - else - return -1; - return psize; -} - -/* - * Flush partition scoped LPID address translation for all CPUs. - */ -void radix__flush_tlb_lpid_page(unsigned int lpid, - unsigned long addr, - unsigned long page_size) -{ - int psize = radix_get_mmu_psize(page_size); - - _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB); -} -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page); - -/* - * Flush partition scoped PWC from LPID for all CPUs. - */ -void radix__flush_pwc_lpid(unsigned int lpid) -{ - _tlbie_lpid(lpid, RIC_FLUSH_PWC); -} -EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__flush_tlb_lpid(unsigned int lpid) -{ - _tlbie_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__local_flush_tlb_lpid(unsigned int lpid) -{ - _tlbiel_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); - -/* - * Flush process scoped translations from LPID (=LPIDR). - * Important difference, the guest normally manages its own translations, - * but some cases e.g., vCPU CPU migration require KVM to flush. - */ -void radix__local_flush_tlb_lpid_guest(unsigned int lpid) -{ - _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); - - -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize); - -void radix__tlb_flush(struct mmu_gather *tlb) -{ - int psize = 0; - struct mm_struct *mm = tlb->mm; - int page_size = tlb->page_size; - unsigned long start = tlb->start; - unsigned long end = tlb->end; - - /* - * if page size is not something we understand, do a full mm flush - * - * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush - * that flushes the process table entry cache upon process teardown. - * See the comment for radix in arch_exit_mmap(). - */ - if (tlb->fullmm) { - __flush_all_mm(mm, true); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) - } else if (mm_tlb_flush_nested(mm)) { - /* - * If there is a concurrent invalidation that is clearing ptes, - * then it's possible this invalidation will miss one of those - * cleared ptes and miss flushing the TLB. If this invalidate - * returns before the other one flushes TLBs, that can result - * in it returning while there are still valid TLBs inside the - * range to be invalidated. - * - * See mm/memory.c:tlb_finish_mmu() for more details. - * - * The solution to this is ensure the entire range is always - * flushed here. The problem for powerpc is that the flushes - * are page size specific, so this "forced flush" would not - * do the right thing if there are a mix of page sizes in - * the range to be invalidated. So use __flush_tlb_range - * which invalidates all possible page sizes in the range. - * - * PWC flush probably is not be required because the core code - * shouldn't free page tables in this path, but accounting - * for the possibility makes us a bit more robust. - * - * need_flush_all is an uncommon case because page table - * teardown should be done with exclusive locks held (but - * after locks are dropped another invalidate could come - * in), it could be optimized further if necessary. - */ - if (!tlb->need_flush_all) - __radix__flush_tlb_range(mm, start, end, true); - else - radix__flush_all_mm(mm); -#endif - } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { - if (!tlb->need_flush_all) - radix__flush_tlb_mm(mm); - else - radix__flush_all_mm(mm); - } else { - if (!tlb->need_flush_all) - radix__flush_tlb_range_psize(mm, start, end, psize); - else - radix__flush_tlb_pwc_range_psize(mm, start, end, psize); - } - tlb->need_flush_all = 0; -} - -static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, - unsigned long start, unsigned long end, - int psize, bool also_pwc) -{ - unsigned long pid; - unsigned int page_shift = mmu_psize_defs[psize].shift; - unsigned long page_size = 1UL << page_shift; - unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } - - if (full) { - if (local) { - _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); - } else { - if (mm_needs_flush_escalation(mm)) - also_pwc = true; - - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); - } - } else { - if (local) - _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); - else - _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); - } - preempt_enable(); -} - -void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) -{ - return __radix__flush_tlb_range_psize(mm, start, end, psize, false); -} - -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) -{ - __radix__flush_tlb_range_psize(mm, start, end, psize, true); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) -{ - unsigned long pid, end; - - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - /* 4k page size, just blow the world */ - if (PAGE_SIZE == 0x1000) { - radix__flush_all_mm(mm); - return; - } - - end = addr + HPAGE_PMD_SIZE; - - /* Otherwise first do the PWC, then iterate the pages. */ - preempt_disable(); - smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } else { -local: - _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } - - preempt_enable(); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M); -} -EXPORT_SYMBOL(radix__flush_pmd_tlb_range); - -void radix__flush_tlb_all(void) -{ - unsigned long rb,prs,r,rs; - unsigned long ric = RIC_FLUSH_ALL; - - rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */ - - asm volatile("ptesync": : :"memory"); - /* - * now flush guest entries by passing PRS = 1 and LPID != 0 - */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); - /* - * now flush host entires by passing PRS = 0 and LPID == 0 - */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); -} - -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) -{ - unsigned long pid = mm->context.id; - - if (unlikely(pid == MMU_NO_CONTEXT)) - return; - - /* - * If this context hasn't run on that CPU before and KVM is - * around, there's a slim chance that the guest on another - * CPU just brought in obsolete translation into the TLB of - * this CPU due to a bad prefetch using the guest PID on - * the way into the hypervisor. - * - * We work around this here. If KVM is possible, we check if - * any sibling thread is in KVM. If it is, the window may exist - * and thus we flush that PID from the core. - * - * A potential future improvement would be to mark which PIDs - * have never been used on the system and avoid it if the PID - * is new and the process has no other cpumask bit set. - */ - if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { - int cpu = smp_processor_id(); - int sib = cpu_first_thread_sibling(cpu); - bool flush = false; - - for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { - if (sib == cpu) - continue; - if (!cpu_possible(sib)) - continue; - if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) - flush = true; - } - if (flush) - _tlbiel_pid(pid, RIC_FLUSH_ALL); - } -} -EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); -#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c deleted file mode 100644 index 87d71dd25441..000000000000 --- a/arch/powerpc/mm/tlb_hash64.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * This file contains the routines for flushing entries from the - * TLB and MMU hash table. - * - * Derived from arch/ppc64/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * Dave Engebretsen - * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - -DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); - -/* - * A linux PTE was changed and the corresponding hash table entry - * neesd to be flushed. This function will either perform the flush - * immediately or will batch it up if the current CPU has an active - * batch on it. - */ -void hpte_need_flush(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned long pte, int huge) -{ - unsigned long vpn; - struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); - unsigned long vsid; - unsigned int psize; - int ssize; - real_pte_t rpte; - int i, offset; - - i = batch->index; - - /* Get page size (maybe move back to caller). - * - * NOTE: when using special 64K mappings in 4K environment like - * for SPEs, we obtain the page size from the slice, which thus - * must still exist (and thus the VMA not reused) at the time - * of this call - */ - if (huge) { -#ifdef CONFIG_HUGETLB_PAGE - psize = get_slice_psize(mm, addr); - /* Mask the address for the correct page size */ - addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); - if (unlikely(psize == MMU_PAGE_16G)) - offset = PTRS_PER_PUD; - else - offset = PTRS_PER_PMD; -#else - BUG(); - psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ -#endif - } else { - psize = pte_pagesize_index(mm, addr, pte); - /* Mask the address for the standard page size. If we - * have a 64k page kernel, but the hardware does not - * support 64k pages, this might be different from the - * hardware page size encoded in the slice table. */ - addr &= PAGE_MASK; - offset = PTRS_PER_PTE; - } - - - /* Build full vaddr */ - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_user_vsid(&mm->context, addr, ssize); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - WARN_ON(vsid == 0); - vpn = hpt_vpn(addr, vsid, ssize); - rpte = __real_pte(__pte(pte), ptep, offset); - - /* - * Check if we have an active batch on this CPU. If not, just - * flush now and return. - */ - if (!batch->active) { - flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); - put_cpu_var(ppc64_tlb_batch); - return; - } - - /* - * This can happen when we are in the middle of a TLB batch and - * we encounter memory pressure (eg copy_page_range when it tries - * to allocate a new pte). If we have to reclaim memory and end - * up scanning and resetting referenced bits then our batch context - * will change mid stream. - * - * We also need to ensure only one page size is present in a given - * batch - */ - if (i != 0 && (mm != batch->mm || batch->psize != psize || - batch->ssize != ssize)) { - __flush_tlb_pending(batch); - i = 0; - } - if (i == 0) { - batch->mm = mm; - batch->psize = psize; - batch->ssize = ssize; - } - batch->pte[i] = rpte; - batch->vpn[i] = vpn; - batch->index = ++i; - if (i >= PPC64_TLB_BATCH_NR) - __flush_tlb_pending(batch); - put_cpu_var(ppc64_tlb_batch); -} - -/* - * This function is called when terminating an mmu batch or when a batch - * is full. It will perform the flush of all the entries currently stored - * in a batch. - * - * Must be called from within some kind of spinlock/non-preempt region... - */ -void __flush_tlb_pending(struct ppc64_tlb_batch *batch) -{ - int i, local; - - i = batch->index; - local = mm_is_thread_local(batch->mm); - if (i == 1) - flush_hash_page(batch->vpn[0], batch->pte[0], - batch->psize, batch->ssize, local); - else - flush_hash_range(i, local); - batch->index = 0; -} - -void hash__tlb_flush(struct mmu_gather *tlb) -{ - struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); - - /* If there's a TLB batch pending, then we must flush it because the - * pages are going to be freed and we really don't want to have a CPU - * access a freed page because it has a stale TLB - */ - if (tlbbatch->index) - __flush_tlb_pending(tlbbatch); - - put_cpu_var(ppc64_tlb_batch); -} - -/** - * __flush_hash_table_range - Flush all HPTEs for a given address range - * from the hash table (and the TLB). But keeps - * the linux PTEs intact. - * - * @mm : mm_struct of the target address space (generally init_mm) - * @start : starting address - * @end : ending address (not included in the flush) - * - * This function is mostly to be used by some IO hotplug code in order - * to remove all hash entries from a given address range used to map IO - * space on a removed PCI-PCI bidge without tearing down the full mapping - * since 64K pages may overlap with other bridges when using 64K pages - * with 4K HW pages on IO space. - * - * Because of that usage pattern, it is implemented for small size rather - * than speed. - */ -void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - bool is_thp; - int hugepage_shift; - unsigned long flags; - - start = _ALIGN_DOWN(start, PAGE_SIZE); - end = _ALIGN_UP(end, PAGE_SIZE); - - BUG_ON(!mm->pgd); - - /* Note: Normally, we should only ever use a batch within a - * PTE locked section. This violates the rule, but will work - * since we don't actually modify the PTEs, we just flush the - * hash while leaving the PTEs intact (including their reference - * to being hashed). This is not the most performance oriented - * way to do things but is fine for our needs here. - */ - local_irq_save(flags); - arch_enter_lazy_mmu_mode(); - for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, - &hugepage_shift); - unsigned long pte; - - if (ptep == NULL) - continue; - pte = pte_val(*ptep); - if (is_thp) - trace_hugepage_invalidate(start, pte); - if (!(pte & H_PAGE_HASHPTE)) - continue; - if (unlikely(is_thp)) - hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); - else - hpte_need_flush(mm, start, ptep, pte, hugepage_shift); - } - arch_leave_lazy_mmu_mode(); - local_irq_restore(flags); -} - -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) -{ - pte_t *pte; - pte_t *start_pte; - unsigned long flags; - - addr = _ALIGN_DOWN(addr, PMD_SIZE); - /* Note: Normally, we should only ever use a batch within a - * PTE locked section. This violates the rule, but will work - * since we don't actually modify the PTEs, we just flush the - * hash while leaving the PTEs intact (including their reference - * to being hashed). This is not the most performance oriented - * way to do things but is fine for our needs here. - */ - local_irq_save(flags); - arch_enter_lazy_mmu_mode(); - start_pte = pte_offset_map(pmd, addr); - for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { - unsigned long pteval = pte_val(*pte); - if (pteval & H_PAGE_HASHPTE) - hpte_need_flush(mm, addr, pte, pteval, 0); - addr += PAGE_SIZE; - } - arch_leave_lazy_mmu_mode(); - local_irq_restore(flags); -} diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c deleted file mode 100644 index f83044faac23..000000000000 --- a/arch/powerpc/mm/vphn.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "vphn.h" - -/* - * The associativity domain numbers are returned from the hypervisor as a - * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the - * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 - * bytes. - * - * --- 16-bit fields --> - * _________________________ - * | 0 | 1 | 2 | 3 | be_packed[0] - * ------+-----+-----+------ - * _________________________ - * | 4 | 5 | 6 | 7 | be_packed[1] - * ------------------------- - * ... - * _________________________ - * | 20 | 21 | 22 | 23 | be_packed[5] - * ------------------------- - * - * Convert to the sequence they would appear in the ibm,associativity property. - */ -int vphn_unpack_associativity(const long *packed, __be32 *unpacked) -{ - __be64 be_packed[VPHN_REGISTER_COUNT]; - int i, nr_assoc_doms = 0; - const __be16 *field = (const __be16 *) be_packed; - u16 last = 0; - bool is_32bit = false; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - /* Let's fix the values returned by plpar_hcall9() */ - for (i = 0; i < VPHN_REGISTER_COUNT; i++) - be_packed[i] = cpu_to_be64(packed[i]); - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - u16 new = be16_to_cpup(field++); - - if (is_32bit) { - /* Let's concatenate the 16 bits of this field to the - * 15 lower bits of the previous field - */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(last << 16 | new); - is_32bit = false; - } else if (new == VPHN_FIELD_UNUSED) - /* This is the list terminator */ - break; - else if (new & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(new & VPHN_FIELD_MASK); - } else { - /* Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - last = new; - is_32bit = true; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = cpu_to_be32(nr_assoc_doms); - - return nr_assoc_doms; -} diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h deleted file mode 100644 index f9ffdb3942fc..000000000000 --- a/arch/powerpc/mm/vphn.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARCH_POWERPC_MM_VPHN_H_ -#define _ARCH_POWERPC_MM_VPHN_H_ - -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. - */ -#define VPHN_REGISTER_COUNT 6 - -/* - * 6 64-bit registers unpacked into up to 24 be32 associativity values. To - * form the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) - -extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); - -#endif diff --git a/tools/testing/selftests/powerpc/vphn/vphn.c b/tools/testing/selftests/powerpc/vphn/vphn.c index 186b906e66d5..1d1f5f2be3b2 120000 --- a/tools/testing/selftests/powerpc/vphn/vphn.c +++ b/tools/testing/selftests/powerpc/vphn/vphn.c @@ -1 +1 @@ -../../../../../arch/powerpc/mm/vphn.c \ No newline at end of file +../../../../../arch/powerpc/mm/book3s64/vphn.c \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/vphn/vphn.h b/tools/testing/selftests/powerpc/vphn/vphn.h index 7131efe38c65..45fe160f8288 120000 --- a/tools/testing/selftests/powerpc/vphn/vphn.h +++ b/tools/testing/selftests/powerpc/vphn/vphn.h @@ -1 +1 @@ -../../../../../arch/powerpc/mm/vphn.h \ No newline at end of file +../../../../../arch/powerpc/mm/book3s64/vphn.h \ No newline at end of file -- cgit v1.2.3-59-g8ed1b From 17312f258cf6eb584f276ad592972ade7e16e318 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:01 +0000 Subject: powerpc/mm: Move book3s32 specifics in subdirectory mm/book3s64 Several files in arch/powerpc/mm are only for book3S32. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Shorten new filenames] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 3 +- arch/powerpc/mm/book3s32/Makefile | 3 + arch/powerpc/mm/book3s32/hash_low.S | 705 +++++++++++++++++++++++++++++++++ arch/powerpc/mm/book3s32/mmu.c | 419 ++++++++++++++++++++ arch/powerpc/mm/book3s32/mmu_context.c | 118 ++++++ arch/powerpc/mm/book3s32/tlb.c | 173 ++++++++ arch/powerpc/mm/hash_low_32.S | 705 --------------------------------- arch/powerpc/mm/mmu_context_hash32.c | 118 ------ arch/powerpc/mm/ppc_mmu_32.c | 419 -------------------- arch/powerpc/mm/tlb_hash32.c | 173 -------- 10 files changed, 1419 insertions(+), 1417 deletions(-) create mode 100644 arch/powerpc/mm/book3s32/Makefile create mode 100644 arch/powerpc/mm/book3s32/hash_low.S create mode 100644 arch/powerpc/mm/book3s32/mmu.c create mode 100644 arch/powerpc/mm/book3s32/mmu_context.c create mode 100644 arch/powerpc/mm/book3s32/tlb.c delete mode 100644 arch/powerpc/mm/hash_low_32.S delete mode 100644 arch/powerpc/mm/mmu_context_hash32.c delete mode 100644 arch/powerpc/mm/ppc_mmu_32.c delete mode 100644 arch/powerpc/mm/tlb_hash32.c (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index a137fdf775e2..68cb1e840b5e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -12,11 +12,10 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o -obj-$(CONFIG_PPC_BOOK3S_32) += tlb_hash32.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile new file mode 100644 index 000000000000..a4e217d0f3b7 --- /dev/null +++ b/arch/powerpc/mm/book3s32/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += mmu.o hash_low.o mmu_context.o tlb.o diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S new file mode 100644 index 000000000000..e27792d0b744 --- /dev/null +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -0,0 +1,705 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP + * Copyright (C) 1996 Cort Dougan + * Adapted for Power Macintosh by Paul Mackerras. + * Low-level exception handlers and MMU support + * rewritten by Paul Mackerras. + * Copyright (C) 1996 Paul Mackerras. + * + * This file contains low-level assembler routines for managing + * the PowerPC MMU hash table. (PPC 8xx processors don't use a + * hash table, so this file is not used on them.) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP + .section .bss + .align 2 +mmu_hash_lock: + .space 4 +#endif /* CONFIG_SMP */ + +/* + * Load a PTE into the hash table, if possible. + * The address is in r4, and r3 contains an access flag: + * _PAGE_RW (0x400) if a write. + * r9 contains the SRR1 value, from which we use the MSR_PR bit. + * SPRG_THREAD contains the physical address of the current task's thread. + * + * Returns to the caller if the access is illegal or there is no + * mapping for the address. Otherwise it places an appropriate PTE + * in the hash table and returns from the exception. + * Uses r0, r3 - r6, r8, r10, ctr, lr. + */ + .text +_GLOBAL(hash_page) +#ifdef CONFIG_SMP + lis r8, (mmu_hash_lock - PAGE_OFFSET)@h + ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l + lis r0,0x0fff + b 10f +11: lwz r6,0(r8) + cmpwi 0,r6,0 + bne 11b +10: lwarx r6,0,r8 + cmpwi 0,r6,0 + bne- 11b + stwcx. r0,0,r8 + bne- 10b + isync +#endif + /* Get PTE (linux-style) and check access */ + lis r0,KERNELBASE@h /* check if kernel address */ + cmplw 0,r4,r0 + ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ + mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */ + blt+ 112f /* assume user more likely */ + lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ + rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ +112: +#ifndef CONFIG_PTE_64BIT + rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ + lwz r8,0(r5) /* get pmd entry */ + rlwinm. r8,r8,0,0,19 /* extract address of pte page */ +#else + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */ + lwzx r8,r8,r5 /* Get L1 entry */ + rlwinm. r8,r8,0,0,20 /* extract pt base address */ +#endif +#ifdef CONFIG_SMP + beq- hash_page_out /* return if no mapping */ +#else + /* XXX it seems like the 601 will give a machine fault on the + rfi if its alignment is wrong (bottom 4 bits of address are + 8 or 0xc) and we have had a not-taken conditional branch + to the address following the rfi. */ + beqlr- +#endif +#ifndef CONFIG_PTE_64BIT + rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ +#else + rlwimi r8,r4,23,20,28 /* compute pte address */ +#endif + rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE + + /* + * Update the linux PTE atomically. We do the lwarx up-front + * because almost always, there won't be a permission violation + * and there won't already be an HPTE, and thus we will have + * to update the PTE to set _PAGE_HASHPTE. -- paulus. + * + * If PTE_64BIT is set, the low word is the flags word; use that + * word for locking since it contains all the interesting bits. + */ +#if (PTE_FLAGS_OFFSET != 0) + addi r8,r8,PTE_FLAGS_OFFSET +#endif +retry: + lwarx r6,0,r8 /* get linux-style pte, flag word */ + andc. r5,r3,r6 /* check access & ~permission */ +#ifdef CONFIG_SMP + bne- hash_page_out /* return if access not permitted */ +#else + bnelr- +#endif + or r5,r0,r6 /* set accessed/dirty bits */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ + stwcx. r5,0,r8 /* attempt to update PTE */ + bne- retry /* retry if someone got there first */ + + mfsrin r3,r4 /* get segment reg for segment */ + mfctr r0 + stw r0,_CTR(r11) + bl create_hpte /* add the hash table entry */ + +#ifdef CONFIG_SMP + eieio + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha + li r0,0 + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) +#endif + + /* Return from the exception */ + lwz r5,_CTR(r11) + mtctr r5 + lwz r0,GPR0(r11) + lwz r8,GPR8(r11) + b fast_exception_return + +#ifdef CONFIG_SMP +hash_page_out: + eieio + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha + li r0,0 + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) + blr +#endif /* CONFIG_SMP */ + +/* + * Add an entry for a particular page to the hash table. + * + * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval) + * + * We assume any necessary modifications to the pte (e.g. setting + * the accessed bit) have already been done and that there is actually + * a hash table in use (i.e. we're not on a 603). + */ +_GLOBAL(add_hash_page) + mflr r0 + stw r0,4(r1) + + /* Convert context and va to VSID */ + mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note create_hpte trims to 24 bits */ + +#ifdef CONFIG_SMP + lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */ + oris r8,r8,12 +#endif /* CONFIG_SMP */ + + /* + * We disable interrupts here, even on UP, because we don't + * want to race with hash_page, and because we want the + * _PAGE_HASHPTE bit to be a reliable indication of whether + * the HPTE exists (or at least whether one did once). + * We also turn off the MMU for data accesses so that we + * we can't take a hash table miss (assuming the code is + * covered by a BAT). -- paulus + */ + mfmsr r9 + SYNC + rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + SYNC_601 + isync + +#ifdef CONFIG_SMP + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l +10: lwarx r0,0,r6 /* take the mmu_hash_lock */ + cmpi 0,r0,0 + bne- 11f + stwcx. r8,0,r6 + beq+ 12f +11: lwz r0,0(r6) + cmpi 0,r0,0 + beq 10b + b 11b +12: isync +#endif + + /* + * Fetch the linux pte and test and set _PAGE_HASHPTE atomically. + * If _PAGE_HASHPTE was already set, we don't replace the existing + * HPTE, so we just unlock and return. + */ + mr r8,r5 +#ifndef CONFIG_PTE_64BIT + rlwimi r8,r4,22,20,29 +#else + rlwimi r8,r4,23,20,28 + addi r8,r8,PTE_FLAGS_OFFSET +#endif +1: lwarx r6,0,r8 + andi. r0,r6,_PAGE_HASHPTE + bne 9f /* if HASHPTE already set, done */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ + ori r5,r6,_PAGE_HASHPTE + stwcx. r5,0,r8 + bne- 1b + + bl create_hpte + +9: +#ifdef CONFIG_SMP + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l + eieio + li r0,0 + stw r0,0(r6) /* clear mmu_hash_lock */ +#endif + + /* reenable interrupts and DR */ + mtmsr r9 + SYNC_601 + isync + + lwz r0,4(r1) + mtlr r0 + blr + +/* + * This routine adds a hardware PTE to the hash table. + * It is designed to be called with the MMU either on or off. + * r3 contains the VSID, r4 contains the virtual address, + * r5 contains the linux PTE, r6 contains the old value of the + * linux PTE (before setting _PAGE_HASHPTE). r10 contains the + * upper half of the PTE if CONFIG_PTE_64BIT. + * On SMP, the caller should have the mmu_hash_lock held. + * We assume that the caller has (or will) set the _PAGE_HASHPTE + * bit in the linux PTE in memory. The value passed in r6 should + * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set + * this routine will skip the search for an existing HPTE. + * This procedure modifies r0, r3 - r6, r8, cr0. + * -- paulus. + * + * For speed, 4 of the instructions get patched once the size and + * physical address of the hash table are known. These definitions + * of Hash_base and Hash_bits below are just an example. + */ +Hash_base = 0xc0180000 +Hash_bits = 12 /* e.g. 256kB hash table */ +Hash_msk = (((1 << Hash_bits) - 1) * 64) + +/* defines for the PTE format for 32-bit PPCs */ +#define HPTE_SIZE 8 +#define PTEG_SIZE 64 +#define LG_PTEG_SIZE 6 +#define LDPTEu lwzu +#define LDPTE lwz +#define STPTE stw +#define CMPPTE cmpw +#define PTE_H 0x40 +#define PTE_V 0x80000000 +#define TST_V(r) rlwinm. r,r,0,0,0 +#define SET_V(r) oris r,r,PTE_V@h +#define CLR_V(r,t) rlwinm r,r,0,1,31 + +#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1) +#define HASH_RIGHT 31-LG_PTEG_SIZE + +_GLOBAL(create_hpte) + /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ + rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ + and r8,r8,r0 /* writable if _RW & _DIRTY */ + rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ + ori r8,r8,0xe04 /* clear out reserved bits */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ +BEGIN_FTR_SECTION + rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ +END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) +#ifdef CONFIG_PTE_64BIT + /* Put the XPN bits into the PTE */ + rlwimi r8,r10,8,20,22 + rlwimi r8,r10,2,29,29 +#endif + + /* Construct the high word of the PPC-style PTE (r5) */ + rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ + rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */ + SET_V(r5) /* set V (valid) bit */ + + patch_site 0f, patch__hash_page_A0 + patch_site 1f, patch__hash_page_A1 + patch_site 2f, patch__hash_page_A2 + /* Get the address of the primary PTE group in the hash table (r3) */ +0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ +1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ +2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ + xor r3,r3,r0 /* make primary hash */ + li r0,8 /* PTEs/group */ + + /* + * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search + * if it is clear, meaning that the HPTE isn't there already... + */ + andi. r6,r6,_PAGE_HASHPTE + beq+ 10f /* no PTE: go look for an empty slot */ + tlbie r4 + + lis r4, (htab_hash_searches - PAGE_OFFSET)@ha + lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) + addi r6,r6,1 /* count how many searches we do */ + stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) + + /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ + mtctr r0 + addi r4,r3,-HPTE_SIZE +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ + CMPPTE 0,r6,r5 + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ found_slot + + patch_site 0f, patch__hash_page_B + /* Search the secondary PTEG for a matching PTE */ + ori r5,r5,PTE_H /* set H (secondary hash) bit */ +0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ + xori r4,r4,(-PTEG_SIZE & 0xffff) + addi r4,r4,-HPTE_SIZE + mtctr r0 +2: LDPTEu r6,HPTE_SIZE(r4) + CMPPTE 0,r6,r5 + bdnzf 2,2b + beq+ found_slot + xori r5,r5,PTE_H /* clear H bit again */ + + /* Search the primary PTEG for an empty slot */ +10: mtctr r0 + addi r4,r3,-HPTE_SIZE /* search primary PTEG */ +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ + TST_V(r6) /* test valid bit */ + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ found_empty + + /* update counter of times that the primary PTEG is full */ + lis r4, (primary_pteg_full - PAGE_OFFSET)@ha + lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) + addi r6,r6,1 + stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) + + patch_site 0f, patch__hash_page_C + /* Search the secondary PTEG for an empty slot */ + ori r5,r5,PTE_H /* set H (secondary hash) bit */ +0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ + xori r4,r4,(-PTEG_SIZE & 0xffff) + addi r4,r4,-HPTE_SIZE + mtctr r0 +2: LDPTEu r6,HPTE_SIZE(r4) + TST_V(r6) + bdnzf 2,2b + beq+ found_empty + xori r5,r5,PTE_H /* clear H bit again */ + + /* + * Choose an arbitrary slot in the primary PTEG to overwrite. + * Since both the primary and secondary PTEGs are full, and we + * have no information that the PTEs in the primary PTEG are + * more important or useful than those in the secondary PTEG, + * and we know there is a definite (although small) speed + * advantage to putting the PTE in the primary PTEG, we always + * put the PTE in the primary PTEG. + * + * In addition, we skip any slot that is mapping kernel text in + * order to avoid a deadlock when not using BAT mappings if + * trying to hash in the kernel hash code itself after it has + * already taken the hash table lock. This works in conjunction + * with pre-faulting of the kernel text. + * + * If the hash table bucket is full of kernel text entries, we'll + * lockup here but that shouldn't happen + */ + +1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - PAGE_OFFSET)@l(r4) + addi r6,r6,HPTE_SIZE /* search for candidate */ + andi. r6,r6,7*HPTE_SIZE + stw r6,next_slot@l(r4) + add r4,r3,r6 + LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ + clrrwi r0,r0,12 + lis r6,etext@h + ori r6,r6,etext@l /* get etext */ + tophys(r6,r6) + cmpl cr0,r0,r6 /* compare and try again */ + blt 1b + +#ifndef CONFIG_SMP + /* Store PTE in PTEG */ +found_empty: + STPTE r5,0(r4) +found_slot: + STPTE r8,HPTE_SIZE/2(r4) + +#else /* CONFIG_SMP */ +/* + * Between the tlbie above and updating the hash table entry below, + * another CPU could read the hash table entry and put it in its TLB. + * There are 3 cases: + * 1. using an empty slot + * 2. updating an earlier entry to change permissions (i.e. enable write) + * 3. taking over the PTE for an unrelated address + * + * In each case it doesn't really matter if the other CPUs have the old + * PTE in their TLB. So we don't need to bother with another tlbie here, + * which is convenient as we've overwritten the register that had the + * address. :-) The tlbie above is mainly to make sure that this CPU comes + * and gets the new PTE from the hash table. + * + * We do however have to make sure that the PTE is never in an invalid + * state with the V bit set. + */ +found_empty: +found_slot: + CLR_V(r5,r0) /* clear V (valid) bit in PTE */ + STPTE r5,0(r4) + sync + TLBSYNC + STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ + sync + SET_V(r5) + STPTE r5,0(r4) /* finally set V bit in PTE */ +#endif /* CONFIG_SMP */ + + sync /* make sure pte updates get to memory */ + blr + + .section .bss + .align 2 +next_slot: + .space 4 +primary_pteg_full: + .space 4 +htab_hash_searches: + .space 4 + .previous + +/* + * Flush the entry for a particular page from the hash table. + * + * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval, + * int count) + * + * We assume that there is a hash table in use (Hash != 0). + */ +_GLOBAL(flush_hash_pages) + /* + * We disable interrupts here, even on UP, because we want + * the _PAGE_HASHPTE bit to be a reliable indication of + * whether the HPTE exists (or at least whether one did once). + * We also turn off the MMU for data accesses so that we + * we can't take a hash table miss (assuming the code is + * covered by a BAT). -- paulus + */ + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + SYNC_601 + isync + + /* First find a PTE in the range that has _PAGE_HASHPTE set */ +#ifndef CONFIG_PTE_64BIT + rlwimi r5,r4,22,20,29 +#else + rlwimi r5,r4,23,20,28 +#endif +1: lwz r0,PTE_FLAGS_OFFSET(r5) + cmpwi cr1,r6,1 + andi. r0,r0,_PAGE_HASHPTE + bne 2f + ble cr1,19f + addi r4,r4,0x1000 + addi r5,r5,PTE_SIZE + addi r6,r6,-1 + b 1b + + /* Convert context and va to VSID */ +2: mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note code below trims to 24 bits */ + + /* Construct the high word of the PPC-style PTE (r11) */ + rlwinm r11,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ + rlwimi r11,r4,10,26,31 /* put in API (abbrev page index) */ + SET_V(r11) /* set V (valid) bit */ + +#ifdef CONFIG_SMP + lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l + lwz r8,TASK_CPU(r2) + oris r8,r8,9 +10: lwarx r0,0,r9 + cmpi 0,r0,0 + bne- 11f + stwcx. r8,0,r9 + beq+ 12f +11: lwz r0,0(r9) + cmpi 0,r0,0 + beq 10b + b 11b +12: isync +#endif + + /* + * Check the _PAGE_HASHPTE bit in the linux PTE. If it is + * already clear, we're done (for this pte). If not, + * clear it (atomically) and proceed. -- paulus. + */ +#if (PTE_FLAGS_OFFSET != 0) + addi r5,r5,PTE_FLAGS_OFFSET +#endif +33: lwarx r8,0,r5 /* fetch the pte flags word */ + andi. r0,r8,_PAGE_HASHPTE + beq 8f /* done if HASHPTE is already clear */ + rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ + stwcx. r8,0,r5 /* update the pte */ + bne- 33b + + patch_site 0f, patch__flush_hash_A0 + patch_site 1f, patch__flush_hash_A1 + patch_site 2f, patch__flush_hash_A2 + /* Get the address of the primary PTE group in the hash table (r3) */ +0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ +1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ +2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ + xor r8,r0,r8 /* make primary hash */ + + /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ + li r0,8 /* PTEs/group */ + mtctr r0 + addi r12,r8,-HPTE_SIZE +1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */ + CMPPTE 0,r0,r11 + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ 3f + + patch_site 0f, patch__flush_hash_B + /* Search the secondary PTEG for a matching PTE */ + ori r11,r11,PTE_H /* set H (secondary hash) bit */ + li r0,8 /* PTEs/group */ +0: xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ + xori r12,r12,(-PTEG_SIZE & 0xffff) + addi r12,r12,-HPTE_SIZE + mtctr r0 +2: LDPTEu r0,HPTE_SIZE(r12) + CMPPTE 0,r0,r11 + bdnzf 2,2b + xori r11,r11,PTE_H /* clear H again */ + bne- 4f /* should rarely fail to find it */ + +3: li r0,0 + STPTE r0,0(r12) /* invalidate entry */ +4: sync + tlbie r4 /* in hw tlb too */ + sync + +8: ble cr1,9f /* if all ptes checked */ +81: addi r6,r6,-1 + addi r5,r5,PTE_SIZE + addi r4,r4,0x1000 + lwz r0,0(r5) /* check next pte */ + cmpwi cr1,r6,1 + andi. r0,r0,_PAGE_HASHPTE + bne 33b + bgt cr1,81b + +9: +#ifdef CONFIG_SMP + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ +#endif + +19: mtmsr r10 + SYNC_601 + isync + blr +EXPORT_SYMBOL(flush_hash_pages) + +/* + * Flush an entry from the TLB + */ +_GLOBAL(_tlbie) +#ifdef CONFIG_SMP + lwz r8,TASK_CPU(r2) + oris r8,r8,11 + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + SYNC_601 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + eieio + tlbie r3 + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + SYNC_601 + isync +#else /* CONFIG_SMP */ + tlbie r3 + sync +#endif /* CONFIG_SMP */ + blr + +/* + * Flush the entire TLB. 603/603e only + */ +_GLOBAL(_tlbia) +#if defined(CONFIG_SMP) + lwz r8,TASK_CPU(r2) + oris r8,r8,10 + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + SYNC_601 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + sync + tlbia + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + SYNC_601 + isync +#else /* CONFIG_SMP */ + sync + tlbia + sync +#endif /* CONFIG_SMP */ + blr diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c new file mode 100644 index 000000000000..1db55159031c --- /dev/null +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -0,0 +1,419 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * and 8260 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct hash_pte *Hash, *Hash_end; +unsigned long Hash_size, Hash_mask; +unsigned long _SDR1; + +struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ + +struct batrange { /* stores address ranges mapped by BATs */ + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} bat_addrs[8]; + +/* + * Return PA for this VA if it is mapped by a BAT, or 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + int b; + for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b) + if (va >= bat_addrs[b].start && va < bat_addrs[b].limit) + return bat_addrs[b].phys + (va - bat_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + int b; + for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b) + if (pa >= bat_addrs[b].phys + && pa < (bat_addrs[b].limit-bat_addrs[b].start) + +bat_addrs[b].phys) + return bat_addrs[b].start+(pa-bat_addrs[b].phys); + return 0; +} + +static int find_free_bat(void) +{ + int b; + + if (cpu_has_feature(CPU_FTR_601)) { + for (b = 0; b < 4; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[0].batl & 0x40)) + return b; + } + } else { + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[1].batu & 3)) + return b; + } + } + return -1; +} + +static unsigned int block_size(unsigned long base, unsigned long top) +{ + unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; + unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int block_shift = (fls(top - base) - 1) & 31; + + return min3(max_size, 1U << base_shift, 1U << block_shift); +} + +/* + * Set up one of the IBAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + * Only for 603+ ... + */ +static void setibat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl = (size >> 17) - 1; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if (!cpu_has_feature(CPU_FTR_NEED_COHERENT)) + flags &= ~_PAGE_COHERENT; + + wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); + bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[0].batu |= 1; /* Vp = 1 */ +} + +static void clearibat(int index) +{ + struct ppc_bat *bat = BATS[index]; + + bat[0].batu = 0; + bat[0].batl = 0; +} + +static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int idx; + + while ((idx = find_free_bat()) != -1 && base != top) { + unsigned int size = block_size(base, top); + + if (size < 128 << 10) + break; + setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; + } + + return base; +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int done; + unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + + if (__map_without_bats) { + pr_debug("RAM mapped without BATs\n"); + return base; + } + + if (!strict_kernel_rwx_enabled() || base >= border || top <= border) + return __mmu_mapin_ram(base, top); + + done = __mmu_mapin_ram(base, border); + if (done != border - base) + return done; + + return done + __mmu_mapin_ram(border, top); +} + +void mmu_mark_initmem_nx(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + unsigned long base = (unsigned long)_stext - PAGE_OFFSET; + unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long size; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { + size = block_size(base, top); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; + } + if (base < top) { + size = block_size(base, top); + size = max(size, 128UL << 10); + if ((top - base) > size) { + if (strict_kernel_rwx_enabled()) + pr_warn("Kernel _etext not properly aligned\n"); + size <<= 1; + } + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; + } + for (; i < nb; i++) + clearibat(i); + + update_bats(); + + for (i = TASK_SIZE >> 28; i < 16; i++) { + /* Do not set NX on VM space for modules */ + if (IS_ENABLED(CONFIG_MODULES) && + (VMALLOC_START & 0xf0000000) == i << 28) + break; + mtsrin(mfsrin(i << 28) | 0x10000000, i << 28); + } +} + +void mmu_mark_rodata_ro(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb; i++) { + struct ppc_bat *bat = BATS[i]; + + if (bat_addrs[i].start < (unsigned long)__init_begin) + bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; + } + + update_bats(); +} + +/* + * Set up one of the I/D BAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + * On 603+, only set IBAT when _PAGE_EXEC is set + */ +void __init setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if ((flags & _PAGE_NO_CACHE) || + (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) + flags &= ~_PAGE_COHERENT; + + bl = (size >> 17) - 1; + if (PVR_VER(mfspr(SPRN_PVR)) != 1) { + /* 603, 604, etc. */ + /* Do DBAT first */ + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT | _PAGE_GUARDED); + wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[1].batu |= 1; /* Vp = 1 */ + if (flags & _PAGE_GUARDED) { + /* G bit must be zero in IBATs */ + flags &= ~_PAGE_EXEC; + } + if (flags & _PAGE_EXEC) + bat[0] = bat[1]; + else + bat[0].batu = bat[0].batl = 0; + } else { + /* 601 cpu */ + if (bl > BL_8M) + bl = BL_8M; + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT); + wimgxpp |= (flags & _PAGE_RW)? + ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; + bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ + bat->batl = phys | bl | 0x40; /* V=1 */ + } + + bat_addrs[index].start = virt; + bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; + bat_addrs[index].phys = phys; +} + +/* + * Preload a translation in the hash table + */ +void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) +{ + pmd_t *pmd; + + if (!Hash) + return; + pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); + if (!pmd_none(*pmd)) + add_hash_page(mm->context.id, ea, pmd_val(*pmd)); +} + +/* + * Initialize the hash table and patch the instructions in hashtable.S. + */ +void __init MMU_init_hw(void) +{ + unsigned int hmask, mb, mb2; + unsigned int n_hpteg, lg_n_hpteg; + + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + + if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105); + +#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */ +#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10) +#define MIN_N_HPTEG 1024 /* min 64kB hash table */ + + /* + * Allow 1 HPTE (1/8 HPTEG) for each page of memory. + * This is less than the recommended amount, but then + * Linux ain't AIX. + */ + n_hpteg = total_memory / (PAGE_SIZE * 8); + if (n_hpteg < MIN_N_HPTEG) + n_hpteg = MIN_N_HPTEG; + lg_n_hpteg = __ilog2(n_hpteg); + if (n_hpteg & (n_hpteg - 1)) { + ++lg_n_hpteg; /* round up if not power of 2 */ + n_hpteg = 1 << lg_n_hpteg; + } + Hash_size = n_hpteg << LG_HPTEG_SIZE; + + /* + * Find some memory for the hash table. + */ + if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); + Hash = memblock_alloc(Hash_size, Hash_size); + if (!Hash) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, Hash_size, Hash_size); + _SDR1 = __pa(Hash) | SDR1_LOW_BITS; + + Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); + + printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); + + + /* + * Patch up the instructions in hashtable.S:create_hpte + */ + if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); + Hash_mask = n_hpteg - 1; + hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + if (lg_n_hpteg > 16) + mb2 = 16 - LG_HPTEG_SIZE; + + modify_instruction_site(&patch__hash_page_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); + modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); + modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); + modify_instruction_site(&patch__hash_page_C, 0xffff, hmask); + + /* + * Patch up the instructions in hashtable.S:flush_hash_page + */ + modify_instruction_site(&patch__flush_hash_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); + modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); + modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); + + if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 601 can only access 16MB at the moment */ + if (PVR_VER(mfspr(SPRN_PVR)) == 1) + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); + else /* Anything else has 256M mapped */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); +} + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + if (cpu_has_feature(CPU_FTR_601)) + pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n"); + + if (disabled) + pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c new file mode 100644 index 000000000000..921c1e33e941 --- /dev/null +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -0,0 +1,118 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * and 8260 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include + +#include + +/* + * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs + * (virtual segment identifiers) for each context. Although the + * hardware supports 24-bit VSIDs, and thus >1 million contexts, + * we only use 32,768 of them. That is ample, since there can be + * at most around 30,000 tasks in the system anyway, and it means + * that we can use a bitmap to indicate which contexts are in use. + * Using a bitmap means that we entirely avoid all of the problems + * that we used to have when the context number overflowed, + * particularly on SMP systems. + * -- paulus. + */ +#define NO_CONTEXT ((unsigned long) -1) +#define LAST_CONTEXT 32767 +#define FIRST_CONTEXT 1 + +/* + * This function defines the mapping from contexts to VSIDs (virtual + * segment IDs). We use a skew on both the context and the high 4 bits + * of the 32-bit virtual address (the "effective segment ID") in order + * to spread out the entries in the MMU hash table. Note, if this + * function is changed then arch/ppc/mm/hashtable.S will have to be + * changed to correspond. + * + * + * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \ + * & 0xffffff) + */ + +static unsigned long next_mmu_context; +static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; + +unsigned long __init_new_context(void) +{ + unsigned long ctx = next_mmu_context; + + while (test_and_set_bit(ctx, context_map)) { + ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); + if (ctx > LAST_CONTEXT) + ctx = 0; + } + next_mmu_context = (ctx + 1) & LAST_CONTEXT; + + return ctx; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +/* + * Set up the context for a new address space. + */ +int init_new_context(struct task_struct *t, struct mm_struct *mm) +{ + mm->context.id = __init_new_context(); + + return 0; +} + +/* + * Free a context ID. Make sure to call this with preempt disabled! + */ +void __destroy_context(unsigned long ctx) +{ + clear_bit(ctx, context_map); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +/* + * We're finished using the context for an address space. + */ +void destroy_context(struct mm_struct *mm) +{ + preempt_disable(); + if (mm->context.id != NO_CONTEXT) { + __destroy_context(mm->context.id); + mm->context.id = NO_CONTEXT; + } + preempt_enable(); +} + +/* + * Initialize the context management stuff. + */ +void __init mmu_context_init(void) +{ + /* Reserve context 0 for kernel use */ + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_mmu_context = FIRST_CONTEXT; +} diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c new file mode 100644 index 000000000000..8d56f0417f87 --- /dev/null +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -0,0 +1,173 @@ +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU uses a hash table to store virtual to + * physical translations, these routines flush entries from the + * hash table also. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* + * Called when unmapping pages to flush entries from the TLB/hash table. + */ +void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) +{ + unsigned long ptephys; + + if (Hash) { + ptephys = __pa(ptep) & PAGE_MASK; + flush_hash_pages(mm->context.id, addr, ptephys, 1); + } +} +EXPORT_SYMBOL(flush_hash_entry); + +/* + * Called at the end of a mmu_gather operation to make sure the + * TLB flush is completely done. + */ +void tlb_flush(struct mmu_gather *tlb) +{ + if (!Hash) { + /* + * 603 needs to flush the whole TLB here since + * it doesn't use a hash table. + */ + _tlbia(); + } +} + +/* + * TLB flushing: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * since the hardware hash table functions as an extension of the + * tlb as far as the linux tables are concerned, flush it too. + * -- Cort + */ + +static void flush_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + pmd_t *pmd; + unsigned long pmd_end; + int count; + unsigned int ctx = mm->context.id; + + if (!Hash) { + _tlbia(); + return; + } + start &= PAGE_MASK; + if (start >= end) + return; + end = (end - 1) | ~PAGE_MASK; + pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); + for (;;) { + pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; + if (pmd_end > end) + pmd_end = end; + if (!pmd_none(*pmd)) { + count = ((pmd_end - start) >> PAGE_SHIFT) + 1; + flush_hash_pages(ctx, start, pmd_val(*pmd), count); + } + if (pmd_end == end) + break; + start = pmd_end + 1; + ++pmd; + } +} + +/* + * Flush kernel TLB entries in the given range + */ +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + flush_range(&init_mm, start, end); +} +EXPORT_SYMBOL(flush_tlb_kernel_range); + +/* + * Flush all the (user) entries for the address space described by mm. + */ +void flush_tlb_mm(struct mm_struct *mm) +{ + struct vm_area_struct *mp; + + if (!Hash) { + _tlbia(); + return; + } + + /* + * It is safe to go down the mm's list of vmas when called + * from dup_mmap, holding mmap_sem. It would also be safe from + * unmap_region or exit_mmap, but not from vmtruncate on SMP - + * but it seems dup_mmap is the only SMP case which gets here. + */ + for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct mm_struct *mm; + pmd_t *pmd; + + if (!Hash) { + _tlbie(vmaddr); + return; + } + mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; + pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); + if (!pmd_none(*pmd)) + flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); +} +EXPORT_SYMBOL(flush_tlb_page); + +/* + * For each address in the range, find the pte for the address + * and check _PAGE_HASHPTE bit; if it is set, find and destroy + * the corresponding HPTE. + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + flush_range(vma->vm_mm, start, end); +} +EXPORT_SYMBOL(flush_tlb_range); + +void __init early_init_mmu(void) +{ +} diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S deleted file mode 100644 index e27792d0b744..000000000000 --- a/arch/powerpc/mm/hash_low_32.S +++ /dev/null @@ -1,705 +0,0 @@ -/* - * PowerPC version - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP - * Copyright (C) 1996 Cort Dougan - * Adapted for Power Macintosh by Paul Mackerras. - * Low-level exception handlers and MMU support - * rewritten by Paul Mackerras. - * Copyright (C) 1996 Paul Mackerras. - * - * This file contains low-level assembler routines for managing - * the PowerPC MMU hash table. (PPC 8xx processors don't use a - * hash table, so this file is not used on them.) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_SMP - .section .bss - .align 2 -mmu_hash_lock: - .space 4 -#endif /* CONFIG_SMP */ - -/* - * Load a PTE into the hash table, if possible. - * The address is in r4, and r3 contains an access flag: - * _PAGE_RW (0x400) if a write. - * r9 contains the SRR1 value, from which we use the MSR_PR bit. - * SPRG_THREAD contains the physical address of the current task's thread. - * - * Returns to the caller if the access is illegal or there is no - * mapping for the address. Otherwise it places an appropriate PTE - * in the hash table and returns from the exception. - * Uses r0, r3 - r6, r8, r10, ctr, lr. - */ - .text -_GLOBAL(hash_page) -#ifdef CONFIG_SMP - lis r8, (mmu_hash_lock - PAGE_OFFSET)@h - ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l - lis r0,0x0fff - b 10f -11: lwz r6,0(r8) - cmpwi 0,r6,0 - bne 11b -10: lwarx r6,0,r8 - cmpwi 0,r6,0 - bne- 11b - stwcx. r0,0,r8 - bne- 10b - isync -#endif - /* Get PTE (linux-style) and check access */ - lis r0,KERNELBASE@h /* check if kernel address */ - cmplw 0,r4,r0 - ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ - mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */ - blt+ 112f /* assume user more likely */ - lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ - addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ - rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: -#ifndef CONFIG_PTE_64BIT - rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ - lwz r8,0(r5) /* get pmd entry */ - rlwinm. r8,r8,0,0,19 /* extract address of pte page */ -#else - rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */ - lwzx r8,r8,r5 /* Get L1 entry */ - rlwinm. r8,r8,0,0,20 /* extract pt base address */ -#endif -#ifdef CONFIG_SMP - beq- hash_page_out /* return if no mapping */ -#else - /* XXX it seems like the 601 will give a machine fault on the - rfi if its alignment is wrong (bottom 4 bits of address are - 8 or 0xc) and we have had a not-taken conditional branch - to the address following the rfi. */ - beqlr- -#endif -#ifndef CONFIG_PTE_64BIT - rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ -#else - rlwimi r8,r4,23,20,28 /* compute pte address */ -#endif - rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ - ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE - - /* - * Update the linux PTE atomically. We do the lwarx up-front - * because almost always, there won't be a permission violation - * and there won't already be an HPTE, and thus we will have - * to update the PTE to set _PAGE_HASHPTE. -- paulus. - * - * If PTE_64BIT is set, the low word is the flags word; use that - * word for locking since it contains all the interesting bits. - */ -#if (PTE_FLAGS_OFFSET != 0) - addi r8,r8,PTE_FLAGS_OFFSET -#endif -retry: - lwarx r6,0,r8 /* get linux-style pte, flag word */ - andc. r5,r3,r6 /* check access & ~permission */ -#ifdef CONFIG_SMP - bne- hash_page_out /* return if access not permitted */ -#else - bnelr- -#endif - or r5,r0,r6 /* set accessed/dirty bits */ -#ifdef CONFIG_PTE_64BIT -#ifdef CONFIG_SMP - subf r10,r6,r8 /* create false data dependency */ - subi r10,r10,PTE_FLAGS_OFFSET - lwzx r10,r6,r10 /* Get upper PTE word */ -#else - lwz r10,-PTE_FLAGS_OFFSET(r8) -#endif /* CONFIG_SMP */ -#endif /* CONFIG_PTE_64BIT */ - stwcx. r5,0,r8 /* attempt to update PTE */ - bne- retry /* retry if someone got there first */ - - mfsrin r3,r4 /* get segment reg for segment */ - mfctr r0 - stw r0,_CTR(r11) - bl create_hpte /* add the hash table entry */ - -#ifdef CONFIG_SMP - eieio - lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha - li r0,0 - stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) -#endif - - /* Return from the exception */ - lwz r5,_CTR(r11) - mtctr r5 - lwz r0,GPR0(r11) - lwz r8,GPR8(r11) - b fast_exception_return - -#ifdef CONFIG_SMP -hash_page_out: - eieio - lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha - li r0,0 - stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) - blr -#endif /* CONFIG_SMP */ - -/* - * Add an entry for a particular page to the hash table. - * - * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval) - * - * We assume any necessary modifications to the pte (e.g. setting - * the accessed bit) have already been done and that there is actually - * a hash table in use (i.e. we're not on a 603). - */ -_GLOBAL(add_hash_page) - mflr r0 - stw r0,4(r1) - - /* Convert context and va to VSID */ - mulli r3,r3,897*16 /* multiply context by context skew */ - rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ - mulli r0,r0,0x111 /* multiply by ESID skew */ - add r3,r3,r0 /* note create_hpte trims to 24 bits */ - -#ifdef CONFIG_SMP - lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */ - oris r8,r8,12 -#endif /* CONFIG_SMP */ - - /* - * We disable interrupts here, even on UP, because we don't - * want to race with hash_page, and because we want the - * _PAGE_HASHPTE bit to be a reliable indication of whether - * the HPTE exists (or at least whether one did once). - * We also turn off the MMU for data accesses so that we - * we can't take a hash table miss (assuming the code is - * covered by a BAT). -- paulus - */ - mfmsr r9 - SYNC - rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear MSR_DR */ - mtmsr r0 - SYNC_601 - isync - -#ifdef CONFIG_SMP - lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha - addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l -10: lwarx r0,0,r6 /* take the mmu_hash_lock */ - cmpi 0,r0,0 - bne- 11f - stwcx. r8,0,r6 - beq+ 12f -11: lwz r0,0(r6) - cmpi 0,r0,0 - beq 10b - b 11b -12: isync -#endif - - /* - * Fetch the linux pte and test and set _PAGE_HASHPTE atomically. - * If _PAGE_HASHPTE was already set, we don't replace the existing - * HPTE, so we just unlock and return. - */ - mr r8,r5 -#ifndef CONFIG_PTE_64BIT - rlwimi r8,r4,22,20,29 -#else - rlwimi r8,r4,23,20,28 - addi r8,r8,PTE_FLAGS_OFFSET -#endif -1: lwarx r6,0,r8 - andi. r0,r6,_PAGE_HASHPTE - bne 9f /* if HASHPTE already set, done */ -#ifdef CONFIG_PTE_64BIT -#ifdef CONFIG_SMP - subf r10,r6,r8 /* create false data dependency */ - subi r10,r10,PTE_FLAGS_OFFSET - lwzx r10,r6,r10 /* Get upper PTE word */ -#else - lwz r10,-PTE_FLAGS_OFFSET(r8) -#endif /* CONFIG_SMP */ -#endif /* CONFIG_PTE_64BIT */ - ori r5,r6,_PAGE_HASHPTE - stwcx. r5,0,r8 - bne- 1b - - bl create_hpte - -9: -#ifdef CONFIG_SMP - lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha - addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l - eieio - li r0,0 - stw r0,0(r6) /* clear mmu_hash_lock */ -#endif - - /* reenable interrupts and DR */ - mtmsr r9 - SYNC_601 - isync - - lwz r0,4(r1) - mtlr r0 - blr - -/* - * This routine adds a hardware PTE to the hash table. - * It is designed to be called with the MMU either on or off. - * r3 contains the VSID, r4 contains the virtual address, - * r5 contains the linux PTE, r6 contains the old value of the - * linux PTE (before setting _PAGE_HASHPTE). r10 contains the - * upper half of the PTE if CONFIG_PTE_64BIT. - * On SMP, the caller should have the mmu_hash_lock held. - * We assume that the caller has (or will) set the _PAGE_HASHPTE - * bit in the linux PTE in memory. The value passed in r6 should - * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set - * this routine will skip the search for an existing HPTE. - * This procedure modifies r0, r3 - r6, r8, cr0. - * -- paulus. - * - * For speed, 4 of the instructions get patched once the size and - * physical address of the hash table are known. These definitions - * of Hash_base and Hash_bits below are just an example. - */ -Hash_base = 0xc0180000 -Hash_bits = 12 /* e.g. 256kB hash table */ -Hash_msk = (((1 << Hash_bits) - 1) * 64) - -/* defines for the PTE format for 32-bit PPCs */ -#define HPTE_SIZE 8 -#define PTEG_SIZE 64 -#define LG_PTEG_SIZE 6 -#define LDPTEu lwzu -#define LDPTE lwz -#define STPTE stw -#define CMPPTE cmpw -#define PTE_H 0x40 -#define PTE_V 0x80000000 -#define TST_V(r) rlwinm. r,r,0,0,0 -#define SET_V(r) oris r,r,PTE_V@h -#define CLR_V(r,t) rlwinm r,r,0,1,31 - -#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1) -#define HASH_RIGHT 31-LG_PTEG_SIZE - -_GLOBAL(create_hpte) - /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ - rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ - and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe04 /* clear out reserved bits */ - andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ -BEGIN_FTR_SECTION - rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ -END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) -#ifdef CONFIG_PTE_64BIT - /* Put the XPN bits into the PTE */ - rlwimi r8,r10,8,20,22 - rlwimi r8,r10,2,29,29 -#endif - - /* Construct the high word of the PPC-style PTE (r5) */ - rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ - rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */ - SET_V(r5) /* set V (valid) bit */ - - patch_site 0f, patch__hash_page_A0 - patch_site 1f, patch__hash_page_A1 - patch_site 2f, patch__hash_page_A2 - /* Get the address of the primary PTE group in the hash table (r3) */ -0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ -1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ -2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ - xor r3,r3,r0 /* make primary hash */ - li r0,8 /* PTEs/group */ - - /* - * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search - * if it is clear, meaning that the HPTE isn't there already... - */ - andi. r6,r6,_PAGE_HASHPTE - beq+ 10f /* no PTE: go look for an empty slot */ - tlbie r4 - - lis r4, (htab_hash_searches - PAGE_OFFSET)@ha - lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) - addi r6,r6,1 /* count how many searches we do */ - stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) - - /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ - mtctr r0 - addi r4,r3,-HPTE_SIZE -1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ - CMPPTE 0,r6,r5 - bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ - beq+ found_slot - - patch_site 0f, patch__hash_page_B - /* Search the secondary PTEG for a matching PTE */ - ori r5,r5,PTE_H /* set H (secondary hash) bit */ -0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ - xori r4,r4,(-PTEG_SIZE & 0xffff) - addi r4,r4,-HPTE_SIZE - mtctr r0 -2: LDPTEu r6,HPTE_SIZE(r4) - CMPPTE 0,r6,r5 - bdnzf 2,2b - beq+ found_slot - xori r5,r5,PTE_H /* clear H bit again */ - - /* Search the primary PTEG for an empty slot */ -10: mtctr r0 - addi r4,r3,-HPTE_SIZE /* search primary PTEG */ -1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ - TST_V(r6) /* test valid bit */ - bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ - beq+ found_empty - - /* update counter of times that the primary PTEG is full */ - lis r4, (primary_pteg_full - PAGE_OFFSET)@ha - lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) - addi r6,r6,1 - stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) - - patch_site 0f, patch__hash_page_C - /* Search the secondary PTEG for an empty slot */ - ori r5,r5,PTE_H /* set H (secondary hash) bit */ -0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ - xori r4,r4,(-PTEG_SIZE & 0xffff) - addi r4,r4,-HPTE_SIZE - mtctr r0 -2: LDPTEu r6,HPTE_SIZE(r4) - TST_V(r6) - bdnzf 2,2b - beq+ found_empty - xori r5,r5,PTE_H /* clear H bit again */ - - /* - * Choose an arbitrary slot in the primary PTEG to overwrite. - * Since both the primary and secondary PTEGs are full, and we - * have no information that the PTEs in the primary PTEG are - * more important or useful than those in the secondary PTEG, - * and we know there is a definite (although small) speed - * advantage to putting the PTE in the primary PTEG, we always - * put the PTE in the primary PTEG. - * - * In addition, we skip any slot that is mapping kernel text in - * order to avoid a deadlock when not using BAT mappings if - * trying to hash in the kernel hash code itself after it has - * already taken the hash table lock. This works in conjunction - * with pre-faulting of the kernel text. - * - * If the hash table bucket is full of kernel text entries, we'll - * lockup here but that shouldn't happen - */ - -1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ - lwz r6, (next_slot - PAGE_OFFSET)@l(r4) - addi r6,r6,HPTE_SIZE /* search for candidate */ - andi. r6,r6,7*HPTE_SIZE - stw r6,next_slot@l(r4) - add r4,r3,r6 - LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ - clrrwi r0,r0,12 - lis r6,etext@h - ori r6,r6,etext@l /* get etext */ - tophys(r6,r6) - cmpl cr0,r0,r6 /* compare and try again */ - blt 1b - -#ifndef CONFIG_SMP - /* Store PTE in PTEG */ -found_empty: - STPTE r5,0(r4) -found_slot: - STPTE r8,HPTE_SIZE/2(r4) - -#else /* CONFIG_SMP */ -/* - * Between the tlbie above and updating the hash table entry below, - * another CPU could read the hash table entry and put it in its TLB. - * There are 3 cases: - * 1. using an empty slot - * 2. updating an earlier entry to change permissions (i.e. enable write) - * 3. taking over the PTE for an unrelated address - * - * In each case it doesn't really matter if the other CPUs have the old - * PTE in their TLB. So we don't need to bother with another tlbie here, - * which is convenient as we've overwritten the register that had the - * address. :-) The tlbie above is mainly to make sure that this CPU comes - * and gets the new PTE from the hash table. - * - * We do however have to make sure that the PTE is never in an invalid - * state with the V bit set. - */ -found_empty: -found_slot: - CLR_V(r5,r0) /* clear V (valid) bit in PTE */ - STPTE r5,0(r4) - sync - TLBSYNC - STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ - sync - SET_V(r5) - STPTE r5,0(r4) /* finally set V bit in PTE */ -#endif /* CONFIG_SMP */ - - sync /* make sure pte updates get to memory */ - blr - - .section .bss - .align 2 -next_slot: - .space 4 -primary_pteg_full: - .space 4 -htab_hash_searches: - .space 4 - .previous - -/* - * Flush the entry for a particular page from the hash table. - * - * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval, - * int count) - * - * We assume that there is a hash table in use (Hash != 0). - */ -_GLOBAL(flush_hash_pages) - /* - * We disable interrupts here, even on UP, because we want - * the _PAGE_HASHPTE bit to be a reliable indication of - * whether the HPTE exists (or at least whether one did once). - * We also turn off the MMU for data accesses so that we - * we can't take a hash table miss (assuming the code is - * covered by a BAT). -- paulus - */ - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear MSR_DR */ - mtmsr r0 - SYNC_601 - isync - - /* First find a PTE in the range that has _PAGE_HASHPTE set */ -#ifndef CONFIG_PTE_64BIT - rlwimi r5,r4,22,20,29 -#else - rlwimi r5,r4,23,20,28 -#endif -1: lwz r0,PTE_FLAGS_OFFSET(r5) - cmpwi cr1,r6,1 - andi. r0,r0,_PAGE_HASHPTE - bne 2f - ble cr1,19f - addi r4,r4,0x1000 - addi r5,r5,PTE_SIZE - addi r6,r6,-1 - b 1b - - /* Convert context and va to VSID */ -2: mulli r3,r3,897*16 /* multiply context by context skew */ - rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ - mulli r0,r0,0x111 /* multiply by ESID skew */ - add r3,r3,r0 /* note code below trims to 24 bits */ - - /* Construct the high word of the PPC-style PTE (r11) */ - rlwinm r11,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ - rlwimi r11,r4,10,26,31 /* put in API (abbrev page index) */ - SET_V(r11) /* set V (valid) bit */ - -#ifdef CONFIG_SMP - lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha - addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l - lwz r8,TASK_CPU(r2) - oris r8,r8,9 -10: lwarx r0,0,r9 - cmpi 0,r0,0 - bne- 11f - stwcx. r8,0,r9 - beq+ 12f -11: lwz r0,0(r9) - cmpi 0,r0,0 - beq 10b - b 11b -12: isync -#endif - - /* - * Check the _PAGE_HASHPTE bit in the linux PTE. If it is - * already clear, we're done (for this pte). If not, - * clear it (atomically) and proceed. -- paulus. - */ -#if (PTE_FLAGS_OFFSET != 0) - addi r5,r5,PTE_FLAGS_OFFSET -#endif -33: lwarx r8,0,r5 /* fetch the pte flags word */ - andi. r0,r8,_PAGE_HASHPTE - beq 8f /* done if HASHPTE is already clear */ - rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ - stwcx. r8,0,r5 /* update the pte */ - bne- 33b - - patch_site 0f, patch__flush_hash_A0 - patch_site 1f, patch__flush_hash_A1 - patch_site 2f, patch__flush_hash_A2 - /* Get the address of the primary PTE group in the hash table (r3) */ -0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ -1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ -2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ - xor r8,r0,r8 /* make primary hash */ - - /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ - li r0,8 /* PTEs/group */ - mtctr r0 - addi r12,r8,-HPTE_SIZE -1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */ - CMPPTE 0,r0,r11 - bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ - beq+ 3f - - patch_site 0f, patch__flush_hash_B - /* Search the secondary PTEG for a matching PTE */ - ori r11,r11,PTE_H /* set H (secondary hash) bit */ - li r0,8 /* PTEs/group */ -0: xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ - xori r12,r12,(-PTEG_SIZE & 0xffff) - addi r12,r12,-HPTE_SIZE - mtctr r0 -2: LDPTEu r0,HPTE_SIZE(r12) - CMPPTE 0,r0,r11 - bdnzf 2,2b - xori r11,r11,PTE_H /* clear H again */ - bne- 4f /* should rarely fail to find it */ - -3: li r0,0 - STPTE r0,0(r12) /* invalidate entry */ -4: sync - tlbie r4 /* in hw tlb too */ - sync - -8: ble cr1,9f /* if all ptes checked */ -81: addi r6,r6,-1 - addi r5,r5,PTE_SIZE - addi r4,r4,0x1000 - lwz r0,0(r5) /* check next pte */ - cmpwi cr1,r6,1 - andi. r0,r0,_PAGE_HASHPTE - bne 33b - bgt cr1,81b - -9: -#ifdef CONFIG_SMP - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ -#endif - -19: mtmsr r10 - SYNC_601 - isync - blr -EXPORT_SYMBOL(flush_hash_pages) - -/* - * Flush an entry from the TLB - */ -_GLOBAL(_tlbie) -#ifdef CONFIG_SMP - lwz r8,TASK_CPU(r2) - oris r8,r8,11 - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear DR */ - mtmsr r0 - SYNC_601 - isync - lis r9,mmu_hash_lock@h - ori r9,r9,mmu_hash_lock@l - tophys(r9,r9) -10: lwarx r7,0,r9 - cmpwi 0,r7,0 - bne- 10b - stwcx. r8,0,r9 - bne- 10b - eieio - tlbie r3 - sync - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ - mtmsr r10 - SYNC_601 - isync -#else /* CONFIG_SMP */ - tlbie r3 - sync -#endif /* CONFIG_SMP */ - blr - -/* - * Flush the entire TLB. 603/603e only - */ -_GLOBAL(_tlbia) -#if defined(CONFIG_SMP) - lwz r8,TASK_CPU(r2) - oris r8,r8,10 - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear DR */ - mtmsr r0 - SYNC_601 - isync - lis r9,mmu_hash_lock@h - ori r9,r9,mmu_hash_lock@l - tophys(r9,r9) -10: lwarx r7,0,r9 - cmpwi 0,r7,0 - bne- 10b - stwcx. r8,0,r9 - bne- 10b - sync - tlbia - sync - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ - mtmsr r10 - SYNC_601 - isync -#else /* CONFIG_SMP */ - sync - tlbia - sync -#endif /* CONFIG_SMP */ - blr diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c deleted file mode 100644 index 921c1e33e941..000000000000 --- a/arch/powerpc/mm/mmu_context_hash32.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * This file contains the routines for handling the MMU on those - * PowerPC implementations where the MMU substantially follows the - * architecture specification. This includes the 6xx, 7xx, 7xxx, - * and 8260 implementations but excludes the 8xx and 4xx. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include - -#include - -/* - * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs - * (virtual segment identifiers) for each context. Although the - * hardware supports 24-bit VSIDs, and thus >1 million contexts, - * we only use 32,768 of them. That is ample, since there can be - * at most around 30,000 tasks in the system anyway, and it means - * that we can use a bitmap to indicate which contexts are in use. - * Using a bitmap means that we entirely avoid all of the problems - * that we used to have when the context number overflowed, - * particularly on SMP systems. - * -- paulus. - */ -#define NO_CONTEXT ((unsigned long) -1) -#define LAST_CONTEXT 32767 -#define FIRST_CONTEXT 1 - -/* - * This function defines the mapping from contexts to VSIDs (virtual - * segment IDs). We use a skew on both the context and the high 4 bits - * of the 32-bit virtual address (the "effective segment ID") in order - * to spread out the entries in the MMU hash table. Note, if this - * function is changed then arch/ppc/mm/hashtable.S will have to be - * changed to correspond. - * - * - * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \ - * & 0xffffff) - */ - -static unsigned long next_mmu_context; -static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; - -unsigned long __init_new_context(void) -{ - unsigned long ctx = next_mmu_context; - - while (test_and_set_bit(ctx, context_map)) { - ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); - if (ctx > LAST_CONTEXT) - ctx = 0; - } - next_mmu_context = (ctx + 1) & LAST_CONTEXT; - - return ctx; -} -EXPORT_SYMBOL_GPL(__init_new_context); - -/* - * Set up the context for a new address space. - */ -int init_new_context(struct task_struct *t, struct mm_struct *mm) -{ - mm->context.id = __init_new_context(); - - return 0; -} - -/* - * Free a context ID. Make sure to call this with preempt disabled! - */ -void __destroy_context(unsigned long ctx) -{ - clear_bit(ctx, context_map); -} -EXPORT_SYMBOL_GPL(__destroy_context); - -/* - * We're finished using the context for an address space. - */ -void destroy_context(struct mm_struct *mm) -{ - preempt_disable(); - if (mm->context.id != NO_CONTEXT) { - __destroy_context(mm->context.id); - mm->context.id = NO_CONTEXT; - } - preempt_enable(); -} - -/* - * Initialize the context management stuff. - */ -void __init mmu_context_init(void) -{ - /* Reserve context 0 for kernel use */ - context_map[0] = (1 << FIRST_CONTEXT) - 1; - next_mmu_context = FIRST_CONTEXT; -} diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c deleted file mode 100644 index 1db55159031c..000000000000 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ /dev/null @@ -1,419 +0,0 @@ -/* - * This file contains the routines for handling the MMU on those - * PowerPC implementations where the MMU substantially follows the - * architecture specification. This includes the 6xx, 7xx, 7xxx, - * and 8260 implementations but excludes the 8xx and 4xx. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -struct hash_pte *Hash, *Hash_end; -unsigned long Hash_size, Hash_mask; -unsigned long _SDR1; - -struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ - -struct batrange { /* stores address ranges mapped by BATs */ - unsigned long start; - unsigned long limit; - phys_addr_t phys; -} bat_addrs[8]; - -/* - * Return PA for this VA if it is mapped by a BAT, or 0 - */ -phys_addr_t v_block_mapped(unsigned long va) -{ - int b; - for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b) - if (va >= bat_addrs[b].start && va < bat_addrs[b].limit) - return bat_addrs[b].phys + (va - bat_addrs[b].start); - return 0; -} - -/* - * Return VA for a given PA or 0 if not mapped - */ -unsigned long p_block_mapped(phys_addr_t pa) -{ - int b; - for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b) - if (pa >= bat_addrs[b].phys - && pa < (bat_addrs[b].limit-bat_addrs[b].start) - +bat_addrs[b].phys) - return bat_addrs[b].start+(pa-bat_addrs[b].phys); - return 0; -} - -static int find_free_bat(void) -{ - int b; - - if (cpu_has_feature(CPU_FTR_601)) { - for (b = 0; b < 4; b++) { - struct ppc_bat *bat = BATS[b]; - - if (!(bat[0].batl & 0x40)) - return b; - } - } else { - int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; - - for (b = 0; b < n; b++) { - struct ppc_bat *bat = BATS[b]; - - if (!(bat[1].batu & 3)) - return b; - } - } - return -1; -} - -static unsigned int block_size(unsigned long base, unsigned long top) -{ - unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; - unsigned int base_shift = (fls(base) - 1) & 31; - unsigned int block_shift = (fls(top - base) - 1) & 31; - - return min3(max_size, 1U << base_shift, 1U << block_shift); -} - -/* - * Set up one of the IBAT (block address translation) register pairs. - * The parameters are not checked; in particular size must be a power - * of 2 between 128k and 256M. - * Only for 603+ ... - */ -static void setibat(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, pgprot_t prot) -{ - unsigned int bl = (size >> 17) - 1; - int wimgxpp; - struct ppc_bat *bat = BATS[index]; - unsigned long flags = pgprot_val(prot); - - if (!cpu_has_feature(CPU_FTR_NEED_COHERENT)) - flags &= ~_PAGE_COHERENT; - - wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); - bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ - bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) - bat[0].batu |= 1; /* Vp = 1 */ -} - -static void clearibat(int index) -{ - struct ppc_bat *bat = BATS[index]; - - bat[0].batu = 0; - bat[0].batl = 0; -} - -static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top) -{ - int idx; - - while ((idx = find_free_bat()) != -1 && base != top) { - unsigned int size = block_size(base, top); - - if (size < 128 << 10) - break; - setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); - base += size; - } - - return base; -} - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - int done; - unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; - - if (__map_without_bats) { - pr_debug("RAM mapped without BATs\n"); - return base; - } - - if (!strict_kernel_rwx_enabled() || base >= border || top <= border) - return __mmu_mapin_ram(base, top); - - done = __mmu_mapin_ram(base, border); - if (done != border - base) - return done; - - return done + __mmu_mapin_ram(border, top); -} - -void mmu_mark_initmem_nx(void) -{ - int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; - int i; - unsigned long base = (unsigned long)_stext - PAGE_OFFSET; - unsigned long top = (unsigned long)_etext - PAGE_OFFSET; - unsigned long size; - - if (cpu_has_feature(CPU_FTR_601)) - return; - - for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { - size = block_size(base, top); - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); - base += size; - } - if (base < top) { - size = block_size(base, top); - size = max(size, 128UL << 10); - if ((top - base) > size) { - if (strict_kernel_rwx_enabled()) - pr_warn("Kernel _etext not properly aligned\n"); - size <<= 1; - } - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); - base += size; - } - for (; i < nb; i++) - clearibat(i); - - update_bats(); - - for (i = TASK_SIZE >> 28; i < 16; i++) { - /* Do not set NX on VM space for modules */ - if (IS_ENABLED(CONFIG_MODULES) && - (VMALLOC_START & 0xf0000000) == i << 28) - break; - mtsrin(mfsrin(i << 28) | 0x10000000, i << 28); - } -} - -void mmu_mark_rodata_ro(void) -{ - int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; - int i; - - if (cpu_has_feature(CPU_FTR_601)) - return; - - for (i = 0; i < nb; i++) { - struct ppc_bat *bat = BATS[i]; - - if (bat_addrs[i].start < (unsigned long)__init_begin) - bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; - } - - update_bats(); -} - -/* - * Set up one of the I/D BAT (block address translation) register pairs. - * The parameters are not checked; in particular size must be a power - * of 2 between 128k and 256M. - * On 603+, only set IBAT when _PAGE_EXEC is set - */ -void __init setbat(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, pgprot_t prot) -{ - unsigned int bl; - int wimgxpp; - struct ppc_bat *bat = BATS[index]; - unsigned long flags = pgprot_val(prot); - - if ((flags & _PAGE_NO_CACHE) || - (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) - flags &= ~_PAGE_COHERENT; - - bl = (size >> 17) - 1; - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { - /* 603, 604, etc. */ - /* Do DBAT first */ - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; - bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ - bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) - bat[1].batu |= 1; /* Vp = 1 */ - if (flags & _PAGE_GUARDED) { - /* G bit must be zero in IBATs */ - flags &= ~_PAGE_EXEC; - } - if (flags & _PAGE_EXEC) - bat[0] = bat[1]; - else - bat[0].batu = bat[0].batl = 0; - } else { - /* 601 cpu */ - if (bl > BL_8M) - bl = BL_8M; - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT); - wimgxpp |= (flags & _PAGE_RW)? - ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; - bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ - bat->batl = phys | bl | 0x40; /* V=1 */ - } - - bat_addrs[index].start = virt; - bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; - bat_addrs[index].phys = phys; -} - -/* - * Preload a translation in the hash table - */ -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) -{ - pmd_t *pmd; - - if (!Hash) - return; - pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); - if (!pmd_none(*pmd)) - add_hash_page(mm->context.id, ea, pmd_val(*pmd)); -} - -/* - * Initialize the hash table and patch the instructions in hashtable.S. - */ -void __init MMU_init_hw(void) -{ - unsigned int hmask, mb, mb2; - unsigned int n_hpteg, lg_n_hpteg; - - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return; - - if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105); - -#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */ -#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10) -#define MIN_N_HPTEG 1024 /* min 64kB hash table */ - - /* - * Allow 1 HPTE (1/8 HPTEG) for each page of memory. - * This is less than the recommended amount, but then - * Linux ain't AIX. - */ - n_hpteg = total_memory / (PAGE_SIZE * 8); - if (n_hpteg < MIN_N_HPTEG) - n_hpteg = MIN_N_HPTEG; - lg_n_hpteg = __ilog2(n_hpteg); - if (n_hpteg & (n_hpteg - 1)) { - ++lg_n_hpteg; /* round up if not power of 2 */ - n_hpteg = 1 << lg_n_hpteg; - } - Hash_size = n_hpteg << LG_HPTEG_SIZE; - - /* - * Find some memory for the hash table. - */ - if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = memblock_alloc(Hash_size, Hash_size); - if (!Hash) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, Hash_size, Hash_size); - _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - - Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); - - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", - (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); - - - /* - * Patch up the instructions in hashtable.S:create_hpte - */ - if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); - Hash_mask = n_hpteg - 1; - hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); - mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; - if (lg_n_hpteg > 16) - mb2 = 16 - LG_HPTEG_SIZE; - - modify_instruction_site(&patch__hash_page_A0, 0xffff, - ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); - modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); - modify_instruction_site(&patch__hash_page_C, 0xffff, hmask); - - /* - * Patch up the instructions in hashtable.S:flush_hash_page - */ - modify_instruction_site(&patch__flush_hash_A0, 0xffff, - ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); - modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); - - if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* 601 can only access 16MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); - else /* Anything else has 256M mapped */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); -} - -#ifdef CONFIG_PPC_KUEP -void __init setup_kuep(bool disabled) -{ - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - if (cpu_has_feature(CPU_FTR_601)) - pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n"); - - if (disabled) - pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); -} -#endif - -#ifdef CONFIG_PPC_KUAP -void __init setup_kuap(bool disabled) -{ - pr_info("Activating Kernel Userspace Access Protection\n"); - - if (disabled) - pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n"); -} -#endif diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c deleted file mode 100644 index 8d56f0417f87..000000000000 --- a/arch/powerpc/mm/tlb_hash32.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * This file contains the routines for TLB flushing. - * On machines where the MMU uses a hash table to store virtual to - * physical translations, these routines flush entries from the - * hash table also. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* - * Called when unmapping pages to flush entries from the TLB/hash table. - */ -void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) -{ - unsigned long ptephys; - - if (Hash) { - ptephys = __pa(ptep) & PAGE_MASK; - flush_hash_pages(mm->context.id, addr, ptephys, 1); - } -} -EXPORT_SYMBOL(flush_hash_entry); - -/* - * Called at the end of a mmu_gather operation to make sure the - * TLB flush is completely done. - */ -void tlb_flush(struct mmu_gather *tlb) -{ - if (!Hash) { - /* - * 603 needs to flush the whole TLB here since - * it doesn't use a hash table. - */ - _tlbia(); - } -} - -/* - * TLB flushing: - * - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes kernel pages - * - * since the hardware hash table functions as an extension of the - * tlb as far as the linux tables are concerned, flush it too. - * -- Cort - */ - -static void flush_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - pmd_t *pmd; - unsigned long pmd_end; - int count; - unsigned int ctx = mm->context.id; - - if (!Hash) { - _tlbia(); - return; - } - start &= PAGE_MASK; - if (start >= end) - return; - end = (end - 1) | ~PAGE_MASK; - pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); - for (;;) { - pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; - if (pmd_end > end) - pmd_end = end; - if (!pmd_none(*pmd)) { - count = ((pmd_end - start) >> PAGE_SHIFT) + 1; - flush_hash_pages(ctx, start, pmd_val(*pmd), count); - } - if (pmd_end == end) - break; - start = pmd_end + 1; - ++pmd; - } -} - -/* - * Flush kernel TLB entries in the given range - */ -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_range(&init_mm, start, end); -} -EXPORT_SYMBOL(flush_tlb_kernel_range); - -/* - * Flush all the (user) entries for the address space described by mm. - */ -void flush_tlb_mm(struct mm_struct *mm) -{ - struct vm_area_struct *mp; - - if (!Hash) { - _tlbia(); - return; - } - - /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_sem. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. - */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) - flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); -} -EXPORT_SYMBOL(flush_tlb_mm); - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - struct mm_struct *mm; - pmd_t *pmd; - - if (!Hash) { - _tlbie(vmaddr); - return; - } - mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; - pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); - if (!pmd_none(*pmd)) - flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); -} -EXPORT_SYMBOL(flush_tlb_page); - -/* - * For each address in the range, find the pte for the address - * and check _PAGE_HASHPTE bit; if it is set, find and destroy - * the corresponding HPTE. - */ -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - flush_range(vma->vm_mm, start, end); -} -EXPORT_SYMBOL(flush_tlb_range); - -void __init early_init_mmu(void) -{ -} -- cgit v1.2.3-59-g8ed1b From 27e23b5f5f6f22292347901303aab2a1d458bcb5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:02 +0000 Subject: powerpc/mm: Move nohash specifics in subdirectory mm/nohash Many files in arch/powerpc/mm are only for nohash. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Shorten new filenames] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/40x_mmu.c | 159 ---- arch/powerpc/mm/44x_mmu.c | 246 ----- arch/powerpc/mm/8xx_mmu.c | 239 ----- arch/powerpc/mm/Makefile | 17 +- arch/powerpc/mm/fsl_booke_mmu.c | 326 ------- arch/powerpc/mm/hugetlbpage-book3e.c | 206 ----- arch/powerpc/mm/mmu_context_nohash.c | 497 ----------- arch/powerpc/mm/nohash/40x.c | 159 ++++ arch/powerpc/mm/nohash/44x.c | 246 +++++ arch/powerpc/mm/nohash/8xx.c | 239 +++++ arch/powerpc/mm/nohash/Makefile | 18 + arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 206 +++++ arch/powerpc/mm/nohash/book3e_pgtable.c | 120 +++ arch/powerpc/mm/nohash/fsl_booke.c | 326 +++++++ arch/powerpc/mm/nohash/mmu_context.c | 497 +++++++++++ arch/powerpc/mm/nohash/tlb.c | 810 +++++++++++++++++ arch/powerpc/mm/nohash/tlb_low.S | 491 ++++++++++ arch/powerpc/mm/nohash/tlb_low_64e.S | 1280 +++++++++++++++++++++++++++ arch/powerpc/mm/pgtable-book3e.c | 120 --- arch/powerpc/mm/tlb_low_64e.S | 1280 --------------------------- arch/powerpc/mm/tlb_nohash.c | 810 ----------------- arch/powerpc/mm/tlb_nohash_low.S | 491 ---------- 22 files changed, 4393 insertions(+), 4390 deletions(-) delete mode 100644 arch/powerpc/mm/40x_mmu.c delete mode 100644 arch/powerpc/mm/44x_mmu.c delete mode 100644 arch/powerpc/mm/8xx_mmu.c delete mode 100644 arch/powerpc/mm/fsl_booke_mmu.c delete mode 100644 arch/powerpc/mm/hugetlbpage-book3e.c delete mode 100644 arch/powerpc/mm/mmu_context_nohash.c create mode 100644 arch/powerpc/mm/nohash/40x.c create mode 100644 arch/powerpc/mm/nohash/44x.c create mode 100644 arch/powerpc/mm/nohash/8xx.c create mode 100644 arch/powerpc/mm/nohash/Makefile create mode 100644 arch/powerpc/mm/nohash/book3e_hugetlbpage.c create mode 100644 arch/powerpc/mm/nohash/book3e_pgtable.c create mode 100644 arch/powerpc/mm/nohash/fsl_booke.c create mode 100644 arch/powerpc/mm/nohash/mmu_context.c create mode 100644 arch/powerpc/mm/nohash/tlb.c create mode 100644 arch/powerpc/mm/nohash/tlb_low.S create mode 100644 arch/powerpc/mm/nohash/tlb_low_64e.S delete mode 100644 arch/powerpc/mm/pgtable-book3e.c delete mode 100644 arch/powerpc/mm/tlb_low_64e.S delete mode 100644 arch/powerpc/mm/tlb_nohash.c delete mode 100644 arch/powerpc/mm/tlb_nohash_low.S (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c deleted file mode 100644 index 460459b6f53e..000000000000 --- a/arch/powerpc/mm/40x_mmu.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -extern int __map_without_ltlbs; -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - /* - * The Zone Protection Register (ZPR) defines how protection will - * be applied to every page which is a member of a given zone. At - * present, we utilize only two of the 4xx's zones. - * The zone index bits (of ZSEL) in the PTE are used for software - * indicators, except the LSB. For user access, zone 1 is used, - * for kernel access, zone 0 is used. We set all but zone 1 - * to zero, allowing only kernel access as indicated in the PTE. - * For zone 1, we set a 01 binary (a value of 10 will not work) - * to allow user access as indicated in the PTE. This also allows - * kernel access as indicated in the PTE. - */ - - mtspr(SPRN_ZPR, 0x10000000); - - flush_instruction_cache(); - - /* - * Set up the real-mode cache parameters for the exception vector - * handlers (which are run in real-mode). - */ - - mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */ - - /* - * Cache instruction and data space where the exception - * vectors and the kernel live in real-mode. - */ - - mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */ - mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */ -} - -#define LARGE_PAGE_SIZE_16M (1<<24) -#define LARGE_PAGE_SIZE_4M (1<<22) - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - unsigned long v, s, mapped; - phys_addr_t p; - - v = KERNELBASE; - p = 0; - s = total_lowmem; - - if (__map_without_ltlbs) - return 0; - - while (s >= LARGE_PAGE_SIZE_16M) { - pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; - - pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); - *pmdp++ = __pmd(val); - *pmdp++ = __pmd(val); - *pmdp++ = __pmd(val); - *pmdp++ = __pmd(val); - - v += LARGE_PAGE_SIZE_16M; - p += LARGE_PAGE_SIZE_16M; - s -= LARGE_PAGE_SIZE_16M; - } - - while (s >= LARGE_PAGE_SIZE_4M) { - pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; - - pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); - *pmdp = __pmd(val); - - v += LARGE_PAGE_SIZE_4M; - p += LARGE_PAGE_SIZE_4M; - s -= LARGE_PAGE_SIZE_4M; - } - - mapped = total_lowmem - s; - - /* If the size of RAM is not an exact power of two, we may not - * have covered RAM in its entirety with 16 and 4 MiB - * pages. Consequently, restrict the top end of RAM currently - * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" - * coverage with normal-sized pages (or other reasons) do not - * attempt to allocate outside the allowed range. - */ - memblock_set_current_limit(mapped); - - return mapped; -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* 40x can only access 16MB at the moment (see head_40x.S) */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); -} diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c deleted file mode 100644 index c07983ebc02e..000000000000 --- a/arch/powerpc/mm/44x_mmu.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Modifications by Matt Porter (mporter@mvista.com) to support - * PPC44x Book E processors. - * - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include - -#include -#include -#include -#include - -#include - -/* Used by the 44x TLB replacement exception handler. - * Just needed it declared someplace. - */ -unsigned int tlb_44x_index; /* = 0 */ -unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS; -int icache_44x_need_flush; - -unsigned long tlb_47x_boltmap[1024/8]; - -static void ppc44x_update_tlb_hwater(void) -{ - /* The TLB miss handlers hard codes the watermark in a cmpli - * instruction to improve performances rather than loading it - * from the global variable. Thus, we patch the instructions - * in the 2 TLB miss handlers when updating the value - */ - modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater); - modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater); -} - -/* - * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU - */ -static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys) -{ - unsigned int entry = tlb_44x_hwater--; - - ppc44x_update_tlb_hwater(); - - mtspr(SPRN_MMUCR, 0); - - __asm__ __volatile__( - "tlbwe %2,%3,%4\n" - "tlbwe %1,%3,%5\n" - "tlbwe %0,%3,%6\n" - : - : "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G), - "r" (phys), - "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M), - "r" (entry), - "i" (PPC44x_TLB_PAGEID), - "i" (PPC44x_TLB_XLAT), - "i" (PPC44x_TLB_ATTRIB)); -} - -static int __init ppc47x_find_free_bolted(void) -{ - unsigned int mmube0 = mfspr(SPRN_MMUBE0); - unsigned int mmube1 = mfspr(SPRN_MMUBE1); - - if (!(mmube0 & MMUBE0_VBE0)) - return 0; - if (!(mmube0 & MMUBE0_VBE1)) - return 1; - if (!(mmube0 & MMUBE0_VBE2)) - return 2; - if (!(mmube1 & MMUBE1_VBE3)) - return 3; - if (!(mmube1 & MMUBE1_VBE4)) - return 4; - if (!(mmube1 & MMUBE1_VBE5)) - return 5; - return -1; -} - -static void __init ppc47x_update_boltmap(void) -{ - unsigned int mmube0 = mfspr(SPRN_MMUBE0); - unsigned int mmube1 = mfspr(SPRN_MMUBE1); - - if (mmube0 & MMUBE0_VBE0) - __set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff, - tlb_47x_boltmap); - if (mmube0 & MMUBE0_VBE1) - __set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff, - tlb_47x_boltmap); - if (mmube0 & MMUBE0_VBE2) - __set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff, - tlb_47x_boltmap); - if (mmube1 & MMUBE1_VBE3) - __set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff, - tlb_47x_boltmap); - if (mmube1 & MMUBE1_VBE4) - __set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff, - tlb_47x_boltmap); - if (mmube1 & MMUBE1_VBE5) - __set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff, - tlb_47x_boltmap); -} - -/* - * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU - */ -static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) -{ - unsigned int rA; - int bolted; - - /* Base rA is HW way select, way 0, bolted bit set */ - rA = 0x88000000; - - /* Look for a bolted entry slot */ - bolted = ppc47x_find_free_bolted(); - BUG_ON(bolted < 0); - - /* Insert bolted slot number */ - rA |= bolted << 24; - - pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n", - virt, phys, bolted); - - mtspr(SPRN_MMUCR, 0); - - __asm__ __volatile__( - "tlbwe %2,%3,0\n" - "tlbwe %1,%3,1\n" - "tlbwe %0,%3,2\n" - : - : "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR | - PPC47x_TLB2_SX -#ifdef CONFIG_SMP - | PPC47x_TLB2_M -#endif - ), - "r" (phys), - "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M), - "r" (rA)); -} - -void __init MMU_init_hw(void) -{ - /* This is not useful on 47x but won't hurt either */ - ppc44x_update_tlb_hwater(); - - flush_instruction_cache(); -} - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - unsigned long addr; - unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); - - /* Pin in enough TLBs to cover any lowmem not covered by the - * initial 256M mapping established in head_44x.S */ - for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; - addr += PPC_PIN_SIZE) { - if (mmu_has_feature(MMU_FTR_TYPE_47x)) - ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); - else - ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); - } - if (mmu_has_feature(MMU_FTR_TYPE_47x)) { - ppc47x_update_boltmap(); - -#ifdef DEBUG - { - int i; - - printk(KERN_DEBUG "bolted entries: "); - for (i = 0; i < 255; i++) { - if (test_bit(i, tlb_47x_boltmap)) - printk("%d ", i); - } - printk("\n"); - } -#endif /* DEBUG */ - } - return total_lowmem; -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - u64 size; - -#ifndef CONFIG_NONSTATIC_KERNEL - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); -#endif - - /* 44x has a 256M TLB entry pinned at boot */ - size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE)); - memblock_set_current_limit(first_memblock_base + size); -} - -#ifdef CONFIG_SMP -void __init mmu_init_secondary(int cpu) -{ - unsigned long addr; - unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); - - /* Pin in enough TLBs to cover any lowmem not covered by the - * initial 256M mapping established in head_44x.S - * - * WARNING: This is called with only the first 256M of the - * linear mapping in the TLB and we can't take faults yet - * so beware of what this code uses. It runs off a temporary - * stack. current (r2) isn't initialized, smp_processor_id() - * will not work, current thread info isn't accessible, ... - */ - for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; - addr += PPC_PIN_SIZE) { - if (mmu_has_feature(MMU_FTR_TYPE_47x)) - ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); - else - ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); - } -} -#endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c deleted file mode 100644 index 70d55b615b62..000000000000 --- a/arch/powerpc/mm/8xx_mmu.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * This file contains the routines for initializing the MMU - * on the 8xx series of chips. - * -- christophe - * - * Derived from arch/powerpc/mm/40x_mmu.c: - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include - -#include - -#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) - -extern int __map_without_ltlbs; - -static unsigned long block_mapped_ram; - -/* - * Return PA for this VA if it is in an area mapped with LTLBs. - * Otherwise, returns 0 - */ -phys_addr_t v_block_mapped(unsigned long va) -{ - unsigned long p = PHYS_IMMR_BASE; - - if (__map_without_ltlbs) - return 0; - if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) - return p + va - VIRT_IMMR_BASE; - if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) - return __pa(va); - return 0; -} - -/* - * Return VA for a given PA mapped with LTLBs or 0 if not mapped - */ -unsigned long p_block_mapped(phys_addr_t pa) -{ - unsigned long p = PHYS_IMMR_BASE; - - if (__map_without_ltlbs) - return 0; - if (pa >= p && pa < p + IMMR_SIZE) - return VIRT_IMMR_BASE + pa - p; - if (pa < block_mapped_ram) - return (unsigned long)__va(pa); - return 0; -} - -#define LARGE_PAGE_SIZE_8M (1<<23) - -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ - if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) { - unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; - int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28; - unsigned long addr = 0; - unsigned long mem = total_lowmem; - - for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { - mtspr(SPRN_MD_CTR, ctr | (i << 8)); - mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); - mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); - mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); - addr += LARGE_PAGE_SIZE_8M; - mem -= LARGE_PAGE_SIZE_8M; - } - } -} - -static void __init mmu_mapin_immr(void) -{ - unsigned long p = PHYS_IMMR_BASE; - unsigned long v = VIRT_IMMR_BASE; - int offset; - - for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) - map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); -} - -static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped) -{ - modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); -} - -static void mmu_patch_addis(s32 *site, long simm) -{ - unsigned int instr = *(unsigned int *)patch_site_addr(site); - - instr &= 0xffff0000; - instr |= ((unsigned long)simm) >> 16; - patch_instruction_site(site, instr); -} - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - unsigned long mapped; - - if (__map_without_ltlbs) { - mapped = 0; - mmu_mapin_immr(); - if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR)) - patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); - if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); - } else { - mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); - if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, - _ALIGN(__pa(_einittext), 8 << 20)); - } - - mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); - mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped); - - /* If the size of RAM is not an exact power of two, we may not - * have covered RAM in its entirety with 8 MiB - * pages. Consequently, restrict the top end of RAM currently - * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" - * coverage with normal-sized pages (or other reasons) do not - * attempt to allocate outside the allowed range. - */ - if (mapped) - memblock_set_current_limit(mapped); - - block_mapped_ram = mapped; - - return mapped; -} - -void mmu_mark_initmem_nx(void) -{ - if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) - mmu_patch_addis(&patch__itlbmiss_linmem_top8, - -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); - if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); -} - -#ifdef CONFIG_STRICT_KERNEL_RWX -void mmu_mark_rodata_ro(void) -{ - if (CONFIG_DATA_SHIFT < 23) - mmu_patch_addis(&patch__dtlbmiss_romem_top8, - -__pa(((unsigned long)_sinittext) & - ~(LARGE_PAGE_SIZE_8M - 1))); - mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); -} -#endif - -void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* 8xx can only access 32MB at the moment */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000)); -} - -/* - * Set up to use a given MMU context. - * id is context number, pgd is PGD pointer. - * - * We place the physical address of the new task page directory loaded - * into the MMU base register, and set the ASID compare register with - * the new "context." - */ -void set_context(unsigned long id, pgd_t *pgd) -{ - s16 offset = (s16)(__pa(swapper_pg_dir)); - - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is passed as second argument. - */ - if (IS_ENABLED(CONFIG_BDI_SWITCH)) - abatron_pteptrs[1] = pgd; - - /* Register M_TWB will contain base address of level 1 table minus the - * lower part of the kernel PGDIR base address, so that all accesses to - * level 1 table are done relative to lower part of kernel PGDIR base - * address. - */ - mtspr(SPRN_M_TWB, __pa(pgd) - offset); - - /* Update context */ - mtspr(SPRN_M_CASID, id - 1); - /* sync */ - mb(); -} - -void flush_instruction_cache(void) -{ - isync(); - mtspr(SPRN_IC_CST, IDC_INVALL); - isync(); -} - -#ifdef CONFIG_PPC_KUEP -void __init setup_kuep(bool disabled) -{ - if (disabled) - return; - - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - mtspr(SPRN_MI_AP, MI_APG_KUEP); -} -#endif - -#ifdef CONFIG_PPC_KUAP -void __init setup_kuap(bool disabled) -{ - pr_info("Activating Kernel Userspace Access Protection\n"); - - if (disabled) - pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n"); - - mtspr(SPRN_MD_AP, MD_APG_KUAP); -} -#endif diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 68cb1e840b5e..08557bae6fa1 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -8,30 +8,15 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o -obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ - tlb_nohash_low.o -obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o -obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_40x) += 40x_mmu.o -obj-$(CONFIG_44x) += 44x_mmu.o -obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o -ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o -endif obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ - -# Disable kcov instrumentation on sensitive code -# This is necessary for booting with kcov enabled on book3e machines -KCOV_INSTRUMENT_tlb_nohash.o := n -KCOV_INSTRUMENT_fsl_booke_mmu.o := n diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c deleted file mode 100644 index 71a1a36751dd..000000000000 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Modifications by Kumar Gala (galak@kernel.crashing.org) to support - * E500 Book E processors. - * - * Copyright 2004,2010 Freescale Semiconductor, Inc. - * - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -unsigned int tlbcam_index; - -#define NUM_TLBCAMS (64) -struct tlbcam TLBCAM[NUM_TLBCAMS]; - -struct tlbcamrange { - unsigned long start; - unsigned long limit; - phys_addr_t phys; -} tlbcam_addrs[NUM_TLBCAMS]; - -unsigned long tlbcam_sz(int idx) -{ - return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; -} - -#ifdef CONFIG_FSL_BOOKE -/* - * Return PA for this VA if it is mapped by a CAM, or 0 - */ -phys_addr_t v_block_mapped(unsigned long va) -{ - int b; - for (b = 0; b < tlbcam_index; ++b) - if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) - return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); - return 0; -} - -/* - * Return VA for a given PA or 0 if not mapped - */ -unsigned long p_block_mapped(phys_addr_t pa) -{ - int b; - for (b = 0; b < tlbcam_index; ++b) - if (pa >= tlbcam_addrs[b].phys - && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) - +tlbcam_addrs[b].phys) - return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); - return 0; -} -#endif - -/* - * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; - * in particular size must be a power of 4 between 4k and the max supported by - * an implementation; max may further be limited by what can be represented in - * an unsigned long (for example, 32-bit implementations cannot support a 4GB - * size). - */ -static void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned long size, unsigned long flags, unsigned int pid) -{ - unsigned int tsize; - - tsize = __ilog2(size) - 10; - -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) - if ((flags & _PAGE_NO_CACHE) == 0) - flags |= _PAGE_COHERENT; -#endif - - TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); - TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); - TLBCAM[index].MAS2 = virt & PAGE_MASK; - - TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - - TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - TLBCAM[index].MAS7 = (u64)phys >> 32; - - /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(__pte(flags))) { - TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); - } - - tlbcam_addrs[index].start = virt; - tlbcam_addrs[index].limit = virt + size - 1; - tlbcam_addrs[index].phys = phys; -} - -unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys) -{ - unsigned int camsize = __ilog2(ram); - unsigned int align = __ffs(virt | phys); - unsigned long max_cam; - - if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - /* Convert (4^max) kB to (2^max) bytes */ - max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; - camsize &= ~1U; - align &= ~1U; - } else { - /* Convert (2^max) kB to (2^max) bytes */ - max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; - } - - if (camsize > align) - camsize = align; - if (camsize > max_cam) - camsize = max_cam; - - return 1UL << camsize; -} - -static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, - unsigned long ram, int max_cam_idx, - bool dryrun) -{ - int i; - unsigned long amount_mapped = 0; - - /* Calculate CAM values */ - for (i = 0; ram && i < max_cam_idx; i++) { - unsigned long cam_sz; - - cam_sz = calc_cam_sz(ram, virt, phys); - if (!dryrun) - settlbcam(i, virt, phys, cam_sz, - pgprot_val(PAGE_KERNEL_X), 0); - - ram -= cam_sz; - amount_mapped += cam_sz; - virt += cam_sz; - phys += cam_sz; - } - - if (dryrun) - return amount_mapped; - - loadcam_multi(0, i, max_cam_idx); - tlbcam_index = i; - -#ifdef CONFIG_PPC64 - get_paca()->tcd.esel_next = i; - get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - get_paca()->tcd.esel_first = i; -#endif - - return amount_mapped; -} - -unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) -{ - unsigned long virt = PAGE_OFFSET; - phys_addr_t phys = memstart_addr; - - return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun); -} - -#ifdef CONFIG_PPC32 - -#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) -#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" -#endif - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; -} - -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - flush_instruction_cache(); -} - -void __init adjust_total_lowmem(void) -{ - unsigned long ram; - int i; - - /* adjust lowmem size to __max_low_memory */ - ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); - - i = switch_to_as1(); - __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false); - restore_to_as0(i, 0, 0, 1); - - pr_info("Memory CAM mapping: "); - for (i = 0; i < tlbcam_index - 1; i++) - pr_cont("%lu/", tlbcam_sz(i) >> 20); - pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, - (unsigned int)((total_lowmem - __max_low_memory) >> 20)); - - memblock_set_current_limit(memstart_addr + __max_low_memory); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - phys_addr_t limit = first_memblock_base + first_memblock_size; - - /* 64M mapped initially according to head_fsl_booke.S */ - memblock_set_current_limit(min_t(u64, limit, 0x04000000)); -} - -#ifdef CONFIG_RELOCATABLE -int __initdata is_second_reloc; -notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) -{ - unsigned long base = KERNELBASE; - - kernstart_addr = start; - if (is_second_reloc) { - virt_phys_offset = PAGE_OFFSET - memstart_addr; - return; - } - - /* - * Relocatable kernel support based on processing of dynamic - * relocation entries. Before we get the real memstart_addr, - * We will compute the virt_phys_offset like this: - * virt_phys_offset = stext.run - kernstart_addr - * - * stext.run = (KERNELBASE & ~0x3ffffff) + - * (kernstart_addr & 0x3ffffff) - * When we relocate, we have : - * - * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) - * - * hence: - * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - - * (kernstart_addr & ~0x3ffffff) - * - */ - start &= ~0x3ffffff; - base &= ~0x3ffffff; - virt_phys_offset = base - start; - early_get_first_memblock_info(__va(dt_ptr), NULL); - /* - * We now get the memstart_addr, then we should check if this - * address is the same as what the PAGE_OFFSET map to now. If - * not we have to change the map of PAGE_OFFSET to memstart_addr - * and do a second relocation. - */ - if (start != memstart_addr) { - int n; - long offset = start - memstart_addr; - - is_second_reloc = 1; - n = switch_to_as1(); - /* map a 64M area for the second relocation */ - if (memstart_addr > start) - map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, - false); - else - map_mem_in_cams_addr(start, PAGE_OFFSET + offset, - 0x4000000, CONFIG_LOWMEM_CAM_NUM, - false); - restore_to_as0(n, offset, __va(dt_ptr), 1); - /* We should never reach here */ - panic("Relocation error"); - } -} -#endif -#endif diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c deleted file mode 100644 index f84ec46cdb26..000000000000 --- a/arch/powerpc/mm/hugetlbpage-book3e.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PPC Huge TLB Page Support for Book3E MMU - * - * Copyright (C) 2009 David Gibson, IBM Corporation. - * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor - * - */ -#include -#include - -#include - -#ifdef CONFIG_PPC_FSL_BOOK3E -#ifdef CONFIG_PPC64 -static inline int tlb1_next(void) -{ - struct paca_struct *paca = get_paca(); - struct tlb_core_data *tcd; - int this, next; - - tcd = paca->tcd_ptr; - this = tcd->esel_next; - - next = this + 1; - if (next >= tcd->esel_max) - next = tcd->esel_first; - - tcd->esel_next = next; - return this; -} -#else -static inline int tlb1_next(void) -{ - int index, ncams; - - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - index = this_cpu_read(next_tlbcam_idx); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __this_cpu_write(next_tlbcam_idx, tlbcam_index); - else - __this_cpu_inc(next_tlbcam_idx); - - return index; -} -#endif /* !PPC64 */ -#endif /* FSL */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} - -#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) -#include - -static inline void book3e_tlb_lock(void) -{ - struct paca_struct *paca = get_paca(); - unsigned long tmp; - int token = smp_processor_id() + 1; - - /* - * Besides being unnecessary in the absence of SMT, this - * check prevents trying to do lbarx/stbcx. on e5500 which - * doesn't implement either feature. - */ - if (!cpu_has_feature(CPU_FTR_SMT)) - return; - - asm volatile("1: lbarx %0, 0, %1;" - "cmpwi %0, 0;" - "bne 2f;" - "stbcx. %2, 0, %1;" - "bne 1b;" - "b 3f;" - "2: lbzx %0, 0, %1;" - "cmpwi %0, 0;" - "bne 2b;" - "b 1b;" - "3:" - : "=&r" (tmp) - : "r" (&paca->tcd_ptr->lock), "r" (token) - : "memory"); -} - -static inline void book3e_tlb_unlock(void) -{ - struct paca_struct *paca = get_paca(); - - if (!cpu_has_feature(CPU_FTR_SMT)) - return; - - isync(); - paca->tcd_ptr->lock = 0; -} -#else -static inline void book3e_tlb_lock(void) -{ -} - -static inline void book3e_tlb_unlock(void) -{ -} -#endif - -static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) -{ - int found = 0; - - mtspr(SPRN_MAS6, pid << 16); - if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { - asm volatile( - "li %0,0\n" - "tlbsx. 0,%1\n" - "bne 1f\n" - "li %0,1\n" - "1:\n" - : "=&r"(found) : "r"(ea)); - } else { - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - } - - return found; -} - -void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, - pte_t pte) -{ - unsigned long mas1, mas2; - u64 mas7_3; - unsigned long psize, tsize, shift; - unsigned long flags; - struct mm_struct *mm; - -#ifdef CONFIG_PPC_FSL_BOOK3E - int index; -#endif - - if (unlikely(is_kernel_addr(ea))) - return; - - mm = vma->vm_mm; - - psize = vma_mmu_pagesize(vma); - shift = __ilog2(psize); - tsize = shift - 10; - /* - * We can't be interrupted while we're setting up the MAS - * regusters or after we've confirmed that no tlb exists. - */ - local_irq_save(flags); - - book3e_tlb_lock(); - - if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { - book3e_tlb_unlock(); - local_irq_restore(flags); - return; - } - -#ifdef CONFIG_PPC_FSL_BOOK3E - /* We have to use the CAM(TLB1) on FSL parts for hugepages */ - index = tlb1_next(); - mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); -#endif - - mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); - mas2 = ea & ~((1UL << shift) - 1); - mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; - mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; - if (!pte_dirty(pte)) - mas7_3 &= ~(MAS3_SW|MAS3_UW); - - mtspr(SPRN_MAS1, mas1); - mtspr(SPRN_MAS2, mas2); - - if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { - mtspr(SPRN_MAS7_MAS3, mas7_3); - } else { - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - } - - asm volatile ("tlbwe"); - - book3e_tlb_unlock(); - local_irq_restore(flags); -} - -void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - struct hstate *hstate = hstate_file(vma->vm_file); - unsigned long tsize = huge_page_shift(hstate) - 10; - - __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); -} diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c deleted file mode 100644 index ae4505d5b4b8..000000000000 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ /dev/null @@ -1,497 +0,0 @@ -/* - * This file contains the routines for handling the MMU on those - * PowerPC implementations where the MMU is not using the hash - * table, such as 8xx, 4xx, BookE's etc... - * - * Copyright 2008 Ben Herrenschmidt - * IBM Corp. - * - * Derived from previous arch/powerpc/mm/mmu_context.c - * and arch/powerpc/include/asm/mmu_context.h - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * TODO: - * - * - The global context lock will not scale very well - * - The maps should be dynamically allocated to allow for processors - * that support more PID bits at runtime - * - Implement flush_tlb_mm() by making the context stale and picking - * a new one - * - More aggressively clear stale map bits and maybe find some way to - * also clear mm->cpu_vm_mask bits when processes are migrated - */ - -//#define DEBUG_MAP_CONSISTENCY -//#define DEBUG_CLAMP_LAST_CONTEXT 31 -//#define DEBUG_HARDER - -/* We don't use DEBUG because it tends to be compiled in always nowadays - * and this would generate way too much output - */ -#ifdef DEBUG_HARDER -#define pr_hard(args...) printk(KERN_DEBUG args) -#define pr_hardcont(args...) printk(KERN_CONT args) -#else -#define pr_hard(args...) do { } while(0) -#define pr_hardcont(args...) do { } while(0) -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* - * The MPC8xx has only 16 contexts. We rotate through them on each task switch. - * A better way would be to keep track of tasks that own contexts, and implement - * an LRU usage. That way very active tasks don't always have to pay the TLB - * reload overhead. The kernel pages are mapped shared, so the kernel can run on - * behalf of any task that makes a kernel entry. Shared does not mean they are - * not protected, just that the ASID comparison is not performed. -- Dan - * - * The IBM4xx has 256 contexts, so we can just rotate through these as a way of - * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison - * is disabled, so we can use a TID of zero to represent all kernel pages as - * shared among all contexts. -- Dan - * - * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should - * normally never have to steal though the facility is present if needed. - * -- BenH - */ -#define FIRST_CONTEXT 1 -#ifdef DEBUG_CLAMP_LAST_CONTEXT -#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT -#elif defined(CONFIG_PPC_8xx) -#define LAST_CONTEXT 16 -#elif defined(CONFIG_PPC_47x) -#define LAST_CONTEXT 65535 -#else -#define LAST_CONTEXT 255 -#endif - -static unsigned int next_context, nr_free_contexts; -static unsigned long *context_map; -#ifdef CONFIG_SMP -static unsigned long *stale_map[NR_CPUS]; -#endif -static struct mm_struct **context_mm; -static DEFINE_RAW_SPINLOCK(context_lock); - -#define CTX_MAP_SIZE \ - (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1)) - - -/* Steal a context from a task that has one at the moment. - * - * This is used when we are running out of available PID numbers - * on the processors. - * - * This isn't an LRU system, it just frees up each context in - * turn (sort-of pseudo-random replacement :). This would be the - * place to implement an LRU scheme if anyone was motivated to do it. - * -- paulus - * - * For context stealing, we use a slightly different approach for - * SMP and UP. Basically, the UP one is simpler and doesn't use - * the stale map as we can just flush the local CPU - * -- benh - */ -#ifdef CONFIG_SMP -static unsigned int steal_context_smp(unsigned int id) -{ - struct mm_struct *mm; - unsigned int cpu, max, i; - - max = LAST_CONTEXT - FIRST_CONTEXT; - - /* Attempt to free next_context first and then loop until we manage */ - while (max--) { - /* Pick up the victim mm */ - mm = context_mm[id]; - - /* We have a candidate victim, check if it's active, on SMP - * we cannot steal active contexts - */ - if (mm->context.active) { - id++; - if (id > LAST_CONTEXT) - id = FIRST_CONTEXT; - continue; - } - pr_hardcont(" | steal %d from 0x%p", id, mm); - - /* Mark this mm has having no context anymore */ - mm->context.id = MMU_NO_CONTEXT; - - /* Mark it stale on all CPUs that used this mm. For threaded - * implementations, we set it on all threads on each core - * represented in the mask. A future implementation will use - * a core map instead but this will do for now. - */ - for_each_cpu(cpu, mm_cpumask(mm)) { - for (i = cpu_first_thread_sibling(cpu); - i <= cpu_last_thread_sibling(cpu); i++) { - if (stale_map[i]) - __set_bit(id, stale_map[i]); - } - cpu = i - 1; - } - return id; - } - - /* This will happen if you have more CPUs than available contexts, - * all we can do here is wait a bit and try again - */ - raw_spin_unlock(&context_lock); - cpu_relax(); - raw_spin_lock(&context_lock); - - /* This will cause the caller to try again */ - return MMU_NO_CONTEXT; -} -#endif /* CONFIG_SMP */ - -static unsigned int steal_all_contexts(void) -{ - struct mm_struct *mm; -#ifdef CONFIG_SMP - int cpu = smp_processor_id(); -#endif - unsigned int id; - - for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { - /* Pick up the victim mm */ - mm = context_mm[id]; - - pr_hardcont(" | steal %d from 0x%p", id, mm); - - /* Mark this mm as having no context anymore */ - mm->context.id = MMU_NO_CONTEXT; - if (id != FIRST_CONTEXT) { - context_mm[id] = NULL; - __clear_bit(id, context_map); -#ifdef DEBUG_MAP_CONSISTENCY - mm->context.active = 0; -#endif - } -#ifdef CONFIG_SMP - __clear_bit(id, stale_map[cpu]); -#endif - } - - /* Flush the TLB for all contexts (not to be used on SMP) */ - _tlbil_all(); - - nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT; - - return FIRST_CONTEXT; -} - -/* Note that this will also be called on SMP if all other CPUs are - * offlined, which means that it may be called for cpu != 0. For - * this to work, we somewhat assume that CPUs that are onlined - * come up with a fully clean TLB (or are cleaned when offlined) - */ -static unsigned int steal_context_up(unsigned int id) -{ - struct mm_struct *mm; -#ifdef CONFIG_SMP - int cpu = smp_processor_id(); -#endif - - /* Pick up the victim mm */ - mm = context_mm[id]; - - pr_hardcont(" | steal %d from 0x%p", id, mm); - - /* Flush the TLB for that context */ - local_flush_tlb_mm(mm); - - /* Mark this mm has having no context anymore */ - mm->context.id = MMU_NO_CONTEXT; - - /* XXX This clear should ultimately be part of local_flush_tlb_mm */ -#ifdef CONFIG_SMP - __clear_bit(id, stale_map[cpu]); -#endif - - return id; -} - -#ifdef DEBUG_MAP_CONSISTENCY -static void context_check_map(void) -{ - unsigned int id, nrf, nact; - - nrf = nact = 0; - for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { - int used = test_bit(id, context_map); - if (!used) - nrf++; - if (used != (context_mm[id] != NULL)) - pr_err("MMU: Context %d is %s and MM is %p !\n", - id, used ? "used" : "free", context_mm[id]); - if (context_mm[id] != NULL) - nact += context_mm[id]->context.active; - } - if (nrf != nr_free_contexts) { - pr_err("MMU: Free context count out of sync ! (%d vs %d)\n", - nr_free_contexts, nrf); - nr_free_contexts = nrf; - } - if (nact > num_online_cpus()) - pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", - nact, num_online_cpus()); - if (FIRST_CONTEXT > 0 && !test_bit(0, context_map)) - pr_err("MMU: Context 0 has been freed !!!\n"); -} -#else -static void context_check_map(void) { } -#endif - -void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned int id; -#ifdef CONFIG_SMP - unsigned int i, cpu = smp_processor_id(); -#endif - unsigned long *map; - - /* No lockless fast path .. yet */ - raw_spin_lock(&context_lock); - - pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", - cpu, next, next->context.active, next->context.id); - -#ifdef CONFIG_SMP - /* Mark us active and the previous one not anymore */ - next->context.active++; - if (prev) { - pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); - WARN_ON(prev->context.active < 1); - prev->context.active--; - } - - again: -#endif /* CONFIG_SMP */ - - /* If we already have a valid assigned context, skip all that */ - id = next->context.id; - if (likely(id != MMU_NO_CONTEXT)) { -#ifdef DEBUG_MAP_CONSISTENCY - if (context_mm[id] != next) - pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", - next, id, id, context_mm[id]); -#endif - goto ctxt_ok; - } - - /* We really don't have a context, let's try to acquire one */ - id = next_context; - if (id > LAST_CONTEXT) - id = FIRST_CONTEXT; - map = context_map; - - /* No more free contexts, let's try to steal one */ - if (nr_free_contexts == 0) { -#ifdef CONFIG_SMP - if (num_online_cpus() > 1) { - id = steal_context_smp(id); - if (id == MMU_NO_CONTEXT) - goto again; - goto stolen; - } -#endif /* CONFIG_SMP */ - if (IS_ENABLED(CONFIG_PPC_8xx)) - id = steal_all_contexts(); - else - id = steal_context_up(id); - goto stolen; - } - nr_free_contexts--; - - /* We know there's at least one free context, try to find it */ - while (__test_and_set_bit(id, map)) { - id = find_next_zero_bit(map, LAST_CONTEXT+1, id); - if (id > LAST_CONTEXT) - id = FIRST_CONTEXT; - } - stolen: - next_context = id + 1; - context_mm[id] = next; - next->context.id = id; - pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); - - context_check_map(); - ctxt_ok: - - /* If that context got marked stale on this CPU, then flush the - * local TLB for it and unmark it before we use it - */ -#ifdef CONFIG_SMP - if (test_bit(id, stale_map[cpu])) { - pr_hardcont(" | stale flush %d [%d..%d]", - id, cpu_first_thread_sibling(cpu), - cpu_last_thread_sibling(cpu)); - - local_flush_tlb_mm(next); - - /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - for (i = cpu_first_thread_sibling(cpu); - i <= cpu_last_thread_sibling(cpu); i++) { - if (stale_map[i]) - __clear_bit(id, stale_map[i]); - } - } -#endif - - /* Flick the MMU and release lock */ - pr_hardcont(" -> %d\n", id); - set_context(id, next->pgd); - raw_spin_unlock(&context_lock); -} - -/* - * Set up the context for a new address space. - */ -int init_new_context(struct task_struct *t, struct mm_struct *mm) -{ - pr_hard("initing context for mm @%p\n", mm); - - /* - * We have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we properly - * initialize context slice details for newly allocated mm's (which will - * have id == 0) and don't alter context slice inherited via fork (which - * will have id != 0). - */ - if (mm->context.id == 0) - slice_init_new_context_exec(mm); - mm->context.id = MMU_NO_CONTEXT; - mm->context.active = 0; - pte_frag_set(&mm->context, NULL); - return 0; -} - -/* - * We're finished using the context for an address space. - */ -void destroy_context(struct mm_struct *mm) -{ - unsigned long flags; - unsigned int id; - - if (mm->context.id == MMU_NO_CONTEXT) - return; - - WARN_ON(mm->context.active != 0); - - raw_spin_lock_irqsave(&context_lock, flags); - id = mm->context.id; - if (id != MMU_NO_CONTEXT) { - __clear_bit(id, context_map); - mm->context.id = MMU_NO_CONTEXT; -#ifdef DEBUG_MAP_CONSISTENCY - mm->context.active = 0; -#endif - context_mm[id] = NULL; - nr_free_contexts++; - } - raw_spin_unlock_irqrestore(&context_lock, flags); -} - -#ifdef CONFIG_SMP -static int mmu_ctx_cpu_prepare(unsigned int cpu) -{ - /* We don't touch CPU 0 map, it's allocated at aboot and kept - * around forever - */ - if (cpu == boot_cpuid) - return 0; - - pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); - stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); - return 0; -} - -static int mmu_ctx_cpu_dead(unsigned int cpu) -{ -#ifdef CONFIG_HOTPLUG_CPU - if (cpu == boot_cpuid) - return 0; - - pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); - kfree(stale_map[cpu]); - stale_map[cpu] = NULL; - - /* We also clear the cpu_vm_mask bits of CPUs going away */ - clear_tasks_mm_cpumask(cpu); -#endif - return 0; -} - -#endif /* CONFIG_SMP */ - -/* - * Initialize the context management stuff. - */ -void __init mmu_context_init(void) -{ - /* Mark init_mm as being active on all possible CPUs since - * we'll get called with prev == init_mm the first time - * we schedule on a given CPU - */ - init_mm.context.active = NR_CPUS; - - /* - * Allocate the maps used by context management - */ - context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!context_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), - SMP_CACHE_BYTES); - if (!context_mm) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(void *) * (LAST_CONTEXT + 1)); -#ifdef CONFIG_SMP - stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!stale_map[boot_cpuid]) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - - cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, - "powerpc/mmu/ctx:prepare", - mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); -#endif - - printk(KERN_INFO - "MMU: Allocated %zu bytes of context maps for %d contexts\n", - 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)), - LAST_CONTEXT - FIRST_CONTEXT + 1); - - /* - * Some processors have too few contexts to reserve one for - * init_mm, and require using context 0 for a normal task. - * Other processors reserve the use of context zero for the kernel. - * This code assumes FIRST_CONTEXT < 32. - */ - context_map[0] = (1 << FIRST_CONTEXT) - 1; - next_context = FIRST_CONTEXT; - nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1; -} diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c new file mode 100644 index 000000000000..460459b6f53e --- /dev/null +++ b/arch/powerpc/mm/nohash/40x.c @@ -0,0 +1,159 @@ +/* + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern int __map_without_ltlbs; +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + /* + * The Zone Protection Register (ZPR) defines how protection will + * be applied to every page which is a member of a given zone. At + * present, we utilize only two of the 4xx's zones. + * The zone index bits (of ZSEL) in the PTE are used for software + * indicators, except the LSB. For user access, zone 1 is used, + * for kernel access, zone 0 is used. We set all but zone 1 + * to zero, allowing only kernel access as indicated in the PTE. + * For zone 1, we set a 01 binary (a value of 10 will not work) + * to allow user access as indicated in the PTE. This also allows + * kernel access as indicated in the PTE. + */ + + mtspr(SPRN_ZPR, 0x10000000); + + flush_instruction_cache(); + + /* + * Set up the real-mode cache parameters for the exception vector + * handlers (which are run in real-mode). + */ + + mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */ + + /* + * Cache instruction and data space where the exception + * vectors and the kernel live in real-mode. + */ + + mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */ + mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */ +} + +#define LARGE_PAGE_SIZE_16M (1<<24) +#define LARGE_PAGE_SIZE_4M (1<<22) + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + unsigned long v, s, mapped; + phys_addr_t p; + + v = KERNELBASE; + p = 0; + s = total_lowmem; + + if (__map_without_ltlbs) + return 0; + + while (s >= LARGE_PAGE_SIZE_16M) { + pmd_t *pmdp; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; + + pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + *pmdp++ = __pmd(val); + *pmdp++ = __pmd(val); + *pmdp++ = __pmd(val); + *pmdp++ = __pmd(val); + + v += LARGE_PAGE_SIZE_16M; + p += LARGE_PAGE_SIZE_16M; + s -= LARGE_PAGE_SIZE_16M; + } + + while (s >= LARGE_PAGE_SIZE_4M) { + pmd_t *pmdp; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; + + pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + *pmdp = __pmd(val); + + v += LARGE_PAGE_SIZE_4M; + p += LARGE_PAGE_SIZE_4M; + s -= LARGE_PAGE_SIZE_4M; + } + + mapped = total_lowmem - s; + + /* If the size of RAM is not an exact power of two, we may not + * have covered RAM in its entirety with 16 and 4 MiB + * pages. Consequently, restrict the top end of RAM currently + * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" + * coverage with normal-sized pages (or other reasons) do not + * attempt to allocate outside the allowed range. + */ + memblock_set_current_limit(mapped); + + return mapped; +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 40x can only access 16MB at the moment (see head_40x.S) */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); +} diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c new file mode 100644 index 000000000000..c07983ebc02e --- /dev/null +++ b/arch/powerpc/mm/nohash/44x.c @@ -0,0 +1,246 @@ +/* + * Modifications by Matt Porter (mporter@mvista.com) to support + * PPC44x Book E processors. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include + +#include +#include +#include +#include + +#include + +/* Used by the 44x TLB replacement exception handler. + * Just needed it declared someplace. + */ +unsigned int tlb_44x_index; /* = 0 */ +unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS; +int icache_44x_need_flush; + +unsigned long tlb_47x_boltmap[1024/8]; + +static void ppc44x_update_tlb_hwater(void) +{ + /* The TLB miss handlers hard codes the watermark in a cmpli + * instruction to improve performances rather than loading it + * from the global variable. Thus, we patch the instructions + * in the 2 TLB miss handlers when updating the value + */ + modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater); + modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater); +} + +/* + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU + */ +static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys) +{ + unsigned int entry = tlb_44x_hwater--; + + ppc44x_update_tlb_hwater(); + + mtspr(SPRN_MMUCR, 0); + + __asm__ __volatile__( + "tlbwe %2,%3,%4\n" + "tlbwe %1,%3,%5\n" + "tlbwe %0,%3,%6\n" + : + : "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G), + "r" (phys), + "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M), + "r" (entry), + "i" (PPC44x_TLB_PAGEID), + "i" (PPC44x_TLB_XLAT), + "i" (PPC44x_TLB_ATTRIB)); +} + +static int __init ppc47x_find_free_bolted(void) +{ + unsigned int mmube0 = mfspr(SPRN_MMUBE0); + unsigned int mmube1 = mfspr(SPRN_MMUBE1); + + if (!(mmube0 & MMUBE0_VBE0)) + return 0; + if (!(mmube0 & MMUBE0_VBE1)) + return 1; + if (!(mmube0 & MMUBE0_VBE2)) + return 2; + if (!(mmube1 & MMUBE1_VBE3)) + return 3; + if (!(mmube1 & MMUBE1_VBE4)) + return 4; + if (!(mmube1 & MMUBE1_VBE5)) + return 5; + return -1; +} + +static void __init ppc47x_update_boltmap(void) +{ + unsigned int mmube0 = mfspr(SPRN_MMUBE0); + unsigned int mmube1 = mfspr(SPRN_MMUBE1); + + if (mmube0 & MMUBE0_VBE0) + __set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube0 & MMUBE0_VBE1) + __set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube0 & MMUBE0_VBE2) + __set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE3) + __set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE4) + __set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE5) + __set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff, + tlb_47x_boltmap); +} + +/* + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU + */ +static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +{ + unsigned int rA; + int bolted; + + /* Base rA is HW way select, way 0, bolted bit set */ + rA = 0x88000000; + + /* Look for a bolted entry slot */ + bolted = ppc47x_find_free_bolted(); + BUG_ON(bolted < 0); + + /* Insert bolted slot number */ + rA |= bolted << 24; + + pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n", + virt, phys, bolted); + + mtspr(SPRN_MMUCR, 0); + + __asm__ __volatile__( + "tlbwe %2,%3,0\n" + "tlbwe %1,%3,1\n" + "tlbwe %0,%3,2\n" + : + : "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR | + PPC47x_TLB2_SX +#ifdef CONFIG_SMP + | PPC47x_TLB2_M +#endif + ), + "r" (phys), + "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M), + "r" (rA)); +} + +void __init MMU_init_hw(void) +{ + /* This is not useful on 47x but won't hurt either */ + ppc44x_update_tlb_hwater(); + + flush_instruction_cache(); +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + unsigned long addr; + unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); + + /* Pin in enough TLBs to cover any lowmem not covered by the + * initial 256M mapping established in head_44x.S */ + for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) { + if (mmu_has_feature(MMU_FTR_TYPE_47x)) + ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); + else + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + } + if (mmu_has_feature(MMU_FTR_TYPE_47x)) { + ppc47x_update_boltmap(); + +#ifdef DEBUG + { + int i; + + printk(KERN_DEBUG "bolted entries: "); + for (i = 0; i < 255; i++) { + if (test_bit(i, tlb_47x_boltmap)) + printk("%d ", i); + } + printk("\n"); + } +#endif /* DEBUG */ + } + return total_lowmem; +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + u64 size; + +#ifndef CONFIG_NONSTATIC_KERNEL + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); +#endif + + /* 44x has a 256M TLB entry pinned at boot */ + size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE)); + memblock_set_current_limit(first_memblock_base + size); +} + +#ifdef CONFIG_SMP +void __init mmu_init_secondary(int cpu) +{ + unsigned long addr; + unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); + + /* Pin in enough TLBs to cover any lowmem not covered by the + * initial 256M mapping established in head_44x.S + * + * WARNING: This is called with only the first 256M of the + * linear mapping in the TLB and we can't take faults yet + * so beware of what this code uses. It runs off a temporary + * stack. current (r2) isn't initialized, smp_processor_id() + * will not work, current thread info isn't accessible, ... + */ + for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) { + if (mmu_has_feature(MMU_FTR_TYPE_47x)) + ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); + else + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + } +} +#endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c new file mode 100644 index 000000000000..70d55b615b62 --- /dev/null +++ b/arch/powerpc/mm/nohash/8xx.c @@ -0,0 +1,239 @@ +/* + * This file contains the routines for initializing the MMU + * on the 8xx series of chips. + * -- christophe + * + * Derived from arch/powerpc/mm/40x_mmu.c: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include + +#include + +#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) + +extern int __map_without_ltlbs; + +static unsigned long block_mapped_ram; + +/* + * Return PA for this VA if it is in an area mapped with LTLBs. + * Otherwise, returns 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + unsigned long p = PHYS_IMMR_BASE; + + if (__map_without_ltlbs) + return 0; + if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) + return p + va - VIRT_IMMR_BASE; + if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) + return __pa(va); + return 0; +} + +/* + * Return VA for a given PA mapped with LTLBs or 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + unsigned long p = PHYS_IMMR_BASE; + + if (__map_without_ltlbs) + return 0; + if (pa >= p && pa < p + IMMR_SIZE) + return VIRT_IMMR_BASE + pa - p; + if (pa < block_mapped_ram) + return (unsigned long)__va(pa); + return 0; +} + +#define LARGE_PAGE_SIZE_8M (1<<23) + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ + if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) { + unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; + int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28; + unsigned long addr = 0; + unsigned long mem = total_lowmem; + + for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { + mtspr(SPRN_MD_CTR, ctr | (i << 8)); + mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); + mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); + mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); + addr += LARGE_PAGE_SIZE_8M; + mem -= LARGE_PAGE_SIZE_8M; + } + } +} + +static void __init mmu_mapin_immr(void) +{ + unsigned long p = PHYS_IMMR_BASE; + unsigned long v = VIRT_IMMR_BASE; + int offset; + + for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) + map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); +} + +static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped) +{ + modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); +} + +static void mmu_patch_addis(s32 *site, long simm) +{ + unsigned int instr = *(unsigned int *)patch_site_addr(site); + + instr &= 0xffff0000; + instr |= ((unsigned long)simm) >> 16; + patch_instruction_site(site, instr); +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + unsigned long mapped; + + if (__map_without_ltlbs) { + mapped = 0; + mmu_mapin_immr(); + if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR)) + patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); + } else { + mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, + _ALIGN(__pa(_einittext), 8 << 20)); + } + + mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); + mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped); + + /* If the size of RAM is not an exact power of two, we may not + * have covered RAM in its entirety with 8 MiB + * pages. Consequently, restrict the top end of RAM currently + * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" + * coverage with normal-sized pages (or other reasons) do not + * attempt to allocate outside the allowed range. + */ + if (mapped) + memblock_set_current_limit(mapped); + + block_mapped_ram = mapped; + + return mapped; +} + +void mmu_mark_initmem_nx(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) + mmu_patch_addis(&patch__itlbmiss_linmem_top8, + -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mmu_mark_rodata_ro(void) +{ + if (CONFIG_DATA_SHIFT < 23) + mmu_patch_addis(&patch__dtlbmiss_romem_top8, + -__pa(((unsigned long)_sinittext) & + ~(LARGE_PAGE_SIZE_8M - 1))); + mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); +} +#endif + +void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 8xx can only access 32MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000)); +} + +/* + * Set up to use a given MMU context. + * id is context number, pgd is PGD pointer. + * + * We place the physical address of the new task page directory loaded + * into the MMU base register, and set the ASID compare register with + * the new "context." + */ +void set_context(unsigned long id, pgd_t *pgd) +{ + s16 offset = (s16)(__pa(swapper_pg_dir)); + + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is passed as second argument. + */ + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + abatron_pteptrs[1] = pgd; + + /* Register M_TWB will contain base address of level 1 table minus the + * lower part of the kernel PGDIR base address, so that all accesses to + * level 1 table are done relative to lower part of kernel PGDIR base + * address. + */ + mtspr(SPRN_M_TWB, __pa(pgd) - offset); + + /* Update context */ + mtspr(SPRN_M_CASID, id - 1); + /* sync */ + mb(); +} + +void flush_instruction_cache(void) +{ + isync(); + mtspr(SPRN_IC_CST, IDC_INVALL); + isync(); +} + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + if (disabled) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + mtspr(SPRN_MI_AP, MI_APG_KUEP); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n"); + + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} +#endif diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile new file mode 100644 index 000000000000..b2228ff81b8a --- /dev/null +++ b/arch/powerpc/mm/nohash/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +obj-y += mmu_context.o tlb.o tlb_low.o +obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o +obj-$(CONFIG_40x) += 40x.o +obj-$(CONFIG_44x) += 44x.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_BOOK3E_MMU) += book3e_hugetlbpage.o +endif + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_fsl_booke.o := n diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c new file mode 100644 index 000000000000..f84ec46cdb26 --- /dev/null +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include +#include + +#include + +#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC64 +static inline int tlb1_next(void) +{ + struct paca_struct *paca = get_paca(); + struct tlb_core_data *tcd; + int this, next; + + tcd = paca->tcd_ptr; + this = tcd->esel_next; + + next = this + 1; + if (next >= tcd->esel_max) + next = tcd->esel_first; + + tcd->esel_next = next; + return this; +} +#else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} +#endif /* !PPC64 */ +#endif /* FSL */ + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} + +#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) +#include + +static inline void book3e_tlb_lock(void) +{ + struct paca_struct *paca = get_paca(); + unsigned long tmp; + int token = smp_processor_id() + 1; + + /* + * Besides being unnecessary in the absence of SMT, this + * check prevents trying to do lbarx/stbcx. on e5500 which + * doesn't implement either feature. + */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + asm volatile("1: lbarx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2f;" + "stbcx. %2, 0, %1;" + "bne 1b;" + "b 3f;" + "2: lbzx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2b;" + "b 1b;" + "3:" + : "=&r" (tmp) + : "r" (&paca->tcd_ptr->lock), "r" (token) + : "memory"); +} + +static inline void book3e_tlb_unlock(void) +{ + struct paca_struct *paca = get_paca(); + + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + isync(); + paca->tcd_ptr->lock = 0; +} +#else +static inline void book3e_tlb_lock(void) +{ +} + +static inline void book3e_tlb_unlock(void) +{ +} +#endif + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { + asm volatile( + "li %0,0\n" + "tlbsx. 0,%1\n" + "bne 1f\n" + "li %0,1\n" + "1:\n" + : "=&r"(found) : "r"(ea)); + } else { + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + } + + return found; +} + +void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, + pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + struct mm_struct *mm; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int index; +#endif + + if (unlikely(is_kernel_addr(ea))) + return; + + mm = vma->vm_mm; + + psize = vma_mmu_pagesize(vma); + shift = __ilog2(psize); + tsize = shift - 10; + /* + * We can't be interrupted while we're setting up the MAS + * regusters or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + book3e_tlb_lock(); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + book3e_tlb_unlock(); + local_irq_restore(flags); + return; + } + +#ifdef CONFIG_PPC_FSL_BOOK3E + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = tlb1_next(); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); +#endif + + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { + mtspr(SPRN_MAS7_MAS3, mas7_3); + } else { + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + } + + asm volatile ("tlbwe"); + + book3e_tlb_unlock(); + local_irq_restore(flags); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); +} diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c new file mode 100644 index 000000000000..f296c2e88b09 --- /dev/null +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -0,0 +1,120 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags))); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *ptr; + + ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT, + __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n", + __func__, size, size, __pa(MAX_DMA_ADDRESS)); + + return ptr; +} + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + } else { + pgdp = pgd_offset_k(ea); +#ifndef __PAGETABLE_PUD_FOLDED + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* !__PAGETABLE_PUD_FOLDED */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + } + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); + + smp_wmb(); + return 0; +} diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c new file mode 100644 index 000000000000..71a1a36751dd --- /dev/null +++ b/arch/powerpc/mm/nohash/fsl_booke.c @@ -0,0 +1,326 @@ +/* + * Modifications by Kumar Gala (galak@kernel.crashing.org) to support + * E500 Book E processors. + * + * Copyright 2004,2010 Freescale Semiconductor, Inc. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned int tlbcam_index; + +#define NUM_TLBCAMS (64) +struct tlbcam TLBCAM[NUM_TLBCAMS]; + +struct tlbcamrange { + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} tlbcam_addrs[NUM_TLBCAMS]; + +unsigned long tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + +#ifdef CONFIG_FSL_BOOKE +/* + * Return PA for this VA if it is mapped by a CAM, or 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) + return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (pa >= tlbcam_addrs[b].phys + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + +tlbcam_addrs[b].phys) + return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); + return 0; +} +#endif + +/* + * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; + * in particular size must be a power of 4 between 4k and the max supported by + * an implementation; max may further be limited by what can be represented in + * an unsigned long (for example, 32-bit implementations cannot support a 4GB + * size). + */ +static void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned long size, unsigned long flags, unsigned int pid) +{ + unsigned int tsize; + + tsize = __ilog2(size) - 10; + +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) + if ((flags & _PAGE_NO_CACHE) == 0) + flags |= _PAGE_COHERENT; +#endif + + TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); + TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); + TLBCAM[index].MAS2 = virt & PAGE_MASK; + + TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; + + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + TLBCAM[index].MAS7 = (u64)phys >> 32; + + /* Below is unlikely -- only for large user pages or similar */ + if (pte_user(__pte(flags))) { + TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); + } + + tlbcam_addrs[index].start = virt; + tlbcam_addrs[index].limit = virt + size - 1; + tlbcam_addrs[index].phys = phys; +} + +unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) +{ + unsigned int camsize = __ilog2(ram); + unsigned int align = __ffs(virt | phys); + unsigned long max_cam; + + if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; + camsize &= ~1U; + align &= ~1U; + } else { + /* Convert (2^max) kB to (2^max) bytes */ + max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; + } + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + return 1UL << camsize; +} + +static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, + unsigned long ram, int max_cam_idx, + bool dryrun) +{ + int i; + unsigned long amount_mapped = 0; + + /* Calculate CAM values */ + for (i = 0; ram && i < max_cam_idx; i++) { + unsigned long cam_sz; + + cam_sz = calc_cam_sz(ram, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, + pgprot_val(PAGE_KERNEL_X), 0); + + ram -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + + if (dryrun) + return amount_mapped; + + loadcam_multi(0, i, max_cam_idx); + tlbcam_index = i; + +#ifdef CONFIG_PPC64 + get_paca()->tcd.esel_next = i; + get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + get_paca()->tcd.esel_first = i; +#endif + + return amount_mapped; +} + +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) +{ + unsigned long virt = PAGE_OFFSET; + phys_addr_t phys = memstart_addr; + + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun); +} + +#ifdef CONFIG_PPC32 + +#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) +#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" +#endif + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; +} + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + flush_instruction_cache(); +} + +void __init adjust_total_lowmem(void) +{ + unsigned long ram; + int i; + + /* adjust lowmem size to __max_low_memory */ + ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); + + i = switch_to_as1(); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false); + restore_to_as0(i, 0, 0, 1); + + pr_info("Memory CAM mapping: "); + for (i = 0; i < tlbcam_index - 1; i++) + pr_cont("%lu/", tlbcam_sz(i) >> 20); + pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, + (unsigned int)((total_lowmem - __max_low_memory) >> 20)); + + memblock_set_current_limit(memstart_addr + __max_low_memory); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + phys_addr_t limit = first_memblock_base + first_memblock_size; + + /* 64M mapped initially according to head_fsl_booke.S */ + memblock_set_current_limit(min_t(u64, limit, 0x04000000)); +} + +#ifdef CONFIG_RELOCATABLE +int __initdata is_second_reloc; +notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) +{ + unsigned long base = KERNELBASE; + + kernstart_addr = start; + if (is_second_reloc) { + virt_phys_offset = PAGE_OFFSET - memstart_addr; + return; + } + + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. Before we get the real memstart_addr, + * We will compute the virt_phys_offset like this: + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0x3ffffff) + + * (kernstart_addr & 0x3ffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - + * (kernstart_addr & ~0x3ffffff) + * + */ + start &= ~0x3ffffff; + base &= ~0x3ffffff; + virt_phys_offset = base - start; + early_get_first_memblock_info(__va(dt_ptr), NULL); + /* + * We now get the memstart_addr, then we should check if this + * address is the same as what the PAGE_OFFSET map to now. If + * not we have to change the map of PAGE_OFFSET to memstart_addr + * and do a second relocation. + */ + if (start != memstart_addr) { + int n; + long offset = start - memstart_addr; + + is_second_reloc = 1; + n = switch_to_as1(); + /* map a 64M area for the second relocation */ + if (memstart_addr > start) + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, + false); + else + map_mem_in_cams_addr(start, PAGE_OFFSET + offset, + 0x4000000, CONFIG_LOWMEM_CAM_NUM, + false); + restore_to_as0(n, offset, __va(dt_ptr), 1); + /* We should never reach here */ + panic("Relocation error"); + } +} +#endif +#endif diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c new file mode 100644 index 000000000000..ae4505d5b4b8 --- /dev/null +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -0,0 +1,497 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU is not using the hash + * table, such as 8xx, 4xx, BookE's etc... + * + * Copyright 2008 Ben Herrenschmidt + * IBM Corp. + * + * Derived from previous arch/powerpc/mm/mmu_context.c + * and arch/powerpc/include/asm/mmu_context.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TODO: + * + * - The global context lock will not scale very well + * - The maps should be dynamically allocated to allow for processors + * that support more PID bits at runtime + * - Implement flush_tlb_mm() by making the context stale and picking + * a new one + * - More aggressively clear stale map bits and maybe find some way to + * also clear mm->cpu_vm_mask bits when processes are migrated + */ + +//#define DEBUG_MAP_CONSISTENCY +//#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* + * The MPC8xx has only 16 contexts. We rotate through them on each task switch. + * A better way would be to keep track of tasks that own contexts, and implement + * an LRU usage. That way very active tasks don't always have to pay the TLB + * reload overhead. The kernel pages are mapped shared, so the kernel can run on + * behalf of any task that makes a kernel entry. Shared does not mean they are + * not protected, just that the ASID comparison is not performed. -- Dan + * + * The IBM4xx has 256 contexts, so we can just rotate through these as a way of + * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison + * is disabled, so we can use a TID of zero to represent all kernel pages as + * shared among all contexts. -- Dan + * + * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should + * normally never have to steal though the facility is present if needed. + * -- BenH + */ +#define FIRST_CONTEXT 1 +#ifdef DEBUG_CLAMP_LAST_CONTEXT +#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT +#elif defined(CONFIG_PPC_8xx) +#define LAST_CONTEXT 16 +#elif defined(CONFIG_PPC_47x) +#define LAST_CONTEXT 65535 +#else +#define LAST_CONTEXT 255 +#endif + +static unsigned int next_context, nr_free_contexts; +static unsigned long *context_map; +#ifdef CONFIG_SMP +static unsigned long *stale_map[NR_CPUS]; +#endif +static struct mm_struct **context_mm; +static DEFINE_RAW_SPINLOCK(context_lock); + +#define CTX_MAP_SIZE \ + (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1)) + + +/* Steal a context from a task that has one at the moment. + * + * This is used when we are running out of available PID numbers + * on the processors. + * + * This isn't an LRU system, it just frees up each context in + * turn (sort-of pseudo-random replacement :). This would be the + * place to implement an LRU scheme if anyone was motivated to do it. + * -- paulus + * + * For context stealing, we use a slightly different approach for + * SMP and UP. Basically, the UP one is simpler and doesn't use + * the stale map as we can just flush the local CPU + * -- benh + */ +#ifdef CONFIG_SMP +static unsigned int steal_context_smp(unsigned int id) +{ + struct mm_struct *mm; + unsigned int cpu, max, i; + + max = LAST_CONTEXT - FIRST_CONTEXT; + + /* Attempt to free next_context first and then loop until we manage */ + while (max--) { + /* Pick up the victim mm */ + mm = context_mm[id]; + + /* We have a candidate victim, check if it's active, on SMP + * we cannot steal active contexts + */ + if (mm->context.active) { + id++; + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; + continue; + } + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_sibling(cpu); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __set_bit(id, stale_map[i]); + } + cpu = i - 1; + } + return id; + } + + /* This will happen if you have more CPUs than available contexts, + * all we can do here is wait a bit and try again + */ + raw_spin_unlock(&context_lock); + cpu_relax(); + raw_spin_lock(&context_lock); + + /* This will cause the caller to try again */ + return MMU_NO_CONTEXT; +} +#endif /* CONFIG_SMP */ + +static unsigned int steal_all_contexts(void) +{ + struct mm_struct *mm; +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); +#endif + unsigned int id; + + for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { + /* Pick up the victim mm */ + mm = context_mm[id]; + + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Mark this mm as having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + if (id != FIRST_CONTEXT) { + context_mm[id] = NULL; + __clear_bit(id, context_map); +#ifdef DEBUG_MAP_CONSISTENCY + mm->context.active = 0; +#endif + } +#ifdef CONFIG_SMP + __clear_bit(id, stale_map[cpu]); +#endif + } + + /* Flush the TLB for all contexts (not to be used on SMP) */ + _tlbil_all(); + + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT; + + return FIRST_CONTEXT; +} + +/* Note that this will also be called on SMP if all other CPUs are + * offlined, which means that it may be called for cpu != 0. For + * this to work, we somewhat assume that CPUs that are onlined + * come up with a fully clean TLB (or are cleaned when offlined) + */ +static unsigned int steal_context_up(unsigned int id) +{ + struct mm_struct *mm; +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); +#endif + + /* Pick up the victim mm */ + mm = context_mm[id]; + + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Flush the TLB for that context */ + local_flush_tlb_mm(mm); + + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ +#ifdef CONFIG_SMP + __clear_bit(id, stale_map[cpu]); +#endif + + return id; +} + +#ifdef DEBUG_MAP_CONSISTENCY +static void context_check_map(void) +{ + unsigned int id, nrf, nact; + + nrf = nact = 0; + for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { + int used = test_bit(id, context_map); + if (!used) + nrf++; + if (used != (context_mm[id] != NULL)) + pr_err("MMU: Context %d is %s and MM is %p !\n", + id, used ? "used" : "free", context_mm[id]); + if (context_mm[id] != NULL) + nact += context_mm[id]->context.active; + } + if (nrf != nr_free_contexts) { + pr_err("MMU: Free context count out of sync ! (%d vs %d)\n", + nr_free_contexts, nrf); + nr_free_contexts = nrf; + } + if (nact > num_online_cpus()) + pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", + nact, num_online_cpus()); + if (FIRST_CONTEXT > 0 && !test_bit(0, context_map)) + pr_err("MMU: Context 0 has been freed !!!\n"); +} +#else +static void context_check_map(void) { } +#endif + +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned int id; +#ifdef CONFIG_SMP + unsigned int i, cpu = smp_processor_id(); +#endif + unsigned long *map; + + /* No lockless fast path .. yet */ + raw_spin_lock(&context_lock); + + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); + +#ifdef CONFIG_SMP + /* Mark us active and the previous one not anymore */ + next->context.active++; + if (prev) { + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); + WARN_ON(prev->context.active < 1); + prev->context.active--; + } + + again: +#endif /* CONFIG_SMP */ + + /* If we already have a valid assigned context, skip all that */ + id = next->context.id; + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif + goto ctxt_ok; + } + + /* We really don't have a context, let's try to acquire one */ + id = next_context; + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; + map = context_map; + + /* No more free contexts, let's try to steal one */ + if (nr_free_contexts == 0) { +#ifdef CONFIG_SMP + if (num_online_cpus() > 1) { + id = steal_context_smp(id); + if (id == MMU_NO_CONTEXT) + goto again; + goto stolen; + } +#endif /* CONFIG_SMP */ + if (IS_ENABLED(CONFIG_PPC_8xx)) + id = steal_all_contexts(); + else + id = steal_context_up(id); + goto stolen; + } + nr_free_contexts--; + + /* We know there's at least one free context, try to find it */ + while (__test_and_set_bit(id, map)) { + id = find_next_zero_bit(map, LAST_CONTEXT+1, id); + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; + } + stolen: + next_context = id + 1; + context_mm[id] = next; + next->context.id = id; + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); + + context_check_map(); + ctxt_ok: + + /* If that context got marked stale on this CPU, then flush the + * local TLB for it and unmark it before we use it + */ +#ifdef CONFIG_SMP + if (test_bit(id, stale_map[cpu])) { + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_sibling(cpu), + cpu_last_thread_sibling(cpu)); + + local_flush_tlb_mm(next); + + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ + for (i = cpu_first_thread_sibling(cpu); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __clear_bit(id, stale_map[i]); + } + } +#endif + + /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); + set_context(id, next->pgd); + raw_spin_unlock(&context_lock); +} + +/* + * Set up the context for a new address space. + */ +int init_new_context(struct task_struct *t, struct mm_struct *mm) +{ + pr_hard("initing context for mm @%p\n", mm); + + /* + * We have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + */ + if (mm->context.id == 0) + slice_init_new_context_exec(mm); + mm->context.id = MMU_NO_CONTEXT; + mm->context.active = 0; + pte_frag_set(&mm->context, NULL); + return 0; +} + +/* + * We're finished using the context for an address space. + */ +void destroy_context(struct mm_struct *mm) +{ + unsigned long flags; + unsigned int id; + + if (mm->context.id == MMU_NO_CONTEXT) + return; + + WARN_ON(mm->context.active != 0); + + raw_spin_lock_irqsave(&context_lock, flags); + id = mm->context.id; + if (id != MMU_NO_CONTEXT) { + __clear_bit(id, context_map); + mm->context.id = MMU_NO_CONTEXT; +#ifdef DEBUG_MAP_CONSISTENCY + mm->context.active = 0; +#endif + context_mm[id] = NULL; + nr_free_contexts++; + } + raw_spin_unlock_irqrestore(&context_lock, flags); +} + +#ifdef CONFIG_SMP +static int mmu_ctx_cpu_prepare(unsigned int cpu) +{ + /* We don't touch CPU 0 map, it's allocated at aboot and kept + * around forever + */ + if (cpu == boot_cpuid) + return 0; + + pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); + stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); + return 0; +} + +static int mmu_ctx_cpu_dead(unsigned int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (cpu == boot_cpuid) + return 0; + + pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); + kfree(stale_map[cpu]); + stale_map[cpu] = NULL; + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + clear_tasks_mm_cpumask(cpu); +#endif + return 0; +} + +#endif /* CONFIG_SMP */ + +/* + * Initialize the context management stuff. + */ +void __init mmu_context_init(void) +{ + /* Mark init_mm as being active on all possible CPUs since + * we'll get called with prev == init_mm the first time + * we schedule on a given CPU + */ + init_mm.context.active = NR_CPUS; + + /* + * Allocate the maps used by context management + */ + context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); + if (!context_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + CTX_MAP_SIZE); + context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + SMP_CACHE_BYTES); + if (!context_mm) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(void *) * (LAST_CONTEXT + 1)); +#ifdef CONFIG_SMP + stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); + if (!stale_map[boot_cpuid]) + panic("%s: Failed to allocate %zu bytes\n", __func__, + CTX_MAP_SIZE); + + cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, + "powerpc/mmu/ctx:prepare", + mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); +#endif + + printk(KERN_INFO + "MMU: Allocated %zu bytes of context maps for %d contexts\n", + 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)), + LAST_CONTEXT - FIRST_CONTEXT + 1); + + /* + * Some processors have too few contexts to reserve one for + * init_mm, and require using context 0 for a normal task. + * Other processors reserve the use of context zero for the kernel. + * This code assumes FIRST_CONTEXT < 32. + */ + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_context = FIRST_CONTEXT; + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1; +} diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c new file mode 100644 index 000000000000..704e613a0b14 --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb.c @@ -0,0 +1,810 @@ +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU does not use a hash table to store virtual to + * physical translations (ie, SW loaded TLBs or Book3E compilant processors, + * this does -not- include 603 however which shares the implementation with + * hash based processors) + * + * -- BenH + * + * Copyright 2008,2009 Ben Herrenschmidt + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) +#ifdef CONFIG_PPC_FSL_BOOK3E +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_2M] = { + .shift = 21, + .enc = BOOK3E_PAGESZ_2M, + }, + [MMU_PAGE_4M] = { + .shift = 22, + .enc = BOOK3E_PAGESZ_4M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_64M] = { + .shift = 26, + .enc = BOOK3E_PAGESZ_64M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#elif defined(CONFIG_PPC_8xx) +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + /* we only manage 4k and 16k pages as normal pages */ +#ifdef CONFIG_PPC_4K_PAGES + [MMU_PAGE_4K] = { + .shift = 12, + }, +#else + [MMU_PAGE_16K] = { + .shift = 14, + }, +#endif + [MMU_PAGE_512K] = { + .shift = 19, + }, + [MMU_PAGE_8M] = { + .shift = 23, + }, +}; +#else +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .ind = 20, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_16K] = { + .shift = 14, + .enc = BOOK3E_PAGESZ_16K, + }, + [MMU_PAGE_64K] = { + .shift = 16, + .ind = 28, + .enc = BOOK3E_PAGESZ_64K, + }, + [MMU_PAGE_1M] = { + .shift = 20, + .enc = BOOK3E_PAGESZ_1M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .ind = 36, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#endif /* CONFIG_FSL_BOOKE */ + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif /* CONFIG_PPC_BOOK3E_MMU */ + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +#ifdef CONFIG_PPC64 + +int mmu_linear_psize; /* Page size used for the linear mapping */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ +int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ +unsigned long linear_map_top; /* Top of linear mapping */ + + +/* + * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug + * exceptions. This is used for bolted and e6500 TLB miss handlers which + * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, + * this is set to zero. + */ +int extlb_level_exc; + +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ +DEFINE_PER_CPU(int, next_tlbcam_idx); +EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); +#endif + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ + +/* + * These are the base non-SMP variants of page and mm flushing + */ +void local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbil_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(local_flush_tlb_mm); + +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbil_va(vmaddr, pid, tsize, ind); + preempt_enable(); +} + +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); + +/* + * And here are the SMP non-local implementations + */ +#ifdef CONFIG_SMP + +static DEFINE_RAW_SPINLOCK(tlbivax_lock); + +struct tlb_flush_param { + unsigned long addr; + unsigned int pid; + unsigned int tsize; + unsigned int ind; +}; + +static void do_flush_tlb_mm_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_pid(p ? p->pid : 0); +} + +static void do_flush_tlb_page_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); +} + + +/* Note on invalidations and PID: + * + * We snapshot the PID with preempt disabled. At this point, it can still + * change either because: + * - our context is being stolen (PID -> NO_CONTEXT) on another CPU + * - we are invaliating some target that isn't currently running here + * and is concurrently acquiring a new PID on another CPU + * - some other CPU is re-acquiring a lost PID for this mm + * etc... + * + * However, this shouldn't be a problem as we only guarantee + * invalidation of TLB entries present prior to this call, so we + * don't care about the PID changing, and invalidating a stale PID + * is generally harmless. + */ + +void flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + if (!mm_is_core_local(mm)) { + struct tlb_flush_param p = { .pid = pid }; + /* Ignores smp_processor_id() even if set. */ + smp_call_function_many(mm_cpumask(mm), + do_flush_tlb_mm_ipi, &p, 1); + } + _tlbil_pid(pid); + no_context: + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + struct cpumask *cpu_mask; + unsigned int pid; + + /* + * This function as well as __local_flush_tlb_page() must only be called + * for user contexts. + */ + if (WARN_ON(!mm)) + return; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + cpu_mask = mm_cpumask(mm); + if (!mm_is_core_local(mm)) { + /* If broadcast tlbivax is supported, use it */ + if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { + int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); + if (lock) + raw_spin_lock(&tlbivax_lock); + _tlbivax_bcast(vmaddr, pid, tsize, ind); + if (lock) + raw_spin_unlock(&tlbivax_lock); + goto bail; + } else { + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; + /* Ignores smp_processor_id() even if set in cpu_mask */ + smp_call_function_many(cpu_mask, + do_flush_tlb_page_ipi, &p, 1); + } + } + _tlbil_va(vmaddr, pid, tsize, ind); + bail: + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(flush_tlb_page); + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PPC_47x +void __init early_init_mmu_47x(void) +{ +#ifdef CONFIG_SMP + unsigned long root = of_get_flat_dt_root(); + if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) + mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); +#endif /* CONFIG_SMP */ +} +#endif /* CONFIG_PPC_47x */ + +/* + * Flush kernel TLB entries in the given range + */ +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_SMP + preempt_disable(); + smp_call_function(do_flush_tlb_mm_ipi, NULL, 1); + _tlbil_pid(0); + preempt_enable(); +#else + _tlbil_pid(0); +#endif +} +EXPORT_SYMBOL(flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. This should + * be optimized based on a threshold on the size of the range, since + * some implementation can stack multiple tlbivax before a tlbsync but + * for now, we keep it that way + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK)) + flush_tlb_page(vma, start); + else + flush_tlb_mm(vma->vm_mm); +} +EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +/* + * Below are functions specific to the 64-bit variant of Book3E though that + * may change in the future + */ + +#ifdef CONFIG_PPC64 + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].enc; + + if (book3e_htw_mode != PPC_HTW_NONE) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + +#ifdef CONFIG_PPC_64K_PAGES + vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; +#else + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; +#endif + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +static void setup_page_sizes(void) +{ + unsigned int tlb0cfg; + unsigned int tlb0ps; + unsigned int eptcfg; + int i, psize; + +#ifdef CONFIG_PPC_FSL_BOOK3E + unsigned int mmucfg = mfspr(SPRN_MMUCFG); + int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); + + if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); + unsigned int min_pg, max_pg; + + min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; + max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def; + unsigned int shift; + + def = &mmu_psize_defs[psize]; + shift = def->shift; + + if (shift == 0 || shift & 1) + continue; + + /* adjust to be in terms of 4^shift Kb */ + shift = (shift - 10) >> 1; + + if ((shift >= min_pg) && (shift <= max_pg)) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + goto out; + } + + if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { + u32 tlb1cfg, tlb1ps; + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb1cfg = mfspr(SPRN_TLB1CFG); + tlb1ps = mfspr(SPRN_TLB1PS); + eptcfg = mfspr(SPRN_EPTCFG); + + if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) + book3e_htw_mode = PPC_HTW_E6500; + + /* + * We expect 4K subpage size and unrestricted indirect size. + * The lack of a restriction on indirect size is a Freescale + * extension, indicated by PSn = 0 but SPSn != 0. + */ + if (eptcfg != 2) + book3e_htw_mode = PPC_HTW_NONE; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (!def->shift) + continue; + + if (tlb1ps & (1U << (def->shift - 10))) { + def->flags |= MMU_PAGE_SIZE_DIRECT; + + if (book3e_htw_mode && psize == MMU_PAGE_2M) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + } + } + + goto out; + } +#endif + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb0ps = mfspr(SPRN_TLB0PS); + eptcfg = mfspr(SPRN_EPTCFG); + + /* Look for supported direct sizes */ + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (tlb0ps & (1U << (def->shift - 10))) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + /* Indirect page sizes supported ? */ + if ((tlb0cfg & TLBnCFG_IND) == 0 || + (tlb0cfg & TLBnCFG_PT) == 0) + goto out; + + book3e_htw_mode = PPC_HTW_IBM; + + /* Now, we only deal with one IND page size for each + * direct size. Hopefully all implementations today are + * unambiguous, but we might want to be careful in the + * future. + */ + for (i = 0; i < 3; i++) { + unsigned int ps, sps; + + sps = eptcfg & 0x1f; + eptcfg >>= 5; + ps = eptcfg & 0x1f; + eptcfg >>= 5; + if (!ps || !sps) + continue; + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (ps == (def->shift - 10)) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + if (sps == (def->shift - 10)) + def->ind = ps + 10; + } + } + +out: + /* Cleanup array and print summary */ + pr_info("MMU: Supported page sizes\n"); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + const char *__page_type_names[] = { + "unsupported", + "direct", + "indirect", + "direct & indirect" + }; + if (def->flags == 0) { + def->shift = 0; + continue; + } + pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), + __page_type_names[def->flags & 0x3]); + } +} + +static void setup_mmu_htw(void) +{ + /* + * If we want to use HW tablewalk, enable it by patching the TLB miss + * handlers to branch to the one dedicated to it. + */ + + switch (book3e_htw_mode) { + case PPC_HTW_IBM: + patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); + break; +#ifdef CONFIG_PPC_FSL_BOOK3E + case PPC_HTW_E6500: + extlb_level_exc = EX_TLB_SIZE; + patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); + break; +#endif + } + pr_info("MMU: Book3E HW tablewalk %s\n", + book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); +} + +/* + * Early initialization of the MMU TLB code + */ +static void early_init_this_mmu(void) +{ + unsigned int mas4; + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + mas4 |= MAS4_INDD; + mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; + mas4 |= MAS4_TLBSELD(1); + mmu_pte_psize = MMU_PAGE_2M; + break; + + case PPC_HTW_IBM: + mas4 |= MAS4_INDD; +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_256M; +#else + mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_1M; +#endif + break; + + case PPC_HTW_NONE: +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; +#else + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; +#endif + mmu_pte_psize = mmu_virtual_psize; + break; + } + mtspr(SPRN_MAS4, mas4); + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned int num_cams; + int __maybe_unused cpu = smp_processor_id(); + bool map = true; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + /* + * Only do the mapping once per core, or else the + * transient mapping would cause problems. + */ +#ifdef CONFIG_SMP + if (hweight32(get_tensr()) > 1) + map = false; +#endif + + if (map) + linear_map_top = map_mem_in_cams(linear_map_top, + num_cams, false); + } +#endif + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +static void __init early_init_mmu_global(void) +{ + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) + */ + mmu_linear_psize = MMU_PAGE_1G; + + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + * + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) + mmu_vmemmap_psize = MMU_PAGE_4K; + else + mmu_vmemmap_psize = MMU_PAGE_16M; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + /* Look for supported page sizes */ + setup_page_sizes(); + + /* Look for HW tablewalk support */ + setup_mmu_htw(); + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + if (book3e_htw_mode == PPC_HTW_NONE) { + extlb_level_exc = EX_TLB_SIZE; + patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); + patch_exception(0x1e0, + exc_instruction_tlb_miss_bolted_book3e); + } + } +#endif + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = memblock_end_of_DRAM(); +} + +static void __init early_mmu_set_memory_limit(void) +{ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + /* + * Limit memory so we dont have linear faults. + * Unlike memblock_set_current_limit, which limits + * memory available during early boot, this permanently + * reduces the memory available to Linux. We need to + * do this because highmem is not supported on 64-bit. + */ + memblock_enforce_memory_limit(linear_map_top); + } +#endif + + memblock_set_current_limit(linear_map_top); +} + +/* boot cpu only */ +void __init early_init_mmu(void) +{ + early_init_mmu_global(); + early_init_this_mmu(); + early_mmu_set_memory_limit(); +} + +void early_init_mmu_secondary(void) +{ + early_init_this_mmu(); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* On non-FSL Embedded 64-bit, we adjust the RMA size to match + * the bolted TLB entry. We know for now that only 1G + * entries are supported though that may eventually + * change. + * + * on FSL Embedded 64-bit, usually all RAM is bolted, but with + * unusual memory sizes it's possible for some RAM to not be mapped + * (such RAM is not used at all by Linux, since we don't support + * highmem on 64-bit). We limit ppc64_rma_size to what would be + * mappable if this memblock is the only one. Additional memblocks + * can only increase, not decrease, the amount that ends up getting + * mapped. We still limit max to 1G even if we'll eventually map + * more. This is due to what the early init code is set up to do. + * + * We crop it to the size of the first MEMBLOCK to + * avoid going over total available memory just in case... + */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned long linear_sz; + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + linear_sz = map_mem_in_cams(first_memblock_size, num_cams, + true); + + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + } else +#endif + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + ppc64_rma_size); +} +#else /* ! CONFIG_PPC64 */ +void __init early_init_mmu(void) +{ +#ifdef CONFIG_PPC_47x + early_init_mmu_47x(); +#endif + +#ifdef CONFIG_PPC_MM_SLICES +#if defined(CONFIG_PPC_8xx) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; +#endif +#endif +} +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S new file mode 100644 index 000000000000..e066a658acac --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -0,0 +1,491 @@ +/* + * This file contains low-level functions for performing various + * types of TLB invalidations on various processors with no hash + * table. + * + * This file implements the following functions for all no-hash + * processors. Some aren't implemented for some variants. Some + * are inline in tlbflush.h + * + * - tlbil_va + * - tlbil_pid + * - tlbil_all + * - tlbivax_bcast + * + * Code mostly moved over from misc_32.S + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Partially rewritten by Cort Dougan (cort@cs.nmt.edu) + * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_40x) + +/* + * 40x implementation needs only tlbil_va + */ +_GLOBAL(__tlbil_va) + /* We run the search with interrupts disabled because we have to change + * the PID and I don't want to preempt when that happens. + */ + mfmsr r5 + mfspr r6,SPRN_PID + wrteei 0 + mtspr SPRN_PID,r4 + tlbsx. r3, 0, r3 + mtspr SPRN_PID,r6 + wrtee r5 + bne 1f + sync + /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is + * clear. Since 25 is the V bit in the TLB_TAG, loading this value + * will invalidate the TLB entry. */ + tlbwe r3, r3, TLB_TAG + isync +1: blr + +#elif defined(CONFIG_PPC_8xx) + +/* + * Nothing to do for 8xx, everything is inline + */ + +#elif defined(CONFIG_44x) /* Includes 47x */ + +/* + * 440 implementation uses tlbsx/we for tlbil_va and a full sweep + * of the TLB for everything else. + */ +_GLOBAL(__tlbil_va) + mfspr r5,SPRN_MMUCR + mfmsr r10 + + /* + * We write 16 bits of STID since 47x supports that much, we + * will never be passed out of bounds values on 440 (hopefully) + */ + rlwimi r5,r4,0,16,31 + + /* We have to run the search with interrupts disabled, otherwise + * an interrupt which causes a TLB miss can clobber the MMUCR + * between the mtspr and the tlbsx. + * + * Critical and Machine Check interrupts take care of saving + * and restoring MMUCR, so only normal interrupts have to be + * taken care of. + */ + wrteei 0 + mtspr SPRN_MMUCR,r5 + tlbsx. r6,0,r3 + bne 10f + sync +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + /* On 440 There are only 64 TLB entries, so r3 < 64, which means bit + * 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this + * value will invalidate the TLB entry. + */ + tlbwe r6,r6,PPC44x_TLB_PAGEID + isync +10: wrtee r10 + blr +2: +#ifdef CONFIG_PPC_47x + oris r7,r6,0x8000 /* specify way explicitly */ + clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */ + ori r4,r4,PPC47x_TLBE_SIZE + tlbwe r4,r7,0 /* write it */ + isync + wrtee r10 + blr +#else /* CONFIG_PPC_47x */ +1: trap + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; +#endif /* !CONFIG_PPC_47x */ + +_GLOBAL(_tlbil_all) +_GLOBAL(_tlbil_pid) +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + li r3,0 + sync + + /* Load high watermark */ + lis r4,tlb_44x_hwater@ha + lwz r5,tlb_44x_hwater@l(r4) + +1: tlbwe r3,r3,PPC44x_TLB_PAGEID + addi r3,r3,1 + cmpw 0,r3,r5 + ble 1b + + isync + blr +2: +#ifdef CONFIG_PPC_47x + /* 476 variant. There's not simple way to do this, hopefully we'll + * try to limit the amount of such full invalidates + */ + mfmsr r11 /* Interrupts off */ + wrteei 0 + li r3,-1 /* Current set */ + lis r10,tlb_47x_boltmap@h + ori r10,r10,tlb_47x_boltmap@l + lis r7,0x8000 /* Specify way explicitly */ + + b 9f /* For each set */ + +1: li r9,4 /* Number of ways */ + li r4,0 /* Current way */ + li r6,0 /* Default entry value 0 */ + andi. r0,r8,1 /* Check if way 0 is bolted */ + mtctr r9 /* Load way counter */ + bne- 3f /* Bolted, skip loading it */ + +2: /* For each way */ + or r5,r3,r4 /* Make way|index for tlbre */ + rlwimi r5,r5,16,8,15 /* Copy index into position */ + tlbre r6,r5,0 /* Read entry */ +3: addis r4,r4,0x2000 /* Next way */ + andi. r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */ + beq 4f /* Nope, skip it */ + rlwimi r7,r5,0,1,2 /* Insert way number */ + rlwinm r6,r6,0,21,19 /* Clear V */ + tlbwe r6,r7,0 /* Write it */ +4: bdnz 2b /* Loop for each way */ + srwi r8,r8,1 /* Next boltmap bit */ +9: cmpwi cr1,r3,255 /* Last set done ? */ + addi r3,r3,1 /* Next set */ + beq cr1,1f /* End of loop */ + andi. r0,r3,0x1f /* Need to load a new boltmap word ? */ + bne 1b /* No, loop */ + lwz r8,0(r10) /* Load boltmap entry */ + addi r10,r10,4 /* Next word */ + b 1b /* Then loop */ +1: isync /* Sync shadows */ + wrtee r11 +#else /* CONFIG_PPC_47x */ +1: trap + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; +#endif /* !CONFIG_PPC_47x */ + blr + +#ifdef CONFIG_PPC_47x + +/* + * _tlbivax_bcast is only on 47x. We don't bother doing a runtime + * check though, it will blow up soon enough if we mistakenly try + * to use it on a 440. + */ +_GLOBAL(_tlbivax_bcast) + mfspr r5,SPRN_MMUCR + mfmsr r10 + rlwimi r5,r4,0,16,31 + wrteei 0 + mtspr SPRN_MMUCR,r5 + isync + PPC_TLBIVAX(0, R3) + isync + eieio + tlbsync +BEGIN_FTR_SECTION + b 1f +END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) + sync + wrtee r10 + blr +/* + * DD2 HW could hang if in instruction fetch happens before msync completes. + * Touch enough instruction cache lines to ensure cache hits + */ +1: mflr r9 + bl 2f +2: mflr r6 + li r7,32 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + add r6,r6,r7 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + add r6,r6,r7 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + sync + nop + nop + nop + nop + nop + nop + nop + nop + mtlr r9 + wrtee r10 + blr +#endif /* CONFIG_PPC_47x */ + +#elif defined(CONFIG_FSL_BOOKE) +/* + * FSL BookE implementations. + * + * Since feature sections are using _SECTION_ELSE we need + * to have the larger code path before the _SECTION_ELSE + */ + +/* + * Flush MMU TLB on the local processor + */ +_GLOBAL(_tlbil_all) +BEGIN_MMU_FTR_SECTION + li r3,(MMUCSR0_TLBFI)@l + mtspr SPRN_MMUCSR0, r3 +1: + mfspr r3,SPRN_MMUCSR0 + andi. r3,r3,MMUCSR0_TLBFI@l + bne 1b +MMU_FTR_SECTION_ELSE + PPC_TLBILX_ALL(0,R0) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) + msync + isync + blr + +_GLOBAL(_tlbil_pid) +BEGIN_MMU_FTR_SECTION + slwi r3,r3,16 + mfmsr r10 + wrteei 0 + mfspr r4,SPRN_MAS6 /* save MAS6 */ + mtspr SPRN_MAS6,r3 + PPC_TLBILX_PID(0,R0) + mtspr SPRN_MAS6,r4 /* restore MAS6 */ + wrtee r10 +MMU_FTR_SECTION_ELSE + li r3,(MMUCSR0_TLBFI)@l + mtspr SPRN_MMUCSR0, r3 +1: + mfspr r3,SPRN_MMUCSR0 + andi. r3,r3,MMUCSR0_TLBFI@l + bne 1b +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) + msync + isync + blr + +/* + * Flush MMU TLB for a particular address, but only on the local processor + * (no broadcast) + */ +_GLOBAL(__tlbil_va) + mfmsr r10 + wrteei 0 + slwi r4,r4,16 + ori r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ +BEGIN_MMU_FTR_SECTION + tlbsx 0,r3 + mfspr r4,SPRN_MAS1 /* check valid */ + andis. r3,r4,MAS1_VALID@h + beq 1f + rlwinm r4,r4,0,1,31 + mtspr SPRN_MAS1,r4 + tlbwe +MMU_FTR_SECTION_ELSE + PPC_TLBILX_VA(0,R3) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) + msync + isync +1: wrtee r10 + blr +#elif defined(CONFIG_PPC_BOOK3E) +/* + * New Book3E (>= 2.06) implementation + * + * Note: We may be able to get away without the interrupt masking stuff + * if we save/restore MAS6 on exceptions that might modify it + */ +_GLOBAL(_tlbil_pid) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,R0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_pid_noind) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + ori r4,r4,MAS6_SIND + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,R0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_all) + PPC_TLBILX_ALL(0,R0) + msync + isync + blr + +_GLOBAL(_tlbil_va) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBILX_VA(0,R3) + msync + isync + wrtee r10 + blr + +_GLOBAL(_tlbivax_bcast) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBIVAX(0,R3) + eieio + tlbsync + sync + wrtee r10 + blr + +_GLOBAL(set_context) +#ifdef CONFIG_BDI_SWITCH + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is the second parameter. + */ + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + stw r4, 0x4(r5) +#endif + mtspr SPRN_PID,r3 + isync /* Force context change */ + blr +#else +#error Unsupported processor type ! +#endif + +#if defined(CONFIG_PPC_FSL_BOOK3E) +/* + * extern void loadcam_entry(unsigned int index) + * + * Load TLBCAM[index] entry in to the L2 CAM MMU + * Must preserve r7, r8, r9, and r10 + */ +_GLOBAL(loadcam_entry) + mflr r5 + LOAD_REG_ADDR_PIC(r4, TLBCAM) + mtlr r5 + mulli r5,r3,TLBCAM_SIZE + add r3,r5,r4 + lwz r4,TLBCAM_MAS0(r3) + mtspr SPRN_MAS0,r4 + lwz r4,TLBCAM_MAS1(r3) + mtspr SPRN_MAS1,r4 + PPC_LL r4,TLBCAM_MAS2(r3) + mtspr SPRN_MAS2,r4 + lwz r4,TLBCAM_MAS3(r3) + mtspr SPRN_MAS3,r4 +BEGIN_MMU_FTR_SECTION + lwz r4,TLBCAM_MAS7(r3) + mtspr SPRN_MAS7,r4 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) + isync + tlbwe + isync + blr + +/* + * Load multiple TLB entries at once, using an alternate-space + * trampoline so that we don't have to care about whether the same + * TLB entry maps us before and after. + * + * r3 = first entry to write + * r4 = number of entries to write + * r5 = temporary tlb entry + */ +_GLOBAL(loadcam_multi) + mflr r8 + + /* + * Set up temporary TLB entry that is the same as what we're + * running from, but in AS=1. + */ + bl 1f +1: mflr r6 + tlbsx 0,r8 + mfspr r6,SPRN_MAS1 + ori r6,r6,MAS1_TS + mtspr SPRN_MAS1,r6 + mfspr r6,SPRN_MAS0 + rlwimi r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK + mr r7,r5 + mtspr SPRN_MAS0,r6 + isync + tlbwe + isync + + /* Switch to AS=1 */ + mfmsr r6 + ori r6,r6,MSR_IS|MSR_DS + mtmsr r6 + isync + + mr r9,r3 + add r10,r3,r4 +2: bl loadcam_entry + addi r9,r9,1 + cmpw r9,r10 + mr r3,r9 + blt 2b + + /* Return to AS=0 and clear the temporary entry */ + mfmsr r6 + rlwinm. r6,r6,0,~(MSR_IS|MSR_DS) + mtmsr r6 + isync + + li r6,0 + mtspr SPRN_MAS1,r6 + rlwinm r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK + oris r6,r6,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r6 + isync + tlbwe + isync + + mtlr r8 + blr +#endif diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S new file mode 100644 index 000000000000..9ed90064f542 --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -0,0 +1,1280 @@ +/* + * Low level TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PPC_64K_PAGES +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) +#else +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#endif +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + +/********************************************************************** + * * + * TLB miss handling for Book3E with a bolted linear mapping * + * No virtual page table, no nested TLB misses * + * * + **********************************************************************/ + +/* + * Note that, unlike non-bolted handlers, TLB_EXFRAME is not + * modified by the TLB miss handlers themselves, since the TLB miss + * handler code will not itself cause a recursive TLB miss. + * + * TLB_EXFRAME will be modified when crit/mc/debug exceptions are + * entered/exited. + */ +.macro tlb_prolog_bolted intnum addr + mtspr SPRN_SPRG_GEN_SCRATCH,r12 + mfspr r12,SPRN_SPRG_TLB_EXFRAME + std r13,EX_TLB_R13(r12) + std r10,EX_TLB_R10(r12) + mfspr r13,SPRN_SPRG_PACA + + mfcr r10 + std r11,EX_TLB_R11(r12) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + DO_KVM \intnum, SPRN_SRR1 + std r16,EX_TLB_R16(r12) + mfspr r16,\addr /* get faulting address */ + std r14,EX_TLB_R14(r12) + ld r14,PACAPGD(r13) + std r15,EX_TLB_R15(r12) + std r10,EX_TLB_CR(r12) +#ifdef CONFIG_PPC_FSL_BOOK3E +START_BTB_FLUSH_SECTION + mfspr r11, SPRN_SRR1 + andi. r10,r11,MSR_PR + beq 1f + BTB_FLUSH(r10) +1: +END_BTB_FLUSH_SECTION + std r7,EX_TLB_R7(r12) +#endif + TLB_MISS_PROLOG_STATS +.endm + +.macro tlb_epilog_bolted + ld r14,EX_TLB_CR(r12) +#ifdef CONFIG_PPC_FSL_BOOK3E + ld r7,EX_TLB_R7(r12) +#endif + ld r10,EX_TLB_R10(r12) + ld r11,EX_TLB_R11(r12) + ld r13,EX_TLB_R13(r12) + mtcr r14 + ld r14,EX_TLB_R14(r12) + ld r15,EX_TLB_R15(r12) + TLB_MISS_RESTORE_STATS + ld r16,EX_TLB_R16(r12) + mfspr r12,SPRN_SPRG_GEN_SCRATCH +.endm + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + + mfspr r11,SPRN_ESR + + srdi r15,r16,60 /* get region */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */ + + rlwinm r10,r11,32-19,27,27 + rlwimi r10,r11,32-16,19,19 + cmpwi r15,0 /* user vs kernel check */ + ori r10,r10,_PAGE_PRESENT + oris r11,r10,_PAGE_ACCESSED@h + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_bolted + +tlb_miss_common_bolted: +/* + * This is the guts of the TLB miss handler for bolted-linear. + * We are entered with: + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq tlb_miss_fault_bolted /* No PGDIR, bail */ + +BEGIN_MMU_FTR_SECTION + /* Set the TLB reservation and search for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,R16) + ldx r14,r14,r15 /* grab pgd entry */ + beq tlb_miss_done_bolted /* tlb exists already, bail */ +MMU_FTR_SECTION_ELSE + ldx r14,r14,r15 /* grab pgd entry */ +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) + +#ifndef CONFIG_PPC_64K_PAGES + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ +#endif /* CONFIG_PPC_64K_PAGES */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab pmd entry */ + + rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */ + + /* Check if required permissions are met */ + andc. r15,r11,r14 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + bne- tlb_miss_fault_bolted + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + clrldi r15,r15,12 /* Clear crap at the top */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r11 + andi. r11,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: + mtspr SPRN_MAS7_MAS3,r15 + tlbwe + +tlb_miss_done_bolted: + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + tlb_epilog_bolted + rfi + +itlb_miss_kernel_bolted: + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h +tlb_miss_kernel_bolted: + mfspr r10,SPRN_MAS1 + ld r14,PACA_KERNELPGD(r13) + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ tlb_miss_common_bolted + +tlb_miss_fault_bolted: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX + bne itlb_miss_fault_bolted +dtlb_miss_fault_bolted: + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_bolted: + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_instruction_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + srdi r15,r16,60 /* get region */ + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne- itlb_miss_fault_bolted + + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + + cmpldi cr0,r15,0 /* Check for user region */ + oris r11,r11,_PAGE_ACCESSED@h + beq tlb_miss_common_bolted + b itlb_miss_kernel_bolted + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* + * TLB miss handling for e6500 and derivatives, using hardware tablewalk. + * + * Linear mapping is bolted: no virtual page table or nested TLB misses + * Indirect entries in TLB1, hardware loads resulting direct entries + * into TLB0 + * No HES or NV hint on TLB1, so we need to do software round-robin + * No tlbsrx. so we need a spinlock, and we have to deal + * with MAS-damage caused by tlbsx + * 4K pages only + */ + + START_EXCEPTION(instruction_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + ori r16,r16,1 + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_e6500 /* user/kernel test */ + + b tlb_miss_common_e6500 + + START_EXCEPTION(data_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + rldicr r16,r16,0,62 + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_e6500 /* user vs kernel check */ + +/* + * This is the guts of the TLB miss handler for e6500 and derivatives. + * We are entered with: + * + * r16 = page of faulting address (low bit 0 if data, 1 if instruction) + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = tlb_per_core ptr + * r10 = crap (free to use) + * r7 = esel_next + */ +tlb_miss_common_e6500: + crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ + +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ + /* + * Search if we already have an indirect entry for that virtual + * address, and if we do, bail out. + * + * MAS6:IND should be already set based on MAS4 + */ + lhz r10,PACAPACAINDEX(r13) + addi r10,r10,1 + crclr cr1*4+eq /* set cr1.eq = 0 for non-recursive */ +1: lbarx r15,0,r11 + cmpdi r15,0 + bne 2f + stbcx. r10,0,r11 + bne 1b +3: + .subsection 1 +2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ + beq cr1,3b /* unlock will happen if cr1.eq = 0 */ +10: lbz r15,0(r11) + cmpdi r15,0 + bne 10b + b 1b + .previous +END_FTR_SECTION_IFSET(CPU_FTR_SMT) + + lbz r7,TCD_ESEL_NEXT(r11) + +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ + /* + * Erratum A-008139 says that we can't use tlbwe to change + * an indirect entry in any way (including replacing or + * invalidating) if the other thread could be in the process + * of a lookup. The workaround is to invalidate the entry + * with tlbilx before overwriting. + */ + + rlwinm r10,r7,16,0xff0000 + oris r10,r10,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r10 + isync + tlbre + mfspr r15,SPRN_MAS1 + andis. r15,r15,MAS1_VALID@h + beq 5f + +BEGIN_FTR_SECTION_NESTED(532) + mfspr r10,SPRN_MAS8 + rlwinm r10,r10,0,0x80000fff /* tgs,tlpid -> sgs,slpid */ + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + mfspr r10,SPRN_MAS1 + rlwinm r15,r10,0,0x3fff0000 /* tid -> spid */ + rlwimi r15,r10,20,0x00000003 /* ind,ts -> sind,sas */ + mfspr r10,SPRN_MAS6 + mtspr SPRN_MAS6,r15 + + mfspr r15,SPRN_MAS2 + isync + tlbilxva 0,r15 + isync + + mtspr SPRN_MAS6,r10 + +5: +BEGIN_FTR_SECTION_NESTED(532) + li r10,0 + mtspr SPRN_MAS8,r10 + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + tlbsx 0,r16 + mfspr r10,SPRN_MAS1 + andis. r15,r10,MAS1_VALID@h + bne tlb_miss_done_e6500 +FTR_SECTION_ELSE + mfspr r10,SPRN_MAS1 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) + + oris r10,r10,MAS1_VALID@h + beq cr2,4f + rlwinm r10,r10,0,16,1 /* Clear TID */ +4: mtspr SPRN_MAS1,r10 + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- tlb_miss_fault_e6500 + + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq- tlb_miss_fault_e6500 /* No PGDIR, bail */ + ldx r14,r14,r15 /* grab pgd entry */ + + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_huge_e6500 /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_huge_e6500 + ldx r14,r14,r15 /* Grab pmd entry */ + + mfspr r10,SPRN_MAS0 + cmpdi cr0,r14,0 + bge tlb_miss_huge_e6500 + + /* Now we build the MAS for a 2M indirect page: + * + * MAS 0 : ESEL needs to be filled by software round-robin + * MAS 1 : Fully set up + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * - TID already cleared if necessary + * MAS 2 : Default not 2M-aligned, need to be redone + * MAS 3+7 : Needs to be done + */ + + ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) + mtspr SPRN_MAS7_MAS3,r14 + + clrrdi r15,r16,21 /* make EA 2M-aligned */ + mtspr SPRN_MAS2,r15 + +tlb_miss_huge_done_e6500: + lbz r16,TCD_ESEL_MAX(r11) + lbz r14,TCD_ESEL_FIRST(r11) + rlwimi r10,r7,16,0x00ff0000 /* insert esel_next into MAS0 */ + addi r7,r7,1 /* increment esel_next */ + mtspr SPRN_MAS0,r10 + cmpw r7,r16 + iseleq r7,r14,r7 /* if next == last use first */ + stb r7,TCD_ESEL_NEXT(r11) + + tlbwe + +tlb_miss_done_e6500: + .macro tlb_unlock_e6500 +BEGIN_FTR_SECTION + beq cr1,1f /* no unlock if lock was recursively grabbed */ + li r15,0 + isync + stb r15,0(r11) +1: +END_FTR_SECTION_IFSET(CPU_FTR_SMT) + .endm + + tlb_unlock_e6500 + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + tlb_epilog_bolted + rfi + +tlb_miss_huge_e6500: + beq tlb_miss_fault_e6500 + li r10,1 + andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */ + rldimi r14,r10,63,0 /* Set PD_HUGE */ + xor r14,r14,r15 /* Clear size bits */ + ldx r14,0,r14 + + /* + * Now we build the MAS for a huge page. + * + * MAS 0 : ESEL needs to be filled by software round-robin + * - can be handled by indirect code + * MAS 1 : Need to clear IND and set TSIZE + * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler + */ + + subi r15,r15,10 /* Convert psize to tsize */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,~MAS1_IND + rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK + mtspr SPRN_MAS1,r10 + + li r10,-0x400 + sld r15,r10,r15 /* Generate mask based on size */ + and r10,r16,r15 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ + clrldi r15,r15,PAGE_SHIFT /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r10 + andi. r10,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r10,MAS3_SW|MAS3_UW + andc r15,r15,r10 +1: + mtspr SPRN_MAS7_MAS3,r15 + + mfspr r10,SPRN_MAS0 + b tlb_miss_huge_done_e6500 + +tlb_miss_kernel_e6500: + ld r14,PACA_KERNELPGD(r13) + cmpldi cr1,r15,8 /* Check for vmalloc region */ + beq+ cr1,tlb_miss_common_e6500 + +tlb_miss_fault_e6500: + tlb_unlock_e6500 + /* We need to check if it was an instruction miss */ + andi. r16,r16,1 + bne itlb_miss_fault_e6500 +dtlb_miss_fault_e6500: + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_e6500: + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_instruction_storage_book3e +#endif /* CONFIG_PPC_FSL_BOOK3E */ + +/********************************************************************** + * * + * TLB miss handling for Book3E with TLB reservation and HES support * + * * + **********************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* The page tables are mapped virtually linear. At this point, though, + * we don't know whether we are trying to fault in a first level + * virtual address or a virtual page table address. We can get that + * from bit 0x1 of the region ID which we have set for a page table + */ + andi. r10,r15,0x1 + bne- virt_page_table_tlb_miss + + std r14,EX_TLB_ESR(r12); /* save ESR */ + std r16,EX_TLB_DEAR(r12); /* save DEAR */ + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + li r11,_PAGE_PRESENT + oris r11,r11,_PAGE_ACCESSED@h + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r15,0 /* Check for user region */ + + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + rlwimi r11,r14,32-19,27,27 + rlwimi r11,r14,32-16,19,19 + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by writing a crazy value in ESR in our exception frame + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + + cmpldi cr0,r15,0 /* Check for user region */ + std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ + beq normal_tlb_miss + + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of the first-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = faulting address + * r15 = region ID + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ +normal_tlb_miss: + /* So we first construct the page table address. We do that by + * shifting the bottom of the address (not the region ID) by + * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and + * or'ing the fourth high bit. + * + * NOTE: For 64K pages, we do things slightly differently in + * order to handle the weird page table format used by linux + */ + ori r10,r15,0x1 +#ifdef CONFIG_PPC_64K_PAGES + /* For the top bits, 16 bytes per PTE */ + rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 + /* Now create the bottom bits as 0 in position 0x8000 and + * the rest calculated for 8 bytes per PTE + */ + rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 + /* Insert the bottom bits in */ + rlwimi r14,r15,0,16,31 +#else + rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 +#endif + sldi r15,r10,60 + clrrdi r14,r14,3 + or r10,r15,r14 + +BEGIN_MMU_FTR_SECTION + /* Set the TLB reservation and search for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,R16) + ld r14,0(r10) + beq normal_tlb_miss_done +MMU_FTR_SECTION_ELSE + ld r14,0(r10) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) + +finish_normal_tlb_miss: + /* Check if required permissions are met */ + andc. r15,r11,r14 + bne- normal_tlb_miss_access_fault + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + * + * TODO: mix up code below for better scheduling + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r11 + + /* Check page size, if not standard, update MAS1 */ + rldicl r11,r14,64-8,64-8 +#ifdef CONFIG_PPC_64K_PAGES + cmpldi cr0,r11,BOOK3E_PAGESZ_64K +#else + cmpldi cr0,r11,BOOK3E_PAGESZ_4K +#endif + beq- 1f + mfspr r11,SPRN_MAS1 + rlwimi r11,r14,31,21,24 + rlwinm r11,r11,0,21,19 + mtspr SPRN_MAS1,r11 +1: + /* Move RPN in position */ + rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + clrldi r15,r11,12 /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + andi. r11,r14,_PAGE_DIRTY + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: +BEGIN_MMU_FTR_SECTION + srdi r16,r15,32 + mtspr SPRN_MAS3,r15 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r15 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +normal_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +normal_tlb_miss_access_fault: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_EXEC + bne 1f + ld r14,EX_TLB_DEAR(r12) + ld r15,EX_TLB_ESR(r12) + mtspr SPRN_DEAR,r14 + mtspr SPRN_ESR,r15 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +1: +BEGIN_MMU_FTR_SECTION + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + */ + PPC_TLBSRX_DOT(0,R16) + beq virt_page_table_tlb_miss_done +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +BEGIN_MMU_FTR_SECTION +virt_page_table_tlb_miss_done: + + /* We have overridden MAS2:EPN but currently our primary TLB miss + * handler will always restore it so that should not be an issue, + * if we ever optimize the primary handler to not write MAS2 on + * some cases, we'll have to restore MAS2:EPN here based on the + * original fault's DEAR. If we do that we have to modify the + * ITLB miss handler to also store SRR0 in the exception frame + * as DEAR. + * + * However, one nasty thing we did is we cleared the reservation + * (well, potentially we did). We do a trick here thus if we + * are not a level 0 exception (we interrupted the TLB miss) we + * offset the return address by -4 in order to replay the tlbsrx + * instruction there + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- 1f + ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) + addi r10,r11,-4 + std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) +1: +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + /* Return to caller, normal case */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retrieve + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant information in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + + +/************************************************************** + * * + * TLB miss handling for Book3E with hw page table support * + * * + **************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_htw) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_htw) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by keeping a crazy value for ESR in r14 + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = PGD pointer + * r14 = ESR + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will save/restore things for us + */ +htw_tlb_miss: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + * + * MAS1:IND should be already set based on MAS4 + */ + PPC_TLBSRX_DOT(0,R16) + beq htw_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- htw_tlb_miss_fault + + /* Get the PGD pointer */ + cmpldi cr0,r15,0 + beq- htw_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault + + /* Ok, we're all right, we can now create an indirect entry for + * a 1M or 256M page. + * + * The last trick is now that because we use "half" pages for + * the HTW (1M IND is 2K and 256M IND is 32K) we need to account + * for an added LSB bit to the RPN. For 64K pages, there is no + * problem as we already use 32K arrays (half PTE pages), but for + * 4K page we need to extract a bit from the virtual address and + * insert it into the "PA52" bit of the RPN. + */ +#ifndef CONFIG_PPC_64K_PAGES + rlwimi r15,r16,32-9,20,20 +#endif + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + */ +#ifdef CONFIG_PPC_64K_PAGES + ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) +#else + ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) +#endif + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +htw_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +htw_tlb_miss_fault: + /* We need to check if it was an instruction miss. We know this + * though because r14 would contain -1 + */ + cmpdi cr0,r14,-1 + beq 1f + mtspr SPRN_DEAR,r16 + mtspr SPRN_ESR,r14 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + ld r11,PACATOC(r13) + ld r11,linear_map_top@got(r11) + ld r10,0(r11) + tovirt(10,10) + cmpld cr0,r16,r10 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB< -#include -#include -#include -#include - -#include - -#ifdef CONFIG_SPARSEMEM_VMEMMAP -/* - * On Book3E CPUs, the vmemmap is currently mapped in the top half of - * the vmalloc space using normal page tables, though the size of - * pages encoded in the PTEs can be different - */ -int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding without page size */ - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | - _PAGE_KERNEL_RW; - - /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); - - /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; - - /* For each PTE for that area, map things. Note that we don't - * increment phys because all PTEs are of the large size and - * thus must have the low bits clear - */ - for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags))); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ -} -#endif -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - -static __ref void *early_alloc_pgtable(unsigned long size) -{ - void *ptr; - - ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT, - __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n", - __func__, size, size, __pa(MAX_DMA_ADDRESS)); - - return ptr; -} - -/* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it - */ -int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - } else { - pgdp = pgd_offset_k(ea); -#ifndef __PAGETABLE_PUD_FOLDED - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - pgd_populate(&init_mm, pgdp, pudp); - } -#endif /* !__PAGETABLE_PUD_FOLDED */ - pudp = pud_offset(pgdp, ea); - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - } - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); - - smp_wmb(); - return 0; -} diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S deleted file mode 100644 index 9ed90064f542..000000000000 --- a/arch/powerpc/mm/tlb_low_64e.S +++ /dev/null @@ -1,1280 +0,0 @@ -/* - * Low level TLB miss handlers for Book3E - * - * Copyright (C) 2008-2009 - * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_PPC_64K_PAGES -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) -#else -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) -#endif -#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) -#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) -#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) - -/********************************************************************** - * * - * TLB miss handling for Book3E with a bolted linear mapping * - * No virtual page table, no nested TLB misses * - * * - **********************************************************************/ - -/* - * Note that, unlike non-bolted handlers, TLB_EXFRAME is not - * modified by the TLB miss handlers themselves, since the TLB miss - * handler code will not itself cause a recursive TLB miss. - * - * TLB_EXFRAME will be modified when crit/mc/debug exceptions are - * entered/exited. - */ -.macro tlb_prolog_bolted intnum addr - mtspr SPRN_SPRG_GEN_SCRATCH,r12 - mfspr r12,SPRN_SPRG_TLB_EXFRAME - std r13,EX_TLB_R13(r12) - std r10,EX_TLB_R10(r12) - mfspr r13,SPRN_SPRG_PACA - - mfcr r10 - std r11,EX_TLB_R11(r12) -#ifdef CONFIG_KVM_BOOKE_HV -BEGIN_FTR_SECTION - mfspr r11, SPRN_SRR1 -END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) -#endif - DO_KVM \intnum, SPRN_SRR1 - std r16,EX_TLB_R16(r12) - mfspr r16,\addr /* get faulting address */ - std r14,EX_TLB_R14(r12) - ld r14,PACAPGD(r13) - std r15,EX_TLB_R15(r12) - std r10,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E -START_BTB_FLUSH_SECTION - mfspr r11, SPRN_SRR1 - andi. r10,r11,MSR_PR - beq 1f - BTB_FLUSH(r10) -1: -END_BTB_FLUSH_SECTION - std r7,EX_TLB_R7(r12) -#endif - TLB_MISS_PROLOG_STATS -.endm - -.macro tlb_epilog_bolted - ld r14,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E - ld r7,EX_TLB_R7(r12) -#endif - ld r10,EX_TLB_R10(r12) - ld r11,EX_TLB_R11(r12) - ld r13,EX_TLB_R13(r12) - mtcr r14 - ld r14,EX_TLB_R14(r12) - ld r15,EX_TLB_R15(r12) - TLB_MISS_RESTORE_STATS - ld r16,EX_TLB_R16(r12) - mfspr r12,SPRN_SPRG_GEN_SCRATCH -.endm - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss_bolted) - tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR - - /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - /* We pre-test some combination of permissions to avoid double - * faults: - * - * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE - * ESR_ST is 0x00800000 - * _PAGE_BAP_SW is 0x00000010 - * So the shift is >> 19. This tests for supervisor writeability. - * If the page happens to be supervisor writeable and not user - * writeable, we will take a new fault later, but that should be - * a rare enough case. - * - * We also move ESR_ST in _PAGE_DIRTY position - * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 - * - * MAS1 is preset for all we need except for TID that needs to - * be cleared for kernel translations - */ - - mfspr r11,SPRN_ESR - - srdi r15,r16,60 /* get region */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */ - - rlwinm r10,r11,32-19,27,27 - rlwimi r10,r11,32-16,19,19 - cmpwi r15,0 /* user vs kernel check */ - ori r10,r10,_PAGE_PRESENT - oris r11,r10,_PAGE_ACCESSED@h - - TLB_MISS_STATS_SAVE_INFO_BOLTED - bne tlb_miss_kernel_bolted - -tlb_miss_common_bolted: -/* - * This is the guts of the TLB miss handler for bolted-linear. - * We are entered with: - * - * r16 = faulting address - * r15 = crap (free to use) - * r14 = page table base - * r13 = PACA - * r11 = PTE permission mask - * r10 = crap (free to use) - */ - rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 - cmpldi cr0,r14,0 - clrrdi r15,r15,3 - beq tlb_miss_fault_bolted /* No PGDIR, bail */ - -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ldx r14,r14,r15 /* grab pgd entry */ - beq tlb_miss_done_bolted /* tlb exists already, bail */ -MMU_FTR_SECTION_ELSE - ldx r14,r14,r15 /* grab pgd entry */ -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) - -#ifndef CONFIG_PPC_64K_PAGES - rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ - ldx r14,r14,r15 /* grab pud entry */ -#endif /* CONFIG_PPC_64K_PAGES */ - - rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_fault_bolted - ldx r14,r14,r15 /* Grab pmd entry */ - - rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_fault_bolted - ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */ - - /* Check if required permissions are met */ - andc. r15,r11,r14 - rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - bne- tlb_miss_fault_bolted - - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE need change if !base page size, not - * yet implemented for now - * MAS 2 : Defaults not useful, need to be redone - * MAS 3+7 : Needs to be done - */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - clrldi r15,r15,12 /* Clear crap at the top */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - mtspr SPRN_MAS2,r11 - andi. r11,r14,_PAGE_DIRTY - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - bne 1f - li r11,MAS3_SW|MAS3_UW - andc r15,r15,r11 -1: - mtspr SPRN_MAS7_MAS3,r15 - tlbwe - -tlb_miss_done_bolted: - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) - tlb_epilog_bolted - rfi - -itlb_miss_kernel_bolted: - li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h -tlb_miss_kernel_bolted: - mfspr r10,SPRN_MAS1 - ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_bolted - -tlb_miss_fault_bolted: - /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX - bne itlb_miss_fault_bolted -dtlb_miss_fault_bolted: - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - tlb_epilog_bolted - b exc_data_storage_book3e -itlb_miss_fault_bolted: - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - tlb_epilog_bolted - b exc_instruction_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss_bolted) - tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 - - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - srdi r15,r16,60 /* get region */ - TLB_MISS_STATS_SAVE_INFO_BOLTED - bne- itlb_miss_fault_bolted - - li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - - cmpldi cr0,r15,0 /* Check for user region */ - oris r11,r11,_PAGE_ACCESSED@h - beq tlb_miss_common_bolted - b itlb_miss_kernel_bolted - -#ifdef CONFIG_PPC_FSL_BOOK3E -/* - * TLB miss handling for e6500 and derivatives, using hardware tablewalk. - * - * Linear mapping is bolted: no virtual page table or nested TLB misses - * Indirect entries in TLB1, hardware loads resulting direct entries - * into TLB0 - * No HES or NV hint on TLB1, so we need to do software round-robin - * No tlbsrx. so we need a spinlock, and we have to deal - * with MAS-damage caused by tlbsx - * 4K pages only - */ - - START_EXCEPTION(instruction_tlb_miss_e6500) - tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 - - ld r11,PACA_TCD_PTR(r13) - srdi. r15,r16,60 /* get region */ - ori r16,r16,1 - - TLB_MISS_STATS_SAVE_INFO_BOLTED - bne tlb_miss_kernel_e6500 /* user/kernel test */ - - b tlb_miss_common_e6500 - - START_EXCEPTION(data_tlb_miss_e6500) - tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR - - ld r11,PACA_TCD_PTR(r13) - srdi. r15,r16,60 /* get region */ - rldicr r16,r16,0,62 - - TLB_MISS_STATS_SAVE_INFO_BOLTED - bne tlb_miss_kernel_e6500 /* user vs kernel check */ - -/* - * This is the guts of the TLB miss handler for e6500 and derivatives. - * We are entered with: - * - * r16 = page of faulting address (low bit 0 if data, 1 if instruction) - * r15 = crap (free to use) - * r14 = page table base - * r13 = PACA - * r11 = tlb_per_core ptr - * r10 = crap (free to use) - * r7 = esel_next - */ -tlb_miss_common_e6500: - crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ - -BEGIN_FTR_SECTION /* CPU_FTR_SMT */ - /* - * Search if we already have an indirect entry for that virtual - * address, and if we do, bail out. - * - * MAS6:IND should be already set based on MAS4 - */ - lhz r10,PACAPACAINDEX(r13) - addi r10,r10,1 - crclr cr1*4+eq /* set cr1.eq = 0 for non-recursive */ -1: lbarx r15,0,r11 - cmpdi r15,0 - bne 2f - stbcx. r10,0,r11 - bne 1b -3: - .subsection 1 -2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ - beq cr1,3b /* unlock will happen if cr1.eq = 0 */ -10: lbz r15,0(r11) - cmpdi r15,0 - bne 10b - b 1b - .previous -END_FTR_SECTION_IFSET(CPU_FTR_SMT) - - lbz r7,TCD_ESEL_NEXT(r11) - -BEGIN_FTR_SECTION /* CPU_FTR_SMT */ - /* - * Erratum A-008139 says that we can't use tlbwe to change - * an indirect entry in any way (including replacing or - * invalidating) if the other thread could be in the process - * of a lookup. The workaround is to invalidate the entry - * with tlbilx before overwriting. - */ - - rlwinm r10,r7,16,0xff0000 - oris r10,r10,MAS0_TLBSEL(1)@h - mtspr SPRN_MAS0,r10 - isync - tlbre - mfspr r15,SPRN_MAS1 - andis. r15,r15,MAS1_VALID@h - beq 5f - -BEGIN_FTR_SECTION_NESTED(532) - mfspr r10,SPRN_MAS8 - rlwinm r10,r10,0,0x80000fff /* tgs,tlpid -> sgs,slpid */ - mtspr SPRN_MAS5,r10 -END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) - - mfspr r10,SPRN_MAS1 - rlwinm r15,r10,0,0x3fff0000 /* tid -> spid */ - rlwimi r15,r10,20,0x00000003 /* ind,ts -> sind,sas */ - mfspr r10,SPRN_MAS6 - mtspr SPRN_MAS6,r15 - - mfspr r15,SPRN_MAS2 - isync - tlbilxva 0,r15 - isync - - mtspr SPRN_MAS6,r10 - -5: -BEGIN_FTR_SECTION_NESTED(532) - li r10,0 - mtspr SPRN_MAS8,r10 - mtspr SPRN_MAS5,r10 -END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) - - tlbsx 0,r16 - mfspr r10,SPRN_MAS1 - andis. r15,r10,MAS1_VALID@h - bne tlb_miss_done_e6500 -FTR_SECTION_ELSE - mfspr r10,SPRN_MAS1 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) - - oris r10,r10,MAS1_VALID@h - beq cr2,4f - rlwinm r10,r10,0,16,1 /* Clear TID */ -4: mtspr SPRN_MAS1,r10 - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- tlb_miss_fault_e6500 - - rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 - cmpldi cr0,r14,0 - clrrdi r15,r15,3 - beq- tlb_miss_fault_e6500 /* No PGDIR, bail */ - ldx r14,r14,r15 /* grab pgd entry */ - - rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_huge_e6500 /* Bad pgd entry or hugepage; bail */ - ldx r14,r14,r15 /* grab pud entry */ - - rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_huge_e6500 - ldx r14,r14,r15 /* Grab pmd entry */ - - mfspr r10,SPRN_MAS0 - cmpdi cr0,r14,0 - bge tlb_miss_huge_e6500 - - /* Now we build the MAS for a 2M indirect page: - * - * MAS 0 : ESEL needs to be filled by software round-robin - * MAS 1 : Fully set up - * - PID already updated by caller if necessary - * - TSIZE for now is base ind page size always - * - TID already cleared if necessary - * MAS 2 : Default not 2M-aligned, need to be redone - * MAS 3+7 : Needs to be done - */ - - ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) - mtspr SPRN_MAS7_MAS3,r14 - - clrrdi r15,r16,21 /* make EA 2M-aligned */ - mtspr SPRN_MAS2,r15 - -tlb_miss_huge_done_e6500: - lbz r16,TCD_ESEL_MAX(r11) - lbz r14,TCD_ESEL_FIRST(r11) - rlwimi r10,r7,16,0x00ff0000 /* insert esel_next into MAS0 */ - addi r7,r7,1 /* increment esel_next */ - mtspr SPRN_MAS0,r10 - cmpw r7,r16 - iseleq r7,r14,r7 /* if next == last use first */ - stb r7,TCD_ESEL_NEXT(r11) - - tlbwe - -tlb_miss_done_e6500: - .macro tlb_unlock_e6500 -BEGIN_FTR_SECTION - beq cr1,1f /* no unlock if lock was recursively grabbed */ - li r15,0 - isync - stb r15,0(r11) -1: -END_FTR_SECTION_IFSET(CPU_FTR_SMT) - .endm - - tlb_unlock_e6500 - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) - tlb_epilog_bolted - rfi - -tlb_miss_huge_e6500: - beq tlb_miss_fault_e6500 - li r10,1 - andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */ - rldimi r14,r10,63,0 /* Set PD_HUGE */ - xor r14,r14,r15 /* Clear size bits */ - ldx r14,0,r14 - - /* - * Now we build the MAS for a huge page. - * - * MAS 0 : ESEL needs to be filled by software round-robin - * - can be handled by indirect code - * MAS 1 : Need to clear IND and set TSIZE - * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler - */ - - subi r15,r15,10 /* Convert psize to tsize */ - mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,~MAS1_IND - rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK - mtspr SPRN_MAS1,r10 - - li r10,-0x400 - sld r15,r10,r15 /* Generate mask based on size */ - and r10,r16,r15 - rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ - clrldi r15,r15,PAGE_SHIFT /* Clear crap at the top */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - mtspr SPRN_MAS2,r10 - andi. r10,r14,_PAGE_DIRTY - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - bne 1f - li r10,MAS3_SW|MAS3_UW - andc r15,r15,r10 -1: - mtspr SPRN_MAS7_MAS3,r15 - - mfspr r10,SPRN_MAS0 - b tlb_miss_huge_done_e6500 - -tlb_miss_kernel_e6500: - ld r14,PACA_KERNELPGD(r13) - cmpldi cr1,r15,8 /* Check for vmalloc region */ - beq+ cr1,tlb_miss_common_e6500 - -tlb_miss_fault_e6500: - tlb_unlock_e6500 - /* We need to check if it was an instruction miss */ - andi. r16,r16,1 - bne itlb_miss_fault_e6500 -dtlb_miss_fault_e6500: - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - tlb_epilog_bolted - b exc_data_storage_book3e -itlb_miss_fault_e6500: - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - tlb_epilog_bolted - b exc_instruction_storage_book3e -#endif /* CONFIG_PPC_FSL_BOOK3E */ - -/********************************************************************** - * * - * TLB miss handling for Book3E with TLB reservation and HES support * - * * - **********************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* The page tables are mapped virtually linear. At this point, though, - * we don't know whether we are trying to fault in a first level - * virtual address or a virtual page table address. We can get that - * from bit 0x1 of the region ID which we have set for a page table - */ - andi. r10,r15,0x1 - bne- virt_page_table_tlb_miss - - std r14,EX_TLB_ESR(r12); /* save ESR */ - std r16,EX_TLB_DEAR(r12); /* save DEAR */ - - /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ - li r11,_PAGE_PRESENT - oris r11,r11,_PAGE_ACCESSED@h - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r15,0 /* Check for user region */ - - /* We pre-test some combination of permissions to avoid double - * faults: - * - * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE - * ESR_ST is 0x00800000 - * _PAGE_BAP_SW is 0x00000010 - * So the shift is >> 19. This tests for supervisor writeability. - * If the page happens to be supervisor writeable and not user - * writeable, we will take a new fault later, but that should be - * a rare enough case. - * - * We also move ESR_ST in _PAGE_DIRTY position - * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 - * - * MAS1 is preset for all we need except for TID that needs to - * be cleared for kernel translations - */ - rlwimi r11,r14,32-19,27,27 - rlwimi r11,r14,32-16,19,19 - beq normal_tlb_miss - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by writing a crazy value in ESR in our exception frame - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - - cmpldi cr0,r15,0 /* Check for user region */ - std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ - beq normal_tlb_miss - - li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - /* XXX replace the RMW cycles with immediate loads + writes */ - mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -/* - * This is the guts of the first-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = faulting address - * r15 = region ID - * r14 = crap (free to use) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = PTE permission mask - * r10 = crap (free to use) - */ -normal_tlb_miss: - /* So we first construct the page table address. We do that by - * shifting the bottom of the address (not the region ID) by - * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and - * or'ing the fourth high bit. - * - * NOTE: For 64K pages, we do things slightly differently in - * order to handle the weird page table format used by linux - */ - ori r10,r15,0x1 -#ifdef CONFIG_PPC_64K_PAGES - /* For the top bits, 16 bytes per PTE */ - rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 - /* Now create the bottom bits as 0 in position 0x8000 and - * the rest calculated for 8 bytes per PTE - */ - rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 - /* Insert the bottom bits in */ - rlwimi r14,r15,0,16,31 -#else - rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 -#endif - sldi r15,r10,60 - clrrdi r14,r14,3 - or r10,r15,r14 - -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ld r14,0(r10) - beq normal_tlb_miss_done -MMU_FTR_SECTION_ELSE - ld r14,0(r10) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) - -finish_normal_tlb_miss: - /* Check if required permissions are met */ - andc. r15,r11,r14 - bne- normal_tlb_miss_access_fault - - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE need change if !base page size, not - * yet implemented for now - * MAS 2 : Defaults not useful, need to be redone - * MAS 3+7 : Needs to be done - * - * TODO: mix up code below for better scheduling - */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - mtspr SPRN_MAS2,r11 - - /* Check page size, if not standard, update MAS1 */ - rldicl r11,r14,64-8,64-8 -#ifdef CONFIG_PPC_64K_PAGES - cmpldi cr0,r11,BOOK3E_PAGESZ_64K -#else - cmpldi cr0,r11,BOOK3E_PAGESZ_4K -#endif - beq- 1f - mfspr r11,SPRN_MAS1 - rlwimi r11,r14,31,21,24 - rlwinm r11,r11,0,21,19 - mtspr SPRN_MAS1,r11 -1: - /* Move RPN in position */ - rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - clrldi r15,r11,12 /* Clear crap at the top */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - andi. r11,r14,_PAGE_DIRTY - bne 1f - li r11,MAS3_SW|MAS3_UW - andc r15,r15,r11 -1: -BEGIN_MMU_FTR_SECTION - srdi r16,r15,32 - mtspr SPRN_MAS3,r15 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r15 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -normal_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) - TLB_MISS_EPILOG_SUCCESS - rfi - -normal_tlb_miss_access_fault: - /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_EXEC - bne 1f - ld r14,EX_TLB_DEAR(r12) - ld r15,EX_TLB_ESR(r12) - mtspr SPRN_DEAR,r14 - mtspr SPRN_ESR,r15 - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - -/* - * This is the guts of the second-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = virtual page table faulting address - * r15 = region (top 4 bits of address) - * r14 = crap (free to use) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * Note that this should only ever be called as a second level handler - * with the current scheme when using SW load. - * That means we can always get the original fault DEAR at - * EX_TLB_DEAR-EX_TLB_SIZE(r12) - * - * It can be re-entered by the linear mapping miss handler. However, to - * avoid too much complication, it will restart the whole fault at level - * 0 so we don't care too much about clobbers - * - * XXX That code was written back when we couldn't clobber r14. We can now, - * so we could probably optimize things a bit - */ -virt_page_table_tlb_miss: - /* Are we hitting a kernel page table ? */ - andi. r10,r15,0x8 - - /* The cool thing now is that r10 contains 0 for user and 8 for kernel, - * and we happen to have the swapper_pg_dir at offset 8 from the user - * pgdir in the PACA :-). - */ - add r11,r10,r13 - - /* If kernel, we need to clear MAS1 TID */ - beq 1f - /* XXX replace the RMW cycles with immediate loads + writes */ - mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 -1: -BEGIN_MMU_FTR_SECTION - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - */ - PPC_TLBSRX_DOT(0,R16) - beq virt_page_table_tlb_miss_done -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 - bne- virt_page_table_tlb_miss_fault - - /* Get the PGD pointer */ - ld r15,PACAPGD(r11) - cmpldi cr0,r15,0 - beq- virt_page_table_tlb_miss_fault - - /* Get to PGD entry */ - rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge virt_page_table_tlb_miss_fault - -#ifndef CONFIG_PPC_64K_PAGES - /* Get to PUD entry */ - rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge virt_page_table_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get to PMD entry */ - rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge virt_page_table_tlb_miss_fault - - /* Ok, we're all right, we can now create a kernel translation for - * a 4K or 64K page from r16 -> r15. - */ - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE for now is base page size always - * MAS 2 : Use defaults - * MAS 3+7 : Needs to be done - * - * So we only do MAS 2 and 3 for now... - */ - clrldi r11,r15,4 /* remove region ID from RPN */ - ori r10,r11,1 /* Or-in SR */ - -BEGIN_MMU_FTR_SECTION - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -BEGIN_MMU_FTR_SECTION -virt_page_table_tlb_miss_done: - - /* We have overridden MAS2:EPN but currently our primary TLB miss - * handler will always restore it so that should not be an issue, - * if we ever optimize the primary handler to not write MAS2 on - * some cases, we'll have to restore MAS2:EPN here based on the - * original fault's DEAR. If we do that we have to modify the - * ITLB miss handler to also store SRR0 in the exception frame - * as DEAR. - * - * However, one nasty thing we did is we cleared the reservation - * (well, potentially we did). We do a trick here thus if we - * are not a level 0 exception (we interrupted the TLB miss) we - * offset the return address by -4 in order to replay the tlbsrx - * instruction there - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- 1f - ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) - addi r10,r11,-4 - std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) -1: -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) - /* Return to caller, normal case */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); - TLB_MISS_EPILOG_SUCCESS - rfi - -virt_page_table_tlb_miss_fault: - /* If we fault here, things are a little bit tricky. We need to call - * either data or instruction store fault, and we need to retrieve - * the original fault address and ESR (for data). - * - * The thing is, we know that in normal circumstances, this is - * always called as a second level tlb miss for SW load or as a first - * level TLB miss for HW load, so we should be able to peek at the - * relevant information in the first exception frame in the PACA. - * - * However, we do need to double check that, because we may just hit - * a stray kernel pointer or a userland attack trying to hit those - * areas. If that is the case, we do a data fault. (We can't get here - * from an instruction tlb miss anyway). - * - * Note also that when going to a fault, we must unwind the previous - * level as well. Since we are doing that, we don't need to clear or - * restore the TLB reservation neither. - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- virt_page_table_tlb_miss_whacko_fault - - /* We dig the original DEAR and ESR from slot 0 */ - ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) - ld r16,EX_TLB_ESR+PACA_EXTLB(r13) - - /* We check for the "special" ESR value for instruction faults */ - cmpdi cr0,r16,-1 - beq 1f - mtspr SPRN_DEAR,r15 - mtspr SPRN_ESR,r16 - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -virt_page_table_tlb_miss_whacko_fault: - /* The linear fault will restart everything so ESR and DEAR will - * not have been clobbered, let's just fault with what we have - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - - -/************************************************************** - * * - * TLB miss handling for Book3E with hw page table support * - * * - **************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss_htw) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r11,0 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss_htw) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by keeping a crazy value for ESR in r14 - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r11,0 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - -/* - * This is the guts of the second-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = virtual page table faulting address - * r15 = PGD pointer - * r14 = ESR - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * It can be re-entered by the linear mapping miss handler. However, to - * avoid too much complication, it will save/restore things for us - */ -htw_tlb_miss: - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - * - * MAS1:IND should be already set based on MAS4 - */ - PPC_TLBSRX_DOT(0,R16) - beq htw_tlb_miss_done - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- htw_tlb_miss_fault - - /* Get the PGD pointer */ - cmpldi cr0,r15,0 - beq- htw_tlb_miss_fault - - /* Get to PGD entry */ - rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - -#ifndef CONFIG_PPC_64K_PAGES - /* Get to PUD entry */ - rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get to PMD entry */ - rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Ok, we're all right, we can now create an indirect entry for - * a 1M or 256M page. - * - * The last trick is now that because we use "half" pages for - * the HTW (1M IND is 2K and 256M IND is 32K) we need to account - * for an added LSB bit to the RPN. For 64K pages, there is no - * problem as we already use 32K arrays (half PTE pages), but for - * 4K page we need to extract a bit from the virtual address and - * insert it into the "PA52" bit of the RPN. - */ -#ifndef CONFIG_PPC_64K_PAGES - rlwimi r15,r16,32-9,20,20 -#endif - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE for now is base ind page size always - * MAS 2 : Use defaults - * MAS 3+7 : Needs to be done - */ -#ifdef CONFIG_PPC_64K_PAGES - ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) -#else - ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -#endif - -BEGIN_MMU_FTR_SECTION - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -htw_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) - TLB_MISS_EPILOG_SUCCESS - rfi - -htw_tlb_miss_fault: - /* We need to check if it was an instruction miss. We know this - * though because r14 would contain -1 - */ - cmpdi cr0,r14,-1 - beq 1f - mtspr SPRN_DEAR,r16 - mtspr SPRN_ESR,r14 - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -/* - * This is the guts of "any" level TLB miss handler for kernel linear - * mapping misses. We are entered with: - * - * - * r16 = faulting address - * r15 = crap (free to use) - * r14 = ESR (data) or -1 (instruction) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * In addition we know that we will not re-enter, so in theory, we could - * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. - * - * We also need to be careful about MAS registers here & TLB reservation, - * as we know we'll have clobbered them if we interrupt the main TLB miss - * handlers in which case we probably want to do a full restart at level - * 0 rather than saving / restoring the MAS. - * - * Note: If we care about performance of that core, we can easily shuffle - * a few things around - */ -tlb_load_linear: - /* For now, we assume the linear mapping is contiguous and stops at - * linear_map_top. We also assume the size is a multiple of 1G, thus - * we only use 1G pages for now. That might have to be changed in a - * final implementation, especially when dealing with hypervisors - */ - ld r11,PACATOC(r13) - ld r11,linear_map_top@got(r11) - ld r10,0(r11) - tovirt(10,10) - cmpld cr0,r16,r10 - bge tlb_load_linear_fault - - /* MAS1 need whole new setup. */ - li r15,(BOOK3E_PAGESZ_1GB< - * IBM Corp. - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -/* - * This struct lists the sw-supported page sizes. The hardawre MMU may support - * other sizes not listed here. The .ind field is only used on MMUs that have - * indirect page table entries. - */ -#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) -#ifdef CONFIG_PPC_FSL_BOOK3E -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_2M] = { - .shift = 21, - .enc = BOOK3E_PAGESZ_2M, - }, - [MMU_PAGE_4M] = { - .shift = 22, - .enc = BOOK3E_PAGESZ_4M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_64M] = { - .shift = 26, - .enc = BOOK3E_PAGESZ_64M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#elif defined(CONFIG_PPC_8xx) -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - /* we only manage 4k and 16k pages as normal pages */ -#ifdef CONFIG_PPC_4K_PAGES - [MMU_PAGE_4K] = { - .shift = 12, - }, -#else - [MMU_PAGE_16K] = { - .shift = 14, - }, -#endif - [MMU_PAGE_512K] = { - .shift = 19, - }, - [MMU_PAGE_8M] = { - .shift = 23, - }, -}; -#else -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .ind = 20, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_16K] = { - .shift = 14, - .enc = BOOK3E_PAGESZ_16K, - }, - [MMU_PAGE_64K] = { - .shift = 16, - .ind = 28, - .enc = BOOK3E_PAGESZ_64K, - }, - [MMU_PAGE_1M] = { - .shift = 20, - .enc = BOOK3E_PAGESZ_1M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .ind = 36, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#endif /* CONFIG_FSL_BOOKE */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} -#else -static inline int mmu_get_tsize(int psize) -{ - /* This isn't used on !Book3E for now */ - return 0; -} -#endif /* CONFIG_PPC_BOOK3E_MMU */ - -/* The variables below are currently only used on 64-bit Book3E - * though this will probably be made common with other nohash - * implementations at some point - */ -#ifdef CONFIG_PPC64 - -int mmu_linear_psize; /* Page size used for the linear mapping */ -int mmu_pte_psize; /* Page size used for PTE pages */ -int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ -int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ -unsigned long linear_map_top; /* Top of linear mapping */ - - -/* - * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug - * exceptions. This is used for bolted and e6500 TLB miss handlers which - * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, - * this is set to zero. - */ -int extlb_level_exc; - -#endif /* CONFIG_PPC64 */ - -#ifdef CONFIG_PPC_FSL_BOOK3E -/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ -DEFINE_PER_CPU(int, next_tlbcam_idx); -EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); -#endif - -/* - * Base TLB flushing operations: - * - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes kernel pages - * - * - local_* variants of page and mm only apply to the current - * processor - */ - -/* - * These are the base non-SMP variants of page and mm flushing - */ -void local_flush_tlb_mm(struct mm_struct *mm) -{ - unsigned int pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbil_pid(pid); - preempt_enable(); -} -EXPORT_SYMBOL(local_flush_tlb_mm); - -void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - int tsize, int ind) -{ - unsigned int pid; - - preempt_disable(); - pid = mm ? mm->context.id : 0; - if (pid != MMU_NO_CONTEXT) - _tlbil_va(vmaddr, pid, tsize, ind); - preempt_enable(); -} - -void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - mmu_get_tsize(mmu_virtual_psize), 0); -} -EXPORT_SYMBOL(local_flush_tlb_page); - -/* - * And here are the SMP non-local implementations - */ -#ifdef CONFIG_SMP - -static DEFINE_RAW_SPINLOCK(tlbivax_lock); - -struct tlb_flush_param { - unsigned long addr; - unsigned int pid; - unsigned int tsize; - unsigned int ind; -}; - -static void do_flush_tlb_mm_ipi(void *param) -{ - struct tlb_flush_param *p = param; - - _tlbil_pid(p ? p->pid : 0); -} - -static void do_flush_tlb_page_ipi(void *param) -{ - struct tlb_flush_param *p = param; - - _tlbil_va(p->addr, p->pid, p->tsize, p->ind); -} - - -/* Note on invalidations and PID: - * - * We snapshot the PID with preempt disabled. At this point, it can still - * change either because: - * - our context is being stolen (PID -> NO_CONTEXT) on another CPU - * - we are invaliating some target that isn't currently running here - * and is concurrently acquiring a new PID on another CPU - * - some other CPU is re-acquiring a lost PID for this mm - * etc... - * - * However, this shouldn't be a problem as we only guarantee - * invalidation of TLB entries present prior to this call, so we - * don't care about the PID changing, and invalidating a stale PID - * is generally harmless. - */ - -void flush_tlb_mm(struct mm_struct *mm) -{ - unsigned int pid; - - preempt_disable(); - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; - if (!mm_is_core_local(mm)) { - struct tlb_flush_param p = { .pid = pid }; - /* Ignores smp_processor_id() even if set. */ - smp_call_function_many(mm_cpumask(mm), - do_flush_tlb_mm_ipi, &p, 1); - } - _tlbil_pid(pid); - no_context: - preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_mm); - -void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - int tsize, int ind) -{ - struct cpumask *cpu_mask; - unsigned int pid; - - /* - * This function as well as __local_flush_tlb_page() must only be called - * for user contexts. - */ - if (WARN_ON(!mm)) - return; - - preempt_disable(); - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - goto bail; - cpu_mask = mm_cpumask(mm); - if (!mm_is_core_local(mm)) { - /* If broadcast tlbivax is supported, use it */ - if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { - int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); - if (lock) - raw_spin_lock(&tlbivax_lock); - _tlbivax_bcast(vmaddr, pid, tsize, ind); - if (lock) - raw_spin_unlock(&tlbivax_lock); - goto bail; - } else { - struct tlb_flush_param p = { - .pid = pid, - .addr = vmaddr, - .tsize = tsize, - .ind = ind, - }; - /* Ignores smp_processor_id() even if set in cpu_mask */ - smp_call_function_many(cpu_mask, - do_flush_tlb_page_ipi, &p, 1); - } - } - _tlbil_va(vmaddr, pid, tsize, ind); - bail: - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ -#ifdef CONFIG_HUGETLB_PAGE - if (vma && is_vm_hugetlb_page(vma)) - flush_hugetlb_page(vma, vmaddr); -#endif - - __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - mmu_get_tsize(mmu_virtual_psize), 0); -} -EXPORT_SYMBOL(flush_tlb_page); - -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_PPC_47x -void __init early_init_mmu_47x(void) -{ -#ifdef CONFIG_SMP - unsigned long root = of_get_flat_dt_root(); - if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) - mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); -#endif /* CONFIG_SMP */ -} -#endif /* CONFIG_PPC_47x */ - -/* - * Flush kernel TLB entries in the given range - */ -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ -#ifdef CONFIG_SMP - preempt_disable(); - smp_call_function(do_flush_tlb_mm_ipi, NULL, 1); - _tlbil_pid(0); - preempt_enable(); -#else - _tlbil_pid(0); -#endif -} -EXPORT_SYMBOL(flush_tlb_kernel_range); - -/* - * Currently, for range flushing, we just do a full mm flush. This should - * be optimized based on a threshold on the size of the range, since - * some implementation can stack multiple tlbivax before a tlbsync but - * for now, we keep it that way - */ -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) - -{ - if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK)) - flush_tlb_page(vma, start); - else - flush_tlb_mm(vma->vm_mm); -} -EXPORT_SYMBOL(flush_tlb_range); - -void tlb_flush(struct mmu_gather *tlb) -{ - flush_tlb_mm(tlb->mm); -} - -/* - * Below are functions specific to the 64-bit variant of Book3E though that - * may change in the future - */ - -#ifdef CONFIG_PPC64 - -/* - * Handling of virtual linear page tables or indirect TLB entries - * flushing when PTE pages are freed - */ -void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) -{ - int tsize = mmu_psize_defs[mmu_pte_psize].enc; - - if (book3e_htw_mode != PPC_HTW_NONE) { - unsigned long start = address & PMD_MASK; - unsigned long end = address + PMD_SIZE; - unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; - - /* This isn't the most optimal, ideally we would factor out the - * while preempt & CPU mask mucking around, or even the IPI but - * it will do for now - */ - while (start < end) { - __flush_tlb_page(tlb->mm, start, tsize, 1); - start += size; - } - } else { - unsigned long rmask = 0xf000000000000000ul; - unsigned long rid = (address & rmask) | 0x1000000000000000ul; - unsigned long vpte = address & ~rmask; - -#ifdef CONFIG_PPC_64K_PAGES - vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; -#else - vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; -#endif - vpte |= rid; - __flush_tlb_page(tlb->mm, vpte, tsize, 0); - } -} - -static void setup_page_sizes(void) -{ - unsigned int tlb0cfg; - unsigned int tlb0ps; - unsigned int eptcfg; - int i, psize; - -#ifdef CONFIG_PPC_FSL_BOOK3E - unsigned int mmucfg = mfspr(SPRN_MMUCFG); - int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); - unsigned int min_pg, max_pg; - - min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; - max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def; - unsigned int shift; - - def = &mmu_psize_defs[psize]; - shift = def->shift; - - if (shift == 0 || shift & 1) - continue; - - /* adjust to be in terms of 4^shift Kb */ - shift = (shift - 10) >> 1; - - if ((shift >= min_pg) && (shift <= max_pg)) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - goto out; - } - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { - u32 tlb1cfg, tlb1ps; - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb1cfg = mfspr(SPRN_TLB1CFG); - tlb1ps = mfspr(SPRN_TLB1PS); - eptcfg = mfspr(SPRN_EPTCFG); - - if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) - book3e_htw_mode = PPC_HTW_E6500; - - /* - * We expect 4K subpage size and unrestricted indirect size. - * The lack of a restriction on indirect size is a Freescale - * extension, indicated by PSn = 0 but SPSn != 0. - */ - if (eptcfg != 2) - book3e_htw_mode = PPC_HTW_NONE; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (!def->shift) - continue; - - if (tlb1ps & (1U << (def->shift - 10))) { - def->flags |= MMU_PAGE_SIZE_DIRECT; - - if (book3e_htw_mode && psize == MMU_PAGE_2M) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - } - } - - goto out; - } -#endif - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb0ps = mfspr(SPRN_TLB0PS); - eptcfg = mfspr(SPRN_EPTCFG); - - /* Look for supported direct sizes */ - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (tlb0ps & (1U << (def->shift - 10))) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - /* Indirect page sizes supported ? */ - if ((tlb0cfg & TLBnCFG_IND) == 0 || - (tlb0cfg & TLBnCFG_PT) == 0) - goto out; - - book3e_htw_mode = PPC_HTW_IBM; - - /* Now, we only deal with one IND page size for each - * direct size. Hopefully all implementations today are - * unambiguous, but we might want to be careful in the - * future. - */ - for (i = 0; i < 3; i++) { - unsigned int ps, sps; - - sps = eptcfg & 0x1f; - eptcfg >>= 5; - ps = eptcfg & 0x1f; - eptcfg >>= 5; - if (!ps || !sps) - continue; - for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (ps == (def->shift - 10)) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - if (sps == (def->shift - 10)) - def->ind = ps + 10; - } - } - -out: - /* Cleanup array and print summary */ - pr_info("MMU: Supported page sizes\n"); - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - const char *__page_type_names[] = { - "unsupported", - "direct", - "indirect", - "direct & indirect" - }; - if (def->flags == 0) { - def->shift = 0; - continue; - } - pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), - __page_type_names[def->flags & 0x3]); - } -} - -static void setup_mmu_htw(void) -{ - /* - * If we want to use HW tablewalk, enable it by patching the TLB miss - * handlers to branch to the one dedicated to it. - */ - - switch (book3e_htw_mode) { - case PPC_HTW_IBM: - patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); - break; -#ifdef CONFIG_PPC_FSL_BOOK3E - case PPC_HTW_E6500: - extlb_level_exc = EX_TLB_SIZE; - patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); - break; -#endif - } - pr_info("MMU: Book3E HW tablewalk %s\n", - book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); -} - -/* - * Early initialization of the MMU TLB code - */ -static void early_init_this_mmu(void) -{ - unsigned int mas4; - - /* Set MAS4 based on page table setting */ - - mas4 = 0x4 << MAS4_WIMGED_SHIFT; - switch (book3e_htw_mode) { - case PPC_HTW_E6500: - mas4 |= MAS4_INDD; - mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; - mas4 |= MAS4_TLBSELD(1); - mmu_pte_psize = MMU_PAGE_2M; - break; - - case PPC_HTW_IBM: - mas4 |= MAS4_INDD; -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_256M; -#else - mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_1M; -#endif - break; - - case PPC_HTW_NONE: -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; -#else - mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; -#endif - mmu_pte_psize = mmu_virtual_psize; - break; - } - mtspr(SPRN_MAS4, mas4); - -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned int num_cams; - int __maybe_unused cpu = smp_processor_id(); - bool map = true; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - - /* - * Only do the mapping once per core, or else the - * transient mapping would cause problems. - */ -#ifdef CONFIG_SMP - if (hweight32(get_tensr()) > 1) - map = false; -#endif - - if (map) - linear_map_top = map_mem_in_cams(linear_map_top, - num_cams, false); - } -#endif - - /* A sync won't hurt us after mucking around with - * the MMU configuration - */ - mb(); -} - -static void __init early_init_mmu_global(void) -{ - /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it. Ideally - * we should find out a suitable page size and patch the - * TLB miss code (either that or use the PACA to store - * the value we want) - */ - mmu_linear_psize = MMU_PAGE_1G; - - /* XXX This should be decided at runtime based on supported - * page sizes in the TLB, but for now let's assume 16M is - * always there and a good fit (which it probably is) - * - * Freescale booke only supports 4K pages in TLB0, so use that. - */ - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) - mmu_vmemmap_psize = MMU_PAGE_4K; - else - mmu_vmemmap_psize = MMU_PAGE_16M; - - /* XXX This code only checks for TLB 0 capabilities and doesn't - * check what page size combos are supported by the HW. It - * also doesn't handle the case where a separate array holds - * the IND entries from the array loaded by the PT. - */ - /* Look for supported page sizes */ - setup_page_sizes(); - - /* Look for HW tablewalk support */ - setup_mmu_htw(); - -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - if (book3e_htw_mode == PPC_HTW_NONE) { - extlb_level_exc = EX_TLB_SIZE; - patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); - patch_exception(0x1e0, - exc_instruction_tlb_miss_bolted_book3e); - } - } -#endif - - /* Set the global containing the top of the linear mapping - * for use by the TLB miss code - */ - linear_map_top = memblock_end_of_DRAM(); -} - -static void __init early_mmu_set_memory_limit(void) -{ -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - /* - * Limit memory so we dont have linear faults. - * Unlike memblock_set_current_limit, which limits - * memory available during early boot, this permanently - * reduces the memory available to Linux. We need to - * do this because highmem is not supported on 64-bit. - */ - memblock_enforce_memory_limit(linear_map_top); - } -#endif - - memblock_set_current_limit(linear_map_top); -} - -/* boot cpu only */ -void __init early_init_mmu(void) -{ - early_init_mmu_global(); - early_init_this_mmu(); - early_mmu_set_memory_limit(); -} - -void early_init_mmu_secondary(void) -{ - early_init_this_mmu(); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* On non-FSL Embedded 64-bit, we adjust the RMA size to match - * the bolted TLB entry. We know for now that only 1G - * entries are supported though that may eventually - * change. - * - * on FSL Embedded 64-bit, usually all RAM is bolted, but with - * unusual memory sizes it's possible for some RAM to not be mapped - * (such RAM is not used at all by Linux, since we don't support - * highmem on 64-bit). We limit ppc64_rma_size to what would be - * mappable if this memblock is the only one. Additional memblocks - * can only increase, not decrease, the amount that ends up getting - * mapped. We still limit max to 1G even if we'll eventually map - * more. This is due to what the early init code is set up to do. - * - * We crop it to the size of the first MEMBLOCK to - * avoid going over total available memory just in case... - */ -#ifdef CONFIG_PPC_FSL_BOOK3E - if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned long linear_sz; - unsigned int num_cams; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - - linear_sz = map_mem_in_cams(first_memblock_size, num_cams, - true); - - ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); - } else -#endif - ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(first_memblock_base + ppc64_rma_size); -} -#else /* ! CONFIG_PPC64 */ -void __init early_init_mmu(void) -{ -#ifdef CONFIG_PPC_47x - early_init_mmu_47x(); -#endif - -#ifdef CONFIG_PPC_MM_SLICES -#if defined(CONFIG_PPC_8xx) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif -#endif -} -#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S deleted file mode 100644 index e066a658acac..000000000000 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ /dev/null @@ -1,491 +0,0 @@ -/* - * This file contains low-level functions for performing various - * types of TLB invalidations on various processors with no hash - * table. - * - * This file implements the following functions for all no-hash - * processors. Some aren't implemented for some variants. Some - * are inline in tlbflush.h - * - * - tlbil_va - * - tlbil_pid - * - tlbil_all - * - tlbivax_bcast - * - * Code mostly moved over from misc_32.S - * - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Partially rewritten by Cort Dougan (cort@cs.nmt.edu) - * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_40x) - -/* - * 40x implementation needs only tlbil_va - */ -_GLOBAL(__tlbil_va) - /* We run the search with interrupts disabled because we have to change - * the PID and I don't want to preempt when that happens. - */ - mfmsr r5 - mfspr r6,SPRN_PID - wrteei 0 - mtspr SPRN_PID,r4 - tlbsx. r3, 0, r3 - mtspr SPRN_PID,r6 - wrtee r5 - bne 1f - sync - /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is - * clear. Since 25 is the V bit in the TLB_TAG, loading this value - * will invalidate the TLB entry. */ - tlbwe r3, r3, TLB_TAG - isync -1: blr - -#elif defined(CONFIG_PPC_8xx) - -/* - * Nothing to do for 8xx, everything is inline - */ - -#elif defined(CONFIG_44x) /* Includes 47x */ - -/* - * 440 implementation uses tlbsx/we for tlbil_va and a full sweep - * of the TLB for everything else. - */ -_GLOBAL(__tlbil_va) - mfspr r5,SPRN_MMUCR - mfmsr r10 - - /* - * We write 16 bits of STID since 47x supports that much, we - * will never be passed out of bounds values on 440 (hopefully) - */ - rlwimi r5,r4,0,16,31 - - /* We have to run the search with interrupts disabled, otherwise - * an interrupt which causes a TLB miss can clobber the MMUCR - * between the mtspr and the tlbsx. - * - * Critical and Machine Check interrupts take care of saving - * and restoring MMUCR, so only normal interrupts have to be - * taken care of. - */ - wrteei 0 - mtspr SPRN_MMUCR,r5 - tlbsx. r6,0,r3 - bne 10f - sync -BEGIN_MMU_FTR_SECTION - b 2f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) - /* On 440 There are only 64 TLB entries, so r3 < 64, which means bit - * 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this - * value will invalidate the TLB entry. - */ - tlbwe r6,r6,PPC44x_TLB_PAGEID - isync -10: wrtee r10 - blr -2: -#ifdef CONFIG_PPC_47x - oris r7,r6,0x8000 /* specify way explicitly */ - clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */ - ori r4,r4,PPC47x_TLBE_SIZE - tlbwe r4,r7,0 /* write it */ - isync - wrtee r10 - blr -#else /* CONFIG_PPC_47x */ -1: trap - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; -#endif /* !CONFIG_PPC_47x */ - -_GLOBAL(_tlbil_all) -_GLOBAL(_tlbil_pid) -BEGIN_MMU_FTR_SECTION - b 2f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) - li r3,0 - sync - - /* Load high watermark */ - lis r4,tlb_44x_hwater@ha - lwz r5,tlb_44x_hwater@l(r4) - -1: tlbwe r3,r3,PPC44x_TLB_PAGEID - addi r3,r3,1 - cmpw 0,r3,r5 - ble 1b - - isync - blr -2: -#ifdef CONFIG_PPC_47x - /* 476 variant. There's not simple way to do this, hopefully we'll - * try to limit the amount of such full invalidates - */ - mfmsr r11 /* Interrupts off */ - wrteei 0 - li r3,-1 /* Current set */ - lis r10,tlb_47x_boltmap@h - ori r10,r10,tlb_47x_boltmap@l - lis r7,0x8000 /* Specify way explicitly */ - - b 9f /* For each set */ - -1: li r9,4 /* Number of ways */ - li r4,0 /* Current way */ - li r6,0 /* Default entry value 0 */ - andi. r0,r8,1 /* Check if way 0 is bolted */ - mtctr r9 /* Load way counter */ - bne- 3f /* Bolted, skip loading it */ - -2: /* For each way */ - or r5,r3,r4 /* Make way|index for tlbre */ - rlwimi r5,r5,16,8,15 /* Copy index into position */ - tlbre r6,r5,0 /* Read entry */ -3: addis r4,r4,0x2000 /* Next way */ - andi. r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */ - beq 4f /* Nope, skip it */ - rlwimi r7,r5,0,1,2 /* Insert way number */ - rlwinm r6,r6,0,21,19 /* Clear V */ - tlbwe r6,r7,0 /* Write it */ -4: bdnz 2b /* Loop for each way */ - srwi r8,r8,1 /* Next boltmap bit */ -9: cmpwi cr1,r3,255 /* Last set done ? */ - addi r3,r3,1 /* Next set */ - beq cr1,1f /* End of loop */ - andi. r0,r3,0x1f /* Need to load a new boltmap word ? */ - bne 1b /* No, loop */ - lwz r8,0(r10) /* Load boltmap entry */ - addi r10,r10,4 /* Next word */ - b 1b /* Then loop */ -1: isync /* Sync shadows */ - wrtee r11 -#else /* CONFIG_PPC_47x */ -1: trap - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; -#endif /* !CONFIG_PPC_47x */ - blr - -#ifdef CONFIG_PPC_47x - -/* - * _tlbivax_bcast is only on 47x. We don't bother doing a runtime - * check though, it will blow up soon enough if we mistakenly try - * to use it on a 440. - */ -_GLOBAL(_tlbivax_bcast) - mfspr r5,SPRN_MMUCR - mfmsr r10 - rlwimi r5,r4,0,16,31 - wrteei 0 - mtspr SPRN_MMUCR,r5 - isync - PPC_TLBIVAX(0, R3) - isync - eieio - tlbsync -BEGIN_FTR_SECTION - b 1f -END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) - sync - wrtee r10 - blr -/* - * DD2 HW could hang if in instruction fetch happens before msync completes. - * Touch enough instruction cache lines to ensure cache hits - */ -1: mflr r9 - bl 2f -2: mflr r6 - li r7,32 - PPC_ICBT(0,R6,R7) /* touch next cache line */ - add r6,r6,r7 - PPC_ICBT(0,R6,R7) /* touch next cache line */ - add r6,r6,r7 - PPC_ICBT(0,R6,R7) /* touch next cache line */ - sync - nop - nop - nop - nop - nop - nop - nop - nop - mtlr r9 - wrtee r10 - blr -#endif /* CONFIG_PPC_47x */ - -#elif defined(CONFIG_FSL_BOOKE) -/* - * FSL BookE implementations. - * - * Since feature sections are using _SECTION_ELSE we need - * to have the larger code path before the _SECTION_ELSE - */ - -/* - * Flush MMU TLB on the local processor - */ -_GLOBAL(_tlbil_all) -BEGIN_MMU_FTR_SECTION - li r3,(MMUCSR0_TLBFI)@l - mtspr SPRN_MMUCSR0, r3 -1: - mfspr r3,SPRN_MMUCSR0 - andi. r3,r3,MMUCSR0_TLBFI@l - bne 1b -MMU_FTR_SECTION_ELSE - PPC_TLBILX_ALL(0,R0) -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) - msync - isync - blr - -_GLOBAL(_tlbil_pid) -BEGIN_MMU_FTR_SECTION - slwi r3,r3,16 - mfmsr r10 - wrteei 0 - mfspr r4,SPRN_MAS6 /* save MAS6 */ - mtspr SPRN_MAS6,r3 - PPC_TLBILX_PID(0,R0) - mtspr SPRN_MAS6,r4 /* restore MAS6 */ - wrtee r10 -MMU_FTR_SECTION_ELSE - li r3,(MMUCSR0_TLBFI)@l - mtspr SPRN_MMUCSR0, r3 -1: - mfspr r3,SPRN_MMUCSR0 - andi. r3,r3,MMUCSR0_TLBFI@l - bne 1b -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) - msync - isync - blr - -/* - * Flush MMU TLB for a particular address, but only on the local processor - * (no broadcast) - */ -_GLOBAL(__tlbil_va) - mfmsr r10 - wrteei 0 - slwi r4,r4,16 - ori r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l - mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ -BEGIN_MMU_FTR_SECTION - tlbsx 0,r3 - mfspr r4,SPRN_MAS1 /* check valid */ - andis. r3,r4,MAS1_VALID@h - beq 1f - rlwinm r4,r4,0,1,31 - mtspr SPRN_MAS1,r4 - tlbwe -MMU_FTR_SECTION_ELSE - PPC_TLBILX_VA(0,R3) -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) - msync - isync -1: wrtee r10 - blr -#elif defined(CONFIG_PPC_BOOK3E) -/* - * New Book3E (>= 2.06) implementation - * - * Note: We may be able to get away without the interrupt masking stuff - * if we save/restore MAS6 on exceptions that might modify it - */ -_GLOBAL(_tlbil_pid) - slwi r4,r3,MAS6_SPID_SHIFT - mfmsr r10 - wrteei 0 - mtspr SPRN_MAS6,r4 - PPC_TLBILX_PID(0,R0) - wrtee r10 - msync - isync - blr - -_GLOBAL(_tlbil_pid_noind) - slwi r4,r3,MAS6_SPID_SHIFT - mfmsr r10 - ori r4,r4,MAS6_SIND - wrteei 0 - mtspr SPRN_MAS6,r4 - PPC_TLBILX_PID(0,R0) - wrtee r10 - msync - isync - blr - -_GLOBAL(_tlbil_all) - PPC_TLBILX_ALL(0,R0) - msync - isync - blr - -_GLOBAL(_tlbil_va) - mfmsr r10 - wrteei 0 - cmpwi cr0,r6,0 - slwi r4,r4,MAS6_SPID_SHIFT - rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK - beq 1f - rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND -1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ - PPC_TLBILX_VA(0,R3) - msync - isync - wrtee r10 - blr - -_GLOBAL(_tlbivax_bcast) - mfmsr r10 - wrteei 0 - cmpwi cr0,r6,0 - slwi r4,r4,MAS6_SPID_SHIFT - rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK - beq 1f - rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND -1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ - PPC_TLBIVAX(0,R3) - eieio - tlbsync - sync - wrtee r10 - blr - -_GLOBAL(set_context) -#ifdef CONFIG_BDI_SWITCH - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is the second parameter. - */ - lis r5, abatron_pteptrs@h - ori r5, r5, abatron_pteptrs@l - stw r4, 0x4(r5) -#endif - mtspr SPRN_PID,r3 - isync /* Force context change */ - blr -#else -#error Unsupported processor type ! -#endif - -#if defined(CONFIG_PPC_FSL_BOOK3E) -/* - * extern void loadcam_entry(unsigned int index) - * - * Load TLBCAM[index] entry in to the L2 CAM MMU - * Must preserve r7, r8, r9, and r10 - */ -_GLOBAL(loadcam_entry) - mflr r5 - LOAD_REG_ADDR_PIC(r4, TLBCAM) - mtlr r5 - mulli r5,r3,TLBCAM_SIZE - add r3,r5,r4 - lwz r4,TLBCAM_MAS0(r3) - mtspr SPRN_MAS0,r4 - lwz r4,TLBCAM_MAS1(r3) - mtspr SPRN_MAS1,r4 - PPC_LL r4,TLBCAM_MAS2(r3) - mtspr SPRN_MAS2,r4 - lwz r4,TLBCAM_MAS3(r3) - mtspr SPRN_MAS3,r4 -BEGIN_MMU_FTR_SECTION - lwz r4,TLBCAM_MAS7(r3) - mtspr SPRN_MAS7,r4 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) - isync - tlbwe - isync - blr - -/* - * Load multiple TLB entries at once, using an alternate-space - * trampoline so that we don't have to care about whether the same - * TLB entry maps us before and after. - * - * r3 = first entry to write - * r4 = number of entries to write - * r5 = temporary tlb entry - */ -_GLOBAL(loadcam_multi) - mflr r8 - - /* - * Set up temporary TLB entry that is the same as what we're - * running from, but in AS=1. - */ - bl 1f -1: mflr r6 - tlbsx 0,r8 - mfspr r6,SPRN_MAS1 - ori r6,r6,MAS1_TS - mtspr SPRN_MAS1,r6 - mfspr r6,SPRN_MAS0 - rlwimi r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK - mr r7,r5 - mtspr SPRN_MAS0,r6 - isync - tlbwe - isync - - /* Switch to AS=1 */ - mfmsr r6 - ori r6,r6,MSR_IS|MSR_DS - mtmsr r6 - isync - - mr r9,r3 - add r10,r3,r4 -2: bl loadcam_entry - addi r9,r9,1 - cmpw r9,r10 - mr r3,r9 - blt 2b - - /* Return to AS=0 and clear the temporary entry */ - mfmsr r6 - rlwinm. r6,r6,0,~(MSR_IS|MSR_DS) - mtmsr r6 - isync - - li r6,0 - mtspr SPRN_MAS1,r6 - rlwinm r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK - oris r6,r6,MAS0_TLBSEL(1)@h - mtspr SPRN_MAS0,r6 - isync - tlbwe - isync - - mtlr r8 - blr -#endif -- cgit v1.2.3-59-g8ed1b From 6f60cc98df2be7f082bd786aa824ceabd24d24cb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:29 +0000 Subject: powerpc/mm: hand a context_t over to slice_mask_for_size() instead of mm_struct slice_mask_for_size() only uses mm->context, so hand directly a pointer to the context. This will help moving the function in subarch mmu.h in the next patch by avoiding having to include the definition of struct mm_struct Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 35b278082391..8eb7e8b09c75 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -151,32 +151,32 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, } #ifdef CONFIG_PPC_BOOK3S_64 -static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) +static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) - return mm_ctx_slice_mask_64k(&mm->context); + return mm_ctx_slice_mask_64k(&ctx); #endif if (psize == MMU_PAGE_4K) - return mm_ctx_slice_mask_4k(&mm->context); + return mm_ctx_slice_mask_4k(&ctx); #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_16M) - return mm_ctx_slice_mask_16m(&mm->context); + return mm_ctx_slice_mask_16m(&ctx); if (psize == MMU_PAGE_16G) - return mm_ctx_slice_mask_16g(&mm->context); + return mm_ctx_slice_mask_16g(&ctx); #endif BUG(); } #elif defined(CONFIG_PPC_8xx) -static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) +static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { if (psize == mmu_virtual_psize) - return &mm->context.mask_base_psize; + return &ctx->mask_base_psize; #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_512K) - return &mm->context.mask_512k; + return &ctx->mask_512k; if (psize == MMU_PAGE_8M) - return &mm->context.mask_8m; + return &ctx->mask_8m; #endif BUG(); } @@ -246,7 +246,7 @@ static void slice_convert(struct mm_struct *mm, slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); slice_print_mask(" mask", mask); - psize_mask = slice_mask_for_size(mm, psize); + psize_mask = slice_mask_for_size(&mm->context, psize); /* We need to use a spinlock here to protect against * concurrent 64k -> 4k demotion ... @@ -263,7 +263,7 @@ static void slice_convert(struct mm_struct *mm, /* Update the slice_mask */ old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(mm, old_psize); + old_mask = slice_mask_for_size(&mm->context, old_psize); old_mask->low_slices &= ~(1u << i); psize_mask->low_slices |= 1u << i; @@ -282,7 +282,7 @@ static void slice_convert(struct mm_struct *mm, /* Update the slice_mask */ old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(mm, old_psize); + old_mask = slice_mask_for_size(&mm->context, old_psize); __clear_bit(i, old_mask->high_slices); __set_bit(i, psize_mask->high_slices); @@ -538,7 +538,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - maskp = slice_mask_for_size(mm, psize); + maskp = slice_mask_for_size(&mm->context, psize); /* * Here "good" means slices that are already the right page size, @@ -565,7 +565,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * a pointer to good mask for the next code to use. */ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { - compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); if (fixed) slice_or_mask(&good_mask, maskp, compat_maskp); else @@ -760,7 +760,7 @@ void slice_init_new_context_exec(struct mm_struct *mm) /* * Slice mask cache starts zeroed, fill the default size cache. */ - mask = slice_mask_for_size(mm, psize); + mask = slice_mask_for_size(&mm->context, psize); mask->low_slices = ~0UL; if (SLICE_NUM_HIGH) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); @@ -819,14 +819,14 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(radix_enabled()); - maskp = slice_mask_for_size(mm, psize); + maskp = slice_mask_for_size(&mm->context, psize); #ifdef CONFIG_PPC_64K_PAGES /* We need to account for 4k slices too */ if (psize == MMU_PAGE_64K) { const struct slice_mask *compat_maskp; struct slice_mask available; - compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); slice_or_mask(&available, maskp, compat_maskp); return !slice_check_range_fits(mm, &available, addr, len); } -- cgit v1.2.3-59-g8ed1b From fca5c1e9eb5e263c1b4def0b5ae4ce5b2e1a9877 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:30 +0000 Subject: powerpc/mm: move slice_mask_for_size() into mmu.h Move slice_mask_for_size() into subarch mmu.h Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V [mpe: Retain the BUG_ON()s, rather than converting to VM_BUG_ON()] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 17 +++++++++++ arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 42 +++++++++++++++++++--------- arch/powerpc/mm/slice.c | 34 ---------------------- 3 files changed, 46 insertions(+), 47 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 230a9dec7677..a6d5b5ed1170 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -203,6 +203,23 @@ static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) } #endif +static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) +{ +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + return mm_ctx_slice_mask_64k(&ctx); +#endif +#ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_16M) + return mm_ctx_slice_mask_16m(&ctx); + if (psize == MMU_PAGE_16G) + return mm_ctx_slice_mask_16g(&ctx); +#endif + BUG_ON(psize != MMU_PAGE_4K); + + return mm_ctx_slice_mask_4k(&ctx); +} + #ifdef CONFIG_PPC_SUBPAGE_PROT static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) { diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index c503e2f05e61..114f50d995dc 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -184,7 +184,23 @@ #define LOW_SLICE_ARRAY_SZ SLICE_ARRAY_SIZE #endif +#if defined(CONFIG_PPC_4K_PAGES) +#define mmu_virtual_psize MMU_PAGE_4K +#elif defined(CONFIG_PPC_16K_PAGES) +#define mmu_virtual_psize MMU_PAGE_16K +#define PTE_FRAG_NR 4 +#define PTE_FRAG_SIZE_SHIFT 12 +#define PTE_FRAG_SIZE (1UL << 12) +#else +#error "Unsupported PAGE_SIZE" +#endif + +#define mmu_linear_psize MMU_PAGE_8M + #ifndef __ASSEMBLY__ + +#include + struct slice_mask { u64 low_slices; DECLARE_BITMAP(high_slices, 0); @@ -255,6 +271,19 @@ static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx) return &ctx->mask_8m; } #endif + +static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_512K) + return &ctx->mask_512k; + if (psize == MMU_PAGE_8M) + return &ctx->mask_8m; +#endif + BUG_ON(psize != mmu_virtual_psize); + + return &ctx->mask_base_psize; +} #endif /* CONFIG_PPC_MM_SLICE */ #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000) @@ -306,17 +335,4 @@ extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf; #endif /* !__ASSEMBLY__ */ -#if defined(CONFIG_PPC_4K_PAGES) -#define mmu_virtual_psize MMU_PAGE_4K -#elif defined(CONFIG_PPC_16K_PAGES) -#define mmu_virtual_psize MMU_PAGE_16K -#define PTE_FRAG_NR 4 -#define PTE_FRAG_SIZE_SHIFT 12 -#define PTE_FRAG_SIZE (1UL << 12) -#else -#error "Unsupported PAGE_SIZE" -#endif - -#define mmu_linear_psize MMU_PAGE_8M - #endif /* _ASM_POWERPC_MMU_8XX_H_ */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 8eb7e8b09c75..31de91b65a64 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -150,40 +150,6 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, __set_bit(i, ret->high_slices); } -#ifdef CONFIG_PPC_BOOK3S_64 -static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) -{ -#ifdef CONFIG_PPC_64K_PAGES - if (psize == MMU_PAGE_64K) - return mm_ctx_slice_mask_64k(&ctx); -#endif - if (psize == MMU_PAGE_4K) - return mm_ctx_slice_mask_4k(&ctx); -#ifdef CONFIG_HUGETLB_PAGE - if (psize == MMU_PAGE_16M) - return mm_ctx_slice_mask_16m(&ctx); - if (psize == MMU_PAGE_16G) - return mm_ctx_slice_mask_16g(&ctx); -#endif - BUG(); -} -#elif defined(CONFIG_PPC_8xx) -static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) -{ - if (psize == mmu_virtual_psize) - return &ctx->mask_base_psize; -#ifdef CONFIG_HUGETLB_PAGE - if (psize == MMU_PAGE_512K) - return &ctx->mask_512k; - if (psize == MMU_PAGE_8M) - return &ctx->mask_8m; -#endif - BUG(); -} -#else -#error "Must define the slice masks for page sizes supported by the platform" -#endif - static bool slice_check_range_fits(struct mm_struct *mm, const struct slice_mask *available, unsigned long start, unsigned long len) -- cgit v1.2.3-59-g8ed1b From b4baad0b2712471740c58a1bc9578ab057af7514 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:32 +0000 Subject: powerpc/mm: remove unnecessary #ifdef CONFIG_PPC64 For PPC32 that's a noop, gcc should be smart enough to ignore it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 31de91b65a64..840c4118a185 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -118,13 +118,11 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) unsigned long start = slice << SLICE_HIGH_SHIFT; unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); -#ifdef CONFIG_PPC64 /* Hack, so that each addresses is controlled by exactly one * of the high or low area bitmaps, the first high area starts * at 4GB, not 0 */ if (start == 0) - start = SLICE_LOW_TOP; -#endif + start = (unsigned long)SLICE_LOW_TOP; return !slice_area_is_free(mm, start, end - start); } -- cgit v1.2.3-59-g8ed1b From 203a1fa6286671900698485ddffbb435901aa75b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:33 +0000 Subject: powerpc/mm: remove a couple of #ifdef CONFIG_PPC_64K_PAGES in mm/slice.c This patch replaces a couple of #ifdef CONFIG_PPC_64K_PAGES by IS_ENABLED(CONFIG_PPC_64K_PAGES) to improve code maintainability. Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 840c4118a185..ace97d953040 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -606,14 +606,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, newaddr = slice_find_area(mm, len, &potential_mask, psize, topdown, high_limit); -#ifdef CONFIG_PPC_64K_PAGES - if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) { + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && + psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ slice_or_mask(&potential_mask, &potential_mask, compat_maskp); newaddr = slice_find_area(mm, len, &potential_mask, psize, topdown, high_limit); } -#endif if (newaddr == -ENOMEM) return -ENOMEM; @@ -784,9 +783,9 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(radix_enabled()); maskp = slice_mask_for_size(&mm->context, psize); -#ifdef CONFIG_PPC_64K_PAGES + /* We need to account for 4k slices too */ - if (psize == MMU_PAGE_64K) { + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { const struct slice_mask *compat_maskp; struct slice_mask available; @@ -794,7 +793,6 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, slice_or_mask(&available, maskp, compat_maskp); return !slice_check_range_fits(mm, &available, addr, len); } -#endif return !slice_check_range_fits(mm, maskp, addr, len); } -- cgit v1.2.3-59-g8ed1b From 43ed7909d70a61c621cadb5d808dc392ad537e5a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:35 +0000 Subject: powerpc/mm: define get_slice_psize() all the time get_slice_psize() can be defined regardless of CONFIG_PPC_MM_SLICES to avoid ifdefs Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/slice.h | 5 +++++ arch/powerpc/mm/hugetlbpage.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index be8af667098f..c6f466f4c241 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -36,6 +36,11 @@ void slice_setup_new_exec(void); static inline void slice_init_new_context_exec(struct mm_struct *mm) {} +static inline unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + #endif /* CONFIG_PPC_MM_SLICES */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9e732bb2c84a..5f67e7a4d1cc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -578,14 +578,12 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { -#ifdef CONFIG_PPC_MM_SLICES /* With radix we don't use slice, so derive it from vma*/ - if (!radix_enabled()) { + if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); return 1UL << mmu_psize_to_shift(psize); } -#endif return vma_kernel_pagesize(vma); } -- cgit v1.2.3-59-g8ed1b From 5953fb4f4671d7d755a81017a76766c00922d059 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:36 +0000 Subject: powerpc/mm: define subarch SLB_ADDR_LIMIT_DEFAULT This patch defines a subarch specific SLB_ADDR_LIMIT_DEFAULT to remove the #ifdefs around the setup of mm->context.slb_addr_limit It also generalises the use of mm_ctx_set_slb_addr_limit() helper. Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/slice.h | 2 ++ arch/powerpc/include/asm/nohash/32/slice.h | 2 ++ arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 +--- arch/powerpc/mm/slice.c | 6 +----- 5 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index 062e11136e9c..f0d3194ba41b 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -11,4 +11,6 @@ #define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) +#define SLB_ADDR_LIMIT_DEFAULT DEFAULT_MAP_WINDOW_USER64 + #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/slice.h b/arch/powerpc/include/asm/nohash/32/slice.h index 777d62e40ac0..39eb0154ae2d 100644 --- a/arch/powerpc/include/asm/nohash/32/slice.h +++ b/arch/powerpc/include/asm/nohash/32/slice.h @@ -13,6 +13,8 @@ #define SLICE_NUM_HIGH 0ul #define GET_HIGH_SLICE_INDEX(addr) (addr & 0) +#define SLB_ADDR_LIMIT_DEFAULT DEFAULT_MAP_WINDOW + #endif /* CONFIG_PPC_MM_SLICES */ #endif /* _ASM_POWERPC_NOHASH_32_SLICE_H */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b21a81d42f15..23ed8db645ad 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1058,7 +1058,7 @@ void __init hash__early_init_mmu(void) htab_initialize(); init_mm.context.hash_context = &init_hash_mm_context; - init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 704e613a0b14..c2494b838008 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -802,9 +802,7 @@ void __init early_init_mmu(void) #endif #ifdef CONFIG_PPC_MM_SLICES -#if defined(CONFIG_PPC_8xx) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); #endif } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index ace97d953040..97fbf7b54422 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -704,11 +704,7 @@ void slice_init_new_context_exec(struct mm_struct *mm) * case of fork it is just inherited from the mm being * duplicated. */ -#ifdef CONFIG_PPC64 - mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW_USER64); -#else - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif + mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); mm_ctx_set_user_psize(&mm->context, psize); /* -- cgit v1.2.3-59-g8ed1b From a521c44c3ded9fe184c5de3eed3a442af2d26f00 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:38 +0000 Subject: powerpc/book3e: drop mmu_get_tsize() This function is not used anymore, drop it. Fixes: b42279f0165c ("powerpc/mm/nohash: MM_SLICE is only used by book3s 64") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index f84ec46cdb26..c911fe9bfa0e 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -49,11 +49,6 @@ static inline int tlb1_next(void) #endif /* !PPC64 */ #endif /* FSL */ -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} - #if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) #include -- cgit v1.2.3-59-g8ed1b From 5874cabe29079b72b192a28d266adf1a460fc5d6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:39 +0000 Subject: powerpc/64: only book3s/64 supports CONFIG_PPC_64K_PAGES CONFIG_PPC_64K_PAGES cannot be selected by nohash/64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 - arch/powerpc/include/asm/nohash/64/pgalloc.h | 3 --- arch/powerpc/include/asm/nohash/64/pgtable.h | 4 ---- arch/powerpc/include/asm/nohash/pte-book3e.h | 5 ----- arch/powerpc/include/asm/pgtable-be-types.h | 9 ++------ arch/powerpc/include/asm/pgtable-types.h | 9 ++------ arch/powerpc/include/asm/task_size_64.h | 2 +- arch/powerpc/mm/nohash/tlb.c | 13 ------------ arch/powerpc/mm/nohash/tlb_low_64e.S | 31 ---------------------------- 9 files changed, 5 insertions(+), 72 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82c3061..5d8e692d6470 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -375,7 +375,6 @@ config ZONE_DMA config PGTABLE_LEVELS int default 2 if !PPC64 - default 3 if PPC_64K_PAGES && !PPC_BOOK3S_64 default 4 source "arch/powerpc/sysdev/Kconfig" diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 66d086f85bd5..ded453f9b5a8 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -171,12 +171,9 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, #define __pmd_free_tlb(tlb, pmd, addr) \ pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) -#endif /* CONFIG_PPC_64K_PAGES */ - #define check_pgt_cache() do { } while (0) #endif /* _ASM_POWERPC_PGALLOC_64_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index c8e6a9a5bc33..b9f66cf15c31 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -10,10 +10,6 @@ #include #include -#ifdef CONFIG_PPC_64K_PAGES -#error "Page size not supported" -#endif - #define FIRST_USER_ADDRESS 0UL /* diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h index dd40d200f274..813918f40765 100644 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ b/arch/powerpc/include/asm/nohash/pte-book3e.h @@ -60,13 +60,8 @@ #define _PAGE_SPECIAL _PAGE_SW0 /* Base page size */ -#ifdef CONFIG_PPC_64K_PAGES -#define _PAGE_PSIZE _PAGE_PSIZE_64K -#define PTE_RPN_SHIFT (28) -#else #define _PAGE_PSIZE _PAGE_PSIZE_4K #define PTE_RPN_SHIFT (24) -#endif #define PTE_WIMGE_SHIFT (19) #define PTE_BAP_SHIFT (2) diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index a89c67b62680..b169bbf95fcb 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -33,11 +33,7 @@ static inline __be64 pmd_raw(pmd_t x) return x.pmd; } -/* - * 64 bit hash always use 4 level table. Everybody else use 4 level - * only for 4K page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +/* 64 bit always use 4 level table. */ typedef struct { __be64 pud; } pud_t; #define __pud(x) ((pud_t) { cpu_to_be64(x) }) #define __pud_raw(x) ((pud_t) { (x) }) @@ -51,7 +47,6 @@ static inline __be64 pud_raw(pud_t x) return x.pud; } -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ #endif /* CONFIG_PPC64 */ /* PGD level */ @@ -77,7 +72,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) +#ifdef CONFIG_PPC_64K_PAGES typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 3b0edf041b2e..d11b4c61d686 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -23,18 +23,13 @@ static inline unsigned long pmd_val(pmd_t x) return x.pmd; } -/* - * 64 bit hash always use 4 level table. Everybody else use 4 level - * only for 4K page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +/* 64 bit always use 4 level table. */ typedef struct { unsigned long pud; } pud_t; #define __pud(x) ((pud_t) { (x) }) static inline unsigned long pud_val(pud_t x) { return x.pud; } -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ #endif /* CONFIG_PPC64 */ /* PGD level */ @@ -54,7 +49,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) +#ifdef CONFIG_PPC_64K_PAGES typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h index eab4779f6b84..c993482237ed 100644 --- a/arch/powerpc/include/asm/task_size_64.h +++ b/arch/powerpc/include/asm/task_size_64.h @@ -20,7 +20,7 @@ /* * For now 512TB is only supported with book3s and 64K linux page size. */ -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) +#ifdef CONFIG_PPC_64K_PAGES /* * Max value currently used: */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index c2494b838008..24f88efb05bf 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -433,11 +433,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) unsigned long rid = (address & rmask) | 0x1000000000000000ul; unsigned long vpte = address & ~rmask; -#ifdef CONFIG_PPC_64K_PAGES - vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; -#else vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; -#endif vpte |= rid; __flush_tlb_page(tlb->mm, vpte, tsize, 0); } @@ -625,21 +621,12 @@ static void early_init_this_mmu(void) case PPC_HTW_IBM: mas4 |= MAS4_INDD; -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_256M; -#else mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; mmu_pte_psize = MMU_PAGE_1M; -#endif break; case PPC_HTW_NONE: -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; -#else mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; -#endif mmu_pte_psize = mmu_virtual_psize; break; } diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 9ed90064f542..58959ce15415 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -24,11 +24,7 @@ #include #include -#ifdef CONFIG_PPC_64K_PAGES -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) -#else #define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) -#endif #define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) #define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) #define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) @@ -167,13 +163,11 @@ MMU_FTR_SECTION_ELSE ldx r14,r14,r15 /* grab pgd entry */ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) -#ifndef CONFIG_PPC_64K_PAGES rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 cmpdi cr0,r14,0 bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ ldx r14,r14,r15 /* grab pud entry */ -#endif /* CONFIG_PPC_64K_PAGES */ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -682,18 +676,7 @@ normal_tlb_miss: * order to handle the weird page table format used by linux */ ori r10,r15,0x1 -#ifdef CONFIG_PPC_64K_PAGES - /* For the top bits, 16 bytes per PTE */ - rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 - /* Now create the bottom bits as 0 in position 0x8000 and - * the rest calculated for 8 bytes per PTE - */ - rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 - /* Insert the bottom bits in */ - rlwimi r14,r15,0,16,31 -#else rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 -#endif sldi r15,r10,60 clrrdi r14,r14,3 or r10,r15,r14 @@ -732,11 +715,7 @@ finish_normal_tlb_miss: /* Check page size, if not standard, update MAS1 */ rldicl r11,r14,64-8,64-8 -#ifdef CONFIG_PPC_64K_PAGES - cmpldi cr0,r11,BOOK3E_PAGESZ_64K -#else cmpldi cr0,r11,BOOK3E_PAGESZ_4K -#endif beq- 1f mfspr r11,SPRN_MAS1 rlwimi r11,r14,31,21,24 @@ -857,14 +836,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) cmpdi cr0,r15,0 bge virt_page_table_tlb_miss_fault -#ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 cmpdi cr0,r15,0 bge virt_page_table_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 @@ -1106,14 +1083,12 @@ htw_tlb_miss: cmpdi cr0,r15,0 bge htw_tlb_miss_fault -#ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 cmpdi cr0,r15,0 bge htw_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 @@ -1132,9 +1107,7 @@ htw_tlb_miss: * 4K page we need to extract a bit from the virtual address and * insert it into the "PA52" bit of the RPN. */ -#ifndef CONFIG_PPC_64K_PAGES rlwimi r15,r16,32-9,20,20 -#endif /* Now we build the MAS: * * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG @@ -1144,11 +1117,7 @@ htw_tlb_miss: * MAS 2 : Use defaults * MAS 3+7 : Needs to be done */ -#ifdef CONFIG_PPC_64K_PAGES - ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) -#else ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -#endif BEGIN_MMU_FTR_SECTION srdi r16,r10,32 -- cgit v1.2.3-59-g8ed1b From 3dea7332ccac1f701a6f8cd4fe44faa9be2e6014 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:40 +0000 Subject: powerpc/book3e: hugetlbpage is only for CONFIG_PPC_FSL_BOOK3E As per Kconfig.cputype, only CONFIG_PPC_FSL_BOOK3E gets to select SYS_SUPPORTS_HUGETLBFS so simplify accordingly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/Makefile | 2 +- arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 47 ++++++++++++----------------- 2 files changed, 20 insertions(+), 29 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index b2228ff81b8a..33b6f6f29d3f 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3E_MMU) += book3e_hugetlbpage.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o endif # Disable kcov instrumentation on sensitive code diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index c911fe9bfa0e..61915f4d3c7f 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -11,8 +11,9 @@ #include -#ifdef CONFIG_PPC_FSL_BOOK3E #ifdef CONFIG_PPC64 +#include + static inline int tlb1_next(void) { struct paca_struct *paca = get_paca(); @@ -29,28 +30,6 @@ static inline int tlb1_next(void) tcd->esel_next = next; return this; } -#else -static inline int tlb1_next(void) -{ - int index, ncams; - - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - index = this_cpu_read(next_tlbcam_idx); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __this_cpu_write(next_tlbcam_idx, tlbcam_index); - else - __this_cpu_inc(next_tlbcam_idx); - - return index; -} -#endif /* !PPC64 */ -#endif /* FSL */ - -#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) -#include static inline void book3e_tlb_lock(void) { @@ -93,6 +72,23 @@ static inline void book3e_tlb_unlock(void) paca->tcd_ptr->lock = 0; } #else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} + static inline void book3e_tlb_lock(void) { } @@ -134,10 +130,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, unsigned long psize, tsize, shift; unsigned long flags; struct mm_struct *mm; - -#ifdef CONFIG_PPC_FSL_BOOK3E int index; -#endif if (unlikely(is_kernel_addr(ea))) return; @@ -161,11 +154,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, return; } -#ifdef CONFIG_PPC_FSL_BOOK3E /* We have to use the CAM(TLB1) on FSL parts for hugepages */ index = tlb1_next(); mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); -#endif mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); mas2 = ea & ~((1UL << shift) - 1); -- cgit v1.2.3-59-g8ed1b From 0caed4de502c7699b7faeaea0a93b39e4f19e11a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:41 +0000 Subject: powerpc/mm: move __find_linux_pte() out of hugetlbpage.c __find_linux_pte() is the only function in hugetlbpage.c which is compiled in regardless on CONFIG_HUGETLBPAGE This patch moves it in pgtable.c. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 103 ----------------------------------------- arch/powerpc/mm/pgtable.c | 104 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 103 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 5f67e7a4d1cc..17915fc389ff 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -756,109 +756,6 @@ void flush_dcache_icache_hugepage(struct page *page) #endif /* CONFIG_HUGETLB_PAGE */ -/* - * We have 4 cases for pgds and pmds: - * (1) invalid (all zeroes) - * (2) pointer to next table, as normal; bottom 6 bits == 0 - * (3) leaf pte for huge page _PAGE_PTE set - * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table - * - * So long as we atomically load page table pointers we are safe against teardown, - * we can follow the address down to the the page and take a ref on it. - * This function need to be called with interrupts disabled. We use this variant - * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED - */ -pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *hpage_shift) -{ - pgd_t pgd, *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t *ret_pte; - hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; - - if (hpage_shift) - *hpage_shift = 0; - - if (is_thp) - *is_thp = false; - - pgdp = pgdir + pgd_index(ea); - pgd = READ_ONCE(*pgdp); - /* - * Always operate on the local stack value. This make sure the - * value don't get updated by a parallel THP split/collapse, - * page fault or a page unmap. The return pte_t * is still not - * stable. So should be checked there for above conditions. - */ - if (pgd_none(pgd)) - return NULL; - else if (pgd_huge(pgd)) { - ret_pte = (pte_t *) pgdp; - goto out; - } else if (is_hugepd(__hugepd(pgd_val(pgd)))) - hpdp = (hugepd_t *)&pgd; - else { - /* - * Even if we end up with an unmap, the pgtable will not - * be freed, because we do an rcu free and here we are - * irq disabled - */ - pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); - pud = READ_ONCE(*pudp); - - if (pud_none(pud)) - return NULL; - else if (pud_huge(pud)) { - ret_pte = (pte_t *) pudp; - goto out; - } else if (is_hugepd(__hugepd(pud_val(pud)))) - hpdp = (hugepd_t *)&pud; - else { - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *) pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *) pmdp; - goto out; - } else if (is_hugepd(__hugepd(pmd_val(pmd)))) - hpdp = (hugepd_t *)&pmd; - else - return pte_offset_kernel(&pmd, ea); - } - } - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(*hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); -out: - if (hpage_shift) - *hpage_shift = pdshift; - return ret_pte; -} -EXPORT_SYMBOL_GPL(__find_linux_pte); - int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d3d61d29b4f1..9f4ccd15849f 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,7 @@ #include #include #include +#include static inline int is_exec_fault(void) { @@ -299,3 +300,106 @@ unsigned long vmalloc_to_phys(void *va) return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); } EXPORT_SYMBOL_GPL(vmalloc_to_phys); + +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page _PAGE_PTE set + * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED + */ +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) +{ + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (hpage_shift) + *hpage_shift = 0; + + if (is_thp) + *is_thp = false; + + pgdp = pgdir + pgd_index(ea); + pgd = READ_ONCE(*pgdp); + /* + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. + */ + if (pgd_none(pgd)) + return NULL; + else if (pgd_huge(pgd)) { + ret_pte = (pte_t *) pgdp; + goto out; + } else if (is_hugepd(__hugepd(pgd_val(pgd)))) + hpdp = (hugepd_t *)&pgd; + else { + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = READ_ONCE(*pudp); + + if (pud_none(pud)) + return NULL; + else if (pud_huge(pud)) { + ret_pte = (pte_t *) pudp; + goto out; + } else if (is_hugepd(__hugepd(pud_val(pud)))) + hpdp = (hugepd_t *)&pud; + else { + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *) pmdp; + goto out; + } + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *) pmdp; + goto out; + } else if (is_hugepd(__hugepd(pmd_val(pmd)))) + hpdp = (hugepd_t *)&pmd; + else + return pte_offset_kernel(&pmd, ea); + } + } + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(*hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (hpage_shift) + *hpage_shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(__find_linux_pte); -- cgit v1.2.3-59-g8ed1b From b7dcf96ce03e2cab7eb6cda2ca8c66e1529e9bc3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:42 +0000 Subject: powerpc/mm: make hugetlbpage.c depend on CONFIG_HUGETLB_PAGE The only function in hugetlbpage.c which doesn't depend on CONFIG_HUGETLB_PAGE is gup_hugepte(), and this function is only called from gup_huge_pd() which depends on CONFIG_HUGETLB_PAGE so all the content of hugetlbpage.c depends on CONFIG_HUGETLB_PAGE. This patch modifies Makefile to only compile hugetlbpage.c when CONFIG_HUGETLB_PAGE is set. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/hugetlbpage.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 08557bae6fa1..3daea8da0c7f 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-y += hugetlbpage.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 17915fc389ff..9f69594f5d09 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -26,9 +26,6 @@ #include #include - -#ifdef CONFIG_HUGETLB_PAGE - #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_512K 19 #define PAGE_SHIFT_8M 23 @@ -754,8 +751,6 @@ void flush_dcache_icache_hugepage(struct page *page) } } -#endif /* CONFIG_HUGETLB_PAGE */ - int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { -- cgit v1.2.3-59-g8ed1b From 0001e5aa5c028c11570f2e641f0198287f4808ba Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:43 +0000 Subject: powerpc/mm: make gup_hugepte() static gup_huge_pd() is the only user of gup_hugepte() and it is located in the same file. This patch moves gup_huge_pd() after gup_hugepte() and makes gup_hugepte() static. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pgtable.h | 3 --- arch/powerpc/mm/hugetlbpage.c | 38 +++++++++++++++++++------------------- 2 files changed, 19 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 505550fb2935..c51846da41a7 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -89,9 +89,6 @@ extern void paging_init(void); */ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); -extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, - struct page **pages, int *nr); #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9f69594f5d09..95cc9f3d97e2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -539,23 +539,6 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, return (__boundary - 1 < end - 1) ? __boundary : end; } -int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, - unsigned long end, int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} - #ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -751,8 +734,8 @@ void flush_dcache_icache_hugepage(struct page *page) } } -int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) +static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) { unsigned long pte_end; struct page *head, *page; @@ -798,3 +781,20 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 1; } + +int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, + unsigned long end, int write, struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} -- cgit v1.2.3-59-g8ed1b From 5fb84fec46015758271fcd2a746633fd4d48e619 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:45 +0000 Subject: powerpc/mm: add a helper to populate hugepd This patchs adds a subarch helper to populate hugepd. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 5 +++++ arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 8 ++++++++ arch/powerpc/include/asm/nohash/hugetlb-book3e.h | 6 ++++++ arch/powerpc/mm/hugetlbpage.c | 20 +------------------- 4 files changed, 20 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index cbc8153d6e0e..def77a45e905 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -100,6 +100,11 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, return hugepd_page(hpd) + idx; } +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2)); +} + void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); #endif diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 997f5b3d6b99..75676885bec2 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H #define _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H +#define PAGE_SHIFT_8M 23 + static inline pte_t *hugepd_page(hugepd_t hpd) { BUG_ON(!hugepd_ok(hpd)); @@ -28,4 +30,10 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, flush_tlb_page(vma, vmaddr); } +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | + (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : _PMD_PAGE_512K)); +} + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h index e94f1cd048ee..51439bcfe313 100644 --- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h +++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h @@ -28,4 +28,10 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + /* We use the old format for PPC_FSL_BOOK3E */ + *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); +} + #endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 95cc9f3d97e2..036f408cfcb0 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -26,12 +26,6 @@ #include #include -#define PAGE_SHIFT_64K 16 -#define PAGE_SHIFT_512K 19 -#define PAGE_SHIFT_8M 23 -#define PAGE_SHIFT_16M 24 -#define PAGE_SHIFT_16G 34 - bool hugetlb_disabled = false; unsigned int HPAGE_SHIFT; @@ -95,19 +89,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, for (i = 0; i < num_hugepd; i++, hpdp++) { if (unlikely(!hugepd_none(*hpdp))) break; - else { -#ifdef CONFIG_PPC_BOOK3S_64 - *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | - (shift_to_mmu_psize(pshift) << 2)); -#elif defined(CONFIG_PPC_8xx) - *hpdp = __hugepd(__pa(new) | _PMD_USER | - (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : - _PMD_PAGE_512K) | _PMD_PRESENT); -#else - /* We use the old format for PPC_FSL_BOOK3E */ - *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); -#endif - } + hugepd_populate(hpdp, new, pshift); } /* If we bailed from the for loop early, an error occurred, clean up */ if (i < num_hugepd) { -- cgit v1.2.3-59-g8ed1b From 723f268f19daddba56a987b934f3e34a04b6499d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:46 +0000 Subject: powerpc/mm: cleanup ifdef mess in add_huge_page_size() Introduce a subarch specific helper check_and_get_huge_psize() to check the huge page sizes and cleanup the ifdef mess in add_huge_page_size() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 27 +++++++++++++++++ arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 5 ++++ arch/powerpc/include/asm/nohash/hugetlb-book3e.h | 8 +++++ arch/powerpc/mm/hugetlbpage.c | 37 ++---------------------- 4 files changed, 43 insertions(+), 34 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index def77a45e905..56140d19c85f 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -107,4 +107,31 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +static inline int check_and_get_huge_psize(int shift) +{ + int mmu_psize; + + if (shift > SLICE_HIGH_SHIFT) + return -EINVAL; + + mmu_psize = shift_to_mmu_psize(shift); + + /* + * We need to make sure that for different page sizes reported by + * firmware we only add hugetlb support for page sizes that can be + * supported by linux page table layout. + * For now we have + * Radix: 2M and 1G + * Hash: 16M and 16G + */ + if (radix_enabled()) { + if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) + return -EINVAL; + } else { + if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) + return -EINVAL; + } + return mmu_psize; +} + #endif diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 75676885bec2..a46616937d20 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -36,4 +36,9 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : _PMD_PAGE_512K)); } +static inline int check_and_get_huge_psize(int shift) +{ + return shift_to_mmu_psize(shift); +} + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h index 51439bcfe313..ecd8694cb229 100644 --- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h +++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h @@ -34,4 +34,12 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); } +static inline int check_and_get_huge_psize(int shift) +{ + if (shift & 1) /* Not a power of 4 */ + return -EINVAL; + + return shift_to_mmu_psize(shift); +} + #endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 036f408cfcb0..7b7027aae73f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -549,13 +549,6 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return vma_kernel_pagesize(vma); } -static inline bool is_power_of_4(unsigned long x) -{ - if (is_power_of_2(x)) - return (__ilog2(x) % 2) ? false : true; - return false; -} - static int __init add_huge_page_size(unsigned long long size) { int shift = __ffs(size); @@ -563,37 +556,13 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ - if (size <= PAGE_SIZE) - return -EINVAL; -#if defined(CONFIG_PPC_FSL_BOOK3E) - if (!is_power_of_4(size)) + if (size <= PAGE_SIZE || !is_power_of_2(size)) return -EINVAL; -#elif !defined(CONFIG_PPC_8xx) - if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) - return -EINVAL; -#endif - if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + mmu_psize = check_and_get_huge_psize(size); + if (mmu_psize < 0) return -EINVAL; -#ifdef CONFIG_PPC_BOOK3S_64 - /* - * We need to make sure that for different page sizes reported by - * firmware we only add hugetlb support for page sizes that can be - * supported by linux page table layout. - * For now we have - * Radix: 2M and 1G - * Hash: 16M and 16G - */ - if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) - return -EINVAL; - } else { - if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) - return -EINVAL; - } -#endif - BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); /* Return if huge page size has already been setup */ -- cgit v1.2.3-59-g8ed1b From 45d0ba527b575d47b2be75dd517b57cceda04bfe Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:47 +0000 Subject: powerpc/mm: move hugetlb_disabled into asm/hugetlb.h No need to have this in asm/page.h, move it into asm/hugetlb.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hugetlb.h | 2 ++ arch/powerpc/include/asm/page.h | 1 - arch/powerpc/kernel/fadump.c | 1 + arch/powerpc/mm/book3s64/hash_utils.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index fd5c0873a57d..84598c6b0959 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -13,6 +13,8 @@ #include #endif /* CONFIG_PPC_BOOK3S_64 */ +extern bool hugetlb_disabled; + void flush_dcache_icache_hugepage(struct page *page); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 748f5db2e2b7..6b508420d92b 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -29,7 +29,6 @@ #ifndef __ASSEMBLY__ #ifdef CONFIG_HUGETLB_PAGE -extern bool hugetlb_disabled; extern unsigned int HPAGE_SHIFT; #else #define HPAGE_SHIFT PAGE_SHIFT diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 45a8d0be1c96..25f063f56ec5 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 23ed8db645ad..f0ce860a69ac 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3-59-g8ed1b From c5710cd20735037ba9be0e95530f0d3795ce07e6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:48 +0000 Subject: powerpc/mm: cleanup HPAGE_SHIFT setup Only book3s/64 may select default among several HPAGE_SHIFT at runtime. 8xx always defines 512K pages as default FSL_BOOK3E always defines 4M pages as default This patch limits HUGETLB_PAGE_SIZE_VARIABLE to book3s/64 moves the definitions in subarches files. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hugetlb.h | 2 ++ arch/powerpc/include/asm/page.h | 11 ++++++++--- arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 16 ++++++++++++++++ arch/powerpc/mm/hugetlbpage.c | 23 +++-------------------- 5 files changed, 30 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5d8e692d6470..7815eb0cc2a5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -390,7 +390,7 @@ source "kernel/Kconfig.hz" config HUGETLB_PAGE_SIZE_VARIABLE bool - depends on HUGETLB_PAGE + depends on HUGETLB_PAGE && PPC_BOOK3S_64 default y config MATH_EMULATION diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 84598c6b0959..20a101046cff 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -15,6 +15,8 @@ extern bool hugetlb_disabled; +void hugetlbpage_init_default(void); + void flush_dcache_icache_hugepage(struct page *page); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 6b508420d92b..dbc8c0679480 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -28,10 +28,15 @@ #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #ifndef __ASSEMBLY__ -#ifdef CONFIG_HUGETLB_PAGE -extern unsigned int HPAGE_SHIFT; -#else +#ifndef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PAGE_SHIFT +#elif defined(CONFIG_PPC_BOOK3S_64) +extern unsigned int hpage_shift; +#define HPAGE_SHIFT hpage_shift +#elif defined(CONFIG_PPC_8xx) +#define HPAGE_SHIFT 19 /* 512k pages */ +#elif defined(CONFIG_PPC_FSL_BOOK3E) +#define HPAGE_SHIFT 22 /* 4M pages */ #endif #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c index 2d4e02aa15a3..eefa89c6117b 100644 --- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -15,6 +15,9 @@ #include #include +unsigned int hpage_shift; +EXPORT_SYMBOL(hpage_shift); + extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, unsigned long pa, unsigned long rlags, unsigned long vflags, int psize, int ssize); @@ -150,3 +153,16 @@ void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr old_pte, pte); set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } + +void hugetlbpage_init_default(void) +{ + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7b7027aae73f..847fb495a628 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -28,9 +28,6 @@ bool hugetlb_disabled = false; -unsigned int HPAGE_SHIFT; -EXPORT_SYMBOL(HPAGE_SHIFT); - #define hugepd_none(hpd) (hpd_val(hpd) == 0) #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *))) @@ -645,23 +642,9 @@ static int __init hugetlbpage_init(void) #endif } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) - /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ - if (mmu_psize_defs[MMU_PAGE_4M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; - else if (mmu_psize_defs[MMU_PAGE_512K].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; -#else - /* Set default large page size. Currently, we pick 16M or 1M - * depending on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; - else if (mmu_psize_defs[MMU_PAGE_2M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; -#endif + if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_default(); + return 0; } -- cgit v1.2.3-59-g8ed1b From 4df4b27585227c8ba66fdf0dd7531d1e23a37194 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:49 +0000 Subject: powerpc/mm: cleanup remaining ifdef mess in hugetlbpage.c Only 3 subarches support huge pages. So when it is either 2 of them, it is not the third one. And mmu_has_feature() is known by all subarches so IS_ENABLED() can be used instead of #ifdef Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 847fb495a628..98db5ec6a1dd 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -226,7 +226,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h) return __alloc_bootmem_huge_page(h); } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) +#ifndef CONFIG_PPC_BOOK3S_64 #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) @@ -595,10 +595,10 @@ static int __init hugetlbpage_init(void) return 0; } -#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) - if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && + !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; -#endif + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { unsigned shift; unsigned pdshift; @@ -636,10 +636,8 @@ static int __init hugetlbpage_init(void) pgtable_cache_add(PTE_INDEX_SIZE); else if (pdshift > shift) pgtable_cache_add(pdshift - shift); -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) - else + else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx)) pgtable_cache_add(PTE_T_ORDER); -#endif } if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE)) -- cgit v1.2.3-59-g8ed1b From fab9a1165bcda99682e3319d1c83980fd4e72365 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:51 +0000 Subject: powerpc/mm: flatten function __find_linux_pte() step 1 __find_linux_pte() is full of if/else which is hard to follow allthough the handling is pretty simple. This patch flattens the function by getting rid of as much if/else as possible. In order to ease the review, this is done in three steps. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9f4ccd15849f..d332abeedf0a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -339,12 +339,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, */ if (pgd_none(pgd)) return NULL; - else if (pgd_huge(pgd)) { - ret_pte = (pte_t *) pgdp; + + if (pgd_huge(pgd)) { + ret_pte = (pte_t *)pgdp; goto out; - } else if (is_hugepd(__hugepd(pgd_val(pgd)))) + } + if (is_hugepd(__hugepd(pgd_val(pgd)))) { hpdp = (hugepd_t *)&pgd; - else { + goto out_huge; + } + { /* * Even if we end up with an unmap, the pgtable will not * be freed, because we do an rcu free and here we are @@ -356,12 +360,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pud_none(pud)) return NULL; - else if (pud_huge(pud)) { + + if (pud_huge(pud)) { ret_pte = (pte_t *) pudp; goto out; - } else if (is_hugepd(__hugepd(pud_val(pud)))) + } + if (is_hugepd(__hugepd(pud_val(pud)))) { hpdp = (hugepd_t *)&pud; - else { + goto out_huge; + } + { pdshift = PMD_SHIFT; pmdp = pmd_offset(&pud, ea); pmd = READ_ONCE(*pmdp); @@ -386,12 +394,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pmd_huge(pmd) || pmd_large(pmd)) { ret_pte = (pte_t *) pmdp; goto out; - } else if (is_hugepd(__hugepd(pmd_val(pmd)))) + } + if (is_hugepd(__hugepd(pmd_val(pmd)))) { hpdp = (hugepd_t *)&pmd; - else - return pte_offset_kernel(&pmd, ea); + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); } } +out_huge: if (!hpdp) return NULL; -- cgit v1.2.3-59-g8ed1b From e2fb2511888b3f7768835de0768c24d1e0d74590 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:52 +0000 Subject: powerpc/mm: flatten function __find_linux_pte() step 2 __find_linux_pte() is full of if/else which is hard to follow allthough the handling is pretty simple. Previous patch left { } blocks. This patch removes the first one by shifting its content to the left. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 62 +++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d332abeedf0a..c1c6d0b79baa 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -369,39 +369,37 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, hpdp = (hugepd_t *)&pud; goto out_huge; } - { - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *) pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *) pmdp; - goto out; - } - if (is_hugepd(__hugepd(pmd_val(pmd)))) { - hpdp = (hugepd_t *)&pmd; - goto out_huge; - } - - return pte_offset_kernel(&pmd, ea); + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *)pmdp; + goto out; + } + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *)pmdp; + goto out; } + if (is_hugepd(__hugepd(pmd_val(pmd)))) { + hpdp = (hugepd_t *)&pmd; + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); } out_huge: if (!hpdp) -- cgit v1.2.3-59-g8ed1b From 26e66b08c3376b6fb4ad4508d48a4e74d61f0b9b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:53 +0000 Subject: powerpc/mm: flatten function __find_linux_pte() step 3 __find_linux_pte() is full of if/else which is hard to follow allthough the handling is pretty simple. Previous patches left a { } block. This patch removes it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 98 +++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 49 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index c1c6d0b79baa..db4a6253df92 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -348,59 +348,59 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, hpdp = (hugepd_t *)&pgd; goto out_huge; } - { - /* - * Even if we end up with an unmap, the pgtable will not - * be freed, because we do an rcu free and here we are - * irq disabled - */ - pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); - pud = READ_ONCE(*pudp); - if (pud_none(pud)) - return NULL; + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = READ_ONCE(*pudp); - if (pud_huge(pud)) { - ret_pte = (pte_t *) pudp; - goto out; - } - if (is_hugepd(__hugepd(pud_val(pud)))) { - hpdp = (hugepd_t *)&pud; - goto out_huge; - } - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *)pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *)pmdp; - goto out; - } - if (is_hugepd(__hugepd(pmd_val(pmd)))) { - hpdp = (hugepd_t *)&pmd; - goto out_huge; - } + if (pud_none(pud)) + return NULL; - return pte_offset_kernel(&pmd, ea); + if (pud_huge(pud)) { + ret_pte = (pte_t *)pudp; + goto out; } + if (is_hugepd(__hugepd(pud_val(pud)))) { + hpdp = (hugepd_t *)&pud; + goto out_huge; + } + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *)pmdp; + goto out; + } + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *)pmdp; + goto out; + } + if (is_hugepd(__hugepd(pmd_val(pmd)))) { + hpdp = (hugepd_t *)&pmd; + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); + out_huge: if (!hpdp) return NULL; -- cgit v1.2.3-59-g8ed1b From 737b434d3d55c0b3c23df4eab1ea5b33f8850f30 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:01 +0000 Subject: powerpc/mm: convert Book3E 64 to pte_fragment Book3E 64 is the only subarch not using pte_fragment. In order to allow refactorisation, this patch converts it to pte_fragment. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu_context.h | 6 ----- arch/powerpc/include/asm/nohash/64/mmu.h | 4 +++- arch/powerpc/include/asm/nohash/64/pgalloc.h | 33 ++++++++++------------------ arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mmu_context.c | 2 +- 5 files changed, 17 insertions(+), 30 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 6ee8195a2ffb..66a3805dc935 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -228,13 +228,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, #endif } -#ifdef CONFIG_PPC_BOOK3E_64 -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} -#else extern void arch_exit_mmap(struct mm_struct *mm); -#endif static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h index 81cf30c370e5..26e05ce8f5aa 100644 --- a/arch/powerpc/include/asm/nohash/64/mmu.h +++ b/arch/powerpc/include/asm/nohash/64/mmu.h @@ -4,11 +4,13 @@ #define MAX_PHYSMEM_BITS 44 +#include + /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ #include #ifndef __ASSEMBLY__ -typedef struct page *pgtable_t; +typedef pte_t *pgtable_t; #endif #endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index ded453f9b5a8..7fb87235f845 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -76,10 +76,10 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, (unsigned long)page_address(pte_page)); + pmd_set(pmd, (unsigned long)pte_page); } -#define pmd_pgtable(pmd) pmd_page(pmd) +#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -92,44 +92,35 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); } +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + return (pte_t *)pte_fragment_alloc(mm, 1); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page; - pte_t *pte; - - pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT); - if (!pte) - return NULL; - page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; + return (pgtable_t)pte_fragment_alloc(mm, 0); } +void pte_frag_destroy(void *pte_frag); +void pte_fragment_free(unsigned long *table, int kernel); + static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - free_page((unsigned long)pte); + pte_fragment_free((unsigned long *)pte, 1); } static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) { - pgtable_page_dtor(ptepage); - __free_page(ptepage); + pte_fragment_free((unsigned long *)ptepage, 0); } static inline void pgtable_free(void *table, int shift) { if (!shift) { - pgtable_page_dtor(virt_to_page(table)); - free_page((unsigned long)table); + pte_fragment_free((unsigned long *)table, 0); } else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); @@ -166,7 +157,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { tlb_flush_pgtable(tlb, address); - pgtable_free_tlb(tlb, page_address(table), 0); + pgtable_free_tlb(tlb, table, 0); } #define __pmd_free_tlb(tlb, pmd, addr) \ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3daea8da0c7f..c7d5f37f7c52 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -7,12 +7,12 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ + pgtable-frag.o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o -obj-$(CONFIG_PPC32) += pgtable-frag.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index bb52320b7369..6b049d82b98a 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -98,7 +98,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_mmu_context(prev, next, tsk); } -#ifdef CONFIG_PPC32 +#ifndef CONFIG_PPC_BOOK3S_64 void arch_exit_mmap(struct mm_struct *mm) { void *frag = pte_frag_get(&mm->context); -- cgit v1.2.3-59-g8ed1b From 627f06c6f51e6af6ca3f7d1e82154b59583abc15 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:05 +0000 Subject: powerpc/book3e: move early_alloc_pgtable() to init section early_alloc_pgtable() is only used during init. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/book3e_pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index f296c2e88b09..75e9e2c35fe2 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -55,7 +55,7 @@ void vmemmap_remove_mapping(unsigned long start, #endif #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static __ref void *early_alloc_pgtable(unsigned long size) +static void __init *early_alloc_pgtable(unsigned long size) { void *ptr; @@ -74,7 +74,7 @@ static __ref void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; -- cgit v1.2.3-59-g8ed1b From 4a6d8cf90017019f3b2829b38157cd1a74c64856 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:06 +0000 Subject: powerpc/mm: don't use pte_alloc_kernel() until slab is available on PPC32 In the same way as PPC64, implement early allocation functions and avoid calling pte_alloc_kernel() before slab is available. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_32.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index c9cdbb84d31f..e54b612cbc98 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -43,11 +43,8 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - if (!slab_is_available()) - return memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); - return (pte_t *)pte_fragment_alloc(mm, 1); } @@ -205,7 +202,29 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) +static void __init *early_alloc_pgtable(unsigned long size) +{ + void *ptr = memblock_alloc(size, size); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, size); + + return ptr; +} + +static pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) +{ + if (pmd_none(*pmdp)) { + pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + return pte_offset_kernel(pmdp, va); +} + + +int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; @@ -214,7 +233,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) /* Use upper 10 bits of VA to index the first level map */ pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); /* Use middle 10 bits of VA to index the second-level map */ - pg = pte_alloc_kernel(pd, va); + if (likely(slab_is_available())) + pg = pte_alloc_kernel(pd, va); + else + pg = early_pte_alloc_kernel(pd, va); if (pg != 0) { err = 0; /* The PTE should never be already set nor present in the -- cgit v1.2.3-59-g8ed1b From b0124ff57e9405725b4dfeffbdfa929bb973ad2c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:07 +0000 Subject: powerpc/mm: inline pte_alloc_one_kernel() and pte_alloc_one() on PPC32 pte_alloc_one_kernel() and pte_alloc_one() are simple calls to pte_fragment_alloc(), so they are good candidates for inlining as already done on PPC64. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 15 ++++++++++++--- arch/powerpc/include/asm/nohash/32/pgalloc.h | 15 ++++++++++++--- arch/powerpc/mm/pgtable_32.c | 10 ---------- 3 files changed, 24 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 645af86cd072..0ed856068bb8 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -59,10 +59,19 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -extern pgtable_t pte_alloc_one(struct mm_struct *mm); -void pte_frag_destroy(void *pte_frag); pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +{ + return (pte_t *)pte_fragment_alloc(mm, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return (pgtable_t)pte_fragment_alloc(mm, 0); +} + +void pte_frag_destroy(void *pte_frag); void pte_fragment_free(unsigned long *table, int kernel); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index ea265a578eb0..1d41508f0676 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -77,10 +77,19 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) #endif -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -extern pgtable_t pte_alloc_one(struct mm_struct *mm); -void pte_frag_destroy(void *pte_frag); pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +{ + return (pte_t *)pte_fragment_alloc(mm, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return (pgtable_t)pte_fragment_alloc(mm, 0); +} + +void pte_frag_destroy(void *pte_frag); void pte_fragment_free(unsigned long *table, int kernel); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index e54b612cbc98..2e67f9a1430b 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -43,16 +43,6 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - void __iomem * ioremap(phys_addr_t addr, unsigned long size) { -- cgit v1.2.3-59-g8ed1b From b4abe38fd698ace6942edeeb79a5b8a60a7af4fa Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:32 +0000 Subject: powerpc/32: prepare shadow area for KASAN This patch prepares a shadow area for KASAN. The shadow area will be at the top of the kernel virtual memory space above the fixmap area and will occupy one eighth of the total kernel virtual memory space. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 5 +++++ arch/powerpc/include/asm/fixmap.h | 5 +++++ arch/powerpc/include/asm/kasan.h | 16 ++++++++++++++++ arch/powerpc/mm/mem.c | 4 ++++ arch/powerpc/mm/ptdump/ptdump.c | 8 ++++++++ 5 files changed, 38 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 4e00cb0a5464..61febbbdd02b 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -366,3 +366,8 @@ config PPC_FAST_ENDIAN_SWITCH depends on DEBUG_KERNEL && PPC_BOOK3S_64 help If you're unsure what this is, say N. + +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0xe0000000 diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index b9fbed84ddca..0cfc365d814b 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -22,7 +22,12 @@ #include #endif +#ifdef CONFIG_KASAN +#include +#define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE) +#else #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) +#endif /* * Here we define all the compile-time 'special' virtual diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 2c179a39d4ba..05274dea3109 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -12,4 +12,20 @@ #define EXPORT_SYMBOL_KASAN(fn) #endif +#ifndef __ASSEMBLY__ + +#include + +#define KASAN_SHADOW_SCALE_SHIFT 3 + +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ + (PAGE_OFFSET >> KASAN_SHADOW_SCALE_SHIFT)) + +#define KASAN_SHADOW_OFFSET ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET) + +#define KASAN_SHADOW_END 0UL + +#define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START) + +#endif /* __ASSEMBLY */ #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 105c58f8900a..cd525d709072 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -310,6 +310,10 @@ void __init mem_init(void) mem_init_print_info(NULL); #ifdef CONFIG_PPC32 pr_info("Kernel virtual memory layout:\n"); +#ifdef CONFIG_KASAN + pr_info(" * 0x%08lx..0x%08lx : kasan shadow mem\n", + KASAN_SHADOW_START, KASAN_SHADOW_END); +#endif pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); #ifdef CONFIG_HIGHMEM pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 63fc56feea15..48135ba6fa74 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -100,6 +100,10 @@ static struct addr_marker address_markers[] = { #endif { 0, "Fixmap start" }, { 0, "Fixmap end" }, +#endif +#ifdef CONFIG_KASAN + { 0, "kasan shadow mem start" }, + { 0, "kasan shadow mem end" }, #endif { -1, NULL }, }; @@ -323,6 +327,10 @@ static void populate_markers(void) #endif address_markers[i++].start_address = FIXADDR_START; address_markers[i++].start_address = FIXADDR_TOP; +#ifdef CONFIG_KASAN + address_markers[i++].start_address = KASAN_SHADOW_START; + address_markers[i++].start_address = KASAN_SHADOW_END; +#endif #endif /* CONFIG_PPC64 */ } -- cgit v1.2.3-59-g8ed1b From f072015c7b742c42aa5649d22f43163cd0eb7024 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:33 +0000 Subject: powerpc: disable KASAN instrumentation on early/critical files. All files containing functions run before kasan_early_init() is called must have KASAN instrumentation disabled. For those file, branch profiling also have to be disabled otherwise each if () generates a call to ftrace_likely_update(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 12 ++++++++++++ arch/powerpc/lib/Makefile | 8 ++++++++ arch/powerpc/mm/Makefile | 6 ++++++ arch/powerpc/platforms/powermac/Makefile | 6 ++++++ arch/powerpc/purgatory/Makefile | 3 +++ arch/powerpc/xmon/Makefile | 1 + 6 files changed, 36 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 45e47752b692..0ea6c4aa3a20 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -31,6 +31,18 @@ CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE) endif +KASAN_SANITIZE_early_32.o := n +KASAN_SANITIZE_cputable.o := n +KASAN_SANITIZE_prom_init.o := n +KASAN_SANITIZE_btext.o := n + +ifdef CONFIG_KASAN +CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ process.o systbl.o idle.o \ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 47a4de434c22..c55f9c27bf79 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -8,6 +8,14 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) +KASAN_SANITIZE_code-patching.o := n +KASAN_SANITIZE_feature-fixups.o := n + +ifdef CONFIG_KASAN +CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += alloc.o code-patching.o feature-fixups.o ifndef CONFIG_KASAN diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index c7d5f37f7c52..62735f335bce 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,6 +5,12 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) +KASAN_SANITIZE_ppc_mmu_32.o := n + +ifdef CONFIG_KASAN +CFLAGS_ppc_mmu_32.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o \ diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index 20ebf35d7913..f4247ade71ca 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -2,6 +2,12 @@ CFLAGS_bootx_init.o += -fPIC CFLAGS_bootx_init.o += $(call cc-option, -fno-stack-protector) +KASAN_SANITIZE_bootx_init.o := n + +ifdef CONFIG_KASAN +CFLAGS_bootx_init.o += -DDISABLE_BRANCH_PROFILING +endif + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_bootx_init.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile index 4314ba5baf43..7c6d8b14f440 100644 --- a/arch/powerpc/purgatory/Makefile +++ b/arch/powerpc/purgatory/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE := n + targets += trampoline.o purgatory.ro kexec-purgatory.c LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 3050f9323254..f142570ad860 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -7,6 +7,7 @@ subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n +KASAN_SANITIZE := n # Disable ftrace for the entire directory ORIG_CFLAGS := $(KBUILD_CFLAGS) -- cgit v1.2.3-59-g8ed1b From 2edb16efc899f9c232e2d880930b855e4cf55df4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:34 +0000 Subject: powerpc/32: Add KASAN support This patch adds KASAN support for PPC32. The following patch will add an early activation of hash table for book3s. Until then, a warning will be raised if trying to use KASAN on an hash 6xx. To support KASAN, this patch initialises that MMU mapings for accessing to the KASAN shadow area defined in a previous patch. An early mapping is set as soon as the kernel code has been relocated at its definitive place. Then the definitive mapping is set once paging is initialised. For modules, the shadow area is allocated at module_alloc(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/kasan.h | 9 ++ arch/powerpc/kernel/head_32.S | 3 + arch/powerpc/kernel/head_40x.S | 3 + arch/powerpc/kernel/head_44x.S | 3 + arch/powerpc/kernel/head_8xx.S | 3 + arch/powerpc/kernel/head_fsl_booke.S | 3 + arch/powerpc/kernel/setup-common.c | 3 + arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/init_32.c | 3 + arch/powerpc/mm/kasan/kasan_init_32.c | 156 ++++++++++++++++++++++++++++++++++ 11 files changed, 188 insertions(+) create mode 100644 arch/powerpc/mm/kasan/kasan_init_32.c (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 515bd10d32f6..2711aac24621 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -173,6 +173,7 @@ config PPC select GENERIC_TIME_VSYSCALL select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_KASAN if PPC32 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 05274dea3109..296e51c2f066 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -27,5 +27,14 @@ #define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START) +#ifdef CONFIG_KASAN +void kasan_early_init(void); +void kasan_mmu_init(void); +void kasan_init(void); +#else +static inline void kasan_init(void) { } +static inline void kasan_mmu_init(void) { } +#endif + #endif /* __ASSEMBLY */ #endif diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 40aec3f00a05..6e85171e513c 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -969,6 +969,9 @@ start_here: * Do early platform-specific initialization, * and set up the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index a9c934f2319b..efa219d2136e 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -848,6 +848,9 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 37117ab11584..34a5df827b38 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -203,6 +203,9 @@ _ENTRY(_start); /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 03c73b4c6435..d25adb6ef235 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -853,6 +853,9 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 32332e24e421..567e0ed45ca8 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -268,6 +268,9 @@ set_ivor: /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif mr r3,r30 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 19d68a9b5f37..91d2c6970bdb 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -67,6 +67,7 @@ #include #include #include +#include #include "setup.h" @@ -871,6 +872,8 @@ static void smp_setup_pacas(void) */ void __init setup_arch(char **cmdline_p) { + kasan_init(); + *cmdline_p = boot_command_line; /* Set a half-reasonable default so udelay does something sensible */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 62735f335bce..d8c0ce9b2557 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3eb4cb09749c..c3121b6c8cbd 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -179,6 +180,8 @@ void __init MMU_init(void) btext_unmap(); #endif + kasan_mmu_init(); + setup_kup(); /* Shortly after that, the entire linear mapping will be available */ diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c new file mode 100644 index 000000000000..42617fcad828 --- /dev/null +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include +#include +#include +#include +#include +#include + +static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) +{ + unsigned long va = (unsigned long)kasan_early_shadow_page; + phys_addr_t pa = __pa(kasan_early_shadow_page); + int i; + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++) + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); +} + +static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +{ + pmd_t *pmd; + unsigned long k_cur, k_next; + + pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); + + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { + pte_t *new; + + k_next = pgd_addr_end(k_cur, k_end); + if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) + continue; + + new = pte_alloc_one_kernel(&init_mm); + + if (!new) + return -ENOMEM; + kasan_populate_pte(new, PAGE_KERNEL_RO); + pmd_populate_kernel(&init_mm, pmd, new); + } + return 0; +} + +static void __ref *kasan_get_one_page(void) +{ + if (slab_is_available()) + return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + return memblock_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static int __ref kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_cur; + int ret; + void *block = NULL; + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + if (!slab_is_available()) + block = memblock_alloc(k_end - k_start, PAGE_SIZE); + + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + void *va = block ? block + k_cur - k_start : kasan_get_one_page(); + pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + + if (!va) + return -ENOMEM; + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } + flush_tlb_kernel_range(k_start, k_end); + return 0; +} + +static void __init kasan_remap_early_shadow_ro(void) +{ + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); +} + +void __init kasan_mmu_init(void) +{ + int ret; + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + phys_addr_t base = reg->base; + phys_addr_t top = min(base + reg->size, total_lowmem); + + if (base >= top) + continue; + + ret = kasan_init_region(__va(base), top - base); + if (ret) + panic("kasan: kasan_init_region() failed"); + } +} + +void __init kasan_init(void) +{ + kasan_remap_early_shadow_ro(); + + clear_page(kasan_early_shadow_page); + + /* At this point kasan is fully initialized. Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +#ifdef CONFIG_MODULES +void *module_alloc(unsigned long size) +{ + void *base = vmalloc_exec(size); + + if (!base) + return NULL; + + if (!kasan_init_region(base, size)) + return base; + + vfree(base); + + return NULL; +} +#endif + +void __init kasan_early_init(void) +{ + unsigned long addr = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr); + + BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); + + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL); + + do { + next = pgd_addr_end(addr, end); + pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); + } while (pmd++, addr = next, addr != end); + + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + WARN(true, "KASAN not supported on hash 6xx"); +} -- cgit v1.2.3-59-g8ed1b From 72f208c6a8f7bc78ef5248babd9e6ed6302bd2a0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:35 +0000 Subject: powerpc/32s: move hash code patching out of MMU_init_hw() For KASAN, hash table handling will be activated early for accessing to KASAN shadow areas. In order to avoid any modification of the hash functions while they are still used with the early hash table, the code patching is moved out of MMU_init_hw() and put close to the big-bang switch to the final hash table. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 3 +++ arch/powerpc/mm/book3s32/mmu.c | 36 ++++++++++++++++++++++-------------- arch/powerpc/mm/mmu_decl.h | 1 + 3 files changed, 26 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 6e85171e513c..5958ea685968 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -977,6 +977,9 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init +BEGIN_MMU_FTR_SECTION + bl MMU_init_hw_patch +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) /* * Go back to running unmapped so we can load up new values diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 1db55159031c..165529cc9087 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -39,6 +39,7 @@ struct hash_pte *Hash, *Hash_end; unsigned long Hash_size, Hash_mask; unsigned long _SDR1; +static unsigned int hash_mb, hash_mb2; struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ @@ -308,7 +309,6 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, */ void __init MMU_init_hw(void) { - unsigned int hmask, mb, mb2; unsigned int n_hpteg, lg_n_hpteg; if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) @@ -351,20 +351,30 @@ void __init MMU_init_hw(void) (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); - /* - * Patch up the instructions in hashtable.S:create_hpte - */ - if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); Hash_mask = n_hpteg - 1; - hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); - mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; if (lg_n_hpteg > 16) - mb2 = 16 - LG_HPTEG_SIZE; + hash_mb2 = 16 - LG_HPTEG_SIZE; +} + +void __init MMU_init_hw_patch(void) +{ + unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + if (ppc_md.progress) + ppc_md.progress("hash:patch", 0x345); + if (ppc_md.progress) + ppc_md.progress("hash:done", 0x205); + + /* WARNING: Make sure nothing can trigger a KASAN check past this point */ + + /* + * Patch up the instructions in hashtable.S:create_hpte + */ modify_instruction_site(&patch__hash_page_A0, 0xffff, ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); modify_instruction_site(&patch__hash_page_C, 0xffff, hmask); @@ -373,11 +383,9 @@ void __init MMU_init_hw(void) */ modify_instruction_site(&patch__flush_hash_A0, 0xffff, ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); - - if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); } void setup_initial_memory_limit(phys_addr_t first_memblock_base, diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 74ff61dabcb1..d726ff776054 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -130,6 +130,7 @@ extern void wii_memory_fixups(void); */ #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); +void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif -- cgit v1.2.3-59-g8ed1b From 215b823707ce4e8e52b106915f70357fa474c669 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:36 +0000 Subject: powerpc/32s: set up an early static hash table for KASAN. KASAN requires early activation of hash table, before memblock() functions are available. This patch implements an early hash_table statically defined in __initdata. During early boot, a single page table is used. For hash32, when doing the final init, one page table is allocated for each PGD entry because of the _PAGE_HASHPTE flag which can't be common to several virt pages. This is done after memblock get available but before switching to the final hash table, otherwise there are issues with TLB flushing due to the shared entries. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 70 ++++++++++++++++++++++------------- arch/powerpc/mm/kasan/kasan_init_32.c | 23 +++++++++++- arch/powerpc/mm/mmu_decl.h | 1 + 3 files changed, 68 insertions(+), 26 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 5958ea685968..73288df1c5d6 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -160,6 +160,10 @@ __after_mmu_off: bl flush_tlbs bl initial_bats + bl load_segment_registers +#ifdef CONFIG_KASAN + bl early_hash_table +#endif #if defined(CONFIG_BOOTX_TEXT) bl setup_disp_bat #endif @@ -205,7 +209,7 @@ __after_mmu_off: */ turn_on_mmu: mfmsr r0 - ori r0,r0,MSR_DR|MSR_IR + ori r0,r0,MSR_DR|MSR_IR|MSR_RI mtspr SPRN_SRR1,r0 lis r0,start_here@h ori r0,r0,start_here@l @@ -884,11 +888,24 @@ _ENTRY(__restore_cpu_setup) blr #endif /* !defined(CONFIG_PPC_BOOK3S_32) */ - /* * Load stuff into the MMU. Intended to be called with * IR=0 and DR=0. */ +#ifdef CONFIG_KASAN +early_hash_table: + sync /* Force all PTE updates to finish */ + isync + tlbia /* Clear all TLB entries */ + sync /* wait for tlbia/tlbie to finish */ + TLBSYNC /* ... on all CPUs */ + /* Load the SDR1 register (hash table base & size) */ + lis r6, early_hash - PAGE_OFFSET@h + ori r6, r6, 3 /* 256kB table */ + mtspr SPRN_SDR1, r6 + blr +#endif + load_up_mmu: sync /* Force all PTE updates to finish */ isync @@ -900,29 +917,6 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0, NUM_USER_SEGMENTS /* load up user segment register values */ - mtctr r0 /* for context 0 */ - li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ -#ifdef CONFIG_PPC_KUEP - oris r3, r3, SR_NX@h /* Set Nx */ -#endif -#ifdef CONFIG_PPC_KUAP - oris r3, r3, SR_KS@h /* Set Ks */ -#endif - li r4,0 -3: mtsrin r3,r4 - addi r3,r3,0x111 /* increment VSID */ - addis r4,r4,0x1000 /* address of next segment */ - bdnz 3b - li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ - mtctr r0 /* for context 0 */ - rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ - rlwinm r3, r3, 0, ~SR_KS /* Ks = 0 */ - oris r3, r3, SR_KP@h /* Kp = 1 */ -3: mtsrin r3, r4 - addi r3, r3, 0x111 /* increment VSID */ - addis r4, r4, 0x1000 /* address of next segment */ - bdnz 3b /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ @@ -944,6 +938,32 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +load_segment_registers: + li r0, NUM_USER_SEGMENTS /* load up user segment register values */ + mtctr r0 /* for context 0 */ + li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ +#ifdef CONFIG_PPC_KUEP + oris r3, r3, SR_NX@h /* Set Nx */ +#endif +#ifdef CONFIG_PPC_KUAP + oris r3, r3, SR_KS@h /* Set Ks */ +#endif + li r4, 0 +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b + li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ + mtctr r0 /* for context 0 */ + rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + rlwinm r3, r3, 0, ~SR_KS /* Ks = 0 */ + oris r3, r3, SR_KP@h /* Kp = 1 */ +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b + blr + /* * This is where the main kernel code starts. */ diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 42617fcad828..ba8361487075 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -94,6 +94,13 @@ void __init kasan_mmu_init(void) int ret; struct memblock_region *reg; + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + for_each_memblock(memory, reg) { phys_addr_t base = reg->base; phys_addr_t top = min(base + reg->size, total_lowmem); @@ -135,6 +142,20 @@ void *module_alloc(unsigned long size) } #endif +#ifdef CONFIG_PPC_BOOK3S_32 +u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; + +static void __init kasan_early_hash_table(void) +{ + modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16); + + Hash = (struct hash_pte *)early_hash; +} +#else +static void __init kasan_early_hash_table(void) {} +#endif + void __init kasan_early_init(void) { unsigned long addr = KASAN_SHADOW_START; @@ -152,5 +173,5 @@ void __init kasan_early_init(void) } while (pmd++, addr = next, addr != end); if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - WARN(true, "KASAN not supported on hash 6xx"); + kasan_early_hash_table(); } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d726ff776054..31fce3914ddc 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -106,6 +106,7 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; +extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ -- cgit v1.2.3-59-g8ed1b From da3a3b0a0e38377c98946420acdc7d4ca38cff47 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:37 +0000 Subject: powerpc/32s: map kasan zero shadow with PAGE_READONLY instead of PAGE_KERNEL_RO For hash32, the zero shadow page gets mapped with PAGE_READONLY instead of PAGE_KERNEL_RO, because the PP bits don't provide a RO kernel, so PAGE_KERNEL_RO is equivalent to PAGE_KERNEL. By using PAGE_READONLY, the page is RO for both kernel and user, but this is not a security issue as it contains only zeroes. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/kasan/kasan_init_32.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index ba8361487075..0d62be3cba47 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -39,7 +39,10 @@ static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_ if (!new) return -ENOMEM; - kasan_populate_pte(new, PAGE_KERNEL_RO); + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + kasan_populate_pte(new, PAGE_READONLY); + else + kasan_populate_pte(new, PAGE_KERNEL_RO); pmd_populate_kernel(&init_mm, pmd, new); } return 0; @@ -84,7 +87,10 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { - kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY); + else + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } -- cgit v1.2.3-59-g8ed1b From 57e0491b58fa2a217029b696511499008852a642 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:36:36 +0000 Subject: powerpc/32s: drop Hash_end Hash_end has never been used, drop it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/book3s32/mmu.c | 4 +--- arch/powerpc/mm/mmu_decl.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 165529cc9087..03265de05637 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -36,7 +36,7 @@ #include -struct hash_pte *Hash, *Hash_end; +struct hash_pte *Hash; unsigned long Hash_size, Hash_mask; unsigned long _SDR1; static unsigned int hash_mb, hash_mb2; @@ -345,8 +345,6 @@ void __init MMU_init_hw(void) __func__, Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 31fce3914ddc..7b8833d695d1 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -104,7 +104,7 @@ extern int __map_without_bats; extern unsigned int rtas_data, rtas_size; struct hash_pte; -extern struct hash_pte *Hash, *Hash_end; +extern struct hash_pte *Hash; extern unsigned long Hash_size, Hash_mask; extern u8 early_hash[]; -- cgit v1.2.3-59-g8ed1b From 8f156c23f4c04ca51961cd1f6a0edbc80caa2683 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:36:37 +0000 Subject: powerpc/32s: don't try to print hash table address. Due to %p, (ptrval) is printed in lieu of the hash table address. showing the hash table address isn't an operationnal need so just don't print it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/book3s32/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 03265de05637..131cd3acb6b8 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -345,8 +345,8 @@ void __init MMU_init_hw(void) __func__, Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", - (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); + pr_info("Total memory = %lldMB; using %ldkB for hash table\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10); Hash_mask = n_hpteg - 1; -- cgit v1.2.3-59-g8ed1b From e4dccf9092ab48a6f902003b9558c0e45d0e849a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:36:39 +0000 Subject: powerpc/mm: print hash info in a helper Reduce #ifdef mess by defining a helper to print hash info at startup. In the meantime, remove the display of hash table address to reduce leak of non necessary information. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 22 +--------------------- arch/powerpc/mm/book3s32/mmu.c | 9 ++++++++- arch/powerpc/mm/book3s64/hash_utils.c | 13 +++++++++++++ arch/powerpc/mm/mmu_decl.h | 5 ++++- 4 files changed, 26 insertions(+), 23 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 91d2c6970bdb..3f8805d3c0c9 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -800,12 +800,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_BOOK3S_64 - pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); -#endif -#ifdef CONFIG_PPC_BOOK3S_32 - pr_info("Hash_size = 0x%lx\n", Hash_size); -#endif pr_info("phys_mem_size = 0x%llx\n", (unsigned long long)memblock_phys_mem_size()); @@ -827,21 +821,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_BOOK3S_64 - if (htab_address) - pr_info("htab_address = 0x%p\n", htab_address); - if (htab_hash_mask) - pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); - pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); - pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); - pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); -#endif -#ifdef CONFIG_PPC_BOOK3S_32 - if (Hash) - pr_info("Hash = 0x%p\n", Hash); - if (Hash_mask) - pr_info("Hash_mask = 0x%lx\n", Hash_mask); -#endif + print_system_hash_info(); if (PHYSICAL_START > 0) pr_info("physical_start = 0x%llx\n", diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 131cd3acb6b8..615f78d35536 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -37,7 +37,7 @@ #include struct hash_pte *Hash; -unsigned long Hash_size, Hash_mask; +static unsigned long Hash_size, Hash_mask; unsigned long _SDR1; static unsigned int hash_mb, hash_mb2; @@ -401,6 +401,13 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); } +void __init print_system_hash_info(void) +{ + pr_info("Hash_size = 0x%lx\n", Hash_size); + if (Hash_mask) + pr_info("Hash_mask = 0x%lx\n", Hash_mask); +} + #ifdef CONFIG_PPC_KUEP void __init setup_kuep(bool disabled) { diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index f0ce860a69ac..919a861a8ec0 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -66,6 +66,8 @@ #include #include +#include + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -1945,3 +1947,14 @@ static int __init hash64_debugfs(void) } machine_device_initcall(pseries, hash64_debugfs); #endif /* CONFIG_DEBUG_FS */ + +void __init print_system_hash_info(void) +{ + pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + + if (htab_hash_mask) + pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); + pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); + pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); +} diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7b8833d695d1..7bac0aa2026a 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -83,6 +83,8 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, } #endif +static inline void print_system_hash_info(void) {} + #else /* CONFIG_PPC_MMU_NOHASH */ extern void hash_preload(struct mm_struct *mm, unsigned long ea, @@ -92,6 +94,8 @@ extern void hash_preload(struct mm_struct *mm, unsigned long ea, extern void _tlbie(unsigned long address); extern void _tlbia(void); +void print_system_hash_info(void); + #endif /* CONFIG_PPC_MMU_NOHASH */ #ifdef CONFIG_PPC32 @@ -105,7 +109,6 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash; -extern unsigned long Hash_size, Hash_mask; extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ -- cgit v1.2.3-59-g8ed1b From c9e0fc33b8be52a7134ed0ee79b6a1e332e1b9d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 30 Apr 2019 14:27:39 -0400 Subject: powerpc: remove the __kernel_io_end export This export was added in this merge window, but without any actual user, or justification for a modular user. Fixes: a35a3c6f6065 ("powerpc/mm/hash64: Add a variable to track the end of IO mapping") Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 95ed76519411..4c6a73782b19 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -97,7 +97,6 @@ EXPORT_SYMBOL(__vmalloc_end); unsigned long __kernel_io_start; EXPORT_SYMBOL(__kernel_io_start); unsigned long __kernel_io_end; -EXPORT_SYMBOL(__kernel_io_end); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; -- cgit v1.2.3-59-g8ed1b From 5f18cbdbdd42b050c51eb9859f8ce43db3f51846 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 2 May 2019 17:39:46 +1000 Subject: powerpc/mm/ptdump: Wrap seq_printf() to handle NULL pointers Lovingly borrowed from the arch/arm64 ptdump code. This doesn't seem to be an issue in practice, but is necessary for my upcoming commit. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/mm/ptdump/ptdump.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 48135ba6fa74..e249a56c07bf 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -108,6 +108,18 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_putc(m, c) \ +({ \ + if (m) \ + seq_putc(m, c); \ +}) + static void dump_flag_info(struct pg_state *st, const struct flag_info *flag, u64 pte, int num) { @@ -125,19 +137,19 @@ static void dump_flag_info(struct pg_state *st, const struct flag_info val = pte & flag->val; if (flag->shift) val = val >> flag->shift; - seq_printf(st->seq, " %s:%llx", flag->set, val); + pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val); } else { if ((pte & flag->mask) == flag->val) s = flag->set; else s = flag->clear; if (s) - seq_printf(st->seq, " %s", s); + pt_dump_seq_printf(st->seq, " %s", s); } st->current_flags &= ~flag->mask; } if (st->current_flags != 0) - seq_printf(st->seq, " unknown flags:%llx", st->current_flags); + pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags); } static void dump_addr(struct pg_state *st, unsigned long addr) @@ -152,12 +164,12 @@ static void dump_addr(struct pg_state *st, unsigned long addr) #define REG "0x%08lx" #endif - seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) { - seq_printf(st->seq, "[" REG "]", st->start_pa); + pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa); delta = PAGE_SIZE >> 10; } else { - seq_printf(st->seq, " " REG " ", st->start_pa); + pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); delta = (addr - st->start_address) >> 10; } /* Work out what appropriate unit to use */ @@ -165,7 +177,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr) delta >>= 10; unit++; } - seq_printf(st->seq, "%9lu%c", delta, *unit); + pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit); } @@ -182,7 +194,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->start_address = addr; st->start_pa = pa; st->last_pa = pa; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -206,7 +218,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->current_flags, pg_level[st->level].num); - seq_putc(st->seq, '\n'); + pt_dump_seq_putc(st->seq, '\n'); } /* @@ -215,7 +227,7 @@ static void note_page(struct pg_state *st, unsigned long addr, */ while (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; st->start_pa = pa; -- cgit v1.2.3-59-g8ed1b From 453d87f6a8aed827f5ebb1708a4cea458fd68d23 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 2 May 2019 17:39:47 +1000 Subject: powerpc/mm: Warn if W+X pages found on boot Implement code to walk all pages and warn if any are found to be both writable and executable. Depends on STRICT_KERNEL_RWX enabled, and is behind the DEBUG_WX config option. This only runs on boot and has no runtime performance implications. Very heavily influenced (and in some cases copied verbatim) from the ARM64 code written by Laura Abbott (thanks!), since our ptdump infrastructure is similar. Signed-off-by: Russell Currey [mpe: Fixup build error when disabled] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 19 +++++++++++++++++ arch/powerpc/include/asm/pgtable.h | 6 ++++++ arch/powerpc/mm/pgtable_32.c | 3 +++ arch/powerpc/mm/pgtable_64.c | 3 +++ arch/powerpc/mm/ptdump/ptdump.c | 43 +++++++++++++++++++++++++++++++++++++- 5 files changed, 73 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 61febbbdd02b..e9ae650c8e93 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -361,6 +361,25 @@ config PPC_PTDUMP If you are unsure, say N. +config PPC_DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on PPC_PTDUMP + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index c51846da41a7..3f53be60fb01 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -105,6 +105,12 @@ void mark_initmem_nx(void); static inline void mark_initmem_nx(void) { } #endif +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void); +#else +static inline void ptdump_check_wx(void) { } +#endif + /* * When used, PTE_FRAG_NR is defined in subarch pgtable.h * so we are sure it is included when arriving here. diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 2e67f9a1430b..16ada373b32b 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -396,6 +396,9 @@ void mark_rodata_ro(void) PFN_DOWN((unsigned long)__start_rodata); change_page_attr(page, numpages, PAGE_KERNEL_RO); + + // mark_initmem_nx() should have already run by now + ptdump_check_wx(); } #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 4c6a73782b19..d2d976ff8a0e 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -332,6 +332,9 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); + + // mark_initmem_nx() should have already run by now + ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index e249a56c07bf..646876d9da64 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -31,7 +31,7 @@ #include "ptdump.h" #ifdef CONFIG_PPC32 -#define KERN_VIRT_START 0 +#define KERN_VIRT_START PAGE_OFFSET #endif /* @@ -68,6 +68,8 @@ struct pg_state { unsigned long last_pa; unsigned int level; u64 current_flags; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -181,6 +183,20 @@ static void dump_addr(struct pg_state *st, unsigned long addr) } +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) + return; + + WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + static void note_page(struct pg_state *st, unsigned long addr, unsigned int level, u64 val) { @@ -210,6 +226,7 @@ static void note_page(struct pg_state *st, unsigned long addr, /* Check the PTE flags */ if (st->current_flags) { + note_prot_wx(st, addr); dump_addr(st, addr); /* Dump all the flags */ @@ -387,6 +404,30 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = address_markers, + .check_wx = true, + }; + + if (radix_enabled()) + st.start_address = PAGE_OFFSET; + else + st.start_address = KERN_VIRT_START; + + walk_pagetables(&st); + + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", + st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} +#endif + static int ptdump_init(void) { struct dentry *debugfs_file; -- cgit v1.2.3-59-g8ed1b From 305d60012304684bd59ea1f67703e51662e4906a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 06:21:00 +0000 Subject: powerpc/kasan: add missing/lost Makefile For unknown reason (aka. mpe is a doofus), the new Makefile added via the KASAN support patch didn't land into arch/powerpc/mm/kasan/ This patch restores it. Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/kasan/Makefile | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 arch/powerpc/mm/kasan/Makefile (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile new file mode 100644 index 000000000000..6577897673dd --- /dev/null +++ b/arch/powerpc/mm/kasan/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE := n + +obj-$(CONFIG_PPC32) += kasan_init_32.o -- cgit v1.2.3-59-g8ed1b From 471e475c69a1689e059b5e57e893a7da75d2831a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 06:21:01 +0000 Subject: powerpc/mm: Fix makefile for KASAN In commit 17312f258cf6 ("powerpc/mm: Move book3s32 specifics in subdirectory mm/book3s64"), ppc_mmu_32.c was moved and renamed. This patch fixes Makefiles to disable KASAN instrumentation on the new name and location. Fixes: f072015c7b74 ("powerpc: disable KASAN instrumentation on early/critical files.") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 6 ------ arch/powerpc/mm/book3s32/Makefile | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index d8c0ce9b2557..7a7527116c3a 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,12 +5,6 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -KASAN_SANITIZE_ppc_mmu_32.o := n - -ifdef CONFIG_KASAN -CFLAGS_ppc_mmu_32.o += -DDISABLE_BRANCH_PROFILING -endif - obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o \ diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile index a4e217d0f3b7..1732eaa740a9 100644 --- a/arch/powerpc/mm/book3s32/Makefile +++ b/arch/powerpc/mm/book3s32/Makefile @@ -1,3 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 +KASAN_SANITIZE_mmu.o := n + +ifdef CONFIG_KASAN +CFLAGS_mmu.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += mmu.o hash_low.o mmu_context.o tlb.o -- cgit v1.2.3-59-g8ed1b From c4e31847a5490d52ddd44440a524e8355be11ec1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 06:47:55 +0000 Subject: powerpc/mm: fix redundant inclusion of pgtable-frag.o in Makefile The patch identified below added pgtable-frag.o to obj-y but some merge witchery kept it also for obj-CONFIG_PPC_BOOK3S_64 This patch clears the duplication. Fixes: 737b434d3d55 ("powerpc/mm: convert Book3E 64 to pte_fragment") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 7a7527116c3a..0f499db315d6 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -12,7 +12,6 @@ obj-y := fault.o mem.o pgtable.o mmap.o \ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -- cgit v1.2.3-59-g8ed1b From 67d53f30e23ec66aa7bbdd1592d5e64d46876190 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 08:10:43 +0000 Subject: powerpc/mm: fix section mismatch for setup_kup() commit b28c97505eb1 ("powerpc/64: Setup KUP on secondary CPUs") moved setup_kup() out of the __init section. As stated in that commit, "this is only for 64-bit". But this function is also used on PPC32, where the two functions called by setup_kup() are in the __init section, so setup_kup() has to either be kept in the __init section on PPC32 or marked __ref. This patch marks it __ref, it fixes the below build warnings. MODPOST vmlinux.o WARNING: vmlinux.o(.text+0x169ec): Section mismatch in reference from the function setup_kup() to the function .init.text:setup_kuep() The function setup_kup() references the function __init setup_kuep(). This is often because setup_kup lacks a __init annotation or the annotation of setup_kuep is wrong. WARNING: vmlinux.o(.text+0x16a04): Section mismatch in reference from the function setup_kup() to the function .init.text:setup_kuap() The function setup_kup() references the function __init setup_kuap(). This is often because setup_kup lacks a __init annotation or the annotation of setup_kuap is wrong. Fixes: b28c97505eb1 ("powerpc/64: Setup KUP on secondary CPUs") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 6ea5607fc564..3bcae9e5e954 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -45,7 +45,7 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); -void setup_kup(void) +void __ref setup_kup(void) { setup_kuep(disable_kuep); setup_kuap(disable_kuap); -- cgit v1.2.3-59-g8ed1b From 04a1942933ced67d2b73c156017bf13476b7146b Mon Sep 17 00:00:00 2001 From: Sachin Sant Date: Mon, 6 May 2019 17:33:33 +0530 Subject: powerpc/mm: Fix hugetlb page initialization This patch fixes a regression by using correct kernel config variable for HUGETLB_PAGE_SIZE_VARIABLE. Without this huge pages are disabled during kernel boot. [0.309496] hugetlbfs: disabling because there are no supported hugepage sizes Fixes: c5710cd20735 ("powerpc/mm: cleanup HPAGE_SHIFT setup") Reported-by: Sachin Sant Signed-off-by: Michael Ellerman Tested-by: Sachin Sant Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 98db5ec6a1dd..c5c9ff2d7afc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -640,7 +640,7 @@ static int __init hugetlbpage_init(void) pgtable_cache_add(PTE_T_ORDER); } - if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE)) + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) hugetlbpage_init_default(); return 0; -- cgit v1.2.3-59-g8ed1b