diff options
Diffstat (limited to '')
70 files changed, 2194 insertions, 2290 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index df8172da2301..503a6e249940 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,7 +5,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y := fault.o mem.o pgtable.o mmap.o maccess.o pageattr.o \ +obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o ioremap.o ioremap_$(BITS).o \ init-common.o mmu_context.o drmem.o \ @@ -14,7 +14,6 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile index 15f4773643d2..50dd8f6bdf46 100644 --- a/arch/powerpc/mm/book3s32/Makefile +++ b/arch/powerpc/mm/book3s32/Makefile @@ -9,5 +9,4 @@ endif obj-y += mmu.o mmu_context.o obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o -obj-$(CONFIG_PPC_KUEP) += kuep.o obj-$(CONFIG_PPC_KUAP) += kuap.o diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c index 0f920f09af57..28676cabb005 100644 --- a/arch/powerpc/mm/book3s32/kuap.c +++ b/arch/powerpc/mm/book3s32/kuap.c @@ -20,8 +20,11 @@ EXPORT_SYMBOL(kuap_unlock_all_ool); void setup_kuap(bool disabled) { - if (!disabled) + if (!disabled) { kuap_lock_all_ool(); + init_mm.context.sr0 |= SR_KS; + current->thread.sr0 |= SR_KS; + } if (smp_processor_id() != boot_cpuid) return; diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c deleted file mode 100644 index c20733d6e02c..000000000000 --- a/arch/powerpc/mm/book3s32/kuep.c +++ /dev/null @@ -1,20 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <asm/kup.h> -#include <asm/smp.h> - -struct static_key_false disable_kuep_key; - -void setup_kuep(bool disabled) -{ - if (!disabled) - kuep_lock(); - - if (smp_processor_id() != boot_cpuid) - return; - - if (disabled) - static_branch_enable(&disable_kuep_key); - else - pr_info("Activating Kernel Userspace Execution Prevention\n"); -} diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 27061583a010..850783cfa9c7 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -23,7 +23,6 @@ #include <linux/highmem.h> #include <linux/memblock.h> -#include <asm/prom.h> #include <asm/mmu.h> #include <asm/machdep.h> #include <asm/code-patching.h> @@ -76,7 +75,7 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -static int find_free_bat(void) +int __init find_free_bat(void) { int b; int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; @@ -100,7 +99,7 @@ static int find_free_bat(void) * - block size has to be a power of two. This is calculated by finding the * highest bit set to 1. */ -static unsigned int block_size(unsigned long base, unsigned long top) +unsigned int bat_block_size(unsigned long base, unsigned long top) { unsigned int max_size = SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; @@ -145,7 +144,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to int idx; while ((idx = find_free_bat()) != -1 && base != top) { - unsigned int size = block_size(base, top); + unsigned int size = bat_block_size(base, top); if (size < 128 << 10) break; @@ -159,10 +158,13 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long done; - unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET; + unsigned long size; + size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET); + setibat(0, PAGE_OFFSET, 0, size, PAGE_KERNEL_X); - if (debug_pagealloc_enabled_or_kfence() || __map_without_bats) { + if (debug_pagealloc_enabled_or_kfence()) { pr_debug_once("Read-Write memory mapped without BATs\n"); if (base >= border) return base; @@ -196,18 +198,17 @@ void mmu_mark_initmem_nx(void) int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; unsigned long base = (unsigned long)_stext - PAGE_OFFSET; - unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K); unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; unsigned long size; - for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { - size = block_size(base, top); + for (i = 0; i < nb - 1 && base < top;) { + size = bat_block_size(base, top); setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); base += size; } if (base < top) { - size = block_size(base, top); - size = max(size, 128UL << 10); + size = bat_block_size(base, top); if ((top - base) > size) { size <<= 1; if (strict_kernel_rwx_enabled() && base + size > border) @@ -239,7 +240,7 @@ void mmu_mark_rodata_ro(void) for (i = 0; i < nb; i++) { struct ppc_bat *bat = BATS[i]; - if (bat_addrs[i].start < (unsigned long)__init_begin) + if (bat_addrs[i].start < (unsigned long)__end_rodata) bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; } @@ -247,10 +248,9 @@ void mmu_mark_rodata_ro(void) } /* - * Set up one of the I/D BAT (block address translation) register pairs. + * Set up one of the D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. - * On 603+, only set IBAT when _PAGE_EXEC is set */ void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -286,10 +286,6 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* G bit must be zero in IBATs */ flags &= ~_PAGE_EXEC; } - if (flags & _PAGE_EXEC) - bat[0] = bat[1]; - else - bat[0].batu = bat[0].batl = 0; bat_addrs[index].start = virt; bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; @@ -318,11 +314,9 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return; /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c index e2708e387dc3..269a3eb25a73 100644 --- a/arch/powerpc/mm/book3s32/mmu_context.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -69,6 +69,12 @@ EXPORT_SYMBOL_GPL(__init_new_context); int init_new_context(struct task_struct *t, struct mm_struct *mm) { mm->context.id = __init_new_context(); + mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0); + + if (!kuep_is_disabled()) + mm->context.sr0 |= SR_NX; + if (!kuap_is_disabled()) + mm->context.sr0 |= SR_KS; return 0; } @@ -108,20 +114,13 @@ void __init mmu_context_init(void) void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { long id = next->context.id; - unsigned long val; if (id < 0) panic("mm_struct %p has no context ID", next); isync(); - val = CTX_TO_VSID(id, 0); - if (!kuep_is_disabled()) - val |= SR_NX; - if (!kuap_is_disabled()) - val |= SR_KS; - - update_user_segments(val); + update_user_segments(next->context.sr0); if (IS_ENABLED(CONFIG_BDI_SWITCH)) abatron_pteptrs[1] = next->pgd; diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 19f0ef950d77..9ad6b56bfec9 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range); void hash__flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_lock. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + for_each_vma(vmi, mp) hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); } EXPORT_SYMBOL(hash__flush_tlb_mm); diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 1b56d3af47d4..cad2abc1730f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -2,22 +2,34 @@ ccflags-y := $(NO_MINIMAL_TOC) +obj-y += mmu_context.o pgtable.o trace.o +ifdef CONFIG_PPC_64S_HASH_MMU CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - -obj-y += hash_pgtable.o hash_utils.o slb.o \ - mmu_context.o pgtable.o hash_tlb.o -obj-$(CONFIG_PPC_NATIVE) += hash_native.o -obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o +obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +endif + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o obj-$(CONFIG_PPC_PKEY) += pkeys.o # Instrumenting the SLB fault path can lead to duplicate SLB entries KCOV_INSTRUMENT_slb.o := n + +# Parts of these can run in real mode and therefore are +# not safe with the current outline KASAN implementation +KASAN_SANITIZE_mmu_context.o := n +KASAN_SANITIZE_pgtable.o := n +KASAN_SANITIZE_radix_pgtable.o := n +KASAN_SANITIZE_radix_tlb.o := n +KASAN_SANITIZE_slb.o := n +KASAN_SANITIZE_pkeys.o := n diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c deleted file mode 100644 index a688e1324ae5..000000000000 --- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * Based on the IA-32 version: - * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> - */ - -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <asm/cacheflush.h> -#include <asm/machdep.h> - -unsigned int hpage_shift; -EXPORT_SYMBOL(hpage_shift); - -int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, unsigned long flags, - int ssize, unsigned int shift, unsigned int mmu_psize) -{ - real_pte_t rpte; - unsigned long vpn; - unsigned long old_pte, new_pte; - unsigned long rflags, pa; - long slot, offset; - - BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); - - /* Search the Linux page table for a match with va */ - vpn = hpt_vpn(ea, vsid, ssize); - - /* - * At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - - do { - old_pte = pte_val(*ptep); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & H_PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(!check_pte_access(access, old_pte))) - return 1; - - /* - * Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access - */ - new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_WRITE) - new_pte |= _PAGE_DIRTY; - } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - /* Make sure this is a hugetlb entry */ - if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) - return 0; - - rflags = htab_convert_pte_flags(new_pte, flags); - if (unlikely(mmu_psize == MMU_PAGE_16G)) - offset = PTRS_PER_PUD; - else - offset = PTRS_PER_PMD; - rpte = __real_pte(__pte(old_pte), ptep, offset); - - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & H_PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long gslot; - - gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); - if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, - mmu_psize, ssize, flags) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & H_PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, shift, ssize); - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - - /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; - - slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, - mmu_psize, ssize); - - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - mmu_psize, mmu_psize, old_pte); - return -1; - } - - new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); - } - - /* - * No need to use ldarx/stdcx here - */ - *ptep = __pte(new_pte & ~H_PAGE_BUSY); - return 0; -} - -pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - unsigned long pte_val; - /* - * Clear the _PAGE_PRESENT so that no hardware parallel update is - * possible. Also keep the pte_present true so that we don't take - * wrong fault. - */ - pte_val = pte_update(vma->vm_mm, addr, ptep, - _PAGE_PRESENT, _PAGE_INVALID, 1); - - return __pte(pte_val); -} - -void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t old_pte, pte_t pte) -{ - - if (radix_enabled()) - return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, - old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); -} - -void hugetlbpage_init_default(void) -{ - /* Set default large page size. Currently, we pick 16M or 1M - * depending on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift; - else if (mmu_psize_defs[MMU_PAGE_2M].shift) - hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift; -} diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index d8279bfe68ea..9342e79870df 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,109 +43,28 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) -{ - unsigned long rb; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - - asm volatile("tlbiel %0" : : "r" (rb)); -} +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); -/* - * tlbiel instruction for hash, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static void acquire_hpte_lock(void) { - unsigned long rb; - unsigned long rs; - unsigned int r = 0; /* hash format */ - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) - : "memory"); + lock_map_acquire(&hpte_lock_map); } - -static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +static void release_hpte_lock(void) { - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa206(set, is); - - ppc_after_tlbiel_barrier(); + lock_map_release(&hpte_lock_map); } - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +#else +static void acquire_hpte_lock(void) { - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the partition table cache if this is HV mode. - */ - if (early_cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - - /* - * Now invalidate the process table cache. UPRT=0 HPT modes (what - * current hardware implements) do not use the process table, but - * add the flushes anyway. - * - * From ISA v3.0B p. 1078: - * The following forms are invalid. - * * PRS=1, R=0, and RIC!=2 (The only process-scoped - * HPT caching is of the Process Table.) - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 1); - - /* - * Then flush the sets of the TLB proper. Hash mode uses - * partition scoped TLB translations, which may be flushed - * in !HV mode. - */ - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); - - ppc_after_tlbiel_barrier(); - - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } -void hash__tlbiel_all(unsigned int action) +static void release_hpte_lock(void) { - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) - tlbiel_all_isa206(POWER8_TLB_SETS, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) - tlbiel_all_isa206(POWER7_TLB_SETS, is); - else - WARN(1, "%s called on pre-POWER7 CPU\n", __func__); } +#endif static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) @@ -267,7 +186,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) va |= ssize << 8; sllp = get_sllp_encoding(apsize); va |= sllp << 5; - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -286,7 +205,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) */ va |= (vpn & 0xfe); va |= 1; /* L */ - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -324,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -338,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -347,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -367,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -390,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } @@ -431,6 +360,7 @@ static long native_hpte_remove(unsigned long hpte_group) return -1; /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; return i; @@ -443,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -486,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -549,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -567,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -580,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -597,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } @@ -621,10 +567,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -684,10 +631,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -869,8 +814,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index ad5eff097d31..51f48984abca 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -13,10 +13,10 @@ #include <asm/sections.h> #include <asm/mmu.h> #include <asm/tlb.h> +#include <asm/firmware.h> #include <mm/mmu_decl.h> -#define CREATE_TRACE_POINTS #include <trace/events/thp.h> #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) @@ -256,7 +256,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * the __collapse_huge_page_copy can result in copying * the old content. */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); return pmd; } @@ -378,7 +378,7 @@ int hash__has_transparent_hugepage(void) if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) return 0; /* - * We need to make sure that we support 16MB hugepage in a segement + * We need to make sure that we support 16MB hugepage in a segment * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE * of 64K. */ @@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); @@ -541,7 +543,7 @@ void hash__mark_rodata_ro(void) unsigned long start, end, pp; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index eb0bccaf221e..a64ea0a7ef96 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -221,7 +221,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) local_irq_restore(flags); } -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) { pte_t *pte; pte_t *start_pte; diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index cfd45245d009..6df4c6d38b66 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -37,6 +37,9 @@ #include <linux/cpu.h> #include <linux/pgtable.h> #include <linux/debugfs.h> +#include <linux/random.h> +#include <linux/elf-randomize.h> +#include <linux/of_fdt.h> #include <asm/interrupt.h> #include <asm/processor.h> @@ -46,7 +49,6 @@ #include <asm/types.h> #include <linux/uaccess.h> #include <asm/machdep.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/eeh.h> #include <asm/tlb.h> @@ -99,8 +101,6 @@ */ static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -EXPORT_SYMBOL_GPL(mmu_psize_defs); u8 hpte_page_sizes[1 << LP_BITS]; EXPORT_SYMBOL_GPL(hpte_page_sizes); @@ -114,9 +114,6 @@ EXPORT_SYMBOL_GPL(mmu_linear_psize); int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; EXPORT_SYMBOL_GPL(mmu_kernel_ssize); @@ -126,11 +123,8 @@ EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif -#ifdef CONFIG_DEBUG_PAGEALLOC static u8 *linear_map_hash_slots; static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ struct mmu_hash_ops mmu_hash_ops; EXPORT_SYMBOL(mmu_hash_ops); @@ -175,6 +169,110 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = { }, }; +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + ppc_after_tlbiel_barrier(); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the partition table cache if this is HV mode. + */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + + /* + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + ppc_after_tlbiel_barrier(); + + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + /* * 'R' and 'C' update notes: * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* @@ -307,7 +405,7 @@ repeat: ssize); if (ret == -1) { /* - * Try to to keep bolted entries in primary. + * Try to keep bolted entries in primary. * Remove non bolted entries and try insert again */ ret = mmu_hash_ops.hpte_remove(hpteg); @@ -326,11 +424,9 @@ repeat: break; cond_resched(); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && + if (debug_pagealloc_enabled_or_kfence() && (paddr >> PAGE_SHIFT) < linear_map_hash_count) linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ } return ret < 0 ? ret : 0; } @@ -563,7 +659,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, } #endif /* CONFIG_HUGETLB_PAGE */ -static void mmu_psize_set_default_penc(void) +static void __init mmu_psize_set_default_penc(void) { int bpsize, apsize; for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) @@ -573,7 +669,7 @@ static void mmu_psize_set_default_penc(void) #ifdef CONFIG_PPC_64K_PAGES -static bool might_have_hea(void) +static bool __init might_have_hea(void) { /* * The HEA ethernet adapter requires awareness of the @@ -644,7 +740,7 @@ static void __init htab_scan_page_sizes(void) * low-order N bits as the encoding for the 2^(12+N) byte page size * (if it exists). */ -static void init_hpte_page_sizes(void) +static void __init init_hpte_page_sizes(void) { long int ap, bp; long int shift, penc; @@ -677,7 +773,7 @@ static void __init htab_init_page_sizes(void) bool aligned = true; init_hpte_page_sizes(); - if (!debug_pagealloc_enabled()) { + if (!debug_pagealloc_enabled_or_kfence()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default @@ -965,8 +1061,7 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { + if (debug_pagealloc_enabled_or_kfence()) { linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; linear_map_hash_slots = memblock_alloc_try_nid( linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, @@ -975,7 +1070,6 @@ static void __init htab_initialize(void) panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", __func__, linear_map_hash_count, &ppc64_rma_size); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ /* create bolted the linear mapping in the hash table */ for_each_mem_range(i, &base, &end) { @@ -1091,7 +1185,7 @@ void __init hash__early_init_mmu(void) ps3_early_mm_init(); else if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_pseries(); - else if (IS_ENABLED(CONFIG_PPC_NATIVE)) + else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE)) hpte_init_native(); if (!mmu_hash_ops.hpte_insert) @@ -1165,7 +1259,6 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) return pp; } -#ifdef CONFIG_PPC_MM_SLICES static unsigned int get_paca_psize(unsigned long addr) { unsigned char *psizes; @@ -1182,12 +1275,6 @@ static unsigned int get_paca_psize(unsigned long addr) return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; } -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->mm_ctx_user_psize; -} -#endif /* * Demote a segment to using 4k pages. @@ -1244,7 +1331,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) spp >>= 30 - 2 * ((ea >> 12) & 0xf); /* - * 0 -> full premission + * 0 -> full permission * 1 -> Read only * 2 -> no access. * We return the flag that need to be cleared. @@ -1522,8 +1609,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -DECLARE_INTERRUPT_HANDLER(__do_hash_fault); -DEFINE_INTERRUPT_HANDLER(__do_hash_fault) +DEFINE_INTERRUPT_HANDLER(do_hash_fault) { unsigned long ea = regs->dar; unsigned long dsisr = regs->dsisr; @@ -1566,7 +1652,7 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault) err = hash_page_mm(mm, ea, access, TRAP(regs), flags); if (unlikely(err < 0)) { - // failed to instert a hash PTE due to an hypervisor error + // failed to insert a hash PTE due to an hypervisor error if (user_mode(regs)) { if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2) _exception(SIGSEGV, regs, SEGV_ACCERR, ea); @@ -1582,36 +1668,6 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault) } } -/* - * The _RAW interrupt entry checks for the in_nmi() case before - * running the full handler. - */ -DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault) -{ - /* - * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then - * don't call hash_page, just fail the fault. This is required to - * prevent re-entrancy problems in the hash code, namely perf - * interrupts hitting while something holds H_PAGE_BUSY, and taking a - * hash fault. See the comment in hash_preload(). - * - * We come here as a result of a DSI at a point where we don't want - * to call hash_page, such as when we are accessing memory (possibly - * user memory) inside a PMU interrupt that occurred while interrupts - * were soft-disabled. We want to invoke the exception handler for - * the access, or panic if there isn't a handler. - */ - if (unlikely(in_nmi())) { - do_bad_page_fault_segv(regs); - return 0; - } - - __do_hash_fault(regs); - - return 0; -} - -#ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { int psize = get_slice_psize(mm, ea); @@ -1628,12 +1684,6 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) return true; } -#else -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - return true; -} -#endif static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, bool is_exec, unsigned long trap) @@ -1677,26 +1727,18 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* - * __hash_page_* must run with interrupts off, as it sets the - * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any - * time and may take a hash fault reading the user stack, see - * read_user_stack_slow() in the powerpc/perf code. - * - * If that takes a hash fault on the same page as we lock here, it - * will bail out when seeing H_PAGE_BUSY set, and retry the access - * leading to an infinite loop. + * __hash_page_* must run with interrupts off, including PMI interrupts + * off, as it sets the H_PAGE_BUSY bit. * - * Disabling interrupts here does not prevent perf interrupts, but it - * will prevent them taking hash faults (see the NMI test in - * do_hash_page), then read_user_stack's copy_from_user_nofault will - * fail and perf will fall back to read_user_stack_slow(), which - * walks the Linux page tables. + * It's otherwise possible for perf interrupts to hit at any time and + * may take a hash fault reading the user stack, which could take a + * hash miss and deadlock on the same H_PAGE_BUSY bit. * * Interrupts must also be off for the duration of the * mm_is_thread_local test and update, to prevent preempt running the * mm on another CPU (XXX: this may be racy vs kthread_use_mm). */ - local_irq_save(flags); + powerpc_local_irq_pmu_save(flags); /* Is that local to this CPU ? */ if (mm_is_thread_local(mm)) @@ -1721,7 +1763,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, mm_ctx_user_psize(&mm->context), pte_val(*ptep)); - local_irq_restore(flags); + powerpc_local_irq_pmu_restore(flags); } /* @@ -1732,7 +1774,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* @@ -1742,9 +1784,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, unsigned long trap; bool is_exec; - if (radix_enabled()) - return; - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return; @@ -1941,7 +1980,9 @@ repeat: return slot; } -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); + static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { unsigned long hash; @@ -1956,15 +1997,18 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) if (!vsid) return; + if (linear_map_hash_slots[lmi] & 0x80) + return; + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, HPTE_V_BOLTED, mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -1974,11 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + raw_spin_lock(&linear_map_hash_lock); + if (!(linear_map_hash_slots[lmi] & 0x80)) { + raw_spin_unlock(&linear_map_hash_lock); + return; + } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -2006,7 +2053,7 @@ void hash__kernel_map_pages(struct page *page, int numpages, int enable) } local_irq_restore(flags); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) @@ -2086,3 +2133,20 @@ void __init print_system_hash_info(void) if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* + * If we are using 1TB segments and we are allowed to randomise + * the heap, we can put it above 1TB so it is backed by a 1TB + * segment. Otherwise the heap will be in the bottom 1TB + * which always uses 256MB segments and this may result in a + * performance penalty. + */ + if (is_32bit_task()) + return randomize_page(mm->brk, SZ_32M); + else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T) + return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G); + else + return randomize_page(mm->brk, SZ_1G); +} diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c new file mode 100644 index 000000000000..3bc0eb21b2a0 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> + +unsigned int hpage_shift; +EXPORT_SYMBOL(hpage_shift); + +#ifdef CONFIG_PPC_64S_HASH_MMU +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, unsigned int shift, unsigned int mmu_psize) +{ + real_pte_t rpte; + unsigned long vpn; + unsigned long old_pte, new_pte; + unsigned long rflags, pa; + long slot, offset; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + vpn = hpt_vpn(ea, vsid, ssize); + + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* Make sure this is a hugetlb entry */ + if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + return 0; + + rflags = htab_convert_pte_flags(new_pte, flags); + if (unlikely(mmu_psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; + rpte = __real_pte(__pte(old_pte), ptep, offset); + + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long gslot; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, + mmu_psize, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, shift, ssize); + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + + /* clear HPTE slot informations in new PTE */ + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + + slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, + mmu_psize, ssize); + + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, mmu_psize, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} +#endif + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long pte_val; + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, + _PAGE_PRESENT, _PAGE_INVALID, 1); + + return __pte(pte_val); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + + if (radix_enabled()) + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, + old_pte, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} + +void __init hugetlbpage_init_defaultsize(void) +{ + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift; +} diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index cd18e94d0843..7fcfba162e0d 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -305,24 +305,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(mm_iommu_lookup); -struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, - next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} - struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries) { @@ -369,56 +351,6 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, } EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); -long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - unsigned long *pa; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); - if (!pa) - return -EFAULT; - - *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} - -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) -{ - struct mm_iommu_table_group_mem_t *mem; - long entry; - void *va; - unsigned long *pa; - - mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); - if (!mem) - return; - - if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) - return; - - entry = (ua - mem->ua) >> PAGE_SHIFT; - va = &mem->hpas[entry]; - - pa = (void *) vmalloc_to_phys(va); - if (!pa) - return; - - *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; -} - bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, unsigned int pageshift, unsigned long *size) { diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c10fc8a72fb3..c766e4c26e42 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -31,7 +31,8 @@ static int alloc_context_id(int min_id, int max_id) return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); } -void hash__reserve_context_id(int id) +#ifdef CONFIG_PPC_64S_HASH_MMU +void __init hash__reserve_context_id(int id) { int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); @@ -50,7 +51,9 @@ int hash__alloc_context_id(void) return alloc_context_id(MIN_USER_CONTEXT, max); } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +#endif +#ifdef CONFIG_PPC_64S_HASH_MMU static int realloc_context_ids(mm_context_t *ctx) { int i, id; @@ -150,6 +153,13 @@ void hash__setup_new_exec(void) slb_setup_new_exec(); } +#else +static inline int hash__init_new_context(struct mm_struct *mm) +{ + BUILD_BUG(); + return 0; +} +#endif static int radix__init_new_context(struct mm_struct *mm) { @@ -175,7 +185,9 @@ static int radix__init_new_context(struct mm_struct *mm) */ asm volatile("ptesync;isync" : : : "memory"); +#ifdef CONFIG_PPC_64S_HASH_MMU mm->context.hash_context = NULL; +#endif return index; } @@ -213,14 +225,22 @@ EXPORT_SYMBOL_GPL(__destroy_context); static void destroy_contexts(mm_context_t *ctx) { - int index, context_id; + if (radix_enabled()) { + ida_free(&mmu_context_ida, ctx->id); + } else { +#ifdef CONFIG_PPC_64S_HASH_MMU + int index, context_id; - for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { - context_id = ctx->extended_id[index]; - if (context_id) - ida_free(&mmu_context_ida, context_id); + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +#else + BUILD_BUG(); // radix_enabled() should be constant true +#endif } - kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9e16c7b1a6c5..f6151a589298 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,6 +6,8 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/memblock.h> +#include <linux/memremap.h> +#include <linux/pkeys.h> #include <linux/debugfs.h> #include <misc/cxl-base.h> @@ -22,6 +24,13 @@ #include "internal.h" +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif + unsigned long __pmd_frag_nr; EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; @@ -207,17 +216,12 @@ void __init mmu_partition_table_init(void) unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; unsigned long ptcr; - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); /* Initialize the Partition Table with no entries */ partition_tb = memblock_alloc(patb_size, patb_size); if (!partition_tb) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, patb_size, patb_size); - /* - * update partition table control register, - * 64 K size. - */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); @@ -328,7 +332,7 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* * If we find pgtable_page set, we return - * the allocated page with single fragement + * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { @@ -526,3 +530,46 @@ static int __init pgtable_debugfs_setup(void) return 0; } arch_initcall(pgtable_debugfs_setup); + +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN) +/* + * Override the generic version in mm/memremap.c. + * + * With hash translation, the direct-map range is mapped with just one + * page size selected by htab_init_page_sizes(). Consult + * mmu_psize_defs[] to determine the minimum page size alignment. +*/ +unsigned long memremap_compat_align(void) +{ + if (!radix_enabled()) { + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; + return max(SUBSECTION_SIZE, 1UL << shift); + } + + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long prot; + + /* Radix supports execute-only, but protection_map maps X -> RX */ + if (radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) { + prot = pgprot_val(PAGE_EXECONLY); + } else { + prot = pgprot_val(protection_map[vm_flags & + (VM_ACCESS_FLAGS | VM_SHARED)]); + } + + if (vm_flags & VM_SAO) + prot |= _PAGE_SAO; + +#ifdef CONFIG_PPC_MEM_KEYS + prot |= vmflag_to_pte_pkey_bits(vm_flags); +#endif + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index a2d9ad138709..1d2675ab6711 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -10,6 +10,7 @@ #include <asm/mmu.h> #include <asm/setup.h> #include <asm/smp.h> +#include <asm/firmware.h> #include <linux/pkeys.h> #include <linux/of_fdt.h> @@ -66,7 +67,7 @@ static int __init dt_scan_storage_keys(unsigned long node, return 1; } -static int scan_pkey_feature(void) +static int __init scan_pkey_feature(void) { int ret; int pkeys_total = 0; diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 23d3e08911d3..5e3195568525 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -41,61 +41,6 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); } -/* - * A vairant of hugetlb_get_unmapped_area doing topdown search - * FIXME!! should we do as x86 does or non hugetlb area does ? - * ie, use topdown or not based on mmap_is_legacy check ? - */ -unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct hstate *h = hstate_file(file); - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - /* - * We are always doing an topdown search here. Slice code - * does that too. - */ - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - - return vm_unmapped_area(&info); -} - void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) @@ -103,11 +48,13 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + atomic_read(&mm->context.copros) > 0) radix__flush_hugetlb_page(vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, pte); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 3a600bd7fbc6..cac727b01799 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -30,10 +30,12 @@ #include <asm/trace.h> #include <asm/uaccess.h> #include <asm/ultravisor.h> +#include <asm/set_memory.h> #include <trace/events/thp.h> -unsigned int mmu_pid_bits; +#include <mm/mmu_decl.h> + unsigned int mmu_base_pid; unsigned long radix_mem_block_size __ro_after_init; @@ -229,7 +231,7 @@ void radix__mark_rodata_ro(void) unsigned long start, end; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); } @@ -260,21 +262,24 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); + if (addr < __pa_symbol(__srwx_boundary)) + return __pa_symbol(__srwx_boundary); #endif return end; } static int __meminit create_physical_mapping(unsigned long start, unsigned long end, - unsigned long max_mapping_size, int nid, pgprot_t _prot) { unsigned long vaddr, addr, mapping_size = 0; bool prev_exec, exec = false; pgprot_t prot; int psize; + unsigned long max_mapping_size = radix_mem_block_size; + + if (debug_pagealloc_enabled_or_kfence()) + max_mapping_size = PAGE_SIZE; start = ALIGN(start, PAGE_SIZE); end = ALIGN_DOWN(end, PAGE_SIZE); @@ -335,7 +340,7 @@ static void __init radix_init_pgtable(void) u64 i; /* We don't support slb for radix */ - mmu_slb_size = 0; + slb_set_size(0); /* * Create the linear mapping @@ -353,22 +358,16 @@ static void __init radix_init_pgtable(void) } WARN_ON(create_physical_mapping(start, end, - radix_mem_block_size, -1, PAGE_KERNEL)); } - /* Find out how many PID bits are supported */ if (!cpu_has_feature(CPU_FTR_HVMODE) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { /* - * Older versions of KVM on these machines perfer if the + * Older versions of KVM on these machines prefer if the * guest only uses the low 19 PID bits. */ - if (!mmu_pid_bits) - mmu_pid_bits = 19; - } else { - if (!mmu_pid_bits) - mmu_pid_bits = 20; + mmu_pid_bits = 19; } mmu_base_pid = 1; @@ -449,11 +448,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; - /* Find MMU PID size */ - prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); - if (prop && size == 4) - mmu_pid_bits = be32_to_cpup(prop); - /* Grab page size encodings */ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); if (!prop) @@ -510,7 +504,7 @@ static int __init probe_memory_block_size(unsigned long node, const char *uname, return 1; } -static unsigned long radix_memory_block_size(void) +static unsigned long __init radix_memory_block_size(void) { unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE; @@ -528,7 +522,7 @@ static unsigned long radix_memory_block_size(void) #else /* CONFIG_MEMORY_HOTPLUG */ -static unsigned long radix_memory_block_size(void) +static unsigned long __init radix_memory_block_size(void) { return 1UL * 1024 * 1024 * 1024; } @@ -572,22 +566,11 @@ void __init radix__early_init_devtree(void) return; } -static void radix_init_amor(void) -{ - /* - * In HV mode, we init AMOR (Authority Mask Override Register) so that - * the hypervisor and guest can setup IAMR (Instruction Authority Mask - * Register), enable key 0 and set it to 1. - * - * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) - */ - mtspr(SPRN_AMOR, (3ul << 62)); -} - void __init radix__early_init_mmu(void) { unsigned long lpcr; +#ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; @@ -605,6 +588,7 @@ void __init radix__early_init_mmu(void) } else mmu_vmemmap_psize = mmu_virtual_psize; #endif +#endif /* * initialize page table size */ @@ -644,7 +628,6 @@ void __init radix__early_init_mmu(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); - radix_init_amor(); } else { radix_init_pseries(); } @@ -668,8 +651,6 @@ void radix__early_init_mmu_secondary(void) set_ptcr_when_no_uv(__pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - - radix_init_amor(); } radix__switch_mmu_context(NULL, &init_mm); @@ -874,7 +855,7 @@ int __meminit radix__create_section_mapping(unsigned long start, } return create_physical_mapping(__pa(start), __pa(end), - radix_mem_block_size, nid, prot); + nid, prot); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -920,10 +901,17 @@ void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long #endif #endif -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) void radix__kernel_map_pages(struct page *page, int numpages, int enable) { - pr_warn_once("DEBUG_PAGEALLOC not supported in radix mode\n"); + unsigned long addr; + + addr = (unsigned long)page_address(page); + + if (enable) + set_memory_p(addr, numpages); + else + set_memory_np(addr, numpages); } #endif @@ -961,15 +949,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); - /* - * pmdp collapse_flush need to ensure that there are no parallel gup - * walk after this call. This is needed so that we can have stable - * page ref count when collapsing a page. We don't allow a collapse page - * if we have gup taken on the page. We can ensure that by sending IPI - * because gup walk happens with IRQ disabled. - */ - serialize_against_pte_lookup(vma->vm_mm); - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); return pmd; @@ -1042,16 +1021,21 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, unsigned long change = pte_val(entry) ^ pte_val(*ptep); /* - * To avoid NMMU hang while relaxing access, we need mark - * the pte invalid in between. + * On POWER9, the NMMU is not able to relax PTE access permissions + * for a translation with a TLB. The PTE must be invalidated, TLB + * flushed before the new PTE is installed. + * + * This only needs to be done for radix, because hash translation does + * flush when updating the linux pte (and we don't support NMMU + * accelerators on HPT on POWER9 anyway XXX: do we?). + * + * POWER10 (and P9P) NMMU does behave as per ISA. */ - if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && + atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); - /* - * new value of pte - */ new_pte = old_pte | set; radix__flush_tlb_page_psize(mm, address, psize); __radix_pte_update(ptep, _PAGE_INVALID, new_pte); @@ -1059,9 +1043,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, __radix_pte_update(ptep, 0, set); /* * Book3S does not require a TLB flush when relaxing access - * restrictions when the address space is not attached to a - * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architecture. + * restrictions when the address space (modulo the POWER9 nest + * MMU issue above) because the MMU will reload the PTE after + * taking an access fault, as defined by the architecture. See + * "Setting a Reference or Change Bit or Upgrading Access + * Authority (PTE Subject to Atomic Hardware Updates)" in + * Power ISA Version 3.1B. */ } /* See ptesync comment in radix__set_pte_at */ @@ -1074,11 +1061,12 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. We need to do this only for radix, because hash - * translation does flush when updating the linux pte. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && (atomic_read(&mm->context.copros) > 0)) radix__flush_tlb_page(vma, addr); @@ -1100,7 +1088,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (pud_huge(*pud)) { + if (pud_is_leaf(*pud)) { pud_clear(pud); return 1; } @@ -1147,7 +1135,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 7724af19ed7e..4e29b619578c 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -397,7 +397,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) /* * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint + * must be a compile-time constraint to match the "i" constraint * in the asm statement. */ switch (ric) { @@ -755,10 +755,18 @@ EXPORT_SYMBOL(radix__local_flush_tlb_page); static bool mm_needs_flush_escalation(struct mm_struct *mm) { /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate + * The P9 nest MMU has issues with the page walk cache caching PTEs + * and not flushing them when RIC = 0 for a PID/LPID invalidate. + * + * This may have been fixed in shipping firmware (by disabling PWC + * or preventing it from caching PTEs), but until that is confirmed, + * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes + * to RIC=2. + * + * POWER10 (and P9P) does not have this problem. */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + return false; if (atomic_read(&mm->context.copros) > 0) return true; return false; diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index f0037bcc47a0..6956f637a38c 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -9,7 +9,6 @@ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM */ -#include <asm/asm-prototypes.h> #include <asm/interrupt.h> #include <asm/mmu.h> #include <asm/mmu_context.h> @@ -348,7 +347,7 @@ void slb_setup_new_exec(void) /* * We have no good place to clear the slb preload cache on exec, * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have aleady preloaded the SLBEs. + * after we switch to the mm and have already preloaded the SLBEs. * * For the most part that's probably okay to use entries from the * previous exec, they will age out if unused. It may turn out to @@ -616,7 +615,7 @@ static void slb_cache_update(unsigned long esid_data) } else { /* * Our cache is full and the current cache content strictly - * doesn't indicate the active SLB conents. Bump the ptr + * doesn't indicate the active SLB contents. Bump the ptr * so that switch_slb() will ignore the cache. */ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; @@ -868,19 +867,3 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) return err; } } - -DEFINE_INTERRUPT_HANDLER(do_bad_slb_fault) -{ - int err = regs->result; - - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); - else - bad_page_fault(regs, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } -} diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/book3s64/slice.c index 82b45b1cb973..c0b58afb9a47 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -276,20 +276,18 @@ static bool slice_scan_available(unsigned long addr, } static unsigned long slice_find_area_bottomup(struct mm_struct *mm, - unsigned long len, + unsigned long addr, unsigned long len, const struct slice_mask *available, int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, next_end; + unsigned long found, next_end; struct vm_unmapped_area_info info; info.flags = 0; info.length = len; info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); info.align_offset = 0; - - addr = TASK_UNMAPPED_BASE; /* * Check till the allow max value for this mmap request */ @@ -322,12 +320,12 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, } static unsigned long slice_find_area_topdown(struct mm_struct *mm, - unsigned long len, + unsigned long addr, unsigned long len, const struct slice_mask *available, int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, prev; + unsigned long found, prev; struct vm_unmapped_area_info info; unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); @@ -335,8 +333,6 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, info.length = len; info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); info.align_offset = 0; - - addr = mm->mmap_base; /* * If we are trying to allocate above DEFAULT_MAP_WINDOW * Add the different to the mmap_base. @@ -377,7 +373,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * can happen with large stack limits and large mmap() * allocations. */ - return slice_find_area_bottomup(mm, len, available, psize, high_limit); + return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit); } @@ -386,9 +382,9 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, int topdown, unsigned long high_limit) { if (topdown) - return slice_find_area_topdown(mm, len, mask, psize, high_limit); + return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit); else - return slice_find_area_bottomup(mm, len, mask, psize, high_limit); + return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit); } static inline void slice_copy_mask(struct slice_mask *dst, @@ -645,6 +641,9 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long pgoff, unsigned long flags) { + if (radix_enabled()) + return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + return slice_get_unmapped_area(addr, len, flags, mm_ctx_user_psize(¤t->mm->context), 0); } @@ -655,6 +654,9 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long pgoff, const unsigned long flags) { + if (radix_enabled()) + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); + return slice_get_unmapped_area(addr0, len, flags, mm_ctx_user_psize(¤t->mm->context), 1); } @@ -712,7 +714,6 @@ void slice_init_new_context_exec(struct mm_struct *mm) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); } -#ifdef CONFIG_PPC_BOOK3S_64 void slice_setup_new_exec(void) { struct mm_struct *mm = current->mm; @@ -724,7 +725,6 @@ void slice_setup_new_exec(void) mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); } -#endif void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) @@ -779,4 +779,29 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, return !slice_check_range_fits(mm, maskp, addr, len); } + +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + /* With radix we don't use slice, so derive it from vma*/ + if (radix_enabled()) + return vma_kernel_pagesize(vma); + + return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); +} + +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + return shift_to_mmu_psize(huge_page_shift(hstate)); +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + if (radix_enabled()) + return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); + + return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); +} #endif diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 60c6ea16a972..d73b3b4176e8 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; + for_each_vma_range(vmi, vma, addr + len) { vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c new file mode 100644 index 000000000000..ccd64b5e6cac --- /dev/null +++ b/arch/powerpc/mm/book3s64/trace.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file is for defining trace points and trace related helpers. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#include <trace/events/thp.h> +#endif diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 63363787e000..0e9b4879c0f9 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -12,7 +12,7 @@ static inline bool flush_coherent_icache(void) /* * For a snooping icache, we still need a dummy icbi to purge all the * prefetched instructions from the ifetch buffers. We also need a sync - * before the icbi to order the the actual stores to memory that might + * before the icbi to order the actual stores to memory that might * have modified instructions with the icbi. */ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 8acd00178956..7c507fb48182 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -65,6 +65,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, ret = 0; *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); + + /* The fault is fully completed (including releasing mmap lock) */ + if (*flt & VM_FAULT_COMPLETED) + return 0; + if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; @@ -82,6 +87,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(copro_handle_mm_fault); +#ifdef CONFIG_PPC_64S_HASH_MMU int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) { u64 vsid, vsidkey; @@ -146,3 +152,4 @@ void copro_flush_all_slbs(struct mm_struct *mm) cxl_slbia(mm); } EXPORT_SYMBOL_GPL(copro_flush_all_slbs); +#endif diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 22197b18d85e..2369d1bf2411 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -11,7 +11,7 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/memblock.h> -#include <asm/prom.h> +#include <linux/slab.h> #include <asm/drmem.h> static int n_root_addr_cells, n_root_size_cells; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a8d0ce85d39a..2bef19cc1b98 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -270,7 +270,11 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma return false; } - if (unlikely(!vma_is_accessible(vma))) + /* + * Check for a read fault. This could be caused by a read on an + * inaccessible page (i.e. PROT_NONE), or a Radix MMU execute-only page. + */ + if (unlikely(!(vma->vm_flags & VM_READ))) return true; /* * We should ideally do the vma pkey access check here. But in the @@ -367,7 +371,22 @@ static void sanity_check_fault(bool is_write, bool is_user, #elif defined(CONFIG_PPC_8xx) #define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) #elif defined(CONFIG_PPC64) -#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) +static int page_fault_is_bad(unsigned long err) +{ + unsigned long flag = DSISR_BAD_FAULT_64S; + + /* + * PAPR+ v2.11 § 14.15.3.4.1 (unreleased) + * If byte 0, bit 3 of pi-attribute-specifier-type in + * ibm,pi-features property is defined, ignore the DSI error + * which is caused by the paste instruction on the + * suspended NX window. + */ + if (mmu_has_feature(MMU_FTR_NX_DSI)) + flag &= ~DSISR_BAD_COPYPASTE; + + return err & flag; +} #else #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) #endif @@ -511,15 +530,17 @@ retry: if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + goto out; + /* * Handle the retry right now, the mmap_lock has been released in that * case. */ if (unlikely(fault & VM_FAULT_RETRY)) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } + flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(current->mm); @@ -527,6 +548,7 @@ retry: if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); +out: /* * Major/minor page fault accounting. */ @@ -568,18 +590,24 @@ NOKPROBE_SYMBOL(hash__do_page_fault); static void __bad_page_fault(struct pt_regs *regs, int sig) { int is_write = page_fault_is_write(regs->dsisr); + const char *msg; /* kernel has accessed a bad area */ + if (regs->dar < PAGE_SIZE) + msg = "Kernel NULL pointer dereference"; + else + msg = "Unable to handle kernel data access"; + switch (TRAP(regs)) { case INTERRUPT_DATA_STORAGE: - case INTERRUPT_DATA_SEGMENT: case INTERRUPT_H_DATA_STORAGE: - pr_alert("BUG: %s on %s at 0x%08lx\n", - regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" : - "Unable to handle kernel data access", + pr_alert("BUG: %s on %s at 0x%08lx\n", msg, is_write ? "write" : "read", regs->dar); break; + case INTERRUPT_DATA_SEGMENT: + pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar); + break; case INTERRUPT_INST_STORAGE: case INTERRUPT_INST_SEGMENT: pr_alert("BUG: Unable to handle kernel instruction fetch%s", @@ -620,4 +648,27 @@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv) { bad_page_fault(regs, SIGSEGV); } + +/* + * In radix, segment interrupts indicate the EA is not addressable by the + * page table geometry, so they are always sent here. + * + * In hash, this is called if do_slb_fault returns error. Typically it is + * because the EA was outside the region allowed by software. + */ +DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt) +{ + int err = regs->result; + + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); + else + bad_page_fault(regs, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 82d8b368ca6d..5852a86d990d 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,6 +24,7 @@ #include <asm/setup.h> #include <asm/hugetlb.h> #include <asm/pte-walk.h> +#include <asm/firmware.h> bool hugetlb_disabled = false; @@ -391,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, * single hugepage, but all of them point to * the same kmem cache that holds the hugepte. */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); + more = addr + (1UL << hugepd_shift(*(hugepd_t *)pmd)); if (more > next) next = more; @@ -433,7 +434,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, * single hugepage, but all of them point to * the same kmem cache that holds the hugepte. */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); + more = addr + (1UL << hugepd_shift(*(hugepd_t *)pud)); if (more > next) next = more; @@ -495,7 +496,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, * for a single hugepage, but all of them point to the * same kmem cache that holds the hugepte. */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); + more = addr + (1UL << hugepd_shift(*(hugepd_t *)pgd)); if (more > next) next = more; @@ -542,34 +543,6 @@ retry: return page; } -#ifdef CONFIG_PPC_MM_SLICES -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct hstate *hstate = hstate_file(file); - int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - -#ifdef CONFIG_PPC_RADIX_MMU - if (radix_enabled()) - return radix__hugetlb_get_unmapped_area(file, addr, len, - pgoff, flags); -#endif - return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); -} -#endif - -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - /* With radix we don't use slice, so derive it from vma*/ - if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { - unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); - } - return vma_kernel_pagesize(vma); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); @@ -650,7 +623,7 @@ static int __init hugetlbpage_init(void) if (pdshift > shift) { if (!IS_ENABLED(CONFIG_PPC_8xx)) pgtable_cache_add(pdshift - shift); - } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || + } else if (IS_ENABLED(CONFIG_PPC_E500) || IS_ENABLED(CONFIG_PPC_8xx)) { pgtable_cache_add(PTE_T_ORDER); } @@ -658,10 +631,7 @@ static int __init hugetlbpage_init(void) configured = true; } - if (configured) { - if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) - hugetlbpage_init_default(); - } else + if (!configured) pr_info("Failed to initialize. Disabling HugeTLB"); return 0; diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 3a82f89827a5..119ef491f797 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -20,6 +20,7 @@ #include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/kup.h> +#include <asm/smp.h> phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; EXPORT_SYMBOL_GPL(memstart_addr); @@ -33,6 +34,9 @@ bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); static int __init parse_nosmep(char *p) { + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return 0; + disable_kuep = true; pr_warn("Disabling Kernel Userspace Execution Prevention\n"); return 0; @@ -47,6 +51,23 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); +void __weak setup_kuep(bool disabled) +{ + if (!IS_ENABLED(CONFIG_PPC_KUEP) || disabled) + return; + + if (smp_processor_id() != boot_cpuid) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); +} + +void setup_kup(void) +{ + setup_kuap(disable_kuap); + setup_kuep(disable_kuep); +} + #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ memset(addr, 0, sizeof(void *) << (shift)); \ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3d690be48e84..d4cc3749e621 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -29,7 +29,6 @@ #include <linux/slab.h> #include <linux/hugetlb.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/mmu.h> #include <asm/smp.h> @@ -70,44 +69,10 @@ EXPORT_SYMBOL(agp_special_page); void MMU_init(void); -/* - * this tells the system to map all of ram with the segregs - * (i.e. page tables) instead of the bats. - * -- Cort - */ -int __map_without_bats; -int __map_without_ltlbs; - /* max amount of low RAM to map in */ unsigned long __max_low_memory = MAX_LOW_MEM; /* - * Check for command-line options that affect what MMU_init will do. - */ -static void __init MMU_setup(void) -{ - /* Check for nobats option (used in mapin_ram). */ - if (strstr(boot_command_line, "nobats")) { - __map_without_bats = 1; - } - - if (strstr(boot_command_line, "noltlbs")) { - __map_without_ltlbs = 1; - } - if (IS_ENABLED(CONFIG_PPC_8xx)) - return; - - if (IS_ENABLED(CONFIG_KFENCE)) - __map_without_ltlbs = 1; - - if (debug_pagealloc_enabled()) - __map_without_ltlbs = 1; - - if (strict_kernel_rwx_enabled()) - __map_without_ltlbs = 1; -} - -/* * MMU_init sets up the basic memory mappings for the kernel, * including both RAM and possibly some I/O regions, * and sets up the page tables and the MMU hardware ready to go. @@ -117,31 +82,15 @@ void __init MMU_init(void) if (ppc_md.progress) ppc_md.progress("MMU:enter", 0x111); - /* parse args from command line */ - MMU_setup(); - - /* - * Reserve gigantic pages for hugetlb. This MUST occur before - * lowmem_end_addr is initialized below. - */ - if (memblock.memory.cnt > 1) { -#ifndef CONFIG_WII - memblock_enforce_memory_limit(memblock.memory.regions[0].size); - pr_warn("Only using first contiguous memory region\n"); -#else - wii_memory_fixups(); -#endif - } - total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; lowmem_end_addr = memstart_addr + total_lowmem; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB * entries, so we need to adjust lowmem to match the amount we can map * in the fixed entries */ adjust_total_lowmem(); -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ if (total_lowmem > __max_low_memory) { total_lowmem = __max_low_memory; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 386be136026e..05b0d584e50b 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -59,6 +59,7 @@ #include <asm/sections.h> #include <asm/iommu.h> #include <asm/vdso.h> +#include <asm/hugetlb.h> #include <mm/mmu_decl.h> @@ -110,7 +111,7 @@ static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_m } /* - * vmemmap virtual address space management does not have a traditonal page + * vmemmap virtual address space management does not have a traditional page * table to track which virtual struct pages are backed by physical mapping. * The virtual to physical mappings are tracked in a simple linked list * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at @@ -127,7 +128,7 @@ static struct vmemmap_backing *next; /* * The same pointer 'next' tracks individual chunks inside the allocated - * full page during the boot time and again tracks the freeed nodes during + * full page during the boot time and again tracks the freed nodes during * runtime. It is racy but it does not happen as they are separated by the * boot process. Will create problem if some how we have memory hotplug * operation during boot !! @@ -370,6 +371,12 @@ void register_page_bootmem_memmap(unsigned long section_nr, #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 +unsigned int mmu_lpid_bits; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +EXPORT_SYMBOL_GPL(mmu_lpid_bits); +#endif +unsigned int mmu_pid_bits; + static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); static int __init parse_disable_radix(char *p) @@ -437,11 +444,56 @@ static void __init early_check_vec5(void) } } +static int __init dt_scan_mmu_pid_width(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Find MMU LPID, PID register size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size); + if (prop && size == 4) + mmu_lpid_bits = be32_to_cpup(prop); + + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + if (!mmu_pid_bits && !mmu_lpid_bits) + return 0; + + return 1; +} + void __init mmu_early_init_devtree(void) { + bool hvmode = !!(mfmsr() & MSR_HV); + /* Disable radix mode based on kernel command line. */ - if (disable_radix) - cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + if (disable_radix) { + if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + else + pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); + } + + of_scan_flat_dt(dt_scan_mmu_pid_width, NULL); + if (hvmode && !mmu_lpid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + mmu_lpid_bits = 12; /* POWER8-10 */ + else + mmu_lpid_bits = 10; /* POWER7 */ + } + if (!mmu_pid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + mmu_pid_bits = 20; /* POWER9-10 */ + } /* * Check /chosen/ibm,architecture-vec-5 if running as a guest. @@ -449,11 +501,12 @@ void __init mmu_early_init_devtree(void) * even though the ibm,architecture-vec-5 property created by * skiboot doesn't have the necessary bits set. */ - if (!(mfmsr() & MSR_HV)) + if (!hvmode) early_check_vec5(); if (early_radix_enabled()) { radix__early_init_devtree(); + /* * We have finalized the translation we are going to use by now. * Radix mode is not limited by RMA / VRMA addressing. @@ -463,5 +516,12 @@ void __init mmu_early_init_devtree(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } else hash__early_init_devtree(); + + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_defaultsize(); + + if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) && + !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX)) + panic("kernel does not support any MMU type offered by platform"); } #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 57342154d2b0..4f12504fb405 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -98,23 +98,3 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, return NULL; } - -#ifdef CONFIG_ZONE_DEVICE -/* - * Override the generic version in mm/memremap.c. - * - * With hash translation, the direct-map range is mapped with just one - * page size selected by htab_init_page_sizes(). Consult - * mmu_psize_defs[] to determine the minimum page size alignment. -*/ -unsigned long memremap_compat_align(void) -{ - unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; - - if (radix_enabled()) - return SUBSECTION_SIZE; - return max(SUBSECTION_SIZE, 1UL << shift); - -} -EXPORT_SYMBOL_GPL(memremap_compat_align); -#endif diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile index bb1a5408b86b..699eeffd9f55 100644 --- a/arch/powerpc/mm/kasan/Makefile +++ b/arch/powerpc/mm/kasan/Makefile @@ -2,6 +2,8 @@ KASAN_SANITIZE := n -obj-$(CONFIG_PPC32) += kasan_init_32.o +obj-$(CONFIG_PPC32) += init_32.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o +obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c index 202bd260a009..450a67ef0bbe 100644 --- a/arch/powerpc/mm/kasan/book3s_32.c +++ b/arch/powerpc/mm/kasan/book3s_32.c @@ -10,47 +10,51 @@ int __init kasan_init_region(void *start, size_t size) { unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); - unsigned long k_cur = k_start; - int k_size = k_end - k_start; - int k_size_base = 1 << (ffs(k_size) - 1); + unsigned long k_nobat = k_start; + unsigned long k_cur; + phys_addr_t phys; int ret; - void *block; - block = memblock_alloc(k_size, k_size_base); - - if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) { - int k_size_more = 1 << (ffs(k_size - k_size_base) - 1); - - setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL); - if (k_size_more >= SZ_128K) - setbat(-1, k_start + k_size_base, __pa(block) + k_size_base, - k_size_more, PAGE_KERNEL); - if (v_block_mapped(k_start)) - k_cur = k_start + k_size_base; - if (v_block_mapped(k_start + k_size_base)) - k_cur = k_start + k_size_base + k_size_more; - - update_bats(); + while (k_nobat < k_end) { + unsigned int k_size = bat_block_size(k_nobat, k_end); + int idx = find_free_bat(); + + if (idx == -1) + break; + if (k_size < SZ_128K) + break; + phys = memblock_phys_alloc_range(k_size, k_size, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + break; + + setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL); + k_nobat += k_size; } + if (k_nobat != k_start) + update_bats(); - if (!block) - block = memblock_alloc(k_size, PAGE_SIZE); - if (!block) - return -ENOMEM; + if (k_nobat < k_end) { + phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + return -ENOMEM; + } ret = kasan_init_shadow_page_tables(k_start, k_end); if (ret) return ret; - kasan_update_early_region(k_start, k_cur, __pte(0)); + kasan_update_early_region(k_start, k_nobat, __pte(0)); - for (; k_cur < k_end; k_cur += PAGE_SIZE) { + for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_off_k(k_cur); - void *va = block + k_cur - k_start; - pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL); __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); } flush_tlb_kernel_range(k_start, k_end); + memset(kasan_mem_to_shadow(start), 0, k_end - k_start); + return 0; } diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c new file mode 100644 index 000000000000..a70828a6d935 --- /dev/null +++ b/arch/powerpc/mm/kasan/init_32.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/sched/task.h> +#include <asm/pgalloc.h> +#include <asm/code-patching.h> +#include <mm/mmu_decl.h> + +static pgprot_t __init kasan_prot_ro(void) +{ + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return PAGE_READONLY; + + return PAGE_KERNEL_RO; +} + +static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot) +{ + unsigned long va = (unsigned long)kasan_early_shadow_page; + phys_addr_t pa = __pa(kasan_early_shadow_page); + int i; + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++) + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 1); +} + +int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +{ + pmd_t *pmd; + unsigned long k_cur, k_next; + + pmd = pmd_off_k(k_start); + + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { + pte_t *new; + + k_next = pgd_addr_end(k_cur, k_end); + if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) + continue; + + new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + + if (!new) + return -ENOMEM; + kasan_populate_pte(new, PAGE_KERNEL); + pmd_populate_kernel(&init_mm, pmd, new); + } + return 0; +} + +int __init __weak kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_cur; + int ret; + void *block; + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + block = memblock_alloc(k_end - k_start, PAGE_SIZE); + if (!block) + return -ENOMEM; + + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_off_k(k_cur); + void *va = block + k_cur - k_start; + pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } + flush_tlb_kernel_range(k_start, k_end); + return 0; +} + +void __init +kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte) +{ + unsigned long k_cur; + + for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_off_k(k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if (pte_page(*ptep) != virt_to_page(lm_alias(kasan_early_shadow_page))) + continue; + + __set_pte_at(&init_mm, k_cur, ptep, pte, 0); + } + + flush_tlb_kernel_range(k_start, k_end); +} + +static void __init kasan_remap_early_shadow_ro(void) +{ + pgprot_t prot = kasan_prot_ro(); + phys_addr_t pa = __pa(kasan_early_shadow_page); + + kasan_populate_pte(kasan_early_shadow_pte, prot); + + kasan_update_early_region(KASAN_SHADOW_START, KASAN_SHADOW_END, + pfn_pte(PHYS_PFN(pa), prot)); +} + +static void __init kasan_unmap_early_shadow_vmalloc(void) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END); + + kasan_update_early_region(k_start, k_end, __pte(0)); + +#ifdef MODULES_VADDR + k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR); + k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END); + kasan_update_early_region(k_start, k_end, __pte(0)); +#endif +} + +void __init kasan_mmu_init(void) +{ + int ret; + + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } +} + +void __init kasan_init(void) +{ + phys_addr_t base, end; + u64 i; + int ret; + + for_each_mem_range(i, &base, &end) { + phys_addr_t top = min(end, total_lowmem); + + if (base >= top) + continue; + + ret = kasan_init_region(__va(base), top - base); + if (ret) + panic("kasan: kasan_init_region() failed"); + } + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + + kasan_remap_early_shadow_ro(); + + clear_page(kasan_early_shadow_page); + + /* At this point kasan is fully initialized. Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +void __init kasan_late_init(void) +{ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_unmap_early_shadow_vmalloc(); +} + +void __init kasan_early_init(void) +{ + unsigned long addr = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + pmd_t *pmd = pmd_off_k(addr); + + BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); + + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL); + + do { + next = pgd_addr_end(addr, end); + pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); + } while (pmd++, addr = next, addr != end); +} diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c new file mode 100644 index 000000000000..11519e88dc6b --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3e powerpc + * + * Copyright 2022, Christophe Leroy, CS GROUP France + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/set_memory.h> + +#include <asm/pgalloc.h> + +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} + +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (kasan_pud_table(*p4dp)) { + pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (kasan_pmd_table(*pudp)) { + pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (kasan_pte_table(*pmdp)) { + ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + + __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0); + + return 0; +} + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_early_init(void) +{ + int i; + unsigned long addr; + pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START); + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE) + p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud); +} + +void __init kasan_init(void) +{ + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region((void *)start, (void *)end); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); + + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c new file mode 100644 index 000000000000..9300d641cf9a --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3S powerpc + * + * Copyright 2019-2022, Daniel Axtens, IBM Corporation. + */ + +/* + * ppc64 turns on virtual memory late in boot, after calling into generic code + * like the device-tree parser, so it uses this in conjunction with a hook in + * outline mode to avoid invalid access early in boot. + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/sched/task.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> + +DEFINE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_init(void) +{ + /* + * We want to do the following things: + * 1) Map real memory into the shadow for all physical memblocks + * This takes us from c000... to c008... + * 2) Leave a hole over the shadow of vmalloc space. KASAN_VMALLOC + * will manage this for us. + * This takes us from c008... to c00a... + * 3) Map the 'early shadow'/zero page over iomap and vmemmap space. + * This takes us up to where we start at c00e... + */ + + void *k_start = kasan_mem_to_shadow((void *)RADIX_VMALLOC_END); + void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END); + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + if (!early_radix_enabled()) { + pr_warn("KASAN not enabled as it requires radix!"); + return; + } + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region((void *)start, (void *)end); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + /* map the early shadow over the iomap and vmemmap space */ + kasan_populate_early_shadow(k_start, k_end); + + /* mark early shadow region as RO and wipe it */ + zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + /* + * clear_page relies on some cache info that hasn't been set up yet. + * It ends up looping ~forever and blows up other data. + * Use memset instead. + */ + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + static_branch_inc(&powerpc_kasan_enabled_key); + + /* Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +void __init kasan_early_init(void) { } + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c deleted file mode 100644 index cf8770b1a692..000000000000 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ /dev/null @@ -1,192 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define DISABLE_BRANCH_PROFILING - -#include <linux/kasan.h> -#include <linux/printk.h> -#include <linux/memblock.h> -#include <linux/sched/task.h> -#include <asm/pgalloc.h> -#include <asm/code-patching.h> -#include <mm/mmu_decl.h> - -static pgprot_t __init kasan_prot_ro(void) -{ - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return PAGE_READONLY; - - return PAGE_KERNEL_RO; -} - -static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot) -{ - unsigned long va = (unsigned long)kasan_early_shadow_page; - phys_addr_t pa = __pa(kasan_early_shadow_page); - int i; - - for (i = 0; i < PTRS_PER_PTE; i++, ptep++) - __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); -} - -int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) -{ - pmd_t *pmd; - unsigned long k_cur, k_next; - - pmd = pmd_off_k(k_start); - - for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { - pte_t *new; - - k_next = pgd_addr_end(k_cur, k_end); - if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) - continue; - - new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); - - if (!new) - return -ENOMEM; - kasan_populate_pte(new, PAGE_KERNEL); - pmd_populate_kernel(&init_mm, pmd, new); - } - return 0; -} - -int __init __weak kasan_init_region(void *start, size_t size) -{ - unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); - unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); - unsigned long k_cur; - int ret; - void *block; - - ret = kasan_init_shadow_page_tables(k_start, k_end); - if (ret) - return ret; - - block = memblock_alloc(k_end - k_start, PAGE_SIZE); - if (!block) - return -ENOMEM; - - for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { - pmd_t *pmd = pmd_off_k(k_cur); - void *va = block + k_cur - k_start; - pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); - - __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); - } - flush_tlb_kernel_range(k_start, k_end); - return 0; -} - -void __init -kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte) -{ - unsigned long k_cur; - phys_addr_t pa = __pa(kasan_early_shadow_page); - - for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) { - pmd_t *pmd = pmd_off_k(k_cur); - pte_t *ptep = pte_offset_kernel(pmd, k_cur); - - if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) - continue; - - __set_pte_at(&init_mm, k_cur, ptep, pte, 0); - } - - flush_tlb_kernel_range(k_start, k_end); -} - -static void __init kasan_remap_early_shadow_ro(void) -{ - pgprot_t prot = kasan_prot_ro(); - phys_addr_t pa = __pa(kasan_early_shadow_page); - - kasan_populate_pte(kasan_early_shadow_pte, prot); - - kasan_update_early_region(KASAN_SHADOW_START, KASAN_SHADOW_END, - pfn_pte(PHYS_PFN(pa), prot)); -} - -static void __init kasan_unmap_early_shadow_vmalloc(void) -{ - unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START); - unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END); - - kasan_update_early_region(k_start, k_end, __pte(0)); - -#ifdef MODULES_VADDR - k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR); - k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END); - kasan_update_early_region(k_start, k_end, __pte(0)); -#endif -} - -void __init kasan_mmu_init(void) -{ - int ret; - - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { - ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); - - if (ret) - panic("kasan: kasan_init_shadow_page_tables() failed"); - } -} - -void __init kasan_init(void) -{ - phys_addr_t base, end; - u64 i; - int ret; - - for_each_mem_range(i, &base, &end) { - phys_addr_t top = min(end, total_lowmem); - - if (base >= top) - continue; - - ret = kasan_init_region(__va(base), top - base); - if (ret) - panic("kasan: kasan_init_region() failed"); - } - - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); - - if (ret) - panic("kasan: kasan_init_shadow_page_tables() failed"); - } - - kasan_remap_early_shadow_ro(); - - clear_page(kasan_early_shadow_page); - - /* At this point kasan is fully initialized. Enable error messages */ - init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); -} - -void __init kasan_late_init(void) -{ - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) - kasan_unmap_early_shadow_vmalloc(); -} - -void __init kasan_early_init(void) -{ - unsigned long addr = KASAN_SHADOW_START; - unsigned long end = KASAN_SHADOW_END; - unsigned long next; - pmd_t *pmd = pmd_off_k(addr); - - BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); - - kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL); - - do { - next = pgd_addr_end(addr, end); - pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); - } while (pmd++, addr = next, addr != end); -} diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c index aad7c47e0030..ea821d0ffe16 100644 --- a/arch/powerpc/mm/maccess.c +++ b/arch/powerpc/mm/maccess.c @@ -11,20 +11,3 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return is_kernel_addr((unsigned long)unsafe_src); } - -int copy_inst_from_kernel_nofault(struct ppc_inst *inst, u32 *src) -{ - unsigned int val, suffix; - int err; - - err = copy_from_kernel_nofault(&val, src, sizeof(val)); - if (err) - return err; - if (IS_ENABLED(CONFIG_PPC64) && get_op(val) == OP_PREFIX) { - err = copy_from_kernel_nofault(&suffix, src + 1, sizeof(suffix)); - *inst = ppc_inst_prefix(val, suffix); - } else { - *inst = ppc_inst(val); - } - return err; -} diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index bd5d91a31183..84d171953ba4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -17,16 +17,19 @@ #include <linux/suspend.h> #include <linux/dma-direct.h> +#include <asm/swiotlb.h> #include <asm/machdep.h> #include <asm/rtas.h> #include <asm/kasan.h> #include <asm/svm.h> #include <asm/mmzone.h> +#include <asm/ftrace.h> +#include <asm/code-patching.h> +#include <asm/setup.h> #include <mm/mmu_decl.h> unsigned long long memory_limit; -bool init_mem_is_free; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -52,6 +55,7 @@ int memory_add_physaddr_to_nid(u64 start) { return hot_add_scn_to_nid(start); } +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif int __weak create_section_mapping(unsigned long start, unsigned long end, @@ -103,6 +107,37 @@ void __ref arch_remove_linear_mapping(u64 start, u64 size) vm_unmap_aliases(); } +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +int __ref add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct mhp_params *params) +{ + int ret; + + ret = __add_pages(nid, start_pfn, nr_pages, params); + if (ret) + return ret; + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); + + return ret; +} + int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) { @@ -113,7 +148,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, rc = arch_create_linear_mapping(nid, start, size, params); if (rc) return rc; - rc = __add_pages(nid, start_pfn, nr_pages, params); + rc = add_pages(nid, start_pfn, nr_pages, params); if (rc) arch_remove_linear_mapping(start, size); return rc; @@ -249,14 +284,11 @@ void __init mem_init(void) * back to to-down. */ memblock_set_bottom_up(true); - if (is_secure_guest()) - svm_swiotlb_init(); - else - swiotlb_init(0); + swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - set_max_mapnr(max_low_pfn); + set_max_mapnr(max_pfn); kasan_late_init(); @@ -270,13 +302,13 @@ void __init mem_init(void) for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; struct page *page = pfn_to_page(pfn); - if (!memblock_is_reserved(paddr)) + if (memblock_is_memory(paddr) && !memblock_is_reserved(paddr)) free_highmem_page(page); } } #endif /* CONFIG_HIGHMEM */ -#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) +#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up * functions.... do it here for the non-smp case. @@ -312,8 +344,9 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); - init_mem_is_free = true; + static_branch_enable(&init_mem_is_free); free_initmem_default(POISON_FREE_INITMEM); + ftrace_free_init_tramp(); } /* diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c deleted file mode 100644 index ae683fdc716c..000000000000 --- a/arch/powerpc/mm/mmap.c +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * flexible mmap layout support - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * Started by Ingo Molnar <mingo@elte.hu> - */ - -#include <linux/personality.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/sched/signal.h> -#include <linux/sched/mm.h> -#include <linux/elf-randomize.h> -#include <linux/security.h> -#include <linux/mman.h> - -/* - * Top of mmap area (just below the process stack). - * - * Leave at least a ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -unsigned long arch_mmap_rnd(void) -{ - unsigned long shift, rnd; - - shift = mmap_rnd_bits; -#ifdef CONFIG_COMPAT - if (is_32bit_task()) - shift = mmap_rnd_compat_bits; -#endif - rnd = get_random_long() % (1ul << shift); - - return rnd << PAGE_SHIFT; -} - -static inline unsigned long stack_maxrandom_size(void) -{ - if (!(current->flags & PF_RANDOMIZE)) - return 0; - - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - return (1<<23); - else - return (1<<30); -} - -static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_maxrandom_size() + stack_guard_gap; - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); -} - -#ifdef CONFIG_PPC_RADIX_MMU -/* - * Same function as generic code used only for radix, because we don't need to overload - * the generic one. But we will have to duplicate, because hash select - * HAVE_ARCH_UNMAPPED_AREA - */ -static unsigned long -radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - return addr; - } - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = 0; - info.length = len; - info.low_limit = mm->mmap_base; - info.high_limit = high_limit; - info.align_mask = 0; - - return vm_unmapped_area(&info); -} - -static unsigned long -radix__arch_get_unmapped_area_topdown(struct file *filp, - const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - unsigned long addr = addr0; - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - return addr; - } - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = 0; - - addr = vm_unmapped_area(&info); - if (!(addr & ~PAGE_MASK)) - return addr; - VM_BUG_ON(addr != -ENOMEM); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -} - -static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor, - struct rlimit *rlim_stack) -{ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = radix__arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; - } -} -#else -/* dummy */ -extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor, - struct rlimit *rlim_stack); -#endif -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor, - rlim_stack); - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 74246536b832..1fb9c99f8679 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -18,6 +18,12 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, { /* 32-bit keeps track of the current PGDIR in the thread struct */ tsk->thread.pgdir = mm->pgd; +#ifdef CONFIG_PPC_BOOK3S_32 + tsk->thread.sr0 = mm->context.sr0; +#endif +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = mm->context.id; +#endif } #elif defined(CONFIG_PPC_BOOK3E_64) static inline void switch_mm_pgdir(struct task_struct *tsk, @@ -25,6 +31,9 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, { /* 64-bit Book3E keeps track of current PGD in the PACA */ get_paca()->pgd = mm->pgd; +#ifdef CONFIG_PPC_KUAP + tsk->thread.pid = mm->context.id; +#endif } #else static inline void switch_mm_pgdir(struct task_struct *tsk, @@ -81,7 +90,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * context */ if (cpu_has_feature(CPU_FTR_ALTIVEC)) - asm volatile ("dssall"); + asm volatile (PPC_DSSALL); if (!new_on_cpu) membarrier_arch_switch_mm(prev, next, tsk); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 0dd4c18f8363..bd9784f77f2e 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid) #else /* CONFIG_40x || CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) @@ -55,7 +55,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); trace_tlbie(0, 0, address, pid, 0, 0, 0); } -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) extern void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, } #endif /* CONFIG_PPC_8xx */ -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -92,29 +92,16 @@ extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); -extern int __map_without_bats; -extern unsigned int rtas_data, rtas_size; - -struct hash_pte; extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ extern unsigned long __max_low_memory; -extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; extern phys_addr_t total_lowmem; extern phys_addr_t memstart_addr; extern phys_addr_t lowmem_end_addr; -#ifdef CONFIG_WII -extern unsigned long wii_hole_start; -extern unsigned long wii_hole_size; - -extern unsigned long wii_mmu_mapin_mem2(unsigned long top); -extern void wii_memory_fixups(void); -#endif - /* ...and now those things that may be slightly different between processor * architectures. -- Dan */ @@ -124,11 +111,9 @@ void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init); -extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys); #ifdef CONFIG_PPC32 extern void adjust_total_lowmem(void); extern int switch_to_as1(void); @@ -155,11 +140,15 @@ struct tlbcam { u32 MAS3; u32 MAS7; }; + +#define NUM_TLBCAMS 64 + +extern struct tlbcam TLBCAM[NUM_TLBCAMS]; #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx) /* 6xx have BATS */ -/* FSL_BOOKE have TLBCAM */ +/* PPC_85xx have TLBCAM */ /* 8xx have LTLB */ phys_addr_t v_block_mapped(unsigned long va); unsigned long p_block_mapped(phys_addr_t pa); @@ -168,7 +157,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500) void mmu_mark_initmem_nx(void); void mmu_mark_rodata_ro(void); #else diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c index 95751c322f6c..3684d6e570fb 100644 --- a/arch/powerpc/mm/nohash/40x.c +++ b/arch/powerpc/mm/nohash/40x.c @@ -32,7 +32,6 @@ #include <linux/highmem.h> #include <linux/memblock.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/mmu.h> @@ -44,7 +43,6 @@ #include <mm/mmu_decl.h> -extern int __map_without_ltlbs; /* * MMU_init_hw does the chip-specific initialization of the MMU hardware. */ @@ -95,7 +93,13 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) p = 0; s = total_lowmem; - if (__map_without_ltlbs) + if (IS_ENABLED(CONFIG_KFENCE)) + return 0; + + if (debug_pagealloc_enabled()) + return 0; + + if (strict_kernel_rwx_enabled()) return 0; while (s >= LARGE_PAGE_SIZE_16M) { diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index e079f26b267e..1beae802bb1c 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -38,7 +38,7 @@ int icache_44x_need_flush; unsigned long tlb_47x_boltmap[1024/8]; -static void ppc44x_update_tlb_hwater(void) +static void __init ppc44x_update_tlb_hwater(void) { /* The TLB miss handlers hard codes the watermark in a cmpli * instruction to improve performances rather than loading it @@ -122,7 +122,7 @@ static void __init ppc47x_update_boltmap(void) /* * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU */ -static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +static void __init ppc47x_pin_tlb(unsigned int virt, unsigned int phys) { unsigned int rA; int bolted; @@ -240,19 +240,3 @@ void __init mmu_init_secondary(int cpu) } } #endif /* CONFIG_SMP */ - -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - if (smp_processor_id() != boot_cpuid) - return; - - if (disabled) - patch_instruction_site(&patch__tlb_44x_kuep, ppc_inst(PPC_RAW_NOP())); - else - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - if (IS_ENABLED(CONFIG_PPC_47x) && disabled) - patch_instruction_site(&patch__tlb_47x_kuep, ppc_inst(PPC_RAW_NOP())); -} -#endif diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 0df9fe29dd56..dbbfe897455d 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -8,18 +8,12 @@ */ #include <linux/memblock.h> -#include <linux/mmu_context.h> #include <linux/hugetlb.h> -#include <asm/fixmap.h> -#include <asm/code-patching.h> -#include <asm/inst.h> #include <mm/mmu_decl.h> #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) -extern int __map_without_ltlbs; - static unsigned long block_mapped_ram; /* @@ -32,8 +26,6 @@ phys_addr_t v_block_mapped(unsigned long va) if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) return __pa(va); return 0; @@ -49,8 +41,6 @@ unsigned long p_block_mapped(phys_addr_t pa) if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; - if (__map_without_ltlbs) - return 0; if (pa < block_mapped_ram) return (unsigned long)__va(pa); return 0; @@ -157,9 +147,6 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) mmu_mapin_immr(); - if (__map_without_ltlbs) - return 0; - mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, true); if (debug_pagealloc_enabled_or_kfence()) { top = boundary; @@ -183,8 +170,8 @@ void mmu_mark_initmem_nx(void) unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8; unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); - mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, false); - mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); + if (!debug_pagealloc_enabled_or_kfence()) + mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); mmu_pin_tlb(block_mapped_ram, false); } @@ -212,35 +199,6 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M)); } -#ifdef CONFIG_PPC_KUEP -void __init setup_kuep(bool disabled) -{ - if (disabled) - return; - - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - mtspr(SPRN_MI_AP, MI_APG_KUEP); -} -#endif - -#ifdef CONFIG_PPC_KUAP -struct static_key_false disable_kuap_key; -EXPORT_SYMBOL(disable_kuap_key); - -void __init setup_kuap(bool disabled) -{ - if (disabled) { - static_branch_enable(&disable_kuap_key); - return; - } - - pr_info("Activating Kernel Userspace Access Protection\n"); - - mtspr(SPRN_MD_AP, MD_APG_KUAP); -} -#endif - int pud_clear_huge(pud_t *pud) { return 0; diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index b1f630d423d8..f3894e79d5f7 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -2,18 +2,18 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y += mmu_context.o tlb.o tlb_low.o +obj-y += mmu_context.o tlb.o tlb_low.o kup.o obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_40x) += 40x.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_book3e.o +obj-$(CONFIG_PPC_E500) += e500.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o +obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o endif # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_fsl_book3e.o := n +KCOV_INSTRUMENT_e500.o := n diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c deleted file mode 100644 index 8b88be91b622..000000000000 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ /dev/null @@ -1,204 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PPC Huge TLB Page Support for Book3E MMU - * - * Copyright (C) 2009 David Gibson, IBM Corporation. - * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor - * - */ -#include <linux/mm.h> -#include <linux/hugetlb.h> - -#include <asm/mmu.h> - -#ifdef CONFIG_PPC64 -#include <asm/paca.h> - -static inline int tlb1_next(void) -{ - struct paca_struct *paca = get_paca(); - struct tlb_core_data *tcd; - int this, next; - - tcd = paca->tcd_ptr; - this = tcd->esel_next; - - next = this + 1; - if (next >= tcd->esel_max) - next = tcd->esel_first; - - tcd->esel_next = next; - return this; -} - -static inline void book3e_tlb_lock(void) -{ - struct paca_struct *paca = get_paca(); - unsigned long tmp; - int token = smp_processor_id() + 1; - - /* - * Besides being unnecessary in the absence of SMT, this - * check prevents trying to do lbarx/stbcx. on e5500 which - * doesn't implement either feature. - */ - if (!cpu_has_feature(CPU_FTR_SMT)) - return; - - asm volatile("1: lbarx %0, 0, %1;" - "cmpwi %0, 0;" - "bne 2f;" - "stbcx. %2, 0, %1;" - "bne 1b;" - "b 3f;" - "2: lbzx %0, 0, %1;" - "cmpwi %0, 0;" - "bne 2b;" - "b 1b;" - "3:" - : "=&r" (tmp) - : "r" (&paca->tcd_ptr->lock), "r" (token) - : "memory"); -} - -static inline void book3e_tlb_unlock(void) -{ - struct paca_struct *paca = get_paca(); - - if (!cpu_has_feature(CPU_FTR_SMT)) - return; - - isync(); - paca->tcd_ptr->lock = 0; -} -#else -static inline int tlb1_next(void) -{ - int index, ncams; - - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - index = this_cpu_read(next_tlbcam_idx); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __this_cpu_write(next_tlbcam_idx, tlbcam_index); - else - __this_cpu_inc(next_tlbcam_idx); - - return index; -} - -static inline void book3e_tlb_lock(void) -{ -} - -static inline void book3e_tlb_unlock(void) -{ -} -#endif - -static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) -{ - int found = 0; - - mtspr(SPRN_MAS6, pid << 16); - if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { - asm volatile( - "li %0,0\n" - "tlbsx. 0,%1\n" - "bne 1f\n" - "li %0,1\n" - "1:\n" - : "=&r"(found) : "r"(ea)); - } else { - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - } - - return found; -} - -static void -book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) -{ - unsigned long mas1, mas2; - u64 mas7_3; - unsigned long psize, tsize, shift; - unsigned long flags; - struct mm_struct *mm; - int index; - - if (unlikely(is_kernel_addr(ea))) - return; - - mm = vma->vm_mm; - - psize = vma_mmu_pagesize(vma); - shift = __ilog2(psize); - tsize = shift - 10; - /* - * We can't be interrupted while we're setting up the MAS - * regusters or after we've confirmed that no tlb exists. - */ - local_irq_save(flags); - - book3e_tlb_lock(); - - if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { - book3e_tlb_unlock(); - local_irq_restore(flags); - return; - } - - /* We have to use the CAM(TLB1) on FSL parts for hugepages */ - index = tlb1_next(); - mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); - - mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); - mas2 = ea & ~((1UL << shift) - 1); - mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; - mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; - if (!pte_dirty(pte)) - mas7_3 &= ~(MAS3_SW|MAS3_UW); - - mtspr(SPRN_MAS1, mas1); - mtspr(SPRN_MAS2, mas2); - - if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { - mtspr(SPRN_MAS7_MAS3, mas7_3); - } else { - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - } - - asm volatile ("tlbwe"); - - book3e_tlb_unlock(); - local_irq_restore(flags); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * - * This must always be called with the pte lock held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) -{ - if (is_vm_hugetlb_page(vma)) - book3e_hugetlb_preload(vma, address, *ptep); -} - -void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - struct hstate *hstate = hstate_file(vma->vm_file); - unsigned long tsize = huge_page_shift(hstate) - 10; - - __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); -} diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 77884e24281d..b80fc4a91a53 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -10,6 +10,7 @@ #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/dma.h> +#include <asm/code-patching.h> #include <mm/mmu_decl.h> @@ -95,8 +96,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) pgdp = pgd_offset_k(ea); p4dp = p4d_offset(pgdp, ea); if (p4d_none(*p4dp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - p4d_populate(&init_mm, p4dp, pmdp); + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); } pudp = pud_offset(p4dp, ea); if (pud_none(*pudp)) { @@ -105,7 +106,7 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) } pmdp = pmd_offset(pudp, ea); if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); + ptep = early_alloc_pgtable(PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); @@ -115,3 +116,17 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) smp_wmb(); return 0; } + +void __patch_exception(int exc, unsigned long addr) +{ + unsigned int *ibase = &interrupt_base_book3e; + + /* + * Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one. + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} diff --git a/arch/powerpc/mm/nohash/e500.c b/arch/powerpc/mm/nohash/e500.c new file mode 100644 index 000000000000..40a4e69ae1a9 --- /dev/null +++ b/arch/powerpc/mm/nohash/e500.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Modifications by Kumar Gala (galak@kernel.crashing.org) to support + * E500 Book E processors. + * + * Copyright 2004,2010 Freescale Semiconductor, Inc. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/highmem.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> + +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/mmu.h> +#include <linux/uaccess.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/setup.h> +#include <asm/paca.h> + +#include <mm/mmu_decl.h> + +unsigned int tlbcam_index; + +struct tlbcam TLBCAM[NUM_TLBCAMS]; + +static struct { + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} tlbcam_addrs[NUM_TLBCAMS]; + +#ifdef CONFIG_PPC_85xx +/* + * Return PA for this VA if it is mapped by a CAM, or 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) + return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (pa >= tlbcam_addrs[b].phys + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + +tlbcam_addrs[b].phys) + return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); + return 0; +} +#endif + +/* + * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; + * in particular size must be a power of 4 between 4k and the max supported by + * an implementation; max may further be limited by what can be represented in + * an unsigned long (for example, 32-bit implementations cannot support a 4GB + * size). + */ +static void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned long size, unsigned long flags, unsigned int pid) +{ + unsigned int tsize; + + tsize = __ilog2(size) - 10; + +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) + if ((flags & _PAGE_NO_CACHE) == 0) + flags |= _PAGE_COHERENT; +#endif + + TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); + TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); + TLBCAM[index].MAS2 = virt & PAGE_MASK; + + TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; + + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; + TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0; + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + TLBCAM[index].MAS7 = (u64)phys >> 32; + + /* Below is unlikely -- only for large user pages or similar */ + if (pte_user(__pte(flags))) { + TLBCAM[index].MAS3 |= MAS3_UR; + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; + } else { + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; + } + + tlbcam_addrs[index].start = virt; + tlbcam_addrs[index].limit = virt + size - 1; + tlbcam_addrs[index].phys = phys; +} + +static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) +{ + unsigned int camsize = __ilog2(ram); + unsigned int align = __ffs(virt | phys); + unsigned long max_cam; + + if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; + camsize &= ~1U; + align &= ~1U; + } else { + /* Convert (2^max) kB to (2^max) bytes */ + max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; + } + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + return 1UL << camsize; +} + +static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, + unsigned long ram, int max_cam_idx, + bool dryrun, bool init) +{ + int i; + unsigned long amount_mapped = 0; + unsigned long boundary; + + if (strict_kernel_rwx_enabled()) + boundary = (unsigned long)(_sinittext - _stext); + else + boundary = ram; + + /* Calculate CAM values */ + for (i = 0; boundary && i < max_cam_idx; i++) { + unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX; + + cam_sz = calc_cam_sz(boundary, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); + + boundary -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + for (ram -= amount_mapped; ram && i < max_cam_idx; i++) { + unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL; + + cam_sz = calc_cam_sz(ram, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); + + ram -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + + if (dryrun) + return amount_mapped; + + if (init) { + loadcam_multi(0, i, max_cam_idx); + tlbcam_index = i; + } else { + loadcam_multi(0, i, 0); + WARN_ON(i > tlbcam_index); + } + +#ifdef CONFIG_PPC64 + get_paca()->tcd.esel_next = i; + get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + get_paca()->tcd.esel_first = i; +#endif + + return amount_mapped; +} + +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init) +{ + unsigned long virt = PAGE_OFFSET; + phys_addr_t phys = memstart_addr; + + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init); +} + +#ifdef CONFIG_PPC32 + +#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) +#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" +#endif + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; +} + +void flush_instruction_cache(void) +{ + unsigned long tmp; + + tmp = mfspr(SPRN_L1CSR1); + tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; + mtspr(SPRN_L1CSR1, tmp); + isync(); +} + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + flush_instruction_cache(); +} + +static unsigned long __init tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + +void __init adjust_total_lowmem(void) +{ + unsigned long ram; + int i; + + /* adjust lowmem size to __max_low_memory */ + ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); + + i = switch_to_as1(); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true); + restore_to_as0(i, 0, NULL, 1); + + pr_info("Memory CAM mapping: "); + for (i = 0; i < tlbcam_index - 1; i++) + pr_cont("%lu/", tlbcam_sz(i) >> 20); + pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, + (unsigned int)((total_lowmem - __max_low_memory) >> 20)); + + memblock_set_current_limit(memstart_addr + __max_low_memory); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mmu_mark_rodata_ro(void) +{ + unsigned long remapped; + + remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); + + WARN_ON(__max_low_memory != remapped); +} +#endif + +void mmu_mark_initmem_nx(void) +{ + /* Everything is done in mmu_mark_rodata_ro() */ +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + phys_addr_t limit = first_memblock_base + first_memblock_size; + + /* 64M mapped initially according to head_fsl_booke.S */ + memblock_set_current_limit(min_t(u64, limit, 0x04000000)); +} + +#ifdef CONFIG_RELOCATABLE +int __initdata is_second_reloc; +notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) +{ + unsigned long base = kernstart_virt_addr; + phys_addr_t size; + + kernstart_addr = start; + if (is_second_reloc) { + virt_phys_offset = PAGE_OFFSET - memstart_addr; + kaslr_late_init(); + return; + } + + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. Before we get the real memstart_addr, + * We will compute the virt_phys_offset like this: + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0x3ffffff) + + * (kernstart_addr & 0x3ffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - + * (kernstart_addr & ~0x3ffffff) + * + */ + start &= ~0x3ffffff; + base &= ~0x3ffffff; + virt_phys_offset = base - start; + early_get_first_memblock_info(__va(dt_ptr), &size); + /* + * We now get the memstart_addr, then we should check if this + * address is the same as what the PAGE_OFFSET map to now. If + * not we have to change the map of PAGE_OFFSET to memstart_addr + * and do a second relocation. + */ + if (start != memstart_addr) { + int n; + long offset = start - memstart_addr; + + is_second_reloc = 1; + n = switch_to_as1(); + /* map a 64M area for the second relocation */ + if (memstart_addr > start) + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, + false, true); + else + map_mem_in_cams_addr(start, PAGE_OFFSET + offset, + 0x4000000, CONFIG_LOWMEM_CAM_NUM, + false, true); + restore_to_as0(n, offset, __va(dt_ptr), 1); + /* We should never reach here */ + panic("Relocation error"); + } + + kaslr_early_init(__va(dt_ptr), size); +} +#endif +#endif diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c new file mode 100644 index 000000000000..c7d4b317a823 --- /dev/null +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include <linux/mm.h> +#include <linux/hugetlb.h> + +#include <asm/mmu.h> + +#ifdef CONFIG_PPC64 +#include <asm/paca.h> + +static inline int tlb1_next(void) +{ + struct paca_struct *paca = get_paca(); + struct tlb_core_data *tcd; + int this, next; + + tcd = paca->tcd_ptr; + this = tcd->esel_next; + + next = this + 1; + if (next >= tcd->esel_max) + next = tcd->esel_first; + + tcd->esel_next = next; + return this; +} + +static inline void book3e_tlb_lock(void) +{ + struct paca_struct *paca = get_paca(); + unsigned long tmp; + int token = smp_processor_id() + 1; + + /* + * Besides being unnecessary in the absence of SMT, this + * check prevents trying to do lbarx/stbcx. on e5500 which + * doesn't implement either feature. + */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + asm volatile("1: lbarx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2f;" + "stbcx. %2, 0, %1;" + "bne 1b;" + "b 3f;" + "2: lbzx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2b;" + "b 1b;" + "3:" + : "=&r" (tmp) + : "r" (&paca->tcd_ptr->lock), "r" (token) + : "memory"); +} + +static inline void book3e_tlb_unlock(void) +{ + struct paca_struct *paca = get_paca(); + + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + isync(); + paca->tcd_ptr->lock = 0; +} +#else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} + +static inline void book3e_tlb_lock(void) +{ +} + +static inline void book3e_tlb_unlock(void) +{ +} +#endif + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + + return found; +} + +static void +book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + struct mm_struct *mm; + int index; + + if (unlikely(is_kernel_addr(ea))) + return; + + mm = vma->vm_mm; + + psize = vma_mmu_pagesize(vma); + shift = __ilog2(psize); + tsize = shift - 10; + /* + * We can't be interrupted while we're setting up the MAS + * registers or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + book3e_tlb_lock(); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + book3e_tlb_unlock(); + local_irq_restore(flags); + return; + } + + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = tlb1_next(); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); + + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + + asm volatile ("tlbwe"); + + book3e_tlb_unlock(); + local_irq_restore(flags); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); +} diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c deleted file mode 100644 index b231a54f540c..000000000000 --- a/arch/powerpc/mm/nohash/fsl_book3e.c +++ /dev/null @@ -1,379 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Modifications by Kumar Gala (galak@kernel.crashing.org) to support - * E500 Book E processors. - * - * Copyright 2004,2010 Freescale Semiconductor, Inc. - * - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/stddef.h> -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/highmem.h> -#include <linux/memblock.h> - -#include <asm/prom.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#include <asm/mmu.h> -#include <linux/uaccess.h> -#include <asm/smp.h> -#include <asm/machdep.h> -#include <asm/setup.h> -#include <asm/paca.h> - -#include <mm/mmu_decl.h> - -unsigned int tlbcam_index; - -#define NUM_TLBCAMS (64) -struct tlbcam TLBCAM[NUM_TLBCAMS]; - -struct tlbcamrange { - unsigned long start; - unsigned long limit; - phys_addr_t phys; -} tlbcam_addrs[NUM_TLBCAMS]; - -unsigned long tlbcam_sz(int idx) -{ - return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; -} - -#ifdef CONFIG_FSL_BOOKE -/* - * Return PA for this VA if it is mapped by a CAM, or 0 - */ -phys_addr_t v_block_mapped(unsigned long va) -{ - int b; - for (b = 0; b < tlbcam_index; ++b) - if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) - return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); - return 0; -} - -/* - * Return VA for a given PA or 0 if not mapped - */ -unsigned long p_block_mapped(phys_addr_t pa) -{ - int b; - for (b = 0; b < tlbcam_index; ++b) - if (pa >= tlbcam_addrs[b].phys - && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) - +tlbcam_addrs[b].phys) - return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); - return 0; -} -#endif - -/* - * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; - * in particular size must be a power of 4 between 4k and the max supported by - * an implementation; max may further be limited by what can be represented in - * an unsigned long (for example, 32-bit implementations cannot support a 4GB - * size). - */ -static void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned long size, unsigned long flags, unsigned int pid) -{ - unsigned int tsize; - - tsize = __ilog2(size) - 10; - -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) - if ((flags & _PAGE_NO_CACHE) == 0) - flags |= _PAGE_COHERENT; -#endif - - TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); - TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); - TLBCAM[index].MAS2 = virt & PAGE_MASK; - - TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; - TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - - TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0; - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - TLBCAM[index].MAS7 = (u64)phys >> 32; - - /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(__pte(flags))) { - TLBCAM[index].MAS3 |= MAS3_UR; - TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; - } else { - TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; - } - - tlbcam_addrs[index].start = virt; - tlbcam_addrs[index].limit = virt + size - 1; - tlbcam_addrs[index].phys = phys; -} - -unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys) -{ - unsigned int camsize = __ilog2(ram); - unsigned int align = __ffs(virt | phys); - unsigned long max_cam; - - if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - /* Convert (4^max) kB to (2^max) bytes */ - max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; - camsize &= ~1U; - align &= ~1U; - } else { - /* Convert (2^max) kB to (2^max) bytes */ - max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; - } - - if (camsize > align) - camsize = align; - if (camsize > max_cam) - camsize = max_cam; - - return 1UL << camsize; -} - -static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, - unsigned long ram, int max_cam_idx, - bool dryrun, bool init) -{ - int i; - unsigned long amount_mapped = 0; - unsigned long boundary; - - if (strict_kernel_rwx_enabled()) - boundary = (unsigned long)(_sinittext - _stext); - else - boundary = ram; - - /* Calculate CAM values */ - for (i = 0; boundary && i < max_cam_idx; i++) { - unsigned long cam_sz; - pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX; - - cam_sz = calc_cam_sz(boundary, virt, phys); - if (!dryrun) - settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); - - boundary -= cam_sz; - amount_mapped += cam_sz; - virt += cam_sz; - phys += cam_sz; - } - for (ram -= amount_mapped; ram && i < max_cam_idx; i++) { - unsigned long cam_sz; - pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL; - - cam_sz = calc_cam_sz(ram, virt, phys); - if (!dryrun) - settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); - - ram -= cam_sz; - amount_mapped += cam_sz; - virt += cam_sz; - phys += cam_sz; - } - - if (dryrun) - return amount_mapped; - - if (init) { - loadcam_multi(0, i, max_cam_idx); - tlbcam_index = i; - } else { - loadcam_multi(0, i, 0); - WARN_ON(i > tlbcam_index); - } - -#ifdef CONFIG_PPC64 - get_paca()->tcd.esel_next = i; - get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - get_paca()->tcd.esel_first = i; -#endif - - return amount_mapped; -} - -unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init) -{ - unsigned long virt = PAGE_OFFSET; - phys_addr_t phys = memstart_addr; - - return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init); -} - -#ifdef CONFIG_PPC32 - -#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) -#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" -#endif - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; -} - -void flush_instruction_cache(void) -{ - unsigned long tmp; - - tmp = mfspr(SPRN_L1CSR1); - tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; - mtspr(SPRN_L1CSR1, tmp); - isync(); -} - -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - flush_instruction_cache(); -} - -void __init adjust_total_lowmem(void) -{ - unsigned long ram; - int i; - - /* adjust lowmem size to __max_low_memory */ - ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); - - i = switch_to_as1(); - __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true); - restore_to_as0(i, 0, 0, 1); - - pr_info("Memory CAM mapping: "); - for (i = 0; i < tlbcam_index - 1; i++) - pr_cont("%lu/", tlbcam_sz(i) >> 20); - pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, - (unsigned int)((total_lowmem - __max_low_memory) >> 20)); - - memblock_set_current_limit(memstart_addr + __max_low_memory); -} - -#ifdef CONFIG_STRICT_KERNEL_RWX -void mmu_mark_rodata_ro(void) -{ - /* Everything is done in mmu_mark_initmem_nx() */ -} -#endif - -void mmu_mark_initmem_nx(void) -{ - unsigned long remapped; - - if (!strict_kernel_rwx_enabled()) - return; - - remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); - - WARN_ON(__max_low_memory != remapped); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - phys_addr_t limit = first_memblock_base + first_memblock_size; - - /* 64M mapped initially according to head_fsl_booke.S */ - memblock_set_current_limit(min_t(u64, limit, 0x04000000)); -} - -#ifdef CONFIG_RELOCATABLE -int __initdata is_second_reloc; -notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) -{ - unsigned long base = kernstart_virt_addr; - phys_addr_t size; - - kernstart_addr = start; - if (is_second_reloc) { - virt_phys_offset = PAGE_OFFSET - memstart_addr; - kaslr_late_init(); - return; - } - - /* - * Relocatable kernel support based on processing of dynamic - * relocation entries. Before we get the real memstart_addr, - * We will compute the virt_phys_offset like this: - * virt_phys_offset = stext.run - kernstart_addr - * - * stext.run = (KERNELBASE & ~0x3ffffff) + - * (kernstart_addr & 0x3ffffff) - * When we relocate, we have : - * - * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) - * - * hence: - * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - - * (kernstart_addr & ~0x3ffffff) - * - */ - start &= ~0x3ffffff; - base &= ~0x3ffffff; - virt_phys_offset = base - start; - early_get_first_memblock_info(__va(dt_ptr), &size); - /* - * We now get the memstart_addr, then we should check if this - * address is the same as what the PAGE_OFFSET map to now. If - * not we have to change the map of PAGE_OFFSET to memstart_addr - * and do a second relocation. - */ - if (start != memstart_addr) { - int n; - long offset = start - memstart_addr; - - is_second_reloc = 1; - n = switch_to_as1(); - /* map a 64M area for the second relocation */ - if (memstart_addr > start) - map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, - false, true); - else - map_mem_in_cams_addr(start, PAGE_OFFSET + offset, - 0x4000000, CONFIG_LOWMEM_CAM_NUM, - false, true); - restore_to_as0(n, offset, __va(dt_ptr), 1); - /* We should never reach here */ - panic("Relocation error"); - } - - kaslr_early_init(__va(dt_ptr), size); -} -#endif -#endif diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 6ec978967da0..0d04f9d5da8d 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -14,11 +14,11 @@ #include <linux/memblock.h> #include <linux/libfdt.h> #include <linux/crash_core.h> +#include <linux/of.h> +#include <linux/of_fdt.h> #include <asm/cacheflush.h> -#include <asm/prom.h> #include <asm/kdump.h> #include <mm/mmu_decl.h> -#include <generated/compile.h> #include <generated/utsrelease.h> struct regions { @@ -36,17 +36,11 @@ struct regions { int reserved_mem_size_cells; }; -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - struct regions __initdata regions; static __init void kaslr_get_cmdline(void *fdt) { - int node = fdt_path_offset(fdt, "/chosen"); - - early_init_dt_scan_chosen(node, "chosen", 1, boot_command_line); + early_init_dt_scan_chosen(boot_command_line); } static unsigned long __init rotate_xor(unsigned long hash, const void *area, @@ -72,7 +66,8 @@ static unsigned long __init get_boot_seed(void *fdt) { unsigned long hash = 0; - hash = rotate_xor(hash, build_str, sizeof(build_str)); + /* build-specific string for starting entropy. */ + hash = rotate_xor(hash, linux_banner, strlen(linux_banner)); hash = rotate_xor(hash, fdt, fdt_totalsize(fdt)); return hash; @@ -317,7 +312,7 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, true); linear_sz = min_t(unsigned long, ram, SZ_512M); - /* If the linear size is smaller than 64M, do not randmize */ + /* If the linear size is smaller than 64M, do not randomize */ if (linear_sz < SZ_64M) return 0; diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c new file mode 100644 index 000000000000..552becf90e97 --- /dev/null +++ b/arch/powerpc/mm/nohash/kup.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for initializing kernel userspace protection + */ + +#include <linux/export.h> +#include <linux/init.h> +#include <linux/jump_label.h> +#include <linux/printk.h> +#include <linux/smp.h> + +#include <asm/kup.h> +#include <asm/smp.h> + +#ifdef CONFIG_PPC_KUAP +struct static_key_false disable_kuap_key; +EXPORT_SYMBOL(disable_kuap_key); + +void setup_kuap(bool disabled) +{ + if (disabled) { + if (IS_ENABLED(CONFIG_40x)) + disable_kuep = true; + if (smp_processor_id() == boot_cpuid) + static_branch_enable(&disable_kuap_key); + return; + } + + pr_info("Activating Kernel Userspace Access Protection\n"); + + __prevent_user_access(KUAP_READ_WRITE); +} +#endif diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 44b2b5e7cabe..ccd5819b1bd9 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -33,6 +33,7 @@ #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/smp.h> +#include <asm/kup.h> #include <mm/mmu_decl.h> @@ -217,7 +218,7 @@ static void set_context(unsigned long id, pgd_t *pgd) /* sync */ mb(); - } else { + } else if (kuap_is_disabled()) { if (IS_ENABLED(CONFIG_40x)) mb(); /* sync */ @@ -305,6 +306,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, if (IS_ENABLED(CONFIG_BDI_SWITCH)) abatron_pteptrs[1] = next->pgd; set_context(id, next->pgd); +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = id; +#endif raw_spin_unlock(&context_lock); } @@ -313,15 +317,6 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { - /* - * We have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we properly - * initialize context slice details for newly allocated mm's (which will - * have id == 0) and don't alter context slice inherited via fork (which - * will have id != 0). - */ - if (mm->context.id == 0) - slice_init_new_context_exec(mm); mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; pte_frag_set(&mm->context, NULL); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 647bf454a0fa..2c15c86c7015 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -49,8 +49,7 @@ * other sizes not listed here. The .ind field is only used on MMUs that have * indirect page table entries. */ -#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -81,7 +80,20 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .enc = BOOK3E_PAGESZ_1GB, }, }; -#elif defined(CONFIG_PPC_8xx) + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +#ifdef CONFIG_PPC_8xx struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -96,53 +108,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .shift = 23, }, }; -#else -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .ind = 20, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_16K] = { - .shift = 14, - .enc = BOOK3E_PAGESZ_16K, - }, - [MMU_PAGE_64K] = { - .shift = 16, - .ind = 28, - .enc = BOOK3E_PAGESZ_64K, - }, - [MMU_PAGE_1M] = { - .shift = 20, - .enc = BOOK3E_PAGESZ_1M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .ind = 36, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#endif /* CONFIG_FSL_BOOKE */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} -#else -static inline int mmu_get_tsize(int psize) -{ - /* This isn't used on !Book3E for now */ - return 0; -} -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* The variables below are currently only used on 64-bit Book3E * though this will probably be made common with other nohash @@ -150,7 +116,6 @@ static inline int mmu_get_tsize(int psize) */ #ifdef CONFIG_PPC64 -int mmu_linear_psize; /* Page size used for the linear mapping */ int mmu_pte_psize; /* Page size used for PTE pages */ int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ @@ -167,7 +132,7 @@ int extlb_level_exc; #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 /* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ DEFINE_PER_CPU(int, next_tlbcam_idx); EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); @@ -359,6 +324,7 @@ void __init early_init_mmu_47x(void) /* * Flush kernel TLB entries in the given range */ +#ifndef CONFIG_PPC_8xx void flush_tlb_kernel_range(unsigned long start, unsigned long end) { #ifdef CONFIG_SMP @@ -371,6 +337,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) #endif } EXPORT_SYMBOL(flush_tlb_kernel_range); +#endif /* * Currently, for range flushing, we just do a full mm flush. This should @@ -433,14 +400,14 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) } } -static void setup_page_sizes(void) +static void __init setup_page_sizes(void) { unsigned int tlb0cfg; unsigned int tlb0ps; unsigned int eptcfg; int i, psize; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 unsigned int mmucfg = mfspr(SPRN_MMUCFG); int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); @@ -571,7 +538,7 @@ out: } } -static void setup_mmu_htw(void) +static void __init setup_mmu_htw(void) { /* * If we want to use HW tablewalk, enable it by patching the TLB miss @@ -583,7 +550,7 @@ static void setup_mmu_htw(void) patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); break; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 case PPC_HTW_E6500: extlb_level_exc = EX_TLB_SIZE; patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); @@ -626,7 +593,7 @@ static void early_init_this_mmu(void) } mtspr(SPRN_MAS4, mas4); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; bool map = true; @@ -657,14 +624,6 @@ static void early_init_this_mmu(void) static void __init early_init_mmu_global(void) { - /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it. Ideally - * we should find out a suitable page size and patch the - * TLB miss code (either that or use the PACA to store - * the value we want) - */ - mmu_linear_psize = MMU_PAGE_1G; - /* XXX This should be decided at runtime based on supported * page sizes in the TLB, but for now let's assume 16M is * always there and a good fit (which it probably is) @@ -687,7 +646,7 @@ static void __init early_init_mmu_global(void) /* Look for HW tablewalk support */ setup_mmu_htw(); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { if (book3e_htw_mode == PPC_HTW_NONE) { extlb_level_exc = EX_TLB_SIZE; @@ -708,7 +667,7 @@ static void __init early_init_mmu_global(void) static void __init early_mmu_set_memory_limit(void) { -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { /* * Limit memory so we dont have linear faults. @@ -757,7 +716,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, * We crop it to the size of the first MEMBLOCK to * avoid going over total available memory just in case... */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned long linear_sz; unsigned int num_cams; @@ -782,9 +741,5 @@ void __init early_init_mmu(void) #ifdef CONFIG_PPC_47x early_init_mmu_47x(); #endif - -#ifdef CONFIG_PPC_MM_SLICES - mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); -#endif } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index dd39074de9af..e1199608ff4d 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -186,7 +186,7 @@ _GLOBAL(_tlbivax_bcast) isync PPC_TLBIVAX(0, R3) isync - eieio + mbar tlbsync BEGIN_FTR_SECTION b 1f @@ -221,7 +221,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) blr #endif /* CONFIG_PPC_47x */ -#elif defined(CONFIG_FSL_BOOKE) +#elif defined(CONFIG_PPC_85xx) /* * FSL BookE implementations. * @@ -294,7 +294,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) /* * New Book3E (>= 2.06) implementation * @@ -355,7 +355,7 @@ _GLOBAL(_tlbivax_bcast) rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND 1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ PPC_TLBIVAX(0,R3) - eieio + mbar tlbsync sync wrtee r10 @@ -364,7 +364,7 @@ _GLOBAL(_tlbivax_bcast) #error Unsupported processor type ! #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) /* * extern void loadcam_entry(unsigned int index) * diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 9235e720e357..76cf456d7976 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -61,7 +61,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) ld r14,PACAPGD(r13) std r15,EX_TLB_R15(r12) std r10,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION mfspr r11, SPRN_SRR1 andi. r10,r11,MSR_PR @@ -70,14 +69,11 @@ START_BTB_FLUSH_SECTION 1: END_BTB_FLUSH_SECTION std r7,EX_TLB_R7(r12) -#endif .endm .macro tlb_epilog_bolted ld r14,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E ld r7,EX_TLB_R7(r12) -#endif ld r10,EX_TLB_R10(r12) ld r11,EX_TLB_R11(r12) ld r13,EX_TLB_R13(r12) @@ -128,6 +124,13 @@ END_BTB_FLUSH_SECTION bne tlb_miss_kernel_bolted +tlb_miss_user_bolted: +#ifdef CONFIG_PPC_KUAP + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- tlb_miss_fault_bolted /* KUAP fault */ +#endif + tlb_miss_common_bolted: /* * This is the guts of the TLB miss handler for bolted-linear. @@ -145,16 +148,7 @@ tlb_miss_common_bolted: clrrdi r15,r15,3 beq tlb_miss_fault_bolted /* No PGDIR, bail */ -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ldx r14,r14,r15 /* grab pgd entry */ - beq tlb_miss_done_bolted /* tlb exists already, bail */ -MMU_FTR_SECTION_ELSE ldx r14,r14,r15 /* grab pgd entry */ -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -215,10 +209,11 @@ itlb_miss_kernel_bolted: tlb_miss_kernel_bolted: mfspr r10,SPRN_MAS1 ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + andi. r15,r15,1 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_bolted + bne+ tlb_miss_common_bolted tlb_miss_fault_bolted: /* We need to check if it was an instruction miss */ @@ -246,10 +241,9 @@ itlb_miss_fault_bolted: cmpldi cr0,r15,0 /* Check for user region */ oris r11,r11,_PAGE_ACCESSED@h - beq tlb_miss_common_bolted + beq tlb_miss_user_bolted b itlb_miss_kernel_bolted -#ifdef CONFIG_PPC_FSL_BOOK3E /* * TLB miss handling for e6500 and derivatives, using hardware tablewalk. * @@ -500,7 +494,9 @@ tlb_miss_huge_e6500: tlb_miss_kernel_e6500: ld r14,PACA_KERNELPGD(r13) - cmpldi cr1,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + xoris r15,r15,0xc /* Check for vmalloc region */ + cmplwi cr1,r15,1 beq+ cr1,tlb_miss_common_e6500 tlb_miss_fault_e6500: @@ -514,7 +510,6 @@ dtlb_miss_fault_e6500: itlb_miss_fault_e6500: tlb_epilog_bolted b exc_instruction_storage_book3e -#endif /* CONFIG_PPC_FSL_BOOK3E */ /********************************************************************** * * @@ -534,16 +529,18 @@ itlb_miss_fault_e6500: */ mfspr r14,SPRN_ESR mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ + srdi r15,r16,44 /* get region */ + xoris r15,r15,0xc + cmpldi cr0,r15,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r15,1 /* vmalloc mapping ? */ /* The page tables are mapped virtually linear. At this point, though, * we don't know whether we are trying to fault in a first level * virtual address or a virtual page table address. We can get that * from bit 0x1 of the region ID which we have set for a page table */ - andi. r10,r15,0x1 + andis. r10,r15,0x1 bne- virt_page_table_tlb_miss std r14,EX_TLB_ESR(r12); /* save ESR */ @@ -555,7 +552,7 @@ itlb_miss_fault_e6500: /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r15,0 /* Check for user region */ + srdi. r15,r16,60 /* Check for user region */ /* We pre-test some combination of permissions to avoid double * faults: @@ -576,13 +573,12 @@ itlb_miss_fault_e6500: */ rlwimi r11,r14,32-19,27,27 rlwimi r11,r14,32-16,19,19 - beq normal_tlb_miss + beq normal_tlb_miss_user /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss + beq+ cr1,normal_tlb_miss /* We got a crappy address, just fault with whatever DEAR and ESR * are here @@ -608,27 +604,28 @@ itlb_miss_fault_e6500: * * Faulting address is SRR0 which is already in r16 */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ + srdi r15,r16,44 /* get region */ + xoris r15,r15,0xc + cmpldi cr0,r15,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r15,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h - cmpldi cr0,r15,0 /* Check for user region */ + srdi. r15,r16,60 /* Check for user region */ std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ - beq normal_tlb_miss + beq normal_tlb_miss_user li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h /* XXX replace the RMW cycles with immediate loads + writes */ mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss + beq+ cr1,normal_tlb_miss /* We got a crappy address, just fault */ TLB_MISS_EPILOG_ERROR @@ -646,6 +643,12 @@ itlb_miss_fault_e6500: * r11 = PTE permission mask * r10 = crap (free to use) */ +normal_tlb_miss_user: +#ifdef CONFIG_PPC_KUAP + mfspr r14,SPRN_MAS1 + rlwinm. r14,r14,0,0x3fff0000 + beq- normal_tlb_miss_access_fault /* KUAP fault */ +#endif normal_tlb_miss: /* So we first construct the page table address. We do that by * shifting the bottom of the address (not the region ID) by @@ -655,22 +658,14 @@ normal_tlb_miss: * NOTE: For 64K pages, we do things slightly differently in * order to handle the weird page table format used by linux */ - ori r10,r15,0x1 + srdi r15,r16,44 + oris r10,r15,0x1 rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 - sldi r15,r10,60 - clrrdi r14,r14,3 + sldi r15,r10,44 + clrrdi r14,r14,19 or r10,r15,r14 -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ld r14,0(r10) - beq normal_tlb_miss_done -MMU_FTR_SECTION_ELSE ld r14,0(r10) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) finish_normal_tlb_miss: /* Check if required permissions are met */ @@ -689,13 +684,13 @@ finish_normal_tlb_miss: * * TODO: mix up code below for better scheduling */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - mtspr SPRN_MAS2,r11 + clrrdi r10,r16,12 /* Clear low crap in EA */ + rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r10 /* Check page size, if not standard, update MAS1 */ - rldicl r11,r14,64-8,64-8 - cmpldi cr0,r11,BOOK3E_PAGESZ_4K + rldicl r10,r14,64-8,64-8 + cmpldi cr0,r10,BOOK3E_PAGESZ_4K beq- 1f mfspr r11,SPRN_MAS1 rlwimi r11,r14,31,21,24 @@ -714,13 +709,9 @@ finish_normal_tlb_miss: li r11,MAS3_SW|MAS3_UW andc r15,r15,r11 1: -BEGIN_MMU_FTR_SECTION srdi r16,r15,32 mtspr SPRN_MAS3,r15 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r15 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -772,6 +763,7 @@ normal_tlb_miss_access_fault: */ virt_page_table_tlb_miss: /* Are we hitting a kernel page table ? */ + srdi r15,r16,60 andi. r10,r15,0x8 /* The cool thing now is that r10 contains 0 for user and 8 for kernel, @@ -786,19 +778,22 @@ virt_page_table_tlb_miss: mfspr r10,SPRN_MAS1 rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 +#ifdef CONFIG_PPC_KUAP + b 2f 1: -BEGIN_MMU_FTR_SECTION - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - */ - PPC_TLBSRX_DOT(0,R16) - beq virt_page_table_tlb_miss_done -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- virt_page_table_tlb_miss_fault /* KUAP fault */ +2: +#else +1: +#endif /* Now, we need to walk the page tables. First check if we are in * range. */ - rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + cmpldi r10,0x80 bne- virt_page_table_tlb_miss_fault /* Get the PGD pointer */ @@ -844,41 +839,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -BEGIN_MMU_FTR_SECTION -virt_page_table_tlb_miss_done: - - /* We have overridden MAS2:EPN but currently our primary TLB miss - * handler will always restore it so that should not be an issue, - * if we ever optimize the primary handler to not write MAS2 on - * some cases, we'll have to restore MAS2:EPN here based on the - * original fault's DEAR. If we do that we have to modify the - * ITLB miss handler to also store SRR0 in the exception frame - * as DEAR. - * - * However, one nasty thing we did is we cleared the reservation - * (well, potentially we did). We do a trick here thus if we - * are not a level 0 exception (we interrupted the TLB miss) we - * offset the return address by -4 in order to replay the tlbsrx - * instruction there - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- 1f - ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) - addi r10,r11,-4 - std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) -1: -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_EPILOG_SUCCESS rfi @@ -946,23 +912,24 @@ virt_page_table_tlb_miss_whacko_fault: */ mfspr r14,SPRN_ESR mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ + srdi r11,r16,44 /* get region */ + xoris r11,r11,0xc + cmpldi cr0,r11,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r11,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r11,0 /* Check for user region */ + srdi. r11,r16,60 /* Check for user region */ ld r15,PACAPGD(r13) /* Load user pgdir */ beq htw_tlb_miss /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss + beq+ cr1,htw_tlb_miss /* We got a crappy address, just fault with whatever DEAR and ESR * are here @@ -988,19 +955,20 @@ virt_page_table_tlb_miss_whacko_fault: * * Faulting address is SRR0 which is already in r16 */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ + srdi r11,r16,44 /* get region */ + xoris r11,r11,0xc + cmpldi cr0,r11,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r11,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r11,0 /* Check for user region */ + srdi. r11,r16,60 /* Check for user region */ ld r15,PACAPGD(r13) /* Load user pgdir */ beq htw_tlb_miss /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ @@ -1027,6 +995,11 @@ virt_page_table_tlb_miss_whacko_fault: * avoid too much complication, it will save/restore things for us */ htw_tlb_miss: +#ifdef CONFIG_PPC_KUAP + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- htw_tlb_miss_fault /* KUAP fault */ +#endif /* Search if we already have a TLB entry for that virtual address, and * if we do, bail out. * @@ -1088,13 +1061,9 @@ htw_tlb_miss: */ ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -1149,8 +1118,8 @@ tlb_load_linear: * we only use 1G pages for now. That might have to be changed in a * final implementation, especially when dealing with hypervisors */ - ld r11,PACATOC(r13) - ld r11,linear_map_top@got(r11) + __LOAD_PACA_TOC(r11) + LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top) ld r10,0(r11) tovirt(10,10) cmpld cr0,r16,r10 @@ -1175,13 +1144,9 @@ tlb_load_linear: clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 59d3cfcd7887..b44ce71917d7 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <asm/cputhreads.h> #include <asm/sparsemem.h> -#include <asm/prom.h> #include <asm/smp.h> #include <asm/topology.h> #include <asm/firmware.h> @@ -134,7 +133,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, return 0; } -static void reset_numa_cpu_lookup_table(void) +static void __init reset_numa_cpu_lookup_table(void) { unsigned int cpu; @@ -372,7 +371,7 @@ void update_numa_distance(struct device_node *node) * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} */ -static void initialize_form2_numa_distance_lookup_table(void) +static void __init initialize_form2_numa_distance_lookup_table(void) { int i, j; struct device_node *root; @@ -581,7 +580,7 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } -static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) { struct assoc_arrays aa = { .arrays = NULL }; int default_nid = NUMA_NO_NODE; @@ -956,7 +955,9 @@ static int __init parse_numa_properties(void) of_node_put(cpu); } - node_set_online(nid); + /* node_set_online() is an UB if 'nid' is negative */ + if (likely(nid >= 0)) + node_set_online(nid); } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); @@ -1159,6 +1160,9 @@ void __init mem_topology_setup(void) { int cpu; + max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + min_low_pfn = MEMORY_START >> PAGE_SHIFT; + /* * Linux/mm assumes node 0 to be online at boot. However this is not * true on PowerPC, where node 0 is similar to any other node, it @@ -1203,9 +1207,6 @@ void __init initmem_init(void) { int nid; - max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; - memblock_dump_all(); for_each_online_node(nid) { @@ -1421,43 +1422,26 @@ out: return rc; } -int find_and_online_cpu_nid(int cpu) +void find_and_update_cpu_nid(int cpu) { __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; int new_nid; /* Use associativity from first thread for all siblings */ if (vphn_get_associativity(cpu, associativity)) - return cpu_to_node(cpu); + return; + /* Do not have previous associativity, so find it now. */ new_nid = associativity_to_nid(associativity); - if (new_nid < 0 || !node_possible(new_nid)) - new_nid = first_online_node; - if (NODE_DATA(new_nid) == NULL) { -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Need to ensure that NODE_DATA is initialized for a node from - * available memory (see memblock_alloc_try_nid). If unable to - * init the node, then default to nearest node that has memory - * installed. Skip onlining a node if the subsystems are not - * yet initialized. - */ - if (!topology_inited || try_online_node(new_nid)) - new_nid = first_online_node; -#else - /* - * Default to using the nearest node that has memory installed. - * Otherwise, it would be necessary to patch the kernel MM code - * to deal with more memoryless-node error conditions. - */ + if (new_nid < 0 || !node_possible(new_nid)) new_nid = first_online_node; -#endif - } + else + // Associate node <-> cpu, so cpu_up() calls + // try_online_node() on the right node. + set_cpu_numa_node(cpu, new_nid); - pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__, - cpu, new_nid); - return new_nid; + pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid); } int cpu_to_coregroup_id(int cpu) diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c index edea388e9d3f..6163e484bc6d 100644 --- a/arch/powerpc/mm/pageattr.c +++ b/arch/powerpc/mm/pageattr.c @@ -15,12 +15,14 @@ #include <asm/pgtable.h> +static pte_basic_t pte_update_delta(pte_t *ptep, unsigned long addr, + unsigned long old, unsigned long new) +{ + return pte_update(&init_mm, addr, ptep, old & ~new, new & ~old, 0); +} + /* - * Updates the attributes of a page in three steps: - * - * 1. take the page_table_lock - * 2. install the new entry with the updated attributes - * 3. flush the TLB + * Updates the attributes of a page atomically. * * This sequence is safe against concurrent updates, and also allows updating the * attributes of a page currently being executed or accessed. @@ -28,41 +30,40 @@ static int change_page_attr(pte_t *ptep, unsigned long addr, void *data) { long action = (long)data; - pte_t pte; - spin_lock(&init_mm.page_table_lock); - - pte = ptep_get(ptep); - - /* modify the PTE bits as desired, then apply */ + addr &= PAGE_MASK; + /* modify the PTE bits as desired */ switch (action) { case SET_MEMORY_RO: - pte = pte_wrprotect(pte); + /* Don't clear DIRTY bit */ + pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_RO); break; case SET_MEMORY_RW: - pte = pte_mkwrite(pte_mkdirty(pte)); + pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_RW); break; case SET_MEMORY_NX: - pte = pte_exprotect(pte); + pte_update_delta(ptep, addr, _PAGE_KERNEL_ROX, _PAGE_KERNEL_RO); break; case SET_MEMORY_X: - pte = pte_mkexec(pte); + pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_ROX); + break; + case SET_MEMORY_NP: + pte_update(&init_mm, addr, ptep, _PAGE_PRESENT, 0, 0); + break; + case SET_MEMORY_P: + pte_update(&init_mm, addr, ptep, 0, _PAGE_PRESENT, 0); break; default: WARN_ON_ONCE(1); break; } - pte_update(&init_mm, addr, ptep, ~0UL, pte_val(pte), 0); - /* See ptesync comment in radix__set_pte_at() */ if (radix_enabled()) asm volatile("ptesync": : :"memory"); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - spin_unlock(&init_mm.page_table_lock); - return 0; } @@ -96,36 +97,3 @@ int change_memory_attr(unsigned long addr, int numpages, long action) return apply_to_existing_page_range(&init_mm, start, size, change_page_attr, (void *)action); } - -/* - * Set the attributes of a page: - * - * This function is used by PPC32 at the end of init to set final kernel memory - * protection. It includes changing the maping of the page it is executing from - * and data pages it is using. - */ -static int set_page_attr(pte_t *ptep, unsigned long addr, void *data) -{ - pgprot_t prot = __pgprot((unsigned long)data); - - spin_lock(&init_mm.page_table_lock); - - set_pte_at(&init_mm, addr, ptep, pte_modify(*ptep, prot)); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - spin_unlock(&init_mm.page_table_lock); - - return 0; -} - -int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot) -{ - unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); - unsigned long sz = numpages * PAGE_SIZE; - - if (numpages <= 0) - return 0; - - return apply_to_existing_page_range(&init_mm, start, sz, set_page_attr, - (void *)pgprot_val(prot)); -} diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 97ae4935da79..20652daa1d7e 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -83,7 +83,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) spin_lock(&mm->page_table_lock); /* * If we find pgtable_page set, we return - * the allocated page with single fragement + * the allocated page with single fragment * count. */ if (likely(!pte_frag_get(&mm->context))) { diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index ce9482383144..cb2dcdb18f8e 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -81,9 +81,6 @@ static struct page *maybe_pte_to_page(pte_t pte) static pte_t set_pte_filter_hash(pte_t pte) { - if (radix_enabled()) - return pte; - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { @@ -112,6 +109,9 @@ static inline pte_t set_pte_filter(pte_t pte) { struct page *pg; + if (radix_enabled()) + return pte; + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) return set_pte_filter_hash(pte); @@ -144,6 +144,9 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, { struct page *pg; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return pte; + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) return pte; @@ -203,6 +206,15 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, __set_pte_at(mm, addr, ptep, pte, 0); } +void unmap_kernel_page(unsigned long va) +{ + pmd_t *pmdp = pmd_off_k(va); + pte_t *ptep = pte_offset_kernel(pmdp, va); + + pte_clear(&init_mm, va, ptep); + flush_tlb_kernel_range(va, va + PAGE_SIZE); +} + /* * This is called when relaxing access to a PTE. It's also called in the page * fault path when we don't hit any of the major fault cases, ie, a minor @@ -339,7 +351,7 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys); * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table * * So long as we atomically load page table pointers we are safe against teardown, - * we can follow the address down to the the page and take a ref on it. + * we can follow the address down to the page and take a ref on it. * This function need to be called with interrupts disabled. We use this variant * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED */ @@ -460,3 +472,27 @@ out: return ret_pte; } EXPORT_SYMBOL_GPL(__find_linux_pte); + +/* Note due to the way vm flags are laid out, the bits are XWR */ +const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_X, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY_X, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; + +#ifndef CONFIG_PPC_BOOK3S_64 +DECLARE_VM_GET_PAGE_PROT +#endif diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 906e4e4328b2..5c02fd08d61e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -135,10 +135,12 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - if (v_block_mapped((unsigned long)_sinittext)) - mmu_mark_initmem_nx(); - else - set_memory_attr((unsigned long)_sinittext, numpages, PAGE_KERNEL); + mmu_mark_initmem_nx(); + + if (!v_block_mapped((unsigned long)_sinittext)) { + set_memory_nx((unsigned long)_sinittext, numpages); + set_memory_rw((unsigned long)_sinittext, numpages); + } } #ifdef CONFIG_STRICT_KERNEL_RWX @@ -146,24 +148,24 @@ void mark_rodata_ro(void) { unsigned long numpages; + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && mmu_has_feature(MMU_FTR_HPTE_TABLE)) + pr_warn("This platform has HASH MMU, STRICT_MODULE_RWX won't work\n"); + if (v_block_mapped((unsigned long)_stext + 1)) { mmu_mark_rodata_ro(); ptdump_check_wx(); return; } - numpages = PFN_UP((unsigned long)_etext) - - PFN_DOWN((unsigned long)_stext); - - set_memory_attr((unsigned long)_stext, numpages, PAGE_KERNEL_ROX); /* - * mark .rodata as read only. Use __init_begin rather than __end_rodata - * to cover NOTES and EXCEPTION_TABLE. + * mark text and rodata as read only. __end_rodata is set by + * powerpc's linker script and includes tables and data + * requiring relocation which are not put in RO_DATA. */ - numpages = PFN_UP((unsigned long)__init_begin) - - PFN_DOWN((unsigned long)__start_rodata); + numpages = PFN_UP((unsigned long)__end_rodata) - + PFN_DOWN((unsigned long)_stext); - set_memory_attr((unsigned long)__start_rodata, numpages, PAGE_KERNEL_RO); + set_memory_ro((unsigned long)_stext, numpages); // mark_initmem_nx() should have already run by now ptdump_check_wx(); @@ -179,8 +181,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) return; if (enable) - set_memory_attr(addr, numpages, PAGE_KERNEL); + set_memory_p(addr, numpages); else - set_memory_attr(addr, numpages, __pgprot(0)); + set_memory_np(addr, numpages); } #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 78c8cf01db5f..5ac1fd30341b 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -32,7 +32,6 @@ #include <linux/hugetlb.h> #include <asm/page.h> -#include <asm/prom.h> #include <asm/mmu_context.h> #include <asm/mmu.h> #include <asm/smp.h> @@ -102,7 +101,8 @@ EXPORT_SYMBOL(__pte_frag_size_shift); struct page *p4d_page(p4d_t p4d) { if (p4d_is_leaf(p4d)) { - VM_WARN_ON(!p4d_huge(p4d)); + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!p4d_huge(p4d)); return pte_page(p4d_pte(p4d)); } return virt_to_page(p4d_pgtable(p4d)); @@ -112,7 +112,8 @@ struct page *p4d_page(p4d_t p4d) struct page *pud_page(pud_t pud) { if (pud_is_leaf(pud)) { - VM_WARN_ON(!pud_huge(pud)); + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!pud_huge(pud)); return pte_page(pud_pte(pud)); } return virt_to_page(pud_pgtable(pud)); @@ -125,7 +126,13 @@ struct page *pud_page(pud_t pud) struct page *pmd_page(pmd_t pmd) { if (pmd_is_leaf(pmd)) { - VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); + /* + * vmalloc_to_page may be called on any vmap address (not only + * vmalloc), and it uses pmd_page() etc., when huge vmap is + * enabled so these checks can't be used. + */ + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); return pte_page(pmd_pte(pmd)); } return virt_to_page(pmd_page_vaddr(pmd)); diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 4050cbb55acf..dc896d2874f3 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -4,11 +4,11 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o +obj-$(CONFIG_PPC_E500) += shared.o obj-$(CONFIG_PPC_BOOK3S_32) += shared.o obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o ifdef CONFIG_PTDUMP_DEBUGFS obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o +obj-$(CONFIG_PPC_64S_HASH_MMU) += hashpagetable.o endif diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index c7f824d294b2..9a601587836b 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -238,7 +238,10 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { - struct hash_pte ptes[4]; + struct { + unsigned long v; + unsigned long r; + } ptes[4]; unsigned long vsid, vpn, hash, hpte_group, want_v; int i, j, ssize = mmu_kernel_ssize; long lpar_rc = 0; diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index bf251191e78d..2313053fe679 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <asm/fixmap.h> #include <linux/const.h> +#include <linux/kasan.h> #include <asm/page.h> #include <asm/hugetlb.h> @@ -123,7 +124,7 @@ static struct ptdump_range ptdump_range[] __ro_after_init = { void pt_dump_size(struct seq_file *m, unsigned long size) { - static const char units[] = "KMGTPE"; + static const char units[] = " KMGTPE"; const char *unit = units; /* Work out what appropriate unit to use */ @@ -176,14 +177,14 @@ static void dump_addr(struct pg_state *st, unsigned long addr) pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); - pt_dump_size(st->seq, (addr - st->start_address) >> 10); + pt_dump_size(st->seq, addr - st->start_address); } static void note_prot_wx(struct pg_state *st, unsigned long addr) { pte_t pte = __pte(st->current_flags); - if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) + if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx) return; if (!pte_write(pte) || !pte_exec(pte)) @@ -289,11 +290,11 @@ static void populate_markers(void) #endif address_markers[i++].start_address = FIXADDR_START; address_markers[i++].start_address = FIXADDR_TOP; +#endif /* CONFIG_PPC64 */ #ifdef CONFIG_KASAN address_markers[i++].start_address = KASAN_SHADOW_START; address_markers[i++].start_address = KASAN_SHADOW_END; #endif -#endif /* CONFIG_PPC64 */ } static int ptdump_show(struct seq_file *m, void *v) @@ -315,7 +316,7 @@ static int ptdump_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump); -static void build_pgtable_complete_mask(void) +static void __init build_pgtable_complete_mask(void) { unsigned int i, j; diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index 03607ab90c66..f884760ca5cf 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -17,9 +17,9 @@ static const struct flag_info flag_array[] = { .clear = " ", }, { .mask = _PAGE_RW, - .val = _PAGE_RW, - .set = "rw", - .clear = "r ", + .val = 0, + .set = "r ", + .clear = "rw", }, { .mask = _PAGE_EXEC, .val = _PAGE_EXEC, |