diff options
Diffstat (limited to 'arch/powerpc/mm')
62 files changed, 2104 insertions, 1146 deletions
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 6925ce998557..4ed0efd03db5 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -14,6 +14,7 @@ * hash table, so this file is not used on them.) */ +#include <linux/export.h> #include <linux/pgtable.h> #include <linux/init.h> #include <asm/reg.h> @@ -22,7 +23,6 @@ #include <asm/ppc_asm.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> -#include <asm/export.h> #include <asm/feature-fixups.h> #include <asm/code-patching-asm.h> @@ -36,8 +36,9 @@ /* * Load a PTE into the hash table, if possible. - * The address is in r4, and r3 contains an access flag: - * _PAGE_RW (0x400) if a write. + * The address is in r4, and r3 contains required access flags: + * - For ISI: _PAGE_PRESENT | _PAGE_EXEC + * - For DSI: _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. * SPRG_THREAD contains the physical address of the current task's thread. * @@ -67,12 +68,16 @@ _GLOBAL(hash_page) lis r0, TASK_SIZE@h /* check if kernel address */ cmplw 0,r4,r0 mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ + andi. r0,r9,MSR_PR /* Check usermode */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ - rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ +#ifdef CONFIG_SMP + bne- .Lhash_page_out /* return if usermode */ +#else + bnelr- +#endif 112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ @@ -113,15 +118,15 @@ _GLOBAL(hash_page) lwarx r6,0,r8 /* get linux-style pte, flag word */ #ifdef CONFIG_PPC_KUAP mfsrin r5,r4 - rlwinm r0,r9,28,_PAGE_RW /* MSR[PR] => _PAGE_RW */ - rlwinm r5,r5,12,_PAGE_RW /* Ks => _PAGE_RW */ + rlwinm r0,r9,28,_PAGE_WRITE /* MSR[PR] => _PAGE_WRITE */ + rlwinm r5,r5,12,_PAGE_WRITE /* Ks => _PAGE_WRITE */ andc r5,r5,r0 /* Ks & ~MSR[PR] */ - andc r5,r6,r5 /* Clear _PAGE_RW when Ks = 1 && MSR[PR] = 0 */ + andc r5,r6,r5 /* Clear _PAGE_WRITE when Ks = 1 && MSR[PR] = 0 */ andc. r5,r3,r5 /* check access & ~permission */ #else andc. r5,r3,r6 /* check access & ~permission */ #endif - rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + rlwinm r0,r3,32-3,24,24 /* _PAGE_WRITE access -> _PAGE_DIRTY */ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE #ifdef CONFIG_SMP bne- .Lhash_page_out /* return if access not permitted */ @@ -199,12 +204,12 @@ _GLOBAL(add_hash_page) lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l 10: lwarx r0,0,r6 /* take the mmu_hash_lock */ - cmpi 0,r0,0 + cmpwi 0,r0,0 bne- 11f stwcx. r8,0,r6 beq+ 12f 11: lwz r0,0(r6) - cmpi 0,r0,0 + cmpwi 0,r0,0 beq 10b b 11b 12: isync @@ -307,12 +312,15 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) __REF _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + lis r0, TASK_SIZE@h + rlwinm r5,r5,0,~3 /* Clear PP bits */ + cmplw r4,r0 + rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe04 /* clear out reserved bits */ + bge- 1f /* Kernelspace ? Skip */ + ori r5,r5,3 /* Userspace ? PP = 3 */ +1: ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ @@ -512,12 +520,12 @@ _GLOBAL(flush_hash_pages) lwz r8, TASK_CPU(r8) oris r8,r8,9 10: lwarx r0,0,r9 - cmpi 0,r0,0 + cmpwi 0,r0,0 bne- 11f stwcx. r8,0,r9 beq+ 12f 11: lwz r0,0(r9) - cmpi 0,r0,0 + cmpwi 0,r0,0 beq 10b b 11b 12: isync diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c index 28676cabb005..3a8815555a48 100644 --- a/arch/powerpc/mm/book3s32/kuap.c +++ b/arch/powerpc/mm/book3s32/kuap.c @@ -3,25 +3,11 @@ #include <asm/kup.h> #include <asm/smp.h> -struct static_key_false disable_kuap_key; -EXPORT_SYMBOL(disable_kuap_key); - -void kuap_lock_all_ool(void) -{ - kuap_lock_all(); -} -EXPORT_SYMBOL(kuap_lock_all_ool); - -void kuap_unlock_all_ool(void) -{ - kuap_unlock_all(); -} -EXPORT_SYMBOL(kuap_unlock_all_ool); - void setup_kuap(bool disabled) { if (!disabled) { - kuap_lock_all_ool(); + update_user_segments(mfsr(0) | SR_KS); + isync(); /* Context sync required after mtsr() */ init_mm.context.sr0 |= SR_KS; current->thread.sr0 |= SR_KS; } @@ -30,7 +16,7 @@ void setup_kuap(bool disabled) return; if (disabled) - static_branch_enable(&disable_kuap_key); + cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP; else pr_info("Activating Kernel Userspace Access Protection\n"); } diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 49a737fbbd18..5445587bfe84 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -127,7 +127,7 @@ static void setibat(int index, unsigned long virt, phys_addr_t phys, wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) + if (!is_kernel_addr(virt)) bat[0].batu |= 1; /* Vp = 1 */ } @@ -158,10 +158,13 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long done; - unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET; + unsigned long size; + size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET); + setibat(0, PAGE_OFFSET, 0, size, PAGE_KERNEL_X); - if (debug_pagealloc_enabled_or_kfence() || __map_without_bats) { + if (debug_pagealloc_enabled_or_kfence()) { pr_debug_once("Read-Write memory mapped without BATs\n"); if (base >= border) return base; @@ -237,7 +240,7 @@ void mmu_mark_rodata_ro(void) for (i = 0; i < nb; i++) { struct ppc_bat *bat = BATS[i]; - if (bat_addrs[i].start < (unsigned long)__init_begin) + if (bat_addrs[i].start < (unsigned long)__end_rodata) bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; } @@ -245,10 +248,9 @@ void mmu_mark_rodata_ro(void) } /* - * Set up one of the I/D BAT (block address translation) register pairs. + * Set up one of the D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. - * On 603+, only set IBAT when _PAGE_EXEC is set */ void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -275,19 +277,15 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* Do DBAT first */ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + wimgxpp |= (flags & _PAGE_WRITE) ? BPP_RW : BPP_RX; bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) + if (!is_kernel_addr(virt)) bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ flags &= ~_PAGE_EXEC; } - if (flags & _PAGE_EXEC) - bat[0] = bat[1]; - else - bat[0].batu = bat[0].batl = 0; bat_addrs[index].start = virt; bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; @@ -316,11 +314,9 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return; /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c index 269a3eb25a73..1922f9a6b058 100644 --- a/arch/powerpc/mm/book3s32/mmu_context.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -71,7 +71,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.id = __init_new_context(); mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0); - if (!kuep_is_disabled()) + if (IS_ENABLED(CONFIG_PPC_KUEP)) mm->context.sr0 |= SR_NX; if (!kuap_is_disabled()) mm->context.sr0 |= SR_KS; diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 19f0ef950d77..9ad6b56bfec9 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range); void hash__flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_lock. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + for_each_vma(vmi, mp) hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); } EXPORT_SYMBOL(hash__flush_tlb_mm); diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c index 7de1a8a0c62a..02acbfd05b46 100644 --- a/arch/powerpc/mm/book3s64/hash_4k.c +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, int subpg_prot) @@ -118,6 +120,9 @@ repeat: } new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c index 998c6817ed47..954af420f358 100644 --- a/arch/powerpc/mm/book3s64/hash_64k.c +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + /* * Return true, if the entry has a slot value which * the software considers as invalid. @@ -216,6 +218,9 @@ repeat: new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); new_pte |= H_PAGE_HASHPTE; + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -327,7 +332,12 @@ repeat: new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; } diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c index c0fabe6c5a12..15d6f3ea7178 100644 --- a/arch/powerpc/mm/book3s64/hash_hugepage.c +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -59,16 +59,13 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, rflags = htab_convert_pte_flags(new_pmd, flags); -#if 0 - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + /* + * THPs are only supported on platforms that can do mixed page size + * segments (MPSS) and all such platforms have coherent icache. Hence we + * don't need to do lazy icache flush (hash_page_do_lazy_icache()) on + * noexecute fault. + */ - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } -#endif /* * Find the slot index details for this ea, using base page size. */ diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 623a7b7ab38b..430d1d935a7c 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,6 +43,29 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { @@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -243,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -263,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -286,19 +316,24 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } static long native_hpte_remove(unsigned long hpte_group) { + unsigned long hpte_v, flags; struct hash_pte *hptep; int i; int slot_offset; - unsigned long hpte_v; + + local_irq_save(flags); DBG_LOW(" remove(group=%lx)\n", hpte_group); @@ -323,12 +358,16 @@ static long native_hpte_remove(unsigned long hpte_group) slot_offset &= 0x7; } - if (i == HPTES_PER_GROUP) - return -1; + if (i == HPTES_PER_GROUP) { + i = -1; + goto out; + } /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - +out: + local_irq_restore(flags); return i; } @@ -339,6 +378,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -382,6 +424,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -445,6 +489,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -463,6 +510,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -476,6 +525,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -493,6 +545,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } @@ -517,10 +572,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -580,10 +636,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -765,8 +819,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 2e0cad5817ba..988948d69bc1 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -13,6 +13,7 @@ #include <asm/sections.h> #include <asm/mmu.h> #include <asm/tlb.h> +#include <asm/firmware.h> #include <mm/mmu_decl.h> @@ -213,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; @@ -255,7 +256,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * the __collapse_huge_page_copy can result in copying * the old content. */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); return pmd; } @@ -403,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -477,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -515,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); @@ -540,7 +543,7 @@ void hash__mark_rodata_ro(void) unsigned long start, end, pp; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index eb0bccaf221e..21fcad97ae80 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -221,7 +221,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) local_irq_restore(flags); } -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) { pte_t *pte; pte_t *start_pte; @@ -239,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) local_irq_save(flags); arch_enter_lazy_mmu_mode(); start_pte = pte_offset_map(pmd, addr); + if (!start_pte) + goto out; for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } + pte_unmap(start_pte); +out: arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index fc92613dc2bf..01c3b4b65241 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -123,11 +123,8 @@ EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif -#ifdef CONFIG_DEBUG_PAGEALLOC static u8 *linear_map_hash_slots; static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ struct mmu_hash_ops mmu_hash_ops; EXPORT_SYMBOL(mmu_hash_ops); @@ -313,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags else rflags |= 0x3; } + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); } else { if (pteflags & _PAGE_RWX) rflags |= 0x2; + /* + * We should never hit this in normal fault handling because + * a permission check (check_pte_access()) will bubble this + * to higher level linux handler even for PAGE_NONE. + */ + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } @@ -408,7 +412,7 @@ repeat: ssize); if (ret == -1) { /* - * Try to to keep bolted entries in primary. + * Try to keep bolted entries in primary. * Remove non bolted entries and try insert again */ ret = mmu_hash_ops.hpte_remove(hpteg); @@ -427,11 +431,9 @@ repeat: break; cond_resched(); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && + if (debug_pagealloc_enabled_or_kfence() && (paddr >> PAGE_SHIFT) < linear_map_hash_count) linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ } return ret < 0 ? ret : 0; } @@ -476,7 +478,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, return ret; } -static bool disable_1tb_segments = false; +static bool disable_1tb_segments __ro_after_init; static int __init parse_disable_1tb_segments(char *p) { @@ -485,6 +487,40 @@ static int __init parse_disable_1tb_segments(char *p) } early_param("disable_1tb_segments", parse_disable_1tb_segments); +bool stress_hpt_enabled __initdata; + +static int __init parse_stress_hpt(char *p) +{ + stress_hpt_enabled = true; + return 0; +} +early_param("stress_hpt", parse_stress_hpt); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key); + +/* + * per-CPU array allocated if we enable stress_hpt. + */ +#define STRESS_MAX_GROUPS 16 +struct stress_hpt_struct { + unsigned long last_group[STRESS_MAX_GROUPS]; +}; + +static inline int stress_nr_groups(void) +{ + /* + * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries + * to allow practical forward progress. Bare metal returns 1, which + * seems to help uncover more bugs. + */ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return STRESS_MAX_GROUPS; + else + return 1; +} + +static struct stress_hpt_struct *stress_hpt_struct; + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) @@ -778,7 +814,7 @@ static void __init htab_init_page_sizes(void) bool aligned = true; init_hpte_page_sizes(); - if (!debug_pagealloc_enabled()) { + if (!debug_pagealloc_enabled_or_kfence()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default @@ -981,6 +1017,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, pr_info("Partition table %p\n", partition_tb); } +void hpt_clear_stress(void); +static struct timer_list stress_hpt_timer; +static void stress_hpt_timer_fn(struct timer_list *timer) +{ + int next_cpu; + + hpt_clear_stress(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + tlbiel_all(); + + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer_on(&stress_hpt_timer, next_cpu); +} + static void __init htab_initialize(void) { unsigned long table; @@ -1000,6 +1053,21 @@ static void __init htab_initialize(void) if (stress_slb_enabled) static_branch_enable(&stress_slb_key); + if (stress_hpt_enabled) { + unsigned long tmp; + static_branch_enable(&stress_hpt_key); + // Too early to use nr_cpu_ids, so use NR_CPUS + tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, + __alignof__(struct stress_hpt_struct), + 0, MEMBLOCK_ALLOC_ANYWHERE); + memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); + stress_hpt_struct = __va(tmp); + + timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer(&stress_hpt_timer); + } + /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. @@ -1066,8 +1134,7 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { + if (debug_pagealloc_enabled_or_kfence()) { linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; linear_map_hash_slots = memblock_alloc_try_nid( linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, @@ -1076,7 +1143,6 @@ static void __init htab_initialize(void) panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", __func__, linear_map_hash_count, &ppc64_rma_size); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ /* create bolted the linear mapping in the hash table */ for_each_mem_range(i, &base, &end) { @@ -1248,18 +1314,19 @@ void hash__early_init_mmu_secondary(void) */ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) { - struct page *page; + struct folio *folio; if (!pfn_valid(pte_pfn(pte))) return pp; - page = pte_page(pte); + folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { + if (!test_bit(PG_dcache_clean, &folio->flags) && + !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } else pp |= HPTE_R_N; } @@ -1781,7 +1848,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* @@ -1791,9 +1858,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, unsigned long trap; bool is_exec; - if (radix_enabled()) - return; - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return; @@ -1990,7 +2054,72 @@ repeat: return slot; } -#ifdef CONFIG_DEBUG_PAGEALLOC +void hpt_clear_stress(void) +{ + int cpu = raw_smp_processor_id(); + int g; + + for (g = 0; g < stress_nr_groups(); g++) { + unsigned long last_group; + last_group = stress_hpt_struct[cpu].last_group[g]; + + if (last_group != -1UL) { + int i; + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[g] = -1; + } + } +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group) +{ + unsigned long last_group; + int cpu = raw_smp_processor_id(); + + last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1]; + if (hpte_group == last_group) + return; + + if (last_group != -1UL) { + int i; + /* + * Concurrent CPUs might be inserting into this group, so + * give up after a number of iterations, to prevent a live + * lock. + */ + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1; + } + + if (ea >= PAGE_OFFSET) { + /* + * We would really like to prefetch to get the TLB loaded, then + * remove the PTE before returning from fault interrupt, to + * increase the hash fault rate. + * + * Unfortunately QEMU TCG does not model the TLB in a way that + * makes this possible, and systemsim (mambo) emulator does not + * bring in TLBs with prefetches (although loads/stores do + * work for non-CI PTEs). + * + * So remember this PTE and clear it on the next hash fault. + */ + memmove(&stress_hpt_struct[cpu].last_group[1], + &stress_hpt_struct[cpu].last_group[0], + (stress_nr_groups() - 1) * sizeof(unsigned long)); + stress_hpt_struct[cpu].last_group[0] = hpte_group; + } +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); + static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { unsigned long hash; @@ -2005,15 +2134,18 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) if (!vsid) return; + if (linear_map_hash_slots[lmi] & 0x80) + return; + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, HPTE_V_BOLTED, mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2023,11 +2155,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + raw_spin_lock(&linear_map_hash_lock); + if (!(linear_map_hash_slots[lmi] & 0x80)) { + raw_spin_unlock(&linear_map_hash_lock); + return; + } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -2037,7 +2172,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) mmu_kernel_ssize, 0); } -void hash__kernel_map_pages(struct page *page, int numpages, int enable) +int hash__kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long flags, vaddr, lmi; int i; @@ -2054,8 +2189,9 @@ void hash__kernel_map_pages(struct page *page, int numpages, int enable) kernel_unmap_linear_page(vaddr, lmi); } local_irq_restore(flags); + return 0; } -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index 3bc0eb21b2a0..5a2e512e96db 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -143,11 +143,14 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { + unsigned long psize; if (radix_enabled()) return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + + psize = huge_page_size(hstate_vma(vma)); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } void __init hugetlbpage_init_defaultsize(void) diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h index 5045048ce244..a57a25f06a21 100644 --- a/arch/powerpc/mm/book3s64/internal.h +++ b/arch/powerpc/mm/book3s64/internal.h @@ -13,6 +13,17 @@ static inline bool stress_slb(void) return static_branch_unlikely(&stress_slb_key); } +extern bool stress_hpt_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_hpt_key); + +static inline bool stress_hpt(void) +{ + return static_branch_unlikely(&stress_hpt_key); +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group); + void slb_setup_new_exec(void); void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 7fcfba162e0d..c0e8d597e4cb 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } mmap_read_lock(mm); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { @@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, FOLL_WRITE | FOLL_LONGTERM, - mem->hpages + entry, NULL); + mem->hpages + entry); if (ret == n) { pinned += n; continue; diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c766e4c26e42..1715b07c630c 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx) static void pmd_frag_destroy(void *pmd_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pmd_frag); + ptdesc = virt_to_ptdesc(pmd_frag); /* drop all the pending references */ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 7b9966402b25..83823db3488b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -9,6 +9,7 @@ #include <linux/memremap.h> #include <linux/pkeys.h> #include <linux/debugfs.h> +#include <linux/proc_fs.h> #include <misc/cxl-base.h> #include <asm/pgalloc.h> @@ -64,11 +65,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -84,12 +113,29 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd))); + WARN_ON(!(pmd_leaf(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_leaf(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + static void do_serialize(void *arg) { /* We've taken the IPI, so try to trim the mask while here */ @@ -100,14 +146,14 @@ static void do_serialize(void *arg) } /* - * Serialize against find_current_mm_pte which does lock-less + * Serialize against __find_linux_pte() which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. + * __find_linux_pte() to finish. */ void serialize_against_pte_lookup(struct mm_struct *mm) { @@ -147,11 +193,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, return pmd; } +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || + !pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) { return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + /* * At some point we should be able to get rid of * pmd_mkhuge() and mk_huge_pmd() when we update all the @@ -166,6 +236,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); } +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); +} + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) { return pfn_pmd(page_to_pfn(page), pgprot); @@ -306,22 +385,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -331,12 +410,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); mm->context.pmd_frag = ret + PMD_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); @@ -357,15 +436,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) void pmd_fragment_free(unsigned long *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -463,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, set_pte_at(vma->vm_mm, addr, ptep, pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd @@ -484,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } +#endif /* * Does the CPU support tlbie? @@ -553,8 +634,13 @@ EXPORT_SYMBOL_GPL(memremap_compat_align); pgprot_t vm_get_page_prot(unsigned long vm_flags) { - unsigned long prot = pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + unsigned long prot; + + /* Radix supports execute-only, but protection_map maps X -> RX */ + if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) + vm_flags |= VM_READ; + + prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]); if (vm_flags & VM_SAO) prot |= _PAGE_SAO; diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 753e62ba67af..a974baf8f327 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -10,6 +10,7 @@ #include <asm/mmu.h> #include <asm/setup.h> #include <asm/smp.h> +#include <asm/firmware.h> #include <linux/pkeys.h> #include <linux/of_fdt.h> @@ -88,7 +89,8 @@ static int __init scan_pkey_feature(void) unsigned long pvr = mfspr(SPRN_PVR); if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || - PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9) + PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 || + PVR_VER(pvr) == PVR_HX_C2000) pkeys_total = 32; } } @@ -290,7 +292,7 @@ void setup_kuap(bool disabled) if (smp_processor_id() == boot_cpuid) { pr_info("Activating Kernel Userspace Access Prevention\n"); - cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUAP; + cur_cpu_spec->mmu_features |= MMU_FTR_KUAP; } /* diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index d2fb776febb4..35fd2a95be24 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, @@ -46,14 +47,17 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, pte_t old_pte, pte_t pte) { struct mm_struct *mm = vma->vm_mm; + unsigned long psize = huge_page_size(hstate_vma(vma)); /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + atomic_read(&mm->context.copros) > 0) radix__flush_hugetlb_page(vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index db2f3d193448..15e88f1439ec 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -30,11 +30,13 @@ #include <asm/trace.h> #include <asm/uaccess.h> #include <asm/ultravisor.h> +#include <asm/set_memory.h> #include <trace/events/thp.h> +#include <mm/mmu_decl.h> + unsigned int mmu_base_pid; -unsigned long radix_mem_block_size __ro_after_init; static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) @@ -202,14 +204,14 @@ static void radix__change_memory_range(unsigned long start, unsigned long end, pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; - if (pud_is_leaf(*pudp)) { + if (pud_leaf(*pudp)) { ptep = (pte_t *)pudp; goto update_the_pte; } pmdp = pmd_alloc(&init_mm, pudp, idx); if (!pmdp) continue; - if (pmd_is_leaf(*pmdp)) { + if (pmd_leaf(*pmdp)) { ptep = pmdp_ptep(pmdp); goto update_the_pte; } @@ -228,9 +230,17 @@ void radix__mark_rodata_ro(void) unsigned long start, end; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } } void radix__mark_initmem_nx(void) @@ -259,21 +269,40 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + + if (addr < __pa_symbol(__srwx_boundary)) + return __pa_symbol(__srwx_boundary); #endif return end; } static int __meminit create_physical_mapping(unsigned long start, unsigned long end, - unsigned long max_mapping_size, int nid, pgprot_t _prot) { unsigned long vaddr, addr, mapping_size = 0; bool prev_exec, exec = false; pgprot_t prot; int psize; + unsigned long max_mapping_size = memory_block_size; + + if (debug_pagealloc_enabled_or_kfence()) + max_mapping_size = PAGE_SIZE; start = ALIGN(start, PAGE_SIZE); end = ALIGN_DOWN(end, PAGE_SIZE); @@ -352,7 +381,6 @@ static void __init radix_init_pgtable(void) } WARN_ON(create_physical_mapping(start, end, - radix_mem_block_size, -1, PAGE_KERNEL)); } @@ -473,58 +501,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, return 1; } -#ifdef CONFIG_MEMORY_HOTPLUG -static int __init probe_memory_block_size(unsigned long node, const char *uname, int - depth, void *data) -{ - unsigned long *mem_block_size = (unsigned long *)data; - const __be32 *prop; - int len; - - if (depth != 1) - return 0; - - if (strcmp(uname, "ibm,dynamic-reconfiguration-memory")) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len); - - if (!prop || len < dt_root_size_cells * sizeof(__be32)) - /* - * Nothing in the device tree - */ - *mem_block_size = MIN_MEMORY_BLOCK_SIZE; - else - *mem_block_size = of_read_number(prop, dt_root_size_cells); - return 1; -} - -static unsigned long __init radix_memory_block_size(void) -{ - unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE; - - /* - * OPAL firmware feature is set by now. Hence we are ok - * to test OPAL feature. - */ - if (firmware_has_feature(FW_FEATURE_OPAL)) - mem_block_size = 1UL * 1024 * 1024 * 1024; - else - of_scan_flat_dt(probe_memory_block_size, &mem_block_size); - - return mem_block_size; -} - -#else /* CONFIG_MEMORY_HOTPLUG */ - -static unsigned long __init radix_memory_block_size(void) -{ - return 1UL * 1024 * 1024 * 1024; -} - -#endif /* CONFIG_MEMORY_HOTPLUG */ - - void __init radix__early_init_devtree(void) { int rc; @@ -548,16 +524,6 @@ void __init radix__early_init_devtree(void) mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize = psize_to_rpti_pgsize(MMU_PAGE_64K); } - - /* - * Max mapping size used when mapping pages. We don't use - * ppc_md.memory_block_size() here because this get called - * early and we don't have machine probe called yet. Also - * the pseries implementation only check for ibm,lmb-size. - * All hypervisor supporting radix do expose that device - * tree node. - */ - radix_mem_block_size = radix_memory_block_size(); return; } @@ -572,17 +538,6 @@ void __init radix__early_init_mmu(void) #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; -#endif #endif /* * initialize page table size @@ -715,10 +670,60 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) p4d_clear(p4d); } -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) { - unsigned long next; + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. + */ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; pte_t *pte; pte = pte_start + pte_index(addr); @@ -730,23 +735,28 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, if (!pte_present(*pte)) continue; - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; } - - pte_clear(&init_mm, addr, pte); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif } + if (direct) + update_page_count(mmu_virtual_psize, -pages); } static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { - unsigned long next; + unsigned long next, pages = 0; pte_t *pte_base; pmd_t *pmd; @@ -757,26 +767,36 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, if (!pmd_present(*pmd)) continue; - if (pmd_is_leaf(*pmd)) { - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) { - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (pmd_leaf(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; } - pte_clear(&init_mm, addr, (pte_t *)pmd); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next); + remove_pte_table(pte_base, addr, next, direct, altmap); free_pte_table(pte_base, pmd); } + if (direct) + update_page_count(MMU_PAGE_2M, -pages); } static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { - unsigned long next; + unsigned long next, pages = 0; pmd_t *pmd_base; pud_t *pud; @@ -787,23 +807,28 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, if (!pud_present(*pud)) continue; - if (pud_is_leaf(*pud)) { + if (pud_leaf(*pud)) { if (!IS_ALIGNED(addr, PUD_SIZE) || !IS_ALIGNED(next, PUD_SIZE)) { WARN_ONCE(1, "%s: unaligned range\n", __func__); continue; } pte_clear(&init_mm, addr, (pte_t *)pud); + pages++; continue; } pmd_base = pud_pgtable(*pud); - remove_pmd_table(pmd_base, addr, next); + remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } + if (direct) + update_page_count(MMU_PAGE_1G, -pages); } -static void __meminit remove_pagetable(unsigned long start, unsigned long end) +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long addr, next; pud_t *pud_base; @@ -820,7 +845,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) if (!p4d_present(*p4d)) continue; - if (p4d_is_leaf(*p4d)) { + if (p4d_leaf(*p4d)) { if (!IS_ALIGNED(addr, P4D_SIZE) || !IS_ALIGNED(next, P4D_SIZE)) { WARN_ONCE(1, "%s: unaligned range\n", __func__); @@ -832,7 +857,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) } pud_base = p4d_pgtable(*p4d); - remove_pud_table(pud_base, addr, next); + remove_pud_table(pud_base, addr, next, direct, altmap); free_pud_table(pud_base, p4d); } @@ -850,12 +875,12 @@ int __meminit radix__create_section_mapping(unsigned long start, } return create_physical_mapping(__pa(start), __pa(end), - radix_mem_block_size, nid, prot); + nid, prot); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { - remove_pagetable(start, end); + remove_pagetable(start, end, true, NULL); return 0; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -873,7 +898,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys) { /* Create a PTE encoding */ - unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret; @@ -882,26 +906,438 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return -1; } - ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid); BUG_ON(ret); return 0; } + +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_leaf(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); + continue; + } else if (altmap) { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - remove_pagetable(start, start + page_size); + remove_pagetable(start, start + page_size, true, NULL); } -#endif -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -void radix__kernel_map_pages(struct page *page, int numpages, int enable) +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { - pr_warn_once("DEBUG_PAGEALLOC not supported in radix mode\n"); + remove_pagetable(start, end, false, altmap); } #endif +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -916,8 +1352,25 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add assert_spin_locked(pmd_lockptr(mm, pmdp)); #endif - old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); + trace_hugepage_update_pmd(addr, old, clr, set); + + return old; +} + +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); return old; } @@ -937,15 +1390,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); - /* - * pmdp collapse_flush need to ensure that there are no parallel gup - * walk after this call. This is needed so that we can have stable - * page ref count when collapsing a page. We don't allow a collapse page - * if we have gup taken on the page. We can ensure that by sending IPI - * because gup walk happens with IRQ disabled. - */ - serialize_against_pte_lookup(vma->vm_mm); - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); return pmd; @@ -1007,27 +1451,43 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, unsigned long address, int psize) { struct mm_struct *mm = vma->vm_mm; - unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | - _PAGE_RW | _PAGE_EXEC); + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY | + _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); unsigned long change = pte_val(entry) ^ pte_val(*ptep); /* - * To avoid NMMU hang while relaxing access, we need mark - * the pte invalid in between. + * On POWER9, the NMMU is not able to relax PTE access permissions + * for a translation with a TLB. The PTE must be invalidated, TLB + * flushed before the new PTE is installed. + * + * This only needs to be done for radix, because hash translation does + * flush when updating the linux pte (and we don't support NMMU + * accelerators on HPT on POWER9 anyway XXX: do we?). + * + * POWER10 (and P9P) NMMU does behave as per ISA. */ - if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && + atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); - /* - * new value of pte - */ new_pte = old_pte | set; radix__flush_tlb_page_psize(mm, address, psize); __radix_pte_update(ptep, _PAGE_INVALID, new_pte); @@ -1035,9 +1495,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, __radix_pte_update(ptep, 0, set); /* * Book3S does not require a TLB flush when relaxing access - * restrictions when the address space is not attached to a - * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architecture. + * restrictions when the address space (modulo the POWER9 nest + * MMU issue above) because the MMU will reload the PTE after + * taking an access fault, as defined by the architecture. See + * "Setting a Reference or Change Bit or Upgrading Access + * Authority (PTE Subject to Atomic Hardware Updates)" in + * Power ISA Version 3.1B. */ } /* See ptesync comment in radix__set_pte_at */ @@ -1050,11 +1513,12 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. We need to do this only for radix, because hash - * translation does flush when updating the linux pte. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && (atomic_read(&mm->context.copros) > 0)) radix__flush_tlb_page(vma, addr); @@ -1076,7 +1540,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (pud_is_leaf(*pud)) { + if (pud_leaf(*pud)) { pud_clear(pud); return 1; } @@ -1123,7 +1587,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (pmd_is_leaf(*pmd)) { + if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index dda51fef2d2e..9e1f6558d026 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -127,21 +127,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbie_pid_lpid(unsigned long pid, - unsigned long lpid, - unsigned long ric) -{ - unsigned long rb, rs, prs, r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -202,23 +187,6 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, - unsigned long lpid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb, rs, prs, r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} - static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long ap, unsigned long ric) { @@ -264,22 +232,6 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, } } -static inline void fixup_tlbie_va_range_lpid(unsigned long va, - unsigned long pid, - unsigned long lpid, - unsigned long ap) -{ - if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); - } - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); - } -} - static inline void fixup_tlbie_pid(unsigned long pid) { /* @@ -299,26 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid) } } -static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) -{ - /* - * We can use any address for the invalidation, pick one which is - * probably unused as an optimisation. - */ - unsigned long va = ((1UL << 52) - 1); - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); - } - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), - RIC_FLUSH_TLB); - } -} - static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long ap) { @@ -416,31 +348,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, - unsigned long ric) -{ - asm volatile("ptesync" : : : "memory"); - - /* - * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint - * in the asm statement. - */ - switch (ric) { - case RIC_FLUSH_TLB: - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); - fixup_tlbie_pid_lpid(pid, lpid); - break; - case RIC_FLUSH_PWC: - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); - break; - case RIC_FLUSH_ALL: - default: - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); - fixup_tlbie_pid_lpid(pid, lpid); - } - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} struct tlbiel_pid { unsigned long pid; unsigned long ric; @@ -566,20 +473,6 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, fixup_tlbie_va_range(addr - page_size, pid, ap); } -static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, - unsigned long pid, unsigned long lpid, - unsigned long page_size, - unsigned long psize) -{ - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - - for (addr = start; addr < end; addr += page_size) - __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); - - fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); -} - static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, unsigned long psize, unsigned long ric) { @@ -660,18 +553,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, - unsigned long pid, unsigned long lpid, - unsigned long page_size, - unsigned long psize, bool also_pwc) -{ - asm volatile("ptesync" : : : "memory"); - if (also_pwc) - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); - __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} - static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, @@ -700,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, */ void radix__local_flush_tlb_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); @@ -713,12 +595,13 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP void radix__local_flush_all_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_all_mm); @@ -732,12 +615,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -755,10 +639,18 @@ EXPORT_SYMBOL(radix__local_flush_tlb_page); static bool mm_needs_flush_escalation(struct mm_struct *mm) { /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate + * The P9 nest MMU has issues with the page walk cache caching PTEs + * and not flushing them when RIC = 0 for a PID/LPID invalidate. + * + * This may have been fixed in shipping firmware (by disabling PWC + * or preventing it from caching PTEs), but until that is confirmed, + * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes + * to RIC=2. + * + * POWER10 (and P9P) does not have this problem. */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + return false; if (atomic_read(&mm->context.copros) > 0) return true; return false; @@ -784,12 +676,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) goto out; if (current->active_mm == mm) { + unsigned long flags; + WARN_ON_ONCE(current->mm != NULL); - /* Is a kernel thread and is using mm as the lazy tlb */ - mmgrab(&init_mm); + /* + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. + */ + local_irq_save(flags); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); - mmdrop(mm); + mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } /* @@ -801,7 +701,7 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) * that's what the caller expects. */ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { - atomic_dec(&mm->context.active_cpus); + dec_mm_active_cpus(mm); cpumask_clear_cpu(cpu, mm_cpumask(mm)); always_flush = true; } @@ -937,7 +837,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -968,6 +868,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -977,7 +878,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -1001,6 +902,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1016,7 +918,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -1096,6 +998,9 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +/* + * Doesn't appear to be used anywhere. Remove. + */ #define TLB_FLUSH_ALL -1UL /* @@ -1117,23 +1022,22 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool fullmm = (end == TLB_FLUSH_ALL); bool flush_pid, flush_pwc = false; enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - type = flush_type_needed(mm, fullmm); + type = flush_type_needed(mm, false); if (type == FLUSH_TYPE_NONE) goto out; - if (fullmm) - flush_pid = true; - else if (type == FLUSH_TYPE_GLOBAL) + if (type == FLUSH_TYPE_GLOBAL) flush_pid = nr_pages > tlb_single_page_flush_ceiling; else flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; @@ -1171,15 +1075,12 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } } } else { - bool hflush = false; + bool hflush; unsigned long hstart, hend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart < hend) - hflush = true; - } + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend; if (type == FLUSH_TYPE_LOCAL) { asm volatile("ptesync": : :"memory"); @@ -1210,6 +1111,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1294,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm || tlb->need_flush_all) { - __flush_all_mm(mm, true); + if (tlb->fullmm) { + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * Shootdown based lazy tlb mm refcounting means we + * have to IPI everyone in the mm_cpumask anyway soon + * when the mm goes away, so might as well do it as + * part of the final flush now. + * + * If lazy shootdown was improved to reduce IPIs (e.g., + * by batching), then it may end up being better to use + * tlbies here instead. + */ + preempt_disable(); + + smp_mb(); /* see radix__flush_tlb_mm */ + exit_flush_lazy_tlbs(mm); + __flush_all_mm(mm, true); + + preempt_enable(); + } else { + __flush_all_mm(mm, true); + } + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->freed_tables) radix__flush_tlb_mm(mm); @@ -1317,25 +1240,22 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool fullmm = (end == TLB_FLUSH_ALL); bool flush_pid; enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; - fullmm = (end == TLB_FLUSH_ALL); + WARN_ON_ONCE(end == TLB_FLUSH_ALL); preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - type = flush_type_needed(mm, fullmm); + type = flush_type_needed(mm, false); if (type == FLUSH_TYPE_NONE) goto out; - if (fullmm) - flush_pid = true; - else if (type == FLUSH_TYPE_GLOBAL) + if (type == FLUSH_TYPE_GLOBAL) flush_pid = nr_pages > tlb_single_page_flush_ceiling; else flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; @@ -1377,6 +1297,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, @@ -1398,7 +1319,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; /* 4k page size, just blow the world */ @@ -1446,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; @@ -1471,6 +1399,127 @@ void radix__flush_tlb_all(void) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static __always_inline void __tlbie_pid_lpid(unsigned long pid, + unsigned long lpid, + unsigned long ric) +{ + unsigned long rb, rs, prs, r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, + unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb, rs, prs, r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), + RIC_FLUSH_TLB); + } +} + +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, + unsigned long ric) +{ + asm volatile("ptesync" : : : "memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + fixup_tlbie_pid_lpid(pid, lpid); + break; + case RIC_FLUSH_PWC: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + fixup_tlbie_pid_lpid(pid, lpid); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +static inline void fixup_tlbie_va_range_lpid(unsigned long va, + unsigned long pid, + unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); + } +} + +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); +} + +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync" : : : "memory"); + if (also_pwc) + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + /* * Performs process-scoped invalidations for a given LPID * as part of H_RPT_INVALIDATE hcall. diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 6956f637a38c..f2708c8629a5 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -13,6 +13,7 @@ #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> +#include <asm/lppaca.h> #include <asm/ppc-opcode.h> #include <asm/cputable.h> #include <asm/cacheflush.h> diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 60c6ea16a972..ec98e526167e 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -71,6 +71,8 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, if (pmd_none(*pmd)) return; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return; arch_enter_lazy_mmu_mode(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); @@ -143,30 +145,22 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops subpage_walk_ops = { .pmd_entry = subpage_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; - vma->vm_flags |= VM_NOHUGEPAGE; + for_each_vma_range(vmi, vma, addr + len) { + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 0e9b4879c0f9..15189592da09 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p) invalidate_icache_range(addr, addr + PAGE_SIZE); } -static void flush_dcache_icache_hugepage(struct page *page) +void flush_dcache_icache_folio(struct folio *folio) { - int i; - int nr = compound_nr(page); + unsigned int i, nr = folio_nr_pages(folio); - if (!PageHighMem(page)) { + if (flush_coherent_icache()) + return; + + if (!folio_test_highmem(folio)) { + void *addr = folio_address(folio); for (i = 0; i < nr; i++) - __flush_dcache_icache(lowmem_page_address(page + i)); - } else { + __flush_dcache_icache(addr + i * PAGE_SIZE); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { for (i = 0; i < nr; i++) { - void *start = kmap_local_page(page + i); + void *start = kmap_local_folio(folio, i * PAGE_SIZE); __flush_dcache_icache(start); kunmap_local(start); } - } -} - -void flush_dcache_icache_page(struct page *page) -{ - if (flush_coherent_icache()) - return; - - if (PageCompound(page)) - return flush_dcache_icache_hugepage(page); - - if (!PageHighMem(page)) { - __flush_dcache_icache(lowmem_page_address(page)); - } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { - void *start = kmap_local_page(page); - - __flush_dcache_icache(start); - kunmap_local(start); } else { - flush_dcache_icache_phys(page_to_phys(page)); + unsigned long pfn = folio_pfn(folio); + for (i = 0; i < nr; i++) + flush_dcache_icache_phys((pfn + i) * PAGE_SIZE); } } -EXPORT_SYMBOL(flush_dcache_icache_page); +EXPORT_SYMBOL(flush_dcache_icache_folio); void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index c1cb21a00884..f49fd873df8d 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, if (mm->pgd == NULL) return -EFAULT; - mmap_read_lock(mm); - ret = -EFAULT; - vma = find_vma(mm, ea); + vma = lock_mm_and_find_vma(mm, ea, NULL); if (!vma) - goto out_unlock; - - if (ea < vma->vm_start) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_unlock; - if (expand_stack(vma, ea)) - goto out_unlock; - } + return -EFAULT; + ret = -EFAULT; is_write = dsisr & DSISR_ISSTORE; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) @@ -65,6 +57,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, ret = 0; *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); + + /* The fault is fully completed (including releasing mmap lock) */ + if (*flt & VM_FAULT_COMPLETED) + return 0; + if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 2369d1bf2411..c110ab8fa8a3 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -67,7 +67,7 @@ static int drmem_update_dt_v1(struct device_node *memory, struct property *new_prop; struct of_drconf_cell_v1 *dr_cell; struct drmem_lmb *lmb; - u32 *p; + __be32 *p; new_prop = clone_property(prop, prop->length); if (!new_prop) @@ -393,17 +393,17 @@ static const __be32 *of_get_usable_memory(struct device_node *dn) int walk_drmem_lmbs(struct device_node *dn, void *data, int (*func)(struct drmem_lmb *, const __be32 **, void *)) { + struct device_node *root = of_find_node_by_path("/"); const __be32 *prop, *usm; int ret = -ENODEV; - if (!of_root) + if (!root) return ret; /* Get the address & size cells */ - of_node_get(of_root); - n_root_addr_cells = of_n_addr_cells(of_root); - n_root_size_cells = of_n_size_cells(of_root); - of_node_put(of_root); + n_root_addr_cells = of_n_addr_cells(root); + n_root_size_cells = of_n_size_cells(root); + of_node_put(root); if (init_drmem_lmb_size(dn)) return ret; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d53fed4eccbd..53335ae21a40 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) return __bad_area_nosemaphore(regs, address, si_code); } -static noinline int bad_area(struct pt_regs *regs, unsigned long address) -{ - return __bad_area(regs, address, SEGV_MAPERR); -} - static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, struct vm_area_struct *vma) { @@ -270,8 +265,18 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma return false; } + /* + * VM_READ, VM_WRITE and VM_EXEC may imply read permissions, as + * defined in protection_map[]. In that case Read faults can only be + * caused by a PROT_NONE mapping. However a non exec access on a + * VM_EXEC only mapping is invalid anyway, so report it as such. + */ if (unlikely(!vma_is_accessible(vma))) return true; + + if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC) + return true; + /* * We should ideally do the vma pkey access check here. But in the * fault path, handle_mm_fault() also does the same check. To avoid @@ -367,7 +372,22 @@ static void sanity_check_fault(bool is_write, bool is_user, #elif defined(CONFIG_PPC_8xx) #define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) #elif defined(CONFIG_PPC64) -#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) +static int page_fault_is_bad(unsigned long err) +{ + unsigned long flag = DSISR_BAD_FAULT_64S; + + /* + * PAPR+ v2.11 § 14.15.3.4.1 (unreleased) + * If byte 0, bit 3 of pi-attribute-specifier-type in + * ibm,pi-features property is defined, ignore the DSI error + * which is caused by the paste instruction on the + * suspended NX window. + */ + if (mmu_has_feature(MMU_FTR_NX_DSI)) + flag &= ~DSISR_BAD_COPYPASTE; + + return err & flag; +} #else #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) #endif @@ -450,6 +470,41 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) { + vma_end_read(vma); + goto lock_mmap; + } + + if (unlikely(access_error(is_write, is_exec, vma))) { + vma_end_read(vma); + goto lock_mmap; + } + + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + + if (fault_signal_pending(fault, regs)) + return user_mode(regs) ? 0 : SIGBUS; + +lock_mmap: + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -457,40 +512,12 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * exceptions table. lock_mm_and_find_vma() handles that logic. */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!is_user && !search_exception_tables(regs->nip)) - return bad_area_nosemaphore(regs, address); - retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) - return bad_area(regs, address); - - if (unlikely(vma->vm_start > address)) { - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - return bad_area(regs, address); - - if (unlikely(expand_stack(vma, address))) - return bad_area(regs, address); - } + return bad_area_nosemaphore(regs, address); if (unlikely(access_pkey_error(is_write, is_exec, (error_code & DSISR_KEYFAULT), vma))) @@ -511,6 +538,10 @@ retry: if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + goto out; + /* * Handle the retry right now, the mmap_lock has been released in that * case. @@ -522,9 +553,11 @@ retry: mmap_read_unlock(current->mm); +done: if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); +out: /* * Major/minor page fault accounting. */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b282af39fcf6..594a4b7b2ca2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,6 +24,7 @@ #include <asm/setup.h> #include <asm/hugetlb.h> #include <asm/pte-walk.h> +#include <asm/firmware.h> bool hugetlb_disabled = false; @@ -182,7 +183,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) - return pte_alloc_map(mm, (pmd_t *)hpdp, addr); + return pte_alloc_huge(mm, (pmd_t *)hpdp, addr); BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); @@ -225,7 +226,7 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) return 0; m = phys_to_virt(gpage_freearray[--nr_gpages]); gpage_freearray[nr_gpages] = 0; - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[0]); m->hstate = hstate; return 1; } @@ -391,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, * single hugepage, but all of them point to * the same kmem cache that holds the hugepte. */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); + more = addr + (1UL << hugepd_shift(*(hugepd_t *)pmd)); if (more > next) next = more; @@ -433,7 +434,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, * single hugepage, but all of them point to * the same kmem cache that holds the hugepte. */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); + more = addr + (1UL << hugepd_shift(*(hugepd_t *)pud)); if (more > next) next = more; @@ -495,7 +496,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, * for a single hugepage, but all of them point to the * same kmem cache that holds the hugepte. */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); + more = addr + (1UL << hugepd_shift(*(hugepd_t *)pgd)); if (more > next) next = more; @@ -505,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift) -{ - pte_t *ptep; - spinlock_t *ptl; - struct page *page = NULL; - unsigned long mask; - int shift = hugepd_shift(hpd); - struct mm_struct *mm = vma->vm_mm; - -retry: - /* - * hugepage directory entries are protected by mm->page_table_lock - * Use this instead of huge_pte_lockptr - */ - ptl = &mm->page_table_lock; - spin_lock(ptl); - - ptep = hugepte_offset(hpd, address, pdshift); - if (pte_present(*ptep)) { - mask = (1UL << shift) - 1; - page = pte_page(*ptep); - page += ((address & mask) >> PAGE_SHIFT); - if (flags & FOLL_GET) - get_page(page); - } else { - if (is_hugetlb_entry_migration(*ptep)) { - spin_unlock(ptl); - __migration_entry_wait(mm, ptep, ptl); - goto retry; - } - } - spin_unlock(ptl); - return page; -} - bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); @@ -622,7 +586,7 @@ static int __init hugetlbpage_init(void) if (pdshift > shift) { if (!IS_ENABLED(CONFIG_PPC_8xx)) pgtable_cache_add(pdshift - shift); - } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || + } else if (IS_ENABLED(CONFIG_PPC_E500) || IS_ENABLED(CONFIG_PPC_8xx)) { pgtable_cache_add(PTE_T_ORDER); } @@ -650,8 +614,6 @@ void __init gigantic_hugetlb_cma_reserve(void) */ order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; - if (order) { - VM_WARN_ON(order < MAX_ORDER); + if (order) hugetlb_cma_reserve(order); - } } diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 119ef491f797..d3a7726ecf51 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -126,7 +126,7 @@ void pgtable_cache_add(unsigned int shift) * as to leave enough 0 bits in the address to contain it. */ unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, HUGEPD_SHIFT_MASK + 1); - struct kmem_cache *new; + struct kmem_cache *new = NULL; /* It would be nice if this was a BUILD_BUG_ON(), but at the * moment, gcc doesn't seem to recognize is_power_of_2 as a @@ -139,7 +139,8 @@ void pgtable_cache_add(unsigned int shift) align = max_t(unsigned long, align, minalign); name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); - new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); + if (name) + new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); if (!new) panic("Could not allocate pgtable cache for order %d", shift); diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 693a3a7a9463..4e71dfe7d026 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -39,6 +39,7 @@ #include <asm/hugetlb.h> #include <asm/kup.h> #include <asm/kasan.h> +#include <asm/fixmap.h> #include <mm/mmu_decl.h> @@ -69,44 +70,10 @@ EXPORT_SYMBOL(agp_special_page); void MMU_init(void); -/* - * this tells the system to map all of ram with the segregs - * (i.e. page tables) instead of the bats. - * -- Cort - */ -int __map_without_bats; -int __map_without_ltlbs; - /* max amount of low RAM to map in */ unsigned long __max_low_memory = MAX_LOW_MEM; /* - * Check for command-line options that affect what MMU_init will do. - */ -static void __init MMU_setup(void) -{ - /* Check for nobats option (used in mapin_ram). */ - if (strstr(boot_command_line, "nobats")) { - __map_without_bats = 1; - } - - if (strstr(boot_command_line, "noltlbs")) { - __map_without_ltlbs = 1; - } - if (IS_ENABLED(CONFIG_PPC_8xx)) - return; - - if (IS_ENABLED(CONFIG_KFENCE)) - __map_without_ltlbs = 1; - - if (debug_pagealloc_enabled()) - __map_without_ltlbs = 1; - - if (strict_kernel_rwx_enabled()) - __map_without_ltlbs = 1; -} - -/* * MMU_init sets up the basic memory mappings for the kernel, * including both RAM and possibly some I/O regions, * and sets up the page tables and the MMU hardware ready to go. @@ -116,31 +83,15 @@ void __init MMU_init(void) if (ppc_md.progress) ppc_md.progress("MMU:enter", 0x111); - /* parse args from command line */ - MMU_setup(); - - /* - * Reserve gigantic pages for hugetlb. This MUST occur before - * lowmem_end_addr is initialized below. - */ - if (memblock.memory.cnt > 1) { -#ifndef CONFIG_WII - memblock_enforce_memory_limit(memblock.memory.regions[0].size); - pr_warn("Only using first contiguous memory region\n"); -#else - wii_memory_fixups(); -#endif - } - total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; lowmem_end_addr = memstart_addr + total_lowmem; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB * entries, so we need to adjust lowmem to match the amount we can map * in the fixed entries */ adjust_total_lowmem(); -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ if (total_lowmem > __max_low_memory) { total_lowmem = __max_low_memory; @@ -176,6 +127,8 @@ void __init MMU_init(void) setup_kup(); + update_mmu_feature_fixups(MMU_FTR_KUAP); + /* Shortly after that, the entire linear mapping will be available */ memblock_set_current_limit(lowmem_end_addr); } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 05b0d584e50b..d96bbc001e73 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -40,6 +40,7 @@ #include <linux/of_fdt.h> #include <linux/libfdt.h> #include <linux/memremap.h> +#include <linux/memory.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -92,7 +93,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad * a page table lookup here because with the hash translation we don't keep * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { struct page *start; unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; @@ -183,13 +184,13 @@ static __meminit int vmemmap_list_populate(unsigned long phys, return 0; } -static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, - unsigned long page_size) +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) { unsigned long nr_pfn = page_size / sizeof(struct page); unsigned long start_pfn = page_to_pfn((struct page *)start); - if ((start_pfn + nr_pfn) > altmap->end_pfn) + if ((start_pfn + nr_pfn - 1) > altmap->end_pfn) return true; if (start_pfn < altmap->base_pfn) @@ -198,8 +199,8 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star return false; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; @@ -272,6 +273,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + #ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { @@ -303,8 +316,8 @@ static unsigned long vmemmap_list_free(unsigned long start) return vmem_back->phys; } -void __ref vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void __ref __vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; unsigned long page_order = get_order(page_size); @@ -314,8 +327,7 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, start = ALIGN_DOWN(start, page_size); if (altmap) { alt_start = altmap->base_pfn; - alt_end = altmap->base_pfn + altmap->reserve + - altmap->free + altmap->alloc + altmap->align; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; } pr_debug("vmemmap_free %lx...%lx\n", start, end); @@ -362,6 +374,17 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, vmemmap_remove_mapping(start, page_size); } } + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_free(start, end, altmap); +#endif + return __vmemmap_free(start, end, altmap); +} + #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) @@ -471,6 +494,130 @@ static int __init dt_scan_mmu_pid_width(unsigned long node, return 1; } +/* + * Outside hotplug the kernel uses this value to map the kernel direct map + * with radix. To be compatible with older kernels, let's keep this value + * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map + * things with 1GB size in the case where we don't support hotplug. + */ +#ifndef CONFIG_MEMORY_HOTPLUG +#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M +#else +#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE +#endif + +static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size) +{ + unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE; + + for (; *block_size > min_memory_block_size; *block_size >>= 2) { + if ((mem_size & *block_size) == 0) + break; + } +} + +static int __init probe_memory_block_size(unsigned long node, const char *uname, int + depth, void *data) +{ + const char *type; + unsigned long *block_size = (unsigned long *)data; + const __be32 *reg, *endp; + int l; + + if (depth != 1) + return 0; + /* + * If we have dynamic-reconfiguration-memory node, use the + * lmb value. + */ + if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) { + + const __be32 *prop; + + prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l); + + if (!prop || l < dt_root_size_cells * sizeof(__be32)) + /* + * Nothing in the device tree + */ + *block_size = DEFAULT_MEMORY_BLOCK_SIZE; + else + *block_size = of_read_number(prop, dt_root_size_cells); + /* + * We have found the final value. Don't probe further. + */ + return 1; + } + /* + * Find all the device tree nodes of memory type and make sure + * the area can be mapped using the memory block size value + * we end up using. We start with 1G value and keep reducing + * it such that we can map the entire area using memory_block_size. + * This will be used on powernv and older pseries that don't + * have ibm,lmb-size node. + * For ex: with P5 we can end up with + * memory@0 -> 128MB + * memory@128M -> 64M + * This will end up using 64MB memory block size value. + */ + type = of_get_flat_dt_prop(node, "device_type", NULL); + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); + if (!reg) + reg = of_get_flat_dt_prop(node, "reg", &l); + if (!reg) + return 0; + + endp = reg + (l / sizeof(__be32)); + while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { + const char *compatible; + u64 size; + + dt_mem_next_cell(dt_root_addr_cells, ®); + size = dt_mem_next_cell(dt_root_size_cells, ®); + + if (size) { + update_memory_block_size(block_size, size); + continue; + } + /* + * ibm,coherent-device-memory with linux,usable-memory = 0 + * Force 256MiB block size. Work around for GPUs on P9 PowerNV + * linux,usable-memory == 0 implies driver managed memory and + * we can't use large memory block size due to hotplug/unplug + * limitations. + */ + compatible = of_get_flat_dt_prop(node, "compatible", NULL); + if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) { + if (*block_size > SZ_256M) + *block_size = SZ_256M; + /* + * We keep 256M as the upper limit with GPU present. + */ + return 0; + } + } + /* continue looking for other memory device types */ + return 0; +} + +/* + * start with 1G memory block size. Early init will + * fix this with correct value. + */ +unsigned long memory_block_size __ro_after_init = 1UL << 30; +static void __init early_init_memory_block_size(void) +{ + /* + * We need to do memory_block_size probe early so that + * radix__early_init_mmu() can use this as limit for + * mapping page size. + */ + of_scan_flat_dt(probe_memory_block_size, &memory_block_size); +} + void __init mmu_early_init_devtree(void) { bool hvmode = !!(mfmsr() & MSR_HV); @@ -504,6 +651,8 @@ void __init mmu_early_init_devtree(void) if (!hvmode) early_check_vec5(); + early_init_memory_block_size(); + if (early_radix_enabled()) { radix__early_init_devtree(); diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 4f12504fb405..7b0afcabd89f 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) return __ioremap_caller(addr, size, prot, caller); } -void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) { pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); @@ -50,10 +50,6 @@ void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long f if (pte_write(pte)) pte = pte_mkdirty(pte); - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - pte = pte_mkprivileged(pte); - if (iowa_is_active()) return iowa_ioremap(addr, size, pte_pgprot(pte), caller); return __ioremap_caller(addr, size, pte_pgprot(pte), caller); @@ -66,7 +62,7 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long i; for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); + int err = map_kernel_page(ea + i, pa + i, pgprot_nx(prot)); if (WARN_ON_ONCE(err)) /* Should clean up */ return err; @@ -74,27 +70,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, return 0; } - -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller) -{ - struct vm_struct *area; - int ret; - unsigned long va; - - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); - if (area == NULL) - return NULL; - - area->phys_addr = pa; - va = (unsigned long)area->addr; - - ret = ioremap_page_range(va, va + size, pa, prot); - if (!ret) - return (void __iomem *)area->addr + offset; - - vunmap_range(va, va + size); - free_vm_area(area); - - return NULL; -} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index 9d13143b8be4..ca5bc6be3e6f 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -22,6 +22,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call int err; /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (addr < SZ_16M) + addr += _ISA_MEM_BASE; + + /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. * Before then, we use space going down from IOREMAP_TOP @@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - p; - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16 * 1024 * 1024) - p += _ISA_MEM_BASE; - #ifndef CONFIG_CRASH_DUMP /* * Don't allow anybody to remap normal RAM that we're using. @@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call return (void __iomem *)v + offset; if (slab_is_available()) - return do_ioremap(p, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); /* * Should check if it is a candidate for a BAT mapping @@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr) if (v_block_mapped((unsigned long)addr)) return; - if (addr > high_memory && (unsigned long)addr < ioremap_bot) - vunmap((void *)(PAGE_MASK & (unsigned long)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 3acece00b33e..d24e5f166723 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; if (slab_is_available()) - return do_ioremap(paligned, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); @@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, */ void iounmap(volatile void __iomem *token) { - void *addr; - if (!slab_is_available()) return; - addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); - - if ((unsigned long)addr < ioremap_bot) { - pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); - return; - } - vunmap(addr); + generic_iounmap(PCI_FIX_ADDR(token)); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile index 4999aadb1867..f9522fd70b2f 100644 --- a/arch/powerpc/mm/kasan/Makefile +++ b/arch/powerpc/mm/kasan/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n +KCOV_INSTRUMENT := n obj-$(CONFIG_PPC32) += init_32.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index f3e4d069e0ba..aa9aa11927b2 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -25,7 +25,7 @@ static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot) int i; for (i = 0; i < PTRS_PER_PTE; i++, ptep++) - __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 1); } int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) @@ -64,6 +64,7 @@ int __init __weak kasan_init_region(void *start, size_t size) if (ret) return ret; + k_start = k_start & PAGE_MASK; block = memblock_alloc(k_end - k_start, PAGE_SIZE); if (!block) return -ENOMEM; diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c new file mode 100644 index 000000000000..11519e88dc6b --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3e powerpc + * + * Copyright 2022, Christophe Leroy, CS GROUP France + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/set_memory.h> + +#include <asm/pgalloc.h> + +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} + +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (kasan_pud_table(*p4dp)) { + pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (kasan_pmd_table(*pudp)) { + pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (kasan_pte_table(*pmdp)) { + ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + + __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0); + + return 0; +} + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_early_init(void) +{ + int i; + unsigned long addr; + pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START); + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE) + p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud); +} + +void __init kasan_init(void) +{ + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region((void *)start, (void *)end); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); + + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 0da5566d6b84..9300d641cf9a 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -99,4 +99,6 @@ void __init kasan_init(void) pr_info("KASAN init done\n"); } +void __init kasan_early_init(void) { } + void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index a97128a48817..3a440004b97d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -25,6 +25,8 @@ #include <asm/mmzone.h> #include <asm/ftrace.h> #include <asm/code-patching.h> +#include <asm/setup.h> +#include <asm/fixmap.h> #include <mm/mmu_decl.h> @@ -33,18 +35,18 @@ unsigned long long memory_limit; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t vma_prot) +pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size, + pgprot_t vma_prot) { if (ppc_md.phys_mem_access_prot) - return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot); + return ppc_md.phys_mem_access_prot(pfn, size, vma_prot); if (!page_is_ram(pfn)) vma_prot = pgprot_noncached(vma_prot); return vma_prot; } -EXPORT_SYMBOL(phys_mem_access_prot); +EXPORT_SYMBOL(__phys_mem_access_prot); #ifdef CONFIG_MEMORY_HOTPLUG static DEFINE_MUTEX(linear_mapping_mutex); @@ -54,6 +56,7 @@ int memory_add_physaddr_to_nid(u64 start) { return hot_add_scn_to_nid(start); } +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif int __weak create_section_mapping(unsigned long start, unsigned long end, @@ -286,7 +289,6 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - set_max_mapnr(max_pfn); kasan_late_init(); @@ -300,13 +302,13 @@ void __init mem_init(void) for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; struct page *page = pfn_to_page(pfn); - if (!memblock_is_reserved(paddr)) + if (memblock_is_memory(paddr) && !memblock_is_reserved(paddr)) free_highmem_page(page); } } #endif /* CONFIG_HIGHMEM */ -#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) +#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up * functions.... do it here for the non-smp case. @@ -342,7 +344,6 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); - static_branch_enable(&init_mem_is_free); free_initmem_default(POISON_FREE_INITMEM); ftrace_free_init_tramp(); } diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 1fb9c99f8679..b24c19078eb1 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -43,11 +43,13 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + int cpu = smp_processor_id(); bool new_on_cpu = false; /* Mark this context has been used on the new CPU */ - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { - cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + VM_WARN_ON_ONCE(next == &init_mm); + cpumask_set_cpu(cpu, mm_cpumask(next)); inc_mm_active_cpus(next); /* @@ -100,6 +102,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * sub architectures. Out of line for now */ switch_mmu_context(prev, next, tsk); + + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev))); } #ifndef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 63c4b1a4d435..8e84bc214d13 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid) #else /* CONFIG_40x || CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) @@ -55,7 +55,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); trace_tlbie(0, 0, address, pid, 0, 0, 0); } -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) extern void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, } #endif /* CONFIG_PPC_8xx */ -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -92,29 +92,16 @@ extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); -extern int __map_without_bats; -extern unsigned int rtas_data, rtas_size; - -struct hash_pte; extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ extern unsigned long __max_low_memory; -extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; extern phys_addr_t total_lowmem; extern phys_addr_t memstart_addr; extern phys_addr_t lowmem_end_addr; -#ifdef CONFIG_WII -extern unsigned long wii_hole_start; -extern unsigned long wii_hole_size; - -extern unsigned long wii_mmu_mapin_mem2(unsigned long top); -extern void wii_memory_fixups(void); -#endif - /* ...and now those things that may be slightly different between processor * architectures. -- Dan */ @@ -123,18 +110,18 @@ extern void MMU_init_hw(void); void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif +void mmu_init_secondary(int cpu); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init); -extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys); #ifdef CONFIG_PPC32 extern void adjust_total_lowmem(void); extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); void reloc_kernel_entry(void *fdt, int addr); +void relocate_init(u64 dt_ptr, phys_addr_t start); extern int is_second_reloc; #endif extern void loadcam_entry(unsigned int index); @@ -161,9 +148,9 @@ struct tlbcam { extern struct tlbcam TLBCAM[NUM_TLBCAMS]; #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx) /* 6xx have BATS */ -/* FSL_BOOKE have TLBCAM */ +/* PPC_85xx have TLBCAM */ /* 8xx have LTLB */ phys_addr_t v_block_mapped(unsigned long va); unsigned long p_block_mapped(phys_addr_t pa); @@ -172,7 +159,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500) void mmu_mark_initmem_nx(void); void mmu_mark_rodata_ro(void); #else @@ -184,13 +171,14 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void); -#else -static inline void ptdump_check_wx(void) { } -#endif - static inline bool debug_pagealloc_enabled_or_kfence(void) { return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); } + +#ifdef CONFIG_MEMORY_HOTPLUG +int create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot); +#endif + +int hash__kernel_map_pages(struct page *page, int numpages, int enable); diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c index b32e465a3d52..e835e80c09db 100644 --- a/arch/powerpc/mm/nohash/40x.c +++ b/arch/powerpc/mm/nohash/40x.c @@ -43,26 +43,30 @@ #include <mm/mmu_decl.h> -extern int __map_without_ltlbs; /* * MMU_init_hw does the chip-specific initialization of the MMU hardware. */ void __init MMU_init_hw(void) { + int i; + unsigned long zpr; + /* * The Zone Protection Register (ZPR) defines how protection will - * be applied to every page which is a member of a given zone. At - * present, we utilize only two of the 4xx's zones. + * be applied to every page which is a member of a given zone. * The zone index bits (of ZSEL) in the PTE are used for software - * indicators, except the LSB. For user access, zone 1 is used, - * for kernel access, zone 0 is used. We set all but zone 1 - * to zero, allowing only kernel access as indicated in the PTE. - * For zone 1, we set a 01 binary (a value of 10 will not work) + * indicators. We use the 4 upper bits of virtual address to select + * the zone. We set all zones above TASK_SIZE to zero, allowing + * only kernel access as indicated in the PTE. For zones below + * TASK_SIZE, we set a 01 binary (a value of 10 will not work) * to allow user access as indicated in the PTE. This also allows * kernel access as indicated in the PTE. */ - mtspr(SPRN_ZPR, 0x10000000); + for (i = 0, zpr = 0; i < TASK_SIZE >> 28; i++) + zpr |= 1 << (30 - i * 2); + + mtspr(SPRN_ZPR, zpr); flush_instruction_cache(); @@ -94,7 +98,13 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) p = 0; s = total_lowmem; - if (__map_without_ltlbs) + if (IS_ENABLED(CONFIG_KFENCE)) + return 0; + + if (debug_pagealloc_enabled()) + return 0; + + if (strict_kernel_rwx_enabled()) return 0; while (s >= LARGE_PAGE_SIZE_16M) { diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 27f9186ae374..6be6421086ed 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -10,12 +10,12 @@ #include <linux/memblock.h> #include <linux/hugetlb.h> +#include <asm/fixmap.h> + #include <mm/mmu_decl.h> #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) -extern int __map_without_ltlbs; - static unsigned long block_mapped_ram; /* @@ -28,8 +28,6 @@ phys_addr_t v_block_mapped(unsigned long va) if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) return __pa(va); return 0; @@ -45,8 +43,6 @@ unsigned long p_block_mapped(phys_addr_t pa) if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; - if (__map_without_ltlbs) - return 0; if (pa < block_mapped_ram) return (unsigned long)__va(pa); return 0; @@ -97,7 +93,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot))) return -EINVAL; - set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot))); + set_huge_pte_at(&init_mm, va, ptep, + pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize); return 0; } @@ -153,9 +150,6 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) mmu_mapin_immr(); - if (__map_without_ltlbs) - return 0; - mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, true); if (debug_pagealloc_enabled_or_kfence()) { top = boundary; @@ -179,8 +173,8 @@ void mmu_mark_initmem_nx(void) unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8; unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); - mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, false); - mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); + if (!debug_pagealloc_enabled_or_kfence()) + mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); mmu_pin_tlb(block_mapped_ram, false); } diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index b467a25ee155..f3894e79d5f7 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -7,13 +7,13 @@ obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_40x) += 40x.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_book3e.o +obj-$(CONFIG_PPC_E500) += e500.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o +obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o endif # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_fsl_book3e.o := n +KCOV_INSTRUMENT_e500.o := n diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index b80fc4a91a53..1c5e4ecbebeb 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -71,7 +71,7 @@ static void __init *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, pgprot_t prot) { pgd_t *pgdp; p4d_t *p4dp; diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/e500.c index b8ae6c08c06f..921c3521ec11 100644 --- a/arch/powerpc/mm/nohash/fsl_book3e.c +++ b/arch/powerpc/mm/nohash/e500.c @@ -59,7 +59,7 @@ static struct { phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* * Return PA for this VA if it is mapped by a CAM, or 0 */ @@ -117,15 +117,15 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_SW : 0; if (mmu_has_feature(MMU_FTR_BIG_PHYS)) TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(__pte(flags))) { + if (!is_kernel_addr(virt)) { TLBCAM[index].MAS3 |= MAS3_UR; TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_UW : 0; } else { TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; } @@ -135,8 +135,8 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, tlbcam_addrs[index].phys = phys; } -unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys) +static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) { unsigned int camsize = __ilog2(ram); unsigned int align = __ffs(virt | phys); diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index 307ca919d393..a134d28a0e4d 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -45,7 +45,9 @@ static inline void book3e_tlb_lock(void) if (!cpu_has_feature(CPU_FTR_SMT)) return; - asm volatile("1: lbarx %0, 0, %1;" + asm volatile(".machine push;" + ".machine e6500;" + "1: lbarx %0, 0, %1;" "cmpwi %0, 0;" "bne 2f;" "stbcx. %2, 0, %1;" @@ -56,6 +58,7 @@ static inline void book3e_tlb_lock(void) "bne 2b;" "b 1b;" "3:" + ".machine pop;" : "=&r" (tmp) : "r" (&paca->tcd_ptr->lock), "r" (token) : "memory"); @@ -103,21 +106,11 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) int found = 0; mtspr(SPRN_MAS6, pid << 16); - if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { - asm volatile( - "li %0,0\n" - "tlbsx. 0,%1\n" - "bne 1f\n" - "li %0,1\n" - "1:\n" - : "=&r"(found) : "r"(ea)); - } else { - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - } + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); return found; } @@ -169,13 +162,9 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) mtspr(SPRN_MAS1, mas1); mtspr(SPRN_MAS2, mas2); - if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { - mtspr(SPRN_MAS7_MAS3, mas7_3); - } else { - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - } + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); asm volatile ("tlbwe"); @@ -189,7 +178,7 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 0d04f9d5da8d..cdff129abb14 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -13,13 +13,12 @@ #include <linux/delay.h> #include <linux/memblock.h> #include <linux/libfdt.h> -#include <linux/crash_core.h> +#include <linux/crash_reserve.h> #include <linux/of.h> #include <linux/of_fdt.h> #include <asm/cacheflush.h> #include <asm/kdump.h> #include <mm/mmu_decl.h> -#include <generated/utsrelease.h> struct regions { unsigned long pa_start; @@ -174,12 +173,12 @@ static __init bool overlaps_region(const void *fdt, u32 start, static void __init get_crash_kernel(void *fdt, unsigned long size) { -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_CRASH_RESERVE unsigned long long crash_size, crash_base; int ret; ret = parse_crashkernel(boot_command_line, size, &crash_size, - &crash_base); + &crash_base, NULL, NULL); if (ret != 0 || crash_size == 0) return; if (crash_base == 0) diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c index 552becf90e97..e1f7de2e54ec 100644 --- a/arch/powerpc/mm/nohash/kup.c +++ b/arch/powerpc/mm/nohash/kup.c @@ -5,7 +5,6 @@ #include <linux/export.h> #include <linux/init.h> -#include <linux/jump_label.h> #include <linux/printk.h> #include <linux/smp.h> @@ -13,21 +12,18 @@ #include <asm/smp.h> #ifdef CONFIG_PPC_KUAP -struct static_key_false disable_kuap_key; -EXPORT_SYMBOL(disable_kuap_key); - void setup_kuap(bool disabled) { if (disabled) { if (IS_ENABLED(CONFIG_40x)) disable_kuep = true; if (smp_processor_id() == boot_cpuid) - static_branch_enable(&disable_kuap_key); + cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP; return; } pr_info("Activating Kernel Userspace Access Protection\n"); - __prevent_user_access(KUAP_READ_WRITE); + prevent_user_access(KUAP_READ_WRITE); } #endif diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 5e7ccb48b79c..5ffa0af4328a 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -49,8 +49,7 @@ * other sizes not listed here. The .ind field is only used on MMUs that have * indirect page table entries. */ -#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -81,7 +80,20 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .enc = BOOK3E_PAGESZ_1GB, }, }; -#elif defined(CONFIG_PPC_8xx) + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +#ifdef CONFIG_PPC_8xx struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -96,53 +108,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .shift = 23, }, }; -#else -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .ind = 20, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_16K] = { - .shift = 14, - .enc = BOOK3E_PAGESZ_16K, - }, - [MMU_PAGE_64K] = { - .shift = 16, - .ind = 28, - .enc = BOOK3E_PAGESZ_64K, - }, - [MMU_PAGE_1M] = { - .shift = 20, - .enc = BOOK3E_PAGESZ_1M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .ind = 36, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#endif /* CONFIG_FSL_BOOKE */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} -#else -static inline int mmu_get_tsize(int psize) -{ - /* This isn't used on !Book3E for now */ - return 0; -} -#endif /* CONFIG_PPC_BOOK3E_MMU */ +#endif /* The variables below are currently only used on 64-bit Book3E * though this will probably be made common with other nohash @@ -166,7 +132,7 @@ int extlb_level_exc; #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 /* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ DEFINE_PER_CPU(int, next_tlbcam_idx); EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); @@ -218,6 +184,14 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(local_flush_tlb_page); + +void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + __local_flush_tlb_page(mm, vmaddr, mmu_get_tsize(psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page_psize); + #endif /* @@ -344,17 +318,6 @@ EXPORT_SYMBOL(flush_tlb_page); #endif /* CONFIG_SMP */ -#ifdef CONFIG_PPC_47x -void __init early_init_mmu_47x(void) -{ -#ifdef CONFIG_SMP - unsigned long root = of_get_flat_dt_root(); - if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) - mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); -#endif /* CONFIG_SMP */ -} -#endif /* CONFIG_PPC_47x */ - /* * Flush kernel TLB entries in the given range */ @@ -441,7 +404,7 @@ static void __init setup_page_sizes(void) unsigned int eptcfg; int i, psize; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 unsigned int mmucfg = mfspr(SPRN_MMUCFG); int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); @@ -584,7 +547,7 @@ static void __init setup_mmu_htw(void) patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); break; -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 case PPC_HTW_E6500: extlb_level_exc = EX_TLB_SIZE; patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); @@ -627,7 +590,7 @@ static void early_init_this_mmu(void) } mtspr(SPRN_MAS4, mas4); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; bool map = true; @@ -680,7 +643,7 @@ static void __init early_init_mmu_global(void) /* Look for HW tablewalk support */ setup_mmu_htw(); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { if (book3e_htw_mode == PPC_HTW_NONE) { extlb_level_exc = EX_TLB_SIZE; @@ -701,7 +664,7 @@ static void __init early_init_mmu_global(void) static void __init early_mmu_set_memory_limit(void) { -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { /* * Limit memory so we dont have linear faults. @@ -750,7 +713,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, * We crop it to the size of the first MEMBLOCK to * avoid going over total available memory just in case... */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned long linear_sz; unsigned int num_cams; @@ -772,8 +735,10 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, #else /* ! CONFIG_PPC64 */ void __init early_init_mmu(void) { -#ifdef CONFIG_PPC_47x - early_init_mmu_47x(); -#endif + unsigned long root = of_get_flat_dt_root(); + + if (IS_ENABLED(CONFIG_PPC_47x) && IS_ENABLED(CONFIG_SMP) && + of_get_flat_dt_prop(root, "cooperative-partition", NULL)) + mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index dd39074de9af..e1199608ff4d 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -186,7 +186,7 @@ _GLOBAL(_tlbivax_bcast) isync PPC_TLBIVAX(0, R3) isync - eieio + mbar tlbsync BEGIN_FTR_SECTION b 1f @@ -221,7 +221,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) blr #endif /* CONFIG_PPC_47x */ -#elif defined(CONFIG_FSL_BOOKE) +#elif defined(CONFIG_PPC_85xx) /* * FSL BookE implementations. * @@ -294,7 +294,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) /* * New Book3E (>= 2.06) implementation * @@ -355,7 +355,7 @@ _GLOBAL(_tlbivax_bcast) rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND 1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ PPC_TLBIVAX(0,R3) - eieio + mbar tlbsync sync wrtee r10 @@ -364,7 +364,7 @@ _GLOBAL(_tlbivax_bcast) #error Unsupported processor type ! #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) /* * extern void loadcam_entry(unsigned int index) * diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 8b97c4acfebf..7e0b8fe1c279 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -61,7 +61,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) ld r14,PACAPGD(r13) std r15,EX_TLB_R15(r12) std r10,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION mfspr r11, SPRN_SRR1 andi. r10,r11,MSR_PR @@ -70,14 +69,11 @@ START_BTB_FLUSH_SECTION 1: END_BTB_FLUSH_SECTION std r7,EX_TLB_R7(r12) -#endif .endm .macro tlb_epilog_bolted ld r14,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E ld r7,EX_TLB_R7(r12) -#endif ld r10,EX_TLB_R10(r12) ld r11,EX_TLB_R11(r12) ld r13,EX_TLB_R13(r12) @@ -152,16 +148,7 @@ tlb_miss_common_bolted: clrrdi r15,r15,3 beq tlb_miss_fault_bolted /* No PGDIR, bail */ -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) ldx r14,r14,r15 /* grab pgd entry */ - beq tlb_miss_done_bolted /* tlb exists already, bail */ -MMU_FTR_SECTION_ELSE - ldx r14,r14,r15 /* grab pgd entry */ -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -222,10 +209,11 @@ itlb_miss_kernel_bolted: tlb_miss_kernel_bolted: mfspr r10,SPRN_MAS1 ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + andi. r15,r15,1 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_bolted + bne+ tlb_miss_common_bolted tlb_miss_fault_bolted: /* We need to check if it was an instruction miss */ @@ -256,7 +244,6 @@ itlb_miss_fault_bolted: beq tlb_miss_user_bolted b itlb_miss_kernel_bolted -#ifdef CONFIG_PPC_FSL_BOOK3E /* * TLB miss handling for e6500 and derivatives, using hardware tablewalk. * @@ -364,7 +351,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) mfspr r15,SPRN_MAS2 isync - tlbilxva 0,r15 + PPC_TLBILX_VA(0,R15) isync mtspr SPRN_MAS6,r10 @@ -507,7 +494,9 @@ tlb_miss_huge_e6500: tlb_miss_kernel_e6500: ld r14,PACA_KERNELPGD(r13) - cmpldi cr1,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + xoris r15,r15,0xc /* Check for vmalloc region */ + cmplwi cr1,r15,1 beq+ cr1,tlb_miss_common_e6500 tlb_miss_fault_e6500: @@ -521,7 +510,6 @@ dtlb_miss_fault_e6500: itlb_miss_fault_e6500: tlb_epilog_bolted b exc_instruction_storage_book3e -#endif /* CONFIG_PPC_FSL_BOOK3E */ /********************************************************************** * * @@ -541,16 +529,18 @@ itlb_miss_fault_e6500: */ mfspr r14,SPRN_ESR mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ + srdi r15,r16,44 /* get region */ + xoris r15,r15,0xc + cmpldi cr0,r15,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r15,1 /* vmalloc mapping ? */ /* The page tables are mapped virtually linear. At this point, though, * we don't know whether we are trying to fault in a first level * virtual address or a virtual page table address. We can get that * from bit 0x1 of the region ID which we have set for a page table */ - andi. r10,r15,0x1 + andis. r10,r15,0x1 bne- virt_page_table_tlb_miss std r14,EX_TLB_ESR(r12); /* save ESR */ @@ -562,7 +552,7 @@ itlb_miss_fault_e6500: /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r15,0 /* Check for user region */ + srdi. r15,r16,60 /* Check for user region */ /* We pre-test some combination of permissions to avoid double * faults: @@ -583,13 +573,12 @@ itlb_miss_fault_e6500: */ rlwimi r11,r14,32-19,27,27 rlwimi r11,r14,32-16,19,19 - beq normal_tlb_miss + beq normal_tlb_miss_user /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss + beq+ cr1,normal_tlb_miss /* We got a crappy address, just fault with whatever DEAR and ESR * are here @@ -615,27 +604,28 @@ itlb_miss_fault_e6500: * * Faulting address is SRR0 which is already in r16 */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ + srdi r15,r16,44 /* get region */ + xoris r15,r15,0xc + cmpldi cr0,r15,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r15,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h - cmpldi cr0,r15,0 /* Check for user region */ + srdi. r15,r16,60 /* Check for user region */ std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ - beq normal_tlb_miss + beq normal_tlb_miss_user li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h /* XXX replace the RMW cycles with immediate loads + writes */ mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss + beq+ cr1,normal_tlb_miss /* We got a crappy address, just fault */ TLB_MISS_EPILOG_ERROR @@ -653,6 +643,12 @@ itlb_miss_fault_e6500: * r11 = PTE permission mask * r10 = crap (free to use) */ +normal_tlb_miss_user: +#ifdef CONFIG_PPC_KUAP + mfspr r14,SPRN_MAS1 + rlwinm. r14,r14,0,0x3fff0000 + beq- normal_tlb_miss_access_fault /* KUAP fault */ +#endif normal_tlb_miss: /* So we first construct the page table address. We do that by * shifting the bottom of the address (not the region ID) by @@ -662,32 +658,19 @@ normal_tlb_miss: * NOTE: For 64K pages, we do things slightly differently in * order to handle the weird page table format used by linux */ - ori r10,r15,0x1 + srdi r15,r16,44 + oris r10,r15,0x1 rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 - sldi r15,r10,60 - clrrdi r14,r14,3 + sldi r15,r10,44 + clrrdi r14,r14,19 or r10,r15,r14 -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ld r14,0(r10) - beq normal_tlb_miss_done -MMU_FTR_SECTION_ELSE ld r14,0(r10) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) finish_normal_tlb_miss: /* Check if required permissions are met */ andc. r15,r11,r14 bne- normal_tlb_miss_access_fault -#ifdef CONFIG_PPC_KUAP - mfspr r11,SPRN_MAS1 - rlwinm. r10,r11,0,0x3fff0000 - beq- normal_tlb_miss_access_fault /* KUAP fault */ -#endif /* Now we build the MAS: * @@ -709,9 +692,7 @@ finish_normal_tlb_miss: rldicl r10,r14,64-8,64-8 cmpldi cr0,r10,BOOK3E_PAGESZ_4K beq- 1f -#ifndef CONFIG_PPC_KUAP mfspr r11,SPRN_MAS1 -#endif rlwimi r11,r14,31,21,24 rlwinm r11,r11,0,21,19 mtspr SPRN_MAS1,r11 @@ -728,13 +709,9 @@ finish_normal_tlb_miss: li r11,MAS3_SW|MAS3_UW andc r15,r15,r11 1: -BEGIN_MMU_FTR_SECTION srdi r16,r15,32 mtspr SPRN_MAS3,r15 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r15 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -786,6 +763,7 @@ normal_tlb_miss_access_fault: */ virt_page_table_tlb_miss: /* Are we hitting a kernel page table ? */ + srdi r15,r16,60 andi. r10,r15,0x8 /* The cool thing now is that r10 contains 0 for user and 8 for kernel, @@ -810,18 +788,12 @@ virt_page_table_tlb_miss: #else 1: #endif -BEGIN_MMU_FTR_SECTION - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - */ - PPC_TLBSRX_DOT(0,R16) - beq virt_page_table_tlb_miss_done -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Now, we need to walk the page tables. First check if we are in * range. */ - rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + cmpldi r10,0x80 bne- virt_page_table_tlb_miss_fault /* Get the PGD pointer */ @@ -867,41 +839,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -BEGIN_MMU_FTR_SECTION -virt_page_table_tlb_miss_done: - - /* We have overridden MAS2:EPN but currently our primary TLB miss - * handler will always restore it so that should not be an issue, - * if we ever optimize the primary handler to not write MAS2 on - * some cases, we'll have to restore MAS2:EPN here based on the - * original fault's DEAR. If we do that we have to modify the - * ITLB miss handler to also store SRR0 in the exception frame - * as DEAR. - * - * However, one nasty thing we did is we cleared the reservation - * (well, potentially we did). We do a trick here thus if we - * are not a level 0 exception (we interrupted the TLB miss) we - * offset the return address by -4 in order to replay the tlbsrx - * instruction there - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- 1f - ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) - addi r10,r11,-4 - std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) -1: -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_EPILOG_SUCCESS rfi @@ -969,23 +912,24 @@ virt_page_table_tlb_miss_whacko_fault: */ mfspr r14,SPRN_ESR mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ + srdi r11,r16,44 /* get region */ + xoris r11,r11,0xc + cmpldi cr0,r11,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r11,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r11,0 /* Check for user region */ + srdi. r11,r16,60 /* Check for user region */ ld r15,PACAPGD(r13) /* Load user pgdir */ beq htw_tlb_miss /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss + beq+ cr1,htw_tlb_miss /* We got a crappy address, just fault with whatever DEAR and ESR * are here @@ -1011,19 +955,20 @@ virt_page_table_tlb_miss_whacko_fault: * * Faulting address is SRR0 which is already in r16 */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ + srdi r11,r16,44 /* get region */ + xoris r11,r11,0xc + cmpldi cr0,r11,0 /* linear mapping ? */ beq tlb_load_linear /* yes -> go to linear map load */ + cmpldi cr1,r11,1 /* vmalloc mapping ? */ /* We do the user/kernel test for the PID here along with the RW test */ - cmpldi cr0,r11,0 /* Check for user region */ + srdi. r11,r16,60 /* Check for user region */ ld r15,PACAPGD(r13) /* Load user pgdir */ beq htw_tlb_miss /* XXX replace the RMW cycles with immediate loads + writes */ 1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ @@ -1116,13 +1061,9 @@ htw_tlb_miss: */ ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -1177,8 +1118,8 @@ tlb_load_linear: * we only use 1G pages for now. That might have to be changed in a * final implementation, especially when dealing with hypervisors */ - ld r11,PACATOC(r13) - ld r11,linear_map_top@got(r11) + __LOAD_PACA_TOC(r11) + LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top) ld r10,0(r11) tovirt(10,10) cmpld cr0,r16,r10 @@ -1203,13 +1144,9 @@ tlb_load_linear: clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 0801b2ce9b7d..a490724e84ad 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/of.h> +#include <linux/of_address.h> #include <linux/pfn.h> #include <linux/cpuset.h> #include <linux/node.h> @@ -33,6 +34,7 @@ #include <asm/hvcall.h> #include <asm/setup.h> #include <asm/vdso.h> +#include <asm/vphn.h> #include <asm/drmem.h> static int numa_enabled = 1; @@ -366,6 +368,7 @@ void update_numa_distance(struct device_node *node) WARN(numa_distance_table[nid][nid] == -1, "NUMA distance details for node %d not provided\n", nid); } +EXPORT_SYMBOL_GPL(update_numa_distance); /* * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} @@ -1108,7 +1111,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { - struct device_node *rtas; + struct device_node *rtas, *root; const __be32 *domains = NULL; int prop_length, max_nodes; u32 i; @@ -1129,10 +1132,12 @@ static void __init find_possible_nodes(void) * If the LPAR is migratable, new nodes might be activated after a LPM, * so we should consider the max number in that case. */ - if (!of_get_property(of_root, "ibm,migratable-partition", NULL)) + root = of_find_node_by_path("/"); + if (!of_get_property(root, "ibm,migratable-partition", NULL)) domains = of_get_property(rtas, "ibm,current-associativity-domains", &prop_length); + of_node_put(root); if (!domains) { domains = of_get_property(rtas, "ibm,max-associativity-domains", &prop_length); @@ -1160,6 +1165,9 @@ void __init mem_topology_setup(void) { int cpu; + max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + min_low_pfn = MEMORY_START >> PAGE_SHIFT; + /* * Linux/mm assumes node 0 to be online at boot. However this is not * true on PowerPC, where node 0 is similar to any other node, it @@ -1204,9 +1212,6 @@ void __init initmem_init(void) { int nid; - max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; - memblock_dump_all(); for_each_online_node(nid) { @@ -1288,23 +1293,15 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr) int nid = NUMA_NO_NODE; for_each_node_by_type(memory, "memory") { - unsigned long start, size; - int ranges; - const __be32 *memcell_buf; - unsigned int len; + int i = 0; - memcell_buf = of_get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; - - /* ranges in cell */ - ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); + while (1) { + struct resource res; - while (ranges--) { - start = read_n_cells(n_mem_addr_cells, &memcell_buf); - size = read_n_cells(n_mem_size_cells, &memcell_buf); + if (of_address_to_resource(memory, i++, &res)) + break; - if ((scn_addr < start) || (scn_addr >= (start + size))) + if ((scn_addr < res.start) || (scn_addr > res.end)) continue; nid = of_node_to_nid_single(memory); diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c index 6163e484bc6d..ac22bf28086f 100644 --- a/arch/powerpc/mm/pageattr.c +++ b/arch/powerpc/mm/pageattr.c @@ -14,6 +14,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <mm/mmu_decl.h> static pte_basic_t pte_update_delta(pte_t *ptep, unsigned long addr, unsigned long old, unsigned long new) @@ -38,6 +39,10 @@ static int change_page_attr(pte_t *ptep, unsigned long addr, void *data) /* Don't clear DIRTY bit */ pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_RO); break; + case SET_MEMORY_ROX: + /* Don't clear DIRTY bit */ + pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_ROX); + break; case SET_MEMORY_RW: pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_RW); break; @@ -97,3 +102,26 @@ int change_memory_attr(unsigned long addr, int numpages, long action) return apply_to_existing_page_range(&init_mm, start, size, change_page_attr, (void *)action); } + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +#ifdef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + int err; + unsigned long addr = (unsigned long)page_address(page); + + if (PageHighMem(page)) + return; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled()) + err = hash__kernel_map_pages(page, numpages, enable); + else if (enable) + err = set_memory_p(addr, numpages); + else + err = set_memory_np(addr, numpages); + + if (err) + panic("%s: changing memory protections failed\n", __func__); +} +#endif +#endif diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 20652daa1d7e..8c31802f97e8 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -18,15 +18,15 @@ void pte_frag_destroy(void *pte_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pte_frag); + ptdesc = virt_to_ptdesc(pte_frag); /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pte_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; if (!kernel) { - page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } } else { - page = alloc_page(PGALLOC_GFP); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP, 0); + if (!ptdesc) return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -82,12 +82,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return ret; spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!pte_frag_get(&mm->context))) { - atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR); pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE); } spin_unlock(&mm->page_table_lock); @@ -106,17 +106,40 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) return __alloc_for_ptecache(mm, kernel); } -void pte_fragment_free(unsigned long *table, int kernel) +static void pte_free_now(struct rcu_head *head) { - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc; - if (PageReserved(page)) - return free_reserved_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); +} - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - if (!kernel) - pgtable_pte_page_dtor(page); - __free_page(page); +void pte_fragment_free(unsigned long *table, int kernel) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(table); + + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); + + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + if (kernel) + pagetable_free(ptdesc); + else if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + else + pte_free_now(&ptdesc->pt_rcu_head); } } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + pte_fragment_free((unsigned long *)pgtable, 0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index e6166b71d36d..9e7ba9c3851f 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -46,19 +46,19 @@ static inline int is_exec_fault(void) * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ -static inline int pte_looks_normal(pte_t pte) +static inline int pte_looks_normal(pte_t pte, unsigned long addr) { if (pte_present(pte) && !pte_special(pte)) { if (pte_ci(pte)) return 0; - if (pte_user(pte)) + if (!is_kernel_addr(addr)) return 1; } return 0; } -static struct page *maybe_pte_to_page(pte_t pte) +static struct folio *maybe_pte_to_folio(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; @@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte) page = pfn_to_page(pfn); if (PageReserved(page)) return NULL; - return page; + return page_folio(page); } #ifdef CONFIG_PPC_BOOK3S @@ -79,17 +79,17 @@ static struct page *maybe_pte_to_page(pte_t pte) * support falls into the same category. */ -static pte_t set_pte_filter_hash(pte_t pte) +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE))) { - struct page *pg = maybe_pte_to_page(pte); - if (!pg) + if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct folio *folio = maybe_pte_to_folio(pte); + if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &pg->flags)) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } return pte; @@ -97,41 +97,43 @@ static pte_t set_pte_filter_hash(pte_t pte) #else /* CONFIG_PPC_BOOK3S */ -static pte_t set_pte_filter_hash(pte_t pte) { return pte; } +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; } #endif /* CONFIG_PPC_BOOK3S */ /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so * instead we "filter out" the exec permission for non clean pages. + * + * This is also called once for the folio. So only work with folio->flags here. */ -static inline pte_t set_pte_filter(pte_t pte) +static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) { - struct page *pg; + struct folio *folio; if (radix_enabled()) return pte; if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return set_pte_filter_hash(pte); + return set_pte_filter_hash(pte, addr); /* No exec permission in the first place, move on */ - if (!pte_exec(pte) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte, addr)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); return pte; } @@ -142,7 +144,7 @@ static inline pte_t set_pte_filter(pte_t pte) static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, int dirty) { - struct page *pg; + struct folio *folio; if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) return pte; @@ -168,17 +170,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, #endif /* CONFIG_DEBUG_VM */ /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) goto bail; /* Clean the page and set PG_dcache_clean */ - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); bail: return pte_mkexec(pte); @@ -187,23 +189,39 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { - /* - * Make sure hardware valid bit is not set. We don't do - * tlb flush for this update. - */ - VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this - * is called. + * is called. Filter the pte value and use the filtered value + * to setup all the ptes in the range. + */ + pte = set_pte_filter(pte, addr); + + /* + * We don't need to call arch_enter/leave_lazy_mmu_mode() + * because we expect set_ptes to be only be used on not present + * and not hw_valid ptes. Hence there is no translation cache flush + * involved that need to be batched. */ - pte = set_pte_filter(pte); + for (;;) { - /* Perform the setting of the PTE */ - __set_pte_at(mm, addr, ptep, pte, 0); + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + /* Perform the setting of the PTE */ + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + pte = pte_next_pfn(pte); + } } void unmap_kernel_page(unsigned long va) @@ -279,7 +297,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) { pmd_t *pmd = pmd_off(mm, addr); pte_basic_t val; @@ -292,7 +311,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = set_pte_filter(pte); + pte = set_pte_filter(pte, addr); val = pte_val(pte); @@ -311,6 +330,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; if (mm == &init_mm) return; @@ -329,8 +350,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - BUG_ON(!pmd_present(*pmd)); - assert_spin_locked(pte_lockptr(mm, pmd)); + pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + BUG_ON(!pte); + assert_spin_locked(ptl); + pte_unmap(pte); } #endif /* CONFIG_DEBUG_VM */ @@ -387,7 +410,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (p4d_none(p4d)) return NULL; - if (p4d_is_leaf(p4d)) { + if (p4d_leaf(p4d)) { ret_pte = (pte_t *)p4dp; goto out; } @@ -409,7 +432,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pud_none(pud)) return NULL; - if (pud_is_leaf(pud)) { + if (pud_leaf(pud)) { ret_pte = (pte_t *)pudp; goto out; } @@ -448,7 +471,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, goto out; } - if (pmd_is_leaf(pmd)) { + if (pmd_leaf(pmd)) { ret_pte = (pte_t *)pmdp; goto out; } @@ -472,3 +495,27 @@ out: return ret_pte; } EXPORT_SYMBOL_GPL(__find_linux_pte); + +/* Note due to the way vm flags are laid out, the bits are XWR */ +const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_EXECONLY_X, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY_X, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; + +#ifndef CONFIG_PPC_BOOK3S_64 +DECLARE_VM_GET_PAGE_PROT +#endif diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a56ade39dc68..face94977cb2 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -135,9 +135,9 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - if (v_block_mapped((unsigned long)_sinittext)) { - mmu_mark_initmem_nx(); - } else { + mmu_mark_initmem_nx(); + + if (!v_block_mapped((unsigned long)_sinittext)) { set_memory_nx((unsigned long)_sinittext, numpages); set_memory_rw((unsigned long)_sinittext, numpages); } @@ -153,35 +153,17 @@ void mark_rodata_ro(void) if (v_block_mapped((unsigned long)_stext + 1)) { mmu_mark_rodata_ro(); - ptdump_check_wx(); return; } /* - * mark .text and .rodata as read only. Use __init_begin rather than - * __end_rodata to cover NOTES and EXCEPTION_TABLE. + * mark text and rodata as read only. __end_rodata is set by + * powerpc's linker script and includes tables and data + * requiring relocation which are not put in RO_DATA. */ - numpages = PFN_UP((unsigned long)__init_begin) - + numpages = PFN_UP((unsigned long)__end_rodata) - PFN_DOWN((unsigned long)_stext); set_memory_ro((unsigned long)_stext, numpages); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } #endif - -#if defined(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && defined(CONFIG_DEBUG_PAGEALLOC) -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - unsigned long addr = (unsigned long)page_address(page); - - if (PageHighMem(page)) - return; - - if (enable) - set_memory_p(addr, numpages); - else - set_memory_np(addr, numpages); -} -#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 5ac1fd30341b..9b99113cb51a 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -100,7 +100,7 @@ EXPORT_SYMBOL(__pte_frag_size_shift); /* 4 level page table */ struct page *p4d_page(p4d_t p4d) { - if (p4d_is_leaf(p4d)) { + if (p4d_leaf(p4d)) { if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) VM_WARN_ON(!p4d_huge(p4d)); return pte_page(p4d_pte(p4d)); @@ -111,7 +111,7 @@ struct page *p4d_page(p4d_t p4d) struct page *pud_page(pud_t pud) { - if (pud_is_leaf(pud)) { + if (pud_leaf(pud)) { if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) VM_WARN_ON(!pud_huge(pud)); return pte_page(pud_pte(pud)); @@ -125,14 +125,14 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_is_leaf(pmd)) { + if (pmd_leaf(pmd)) { /* * vmalloc_to_page may be called on any vmap address (not only * vmalloc), and it uses pmd_page() etc., when huge vmap is * enabled so these checks can't be used. */ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) - VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); + VM_WARN_ON(!(pmd_leaf(pmd) || pmd_huge(pmd))); return pte_page(pmd_pte(pmd)); } return virt_to_page(pmd_page_vaddr(pmd)); @@ -150,9 +150,6 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index fac932eb8f9a..b5c79b11ea3c 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -21,11 +21,6 @@ static const struct flag_info flag_array[] = { .set = "huge", .clear = " ", }, { - .mask = _PAGE_SH, - .val = 0, - .set = "user", - .clear = " ", - }, { .mask = _PAGE_RO | _PAGE_NA, .val = 0, .set = "rw", diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index b533caaf0910..dc896d2874f3 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -4,7 +4,7 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o +obj-$(CONFIG_PPC_E500) += shared.o obj-$(CONFIG_PPC_BOOK3S_32) += shared.o obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2313053fe679..9dc239967b77 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -184,13 +184,14 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) { pte_t pte = __pte(st->current_flags); - if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx) + if (!st->check_wx) return; if (!pte_write(pte) || !pte_exec(pte)) return; - WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; @@ -326,8 +327,7 @@ static void __init build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -343,15 +343,22 @@ void ptdump_check_wx(void) } }; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) + return true; + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } -#endif static int __init ptdump_init(void) { diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index 03607ab90c66..39c30c62b7ea 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -11,15 +11,15 @@ static const struct flag_info flag_array[] = { { - .mask = _PAGE_USER, - .val = _PAGE_USER, - .set = "user", - .clear = " ", + .mask = _PAGE_READ, + .val = 0, + .set = " ", + .clear = "r", }, { - .mask = _PAGE_RW, - .val = _PAGE_RW, - .set = "rw", - .clear = "r ", + .mask = _PAGE_WRITE, + .val = 0, + .set = " ", + .clear = "w", }, { .mask = _PAGE_EXEC, .val = _PAGE_EXEC, |