diff options
Diffstat (limited to 'arch/powerpc/mm/book3s64')
-rw-r--r-- | arch/powerpc/mm/book3s64/Makefile | 1 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_native.c | 6 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_utils.c | 105 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/iommu_api.c | 48 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/mmu_context.c | 1 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/pgtable.c | 114 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_pgtable.c | 194 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_tlb.c | 338 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/subpage_prot.c | 12 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/vphn.c | 73 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/vphn.h | 16 |
11 files changed, 537 insertions, 371 deletions
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 974b4fc19f4f..fd393b8be14f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_PPC_NATIVE) += hash_native.o obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 30d62ffe3310..90ab4f31e2b3 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -41,7 +41,7 @@ #define HPTE_LOCK_BIT (56+3) #endif -DEFINE_RAW_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) { @@ -56,7 +56,7 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) * tlbiel instruction for hash, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, unsigned int pid, unsigned int ric, unsigned int prs) { @@ -112,7 +112,7 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } void hash__tlbiel_all(unsigned int action) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 28ced26f2a00..6c123760164e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -34,6 +34,7 @@ #include <linux/libfdt.h> #include <linux/pkeys.h> #include <linux/hugetlb.h> +#include <linux/cpu.h> #include <asm/debugfs.h> #include <asm/processor.h> @@ -61,6 +62,7 @@ #include <asm/ps3.h> #include <asm/pte-walk.h> #include <asm/asm-prototypes.h> +#include <asm/ultravisor.h> #include <mm/mmu_decl.h> @@ -271,10 +273,6 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; - /* Make kvm guest trampolines executable */ - if (overlaps_kvm_tmp(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - /* * If relocatable, check if it overlaps interrupt vectors that * are copied down to real 0. For relocatable kernel @@ -684,10 +682,8 @@ static void __init htab_init_page_sizes(void) if (mmu_psize_defs[MMU_PAGE_16M].shift && memblock_phys_mem_size() >= 0x40000000) mmu_vmemmap_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_64K].shift) - mmu_vmemmap_psize = MMU_PAGE_64K; else - mmu_vmemmap_psize = MMU_PAGE_4K; + mmu_vmemmap_psize = mmu_virtual_psize; #endif /* CONFIG_SPARSEMEM_VMEMMAP */ printk(KERN_DEBUG "Page orders: linear mapping = %d, " @@ -825,7 +821,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, * For now, UPRT is 0 and we have no segment table. */ htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0); + mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false); pr_info("Partition table %p\n", partition_tb); } @@ -859,12 +855,6 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; - /* - * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall - * to inform the hypervisor that we wish to use the HPT. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table(0, 0, 0); #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves @@ -981,7 +971,7 @@ void __init hash__early_init_devtree(void) htab_scan_page_sizes(); } -struct hash_mm_context init_hash_mm_context; +static struct hash_mm_context init_hash_mm_context; void __init hash__early_init_mmu(void) { #ifndef CONFIG_PPC_64K_PAGES @@ -1077,8 +1067,8 @@ void hash__early_init_mmu_secondary(void) if (!cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_SDR1, _SDR1); else - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); } /* Initialize SLB */ slb_initialize(); @@ -1462,8 +1452,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, - unsigned long dsisr) +int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, + unsigned long msr) { unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; @@ -1520,8 +1510,8 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) } #endif -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +static void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1601,6 +1591,57 @@ out_exit: local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long trap; + bool is_exec; + + if (radix_enabled()) { + prefetch((void *)address); + return; + } + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* + * We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot. + * + * We also avoid filling the hash if not coming from a fault. + */ + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + is_exec = false; + break; + case 0x400: + is_exec = true; + break; + default: + return; + } + + hash_preload(vma->vm_mm, address, is_exec, trap); +} + #ifdef CONFIG_PPC_MEM_KEYS /* * Return the protection key associated with the given address and the @@ -1707,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do - * split_huge_page_pmd + * split_huge_pmd */ if (!hpte_slot_array) return; @@ -1901,11 +1942,20 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, * * For guests on platforms before POWER9, we clamp the it limit to 1G * to avoid some funky things such as RTAS bugs etc... + * + * On POWER9 we limit to 1TB in case the host erroneously told us that + * the RMA was >1TB. Effective address bits 0:23 are treated as zero + * (meaning the access is aliased to zero i.e. addr = addr % 1TB) + * for virtual real mode addressing and so it doesn't make sense to + * have an area larger than 1TB as it can't be addressed. */ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { ppc64_rma_size = first_memblock_size; if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + else + ppc64_rma_size = min_t(u64, ppc64_rma_size, + 1UL << SID_SHIFT_1T); /* Finally limit subsequent allocations */ memblock_set_current_limit(ppc64_rma_size); @@ -1924,10 +1974,16 @@ static int hpt_order_get(void *data, u64 *val) static int hpt_order_set(void *data, u64 val) { + int ret; + if (!mmu_hash_ops.resize_hpt) return -ENODEV; - return mmu_hash_ops.resize_hpt(val); + cpus_read_lock(); + ret = mmu_hash_ops.resize_hpt(val); + cpus_read_unlock(); + + return ret; } DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); @@ -1950,7 +2006,4 @@ void __init print_system_hash_info(void) if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); - pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); - pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); - pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); } diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 90ee3a89722c..56cc84520577 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -14,6 +14,7 @@ #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/sizes.h> +#include <linux/mm.h> #include <asm/mmu_context.h> #include <asm/pte-walk.h> #include <linux/mm_inline.h> @@ -46,40 +47,6 @@ struct mm_iommu_table_group_mem_t { u64 dev_hpa; /* Device memory base address */ }; -static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, - unsigned long npages, bool incr) -{ - long ret = 0, locked, lock_limit; - - if (!npages) - return 0; - - down_write(&mm->mmap_sem); - - if (incr) { - locked = mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - } - - pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current ? current->pid : 0, - incr ? '+' : '-', - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); - - return ret; -} - bool mm_iommu_preregistered(struct mm_struct *mm) { return !list_empty(&mm->context.iommu_group_mem_list); @@ -96,7 +63,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { - ret = mm_iommu_adjust_locked_vm(mm, entries, true); + ret = account_locked_vm(mm, entries, true); if (ret) return ret; @@ -162,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); mem->pageshift = min(mem->pageshift, pageshift); /* * We don't need struct page reference any more, switch @@ -211,7 +175,7 @@ free_exit: kfree(mem); unlock_exit: - mm_iommu_adjust_locked_vm(mm, locked_entries, false); + account_locked_vm(mm, locked_entries, false); return ret; } @@ -311,7 +275,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) unlock_exit: mutex_unlock(&mem_list_mutex); - mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + account_locked_vm(mm, unlock_entries, false); return ret; } diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 794404d50a85..2d0cb5ba9a47 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -174,7 +174,6 @@ static int radix__init_new_context(struct mm_struct *mm) */ asm volatile("ptesync;isync" : : : "memory"); - mm->context.npu_context = NULL; mm->context.hash_context = NULL; return index; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 01bc9663360d..75483b40fcb1 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -8,10 +8,13 @@ #include <linux/memblock.h> #include <misc/cxl-base.h> +#include <asm/debugfs.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/trace.h> #include <asm/powernv.h> +#include <asm/firmware.h> +#include <asm/ultravisor.h> #include <mm/mmu_decl.h> #include <trace/events/thp.h> @@ -21,9 +24,6 @@ EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; EXPORT_SYMBOL(__pmd_frag_size_shift); -int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page @@ -72,7 +72,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); + WARN_ON(!(pmd_large(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); @@ -205,37 +205,61 @@ void __init mmu_partition_table_init(void) * 64 K size. */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); + set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); } +static void flush_partition(unsigned int lpid, bool radix) +{ + if (radix) { + radix__flush_all_lpid(lpid); + radix__flush_all_lpid_guest(lpid); + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } +} + void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) + unsigned long dw1, bool flush) { unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + /* + * When ultravisor is enabled, the partition table is stored in secure + * memory and can only be accessed doing an ultravisor call. However, we + * maintain a copy of the partition table in normal memory to allow Nest + * MMU translations to occur (for normal VMs). + * + * Therefore, here we always update partition_tb, regardless of whether + * we are running under an ultravisor or not. + */ partition_tb[lpid].patb0 = cpu_to_be64(dw0); partition_tb[lpid].patb1 = cpu_to_be64(dw1); /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. + * If ultravisor is enabled, we do an ultravisor call to register the + * partition table entry (PATE), which also do a global flush of TLBs + * and partition table caches for the lpid. Otherwise, just do the + * flush. The type of flush (hash or radix) depends on what the previous + * use of the partition ID was, not the new use. */ - asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); - } else { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) { + uv_register_pate(lpid, dw0, dw1); + pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", + dw0, dw1); + } else if (flush) { + /* + * Boot does not need to flush, because MMU is off and each + * CPU does a tlbiel_all() before switching them on, which + * flushes everything. + */ + flush_partition(lpid, (old & PATB_HR)); } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); @@ -446,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } + +/* + * Does the CPU support tlbie? + */ +bool tlbie_capable __read_mostly = true; +EXPORT_SYMBOL(tlbie_capable); + +/* + * Should tlbie be used for management of CPU TLBs, for kernel and process + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM + * guest address spaces. + */ +bool tlbie_enabled __read_mostly = true; + +static int __init setup_disable_tlbie(char *str) +{ + if (!radix_enabled()) { + pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n"); + return 1; + } + + tlbie_capable = false; + tlbie_enabled = false; + + return 1; +} +__setup("disable_tlbie", setup_disable_tlbie); + +static int __init pgtable_debugfs_setup(void) +{ + if (!tlbie_capable) + return 0; + + /* + * There is no locking vs tlb flushing when changing this value. + * The tlb flushers will see one value or another, and use either + * tlbie or tlbiel with IPIs. In both cases the TLBs will be + * invalidated as expected. + */ + debugfs_create_bool("tlbie_enabled", 0600, + powerpc_debugfs_root, + &tlbie_enabled); + + return 0; +} +arch_initcall(pgtable_debugfs_setup); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 273ae66a9a45..3a1fbf9cb8f8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "radix-mmu: " fmt +#include <linux/io.h> #include <linux/kernel.h> #include <linux/sched/mm.h> #include <linux/memblock.h> @@ -26,25 +27,13 @@ #include <asm/sections.h> #include <asm/trace.h> #include <asm/uaccess.h> +#include <asm/ultravisor.h> #include <trace/events/thp.h> unsigned int mmu_pid_bits; unsigned int mmu_base_pid; -static int native_register_process_table(unsigned long base, unsigned long pg_sz, - unsigned long table_size) -{ - unsigned long patb0, patb1; - - patb0 = be64_to_cpu(partition_tb[0].patb0); - patb1 = base | table_size | PATB_GR; - - mmu_partition_table_set_entry(0, patb0, patb1); - - return 0; -} - static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) { @@ -198,14 +187,14 @@ void radix__change_memory_range(unsigned long start, unsigned long end, pudp = pud_alloc(&init_mm, pgdp, idx); if (!pudp) continue; - if (pud_huge(*pudp)) { + if (pud_is_leaf(*pudp)) { ptep = (pte_t *)pudp; goto update_the_pte; } pmdp = pmd_alloc(&init_mm, pudp, idx); if (!pmdp) continue; - if (pmd_huge(*pmdp)) { + if (pmd_is_leaf(*pmdp)) { ptep = pmdp_ptep(pmdp); goto update_the_pte; } @@ -319,7 +308,7 @@ static int __meminit create_physical_mapping(unsigned long start, return 0; } -void __init radix_init_pgtable(void) +static void __init radix_init_pgtable(void) { unsigned long rts_field; struct memblock_region *reg; @@ -379,18 +368,6 @@ void __init radix_init_pgtable(void) */ rts_field = radix__get_tree_size(); process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - /* - * Fill in the partition table. We are suppose to use effective address - * of process table here. But our linear mapping also enable us to use - * physical address here. - */ - register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); - pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); /* * The init_mm context is given the first available (non-zero) PID, @@ -411,20 +388,15 @@ void __init radix_init_pgtable(void) static void __init radix_init_partition_table(void) { - unsigned long rts_field, dw0; + unsigned long rts_field, dw0, dw1; mmu_partition_table_init(); rts_field = radix__get_tree_size(); dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; - mmu_partition_table_set_entry(0, dw0, 0); + dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; + mmu_partition_table_set_entry(0, dw0, dw1, false); pr_info("Initializing Radix MMU\n"); - pr_info("Partition table %p\n", partition_tb); -} - -void __init radix_init_native(void) -{ - register_process_table = native_register_process_table; } static int __init get_idx_from_shift(unsigned int shift) @@ -515,14 +487,6 @@ void __init radix__early_init_devtree(void) mmu_psize_defs[MMU_PAGE_64K].shift = 16; mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; found: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ return; } @@ -587,7 +551,13 @@ void __init radix__early_init_mmu(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP /* vmemmap mapping */ - mmu_vmemmap_psize = mmu_virtual_psize; + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } else + mmu_vmemmap_psize = mmu_virtual_psize; #endif /* * initialize page table size @@ -622,8 +592,9 @@ void __init radix__early_init_mmu(void) __pmd_frag_nr = RADIX_PMD_FRAG_NR; __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + radix_init_pgtable(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) { - radix_init_native(); lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); @@ -634,11 +605,9 @@ void __init radix__early_init_mmu(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - radix_init_pgtable(); /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__early_init_mmu_secondary(void) @@ -651,14 +620,14 @@ void radix__early_init_mmu_secondary(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); + radix_init_amor(); } radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__mmu_cleanup_all(void) @@ -668,7 +637,7 @@ void radix__mmu_cleanup_all(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); - mtspr(SPRN_PTCR, 0); + set_ptcr_when_no_uv(0); powernv_set_nmmu_ptcr(0); radix__flush_tlb_all(); } @@ -738,8 +707,8 @@ static int __meminit stop_machine_change_mapping(void *data) spin_unlock(&init_mm.page_table_lock); pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start, -1); - create_physical_mapping(params->end, params->aligned_end, -1); + create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); + create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); spin_lock(&init_mm.page_table_lock); return 0; } @@ -832,7 +801,7 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, if (!pmd_present(*pmd)) continue; - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); continue; } @@ -857,7 +826,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr, if (!pud_present(*pud)) continue; - if (pud_huge(*pud)) { + if (pud_is_leaf(*pud)) { split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); continue; } @@ -883,7 +852,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) if (!pgd_present(*pgd)) continue; - if (pgd_huge(*pgd)) { + if (pgd_is_leaf(*pgd)) { split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); continue; } @@ -903,7 +872,7 @@ int __meminit radix__create_section_mapping(unsigned long start, unsigned long e return -1; } - return create_physical_mapping(start, end, nid); + return create_physical_mapping(__pa(start), __pa(end), nid); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -1118,3 +1087,108 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); } + +int __init arch_ioremap_pud_supported(void) +{ + /* HPT does not cope with large pages in the vmalloc area */ + return radix_enabled(); +} + +int __init arch_ioremap_pmd_supported(void) +{ + return radix_enabled(); +} + +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} + +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + pte_t *ptep = (pte_t *)pud; + pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); + + if (!radix_enabled()) + return 0; + + set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); + + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (pud_huge(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd; + int i; + + pmd = (pmd_t *)pud_page_vaddr(*pud); + pud_clear(pud); + + flush_tlb_kernel_range(addr, addr + PUD_SIZE); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd[i])) { + pte_t *pte; + pte = (pte_t *)pmd_page_vaddr(pmd[i]); + + pte_free_kernel(&init_mm, pte); + } + } + + pmd_free(&init_mm, pmd); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + pte_t *ptep = (pte_t *)pmd; + pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); + + if (!radix_enabled()) + return 0; + + set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); + + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_huge(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} + +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + + pte_free_kernel(&init_mm, pte); + + return 1; +} + +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index bb9835681315..631be42abd33 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -25,7 +25,7 @@ * tlbiel instruction for radix, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, +static __always_inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, unsigned int pid, unsigned int ric, unsigned int prs) { @@ -51,11 +51,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) * and partition table entries. Then flush the remaining sets of the * TLB. */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); - /* Do the same for process scoped entries. */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) { + /* MSR[HV] should flush partition scope translations first. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + } + + /* Flush process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); @@ -83,7 +87,7 @@ void radix__tlbiel_all(unsigned int action) else WARN(1, "%s called on pre-POWER9 CPU\n", __func__); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } static __always_inline void __tlbiel_pid(unsigned long pid, int set, @@ -116,22 +120,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbiel_lpid(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -146,25 +134,22 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) +static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ + rs = lpid; prs = 1; /* process scoped */ r = 1; /* radix format */ - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } - -static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -179,8 +164,8 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -195,8 +180,8 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -235,7 +220,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) /* * We use 128 set in radix mode and 256 set in hpt mode. */ -static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; @@ -258,7 +243,7 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) __tlbiel_pid(pid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory"); } static inline void _tlbie_pid(unsigned long pid, unsigned long ric) @@ -285,32 +270,37 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) +struct tlbiel_pid { + unsigned long pid; + unsigned long ric; +}; + +static void do_tlbiel_pid(void *info) { - int set; + struct tlbiel_pid *t = info; - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); + else + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); +} - asm volatile("ptesync": : :"memory"); +static inline void _tlbiel_pid_multicast(struct mm_struct *mm, + unsigned long pid, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_pid t = { .pid = pid, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Always want the CPU translations to be invalidated with tlbiel in + * these paths, so while coprocessors must use tlbie, we can not + * optimise away the tlbiel component. */ - __tlbiel_lpid(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_pid(pid, RIC_FLUSH_ALL); } static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) @@ -337,35 +327,28 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. */ - __tlbiel_lpid_guest(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } - static inline void __tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) @@ -377,8 +360,8 @@ static inline void __tlbiel_va_range(unsigned long start, unsigned long end, __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); } -static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) +static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) { unsigned long ap = mmu_get_ap(psize); @@ -409,8 +392,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } -static inline void _tlbie_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) +static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) { unsigned long ap = mmu_get_ap(psize); @@ -420,7 +403,54 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, +struct tlbiel_va { + unsigned long pid; + unsigned long va; + unsigned long psize; + unsigned long ric; +}; + +static void do_tlbiel_va(void *info) +{ + struct tlbiel_va *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); + else + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_va_multicast(struct mm_struct *mm, + unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); +} + +struct tlbiel_va_range { + unsigned long pid; + unsigned long start; + unsigned long end; + unsigned long page_size; + unsigned long psize; + bool also_pwc; +}; + +static void do_tlbiel_va_range(void *info) +{ + struct tlbiel_va_range *t = info; + + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, + t->psize, t->also_pwc); +} + +static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long psize, unsigned long ric) { unsigned long ap = mmu_get_ap(psize); @@ -443,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va_range t = { .start = start, .end = end, + .pid = pid, .page_size = page_size, + .psize = psize, .also_pwc = also_pwc }; + + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); +} + /* * Base TLB flushing operations: * @@ -580,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto local; } - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } else { local: _tlbiel_pid(pid, RIC_FLUSH_TLB); @@ -609,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) goto local; } } - _tlbie_pid(pid, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } else { local: _tlbiel_pid(pid, RIC_FLUSH_ALL); @@ -644,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + else + _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); } else { local: _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); @@ -666,9 +721,35 @@ EXPORT_SYMBOL(radix__flush_tlb_page); #define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ +static void do_tlbiel_kernel(void *info) +{ + _tlbiel_pid(0, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_kernel_broadcast(void) +{ + on_each_cpu(do_tlbiel_kernel, NULL, 1); + if (tlbie_capable) { + /* + * Coherent accelerators don't refcount kernel memory mappings, + * so have to always issue a tlbie for them. This is quite a + * slow path anyway. + */ + _tlbie_pid(0, RIC_FLUSH_ALL); + } +} + +/* + * If kernel TLBIs ever become local rather than global, then + * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it + * assumes kernel TLBIs are global. + */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - _tlbie_pid(0, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(0, RIC_FLUSH_ALL); + else + _tlbiel_kernel_broadcast(); } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); @@ -724,10 +805,14 @@ is_local: if (local) { _tlbiel_pid(pid, RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } } else { bool hflush = flush_all_sizes; @@ -752,8 +837,8 @@ is_local: gflush = false; } - asm volatile("ptesync": : :"memory"); if (local) { + asm volatile("ptesync": : :"memory"); __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, @@ -762,7 +847,8 @@ is_local: __tlbiel_va_range(gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); - } else { + } else if (cputlb_use_tlbie()) { + asm volatile("ptesync": : :"memory"); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, @@ -772,6 +858,15 @@ is_local: PUD_SIZE, MMU_PAGE_1G); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } else { + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, mmu_virtual_psize, false); + if (hflush) + _tlbiel_va_range_multicast(mm, + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); + if (gflush) + _tlbiel_va_range_multicast(mm, + gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false); } } preempt_enable(); @@ -830,32 +925,19 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ -void radix__flush_tlb_lpid(unsigned int lpid) +void radix__flush_all_lpid(unsigned int lpid) { _tlbie_lpid(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__local_flush_tlb_lpid(unsigned int lpid) -{ - _tlbiel_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); +EXPORT_SYMBOL_GPL(radix__flush_all_lpid); /* - * Flush process scoped translations from LPID (=LPIDR). - * Important difference, the guest normally manages its own translations, - * but some cases e.g., vCPU CPU migration require KVM to flush. + * Flush process scoped translations from LPID (=LPIDR) */ -void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +void radix__flush_all_lpid_guest(unsigned int lpid) { - _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); + _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); - static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); @@ -961,16 +1043,26 @@ is_local: if (local) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - also_pwc = true; + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } } else { if (local) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); - else + else if (cputlb_use_tlbie()) _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, psize, also_pwc); } preempt_enable(); } @@ -1012,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + if (cputlb_use_tlbie()) + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + else + _tlbiel_va_range_multicast(mm, + addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } else { local: _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 9ba07e55c489..2ef24a53f4c9 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/types.h> -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/hugetlb.h> #include <linux/syscalls.h> @@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, return 0; } +static const struct mm_walk_ops subpage_walk_ops = { + .pmd_entry = subpage_walk_pmd_entry, +}; + static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; /* * We don't try too hard, we just mark all the vma in that range @@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, if (vma->vm_start >= (addr + len)) break; vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); + walk_page_vma(vma, &subpage_walk_ops, NULL); vma = vma->vm_next; } } diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c deleted file mode 100644 index 0ee7734afb50..000000000000 --- a/arch/powerpc/mm/book3s64/vphn.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <asm/byteorder.h> -#include "vphn.h" - -/* - * The associativity domain numbers are returned from the hypervisor as a - * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the - * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 - * bytes. - * - * --- 16-bit fields --> - * _________________________ - * | 0 | 1 | 2 | 3 | be_packed[0] - * ------+-----+-----+------ - * _________________________ - * | 4 | 5 | 6 | 7 | be_packed[1] - * ------------------------- - * ... - * _________________________ - * | 20 | 21 | 22 | 23 | be_packed[5] - * ------------------------- - * - * Convert to the sequence they would appear in the ibm,associativity property. - */ -int vphn_unpack_associativity(const long *packed, __be32 *unpacked) -{ - __be64 be_packed[VPHN_REGISTER_COUNT]; - int i, nr_assoc_doms = 0; - const __be16 *field = (const __be16 *) be_packed; - u16 last = 0; - bool is_32bit = false; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - /* Let's fix the values returned by plpar_hcall9() */ - for (i = 0; i < VPHN_REGISTER_COUNT; i++) - be_packed[i] = cpu_to_be64(packed[i]); - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - u16 new = be16_to_cpup(field++); - - if (is_32bit) { - /* - * Let's concatenate the 16 bits of this field to the - * 15 lower bits of the previous field - */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(last << 16 | new); - is_32bit = false; - } else if (new == VPHN_FIELD_UNUSED) - /* This is the list terminator */ - break; - else if (new & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(new & VPHN_FIELD_MASK); - } else { - /* - * Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - last = new; - is_32bit = true; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = cpu_to_be32(nr_assoc_doms); - - return nr_assoc_doms; -} diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h deleted file mode 100644 index f0b93c2dd578..000000000000 --- a/arch/powerpc/mm/book3s64/vphn.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARCH_POWERPC_MM_VPHN_H_ -#define _ARCH_POWERPC_MM_VPHN_H_ - -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ -#define VPHN_REGISTER_COUNT 6 - -/* - * 6 64-bit registers unpacked into up to 24 be32 associativity values. To - * form the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) - -extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); - -#endif |