aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/powerpc/mm/book3s64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm/book3s64')
-rw-r--r--arch/powerpc/mm/book3s64/hash_4k.c5
-rw-r--r--arch/powerpc/mm/book3s64/hash_64k.c10
-rw-r--r--arch/powerpc/mm/book3s64/hash_hugepage.c15
-rw-r--r--arch/powerpc/mm/book3s64/hash_native.c80
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c15
-rw-r--r--arch/powerpc/mm/book3s64/hash_tlb.c6
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c194
-rw-r--r--arch/powerpc/mm/book3s64/hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/book3s64/internal.h11
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c4
-rw-r--r--arch/powerpc/mm/book3s64/mmu_context.c10
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c128
-rw-r--r--arch/powerpc/mm/book3s64/pkeys.c6
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c14
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c758
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c377
-rw-r--r--arch/powerpc/mm/book3s64/slb.c1
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c18
18 files changed, 1241 insertions, 416 deletions
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
index 7de1a8a0c62a..02acbfd05b46 100644
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -16,6 +16,8 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
+#include "internal.h"
+
int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, int subpg_prot)
@@ -118,6 +120,9 @@ repeat:
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
index 998c6817ed47..954af420f358 100644
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -16,6 +16,8 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
+#include "internal.h"
+
/*
* Return true, if the entry has a slot value which
* the software considers as invalid.
@@ -216,6 +218,9 @@ repeat:
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
new_pte |= H_PAGE_HASHPTE;
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
@@ -327,7 +332,12 @@ repeat:
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
}
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
return 0;
}
diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
index c0fabe6c5a12..15d6f3ea7178 100644
--- a/arch/powerpc/mm/book3s64/hash_hugepage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -59,16 +59,13 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
rflags = htab_convert_pte_flags(new_pmd, flags);
-#if 0
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+ /*
+ * THPs are only supported on platforms that can do mixed page size
+ * segments (MPSS) and all such platforms have coherent icache. Hence we
+ * don't need to do lazy icache flush (hash_page_do_lazy_icache()) on
+ * noexecute fault.
+ */
- /*
- * No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case
- */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
- }
-#endif
/*
* Find the slot index details for this ea, using base page size.
*/
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 623a7b7ab38b..430d1d935a7c 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -43,6 +43,29 @@
static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map hpte_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map);
+
+static void acquire_hpte_lock(void)
+{
+ lock_map_acquire(&hpte_lock_map);
+}
+
+static void release_hpte_lock(void)
+{
+ lock_map_release(&hpte_lock_map);
+}
+#else
+static void acquire_hpte_lock(void)
+{
+}
+
+static void release_hpte_lock(void)
+{
+}
+#endif
+
static inline unsigned long ___tlbie(unsigned long vpn, int psize,
int apsize, int ssize)
{
@@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
+ acquire_hpte_lock();
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
@@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
+ release_hpte_lock();
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
@@ -243,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
+ unsigned long flags;
int i;
+ local_irq_save(flags);
+
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
@@ -263,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
hptep++;
}
- if (i == HPTES_PER_GROUP)
+ if (i == HPTES_PER_GROUP) {
+ local_irq_restore(flags);
return -1;
+ }
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
@@ -286,19 +316,24 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
+ release_hpte_lock();
hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
+ local_irq_restore(flags);
+
return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static long native_hpte_remove(unsigned long hpte_group)
{
+ unsigned long hpte_v, flags;
struct hash_pte *hptep;
int i;
int slot_offset;
- unsigned long hpte_v;
+
+ local_irq_save(flags);
DBG_LOW(" remove(group=%lx)\n", hpte_group);
@@ -323,12 +358,16 @@ static long native_hpte_remove(unsigned long hpte_group)
slot_offset &= 0x7;
}
- if (i == HPTES_PER_GROUP)
- return -1;
+ if (i == HPTES_PER_GROUP) {
+ i = -1;
+ goto out;
+ }
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
-
+out:
+ local_irq_restore(flags);
return i;
}
@@ -339,6 +378,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0, local = 0;
+ unsigned long irqflags;
+
+ local_irq_save(irqflags);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
@@ -382,6 +424,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
if (!(flags & HPTE_NOHPTE_UPDATE))
tlbie(vpn, bpsize, apsize, ssize, local);
+ local_irq_restore(irqflags);
+
return ret;
}
@@ -445,6 +489,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -463,6 +510,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
* actual page size will be same.
*/
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
}
/*
@@ -476,6 +525,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -493,6 +545,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
/* Invalidate the TLB */
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
+
return 0;
}
@@ -517,10 +572,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
/* recheck with locks held */
hpte_v = hpte_get_old_v(hptep);
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
- else
+ } else
native_unlock_hpte(hptep);
}
/*
@@ -580,10 +636,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
- /*
- * Invalidate the hpte. NOTE: this also unlocks it
- */
-
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
} else
native_unlock_hpte(hptep);
@@ -765,8 +819,10 @@ static void native_flush_hash_range(unsigned long number, int local)
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
- else
+ else {
+ release_hpte_lock();
hptep->v = 0;
+ }
} pte_iterate_hashed_end();
}
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 2e0cad5817ba..988948d69bc1 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -13,6 +13,7 @@
#include <asm/sections.h>
#include <asm/mmu.h>
#include <asm/tlb.h>
+#include <asm/firmware.h>
#include <mm/mmu_decl.h>
@@ -213,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
old = be64_to_cpu(old_be);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
@@ -255,7 +256,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
* the __collapse_huge_page_copy can result in copying
* the old content.
*/
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
return pmd;
}
@@ -403,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
struct change_memory_parms {
unsigned long start, end, newpp;
- unsigned int step, nr_cpus, master_cpu;
+ unsigned int step, nr_cpus;
+ atomic_t master_cpu;
atomic_t cpu_counter;
};
@@ -477,7 +479,8 @@ static int change_memory_range_fn(void *data)
{
struct change_memory_parms *parms = data;
- if (parms->master_cpu != smp_processor_id())
+ // First CPU goes through, all others wait.
+ if (atomic_xchg(&parms->master_cpu, 1) == 1)
return chmem_secondary_loop(parms);
// Wait for all but one CPU (this one) to call-in
@@ -515,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
chmem_parms.end = end;
chmem_parms.step = step;
chmem_parms.newpp = newpp;
- chmem_parms.master_cpu = smp_processor_id();
+ atomic_set(&chmem_parms.master_cpu, 0);
cpus_read_lock();
@@ -540,7 +543,7 @@ void hash__mark_rodata_ro(void)
unsigned long start, end, pp;
start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
+ end = (unsigned long)__end_rodata;
pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index eb0bccaf221e..21fcad97ae80 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -221,7 +221,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end)
local_irq_restore(flags);
}
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
pte_t *start_pte;
@@ -239,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
start_pte = pte_offset_map(pmd, addr);
+ if (!start_pte)
+ goto out;
for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
unsigned long pteval = pte_val(*pte);
if (pteval & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, pte, pteval, 0);
addr += PAGE_SIZE;
}
+ pte_unmap(start_pte);
+out:
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index fc92613dc2bf..01c3b4b65241 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -123,11 +123,8 @@ EXPORT_SYMBOL_GPL(mmu_slb_size);
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
static u8 *linear_map_hash_slots;
static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
struct mmu_hash_ops mmu_hash_ops;
EXPORT_SYMBOL(mmu_hash_ops);
@@ -313,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags
else
rflags |= 0x3;
}
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
} else {
if (pteflags & _PAGE_RWX)
rflags |= 0x2;
+ /*
+ * We should never hit this in normal fault handling because
+ * a permission check (check_pte_access()) will bubble this
+ * to higher level linux handler even for PAGE_NONE.
+ */
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
@@ -408,7 +412,7 @@ repeat:
ssize);
if (ret == -1) {
/*
- * Try to to keep bolted entries in primary.
+ * Try to keep bolted entries in primary.
* Remove non bolted entries and try insert again
*/
ret = mmu_hash_ops.hpte_remove(hpteg);
@@ -427,11 +431,9 @@ repeat:
break;
cond_resched();
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled() &&
+ if (debug_pagealloc_enabled_or_kfence() &&
(paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
}
return ret < 0 ? ret : 0;
}
@@ -476,7 +478,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
return ret;
}
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __ro_after_init;
static int __init parse_disable_1tb_segments(char *p)
{
@@ -485,6 +487,40 @@ static int __init parse_disable_1tb_segments(char *p)
}
early_param("disable_1tb_segments", parse_disable_1tb_segments);
+bool stress_hpt_enabled __initdata;
+
+static int __init parse_stress_hpt(char *p)
+{
+ stress_hpt_enabled = true;
+ return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+/*
+ * per-CPU array allocated if we enable stress_hpt.
+ */
+#define STRESS_MAX_GROUPS 16
+struct stress_hpt_struct {
+ unsigned long last_group[STRESS_MAX_GROUPS];
+};
+
+static inline int stress_nr_groups(void)
+{
+ /*
+ * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
+ * to allow practical forward progress. Bare metal returns 1, which
+ * seems to help uncover more bugs.
+ */
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return STRESS_MAX_GROUPS;
+ else
+ return 1;
+}
+
+static struct stress_hpt_struct *stress_hpt_struct;
+
static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
@@ -778,7 +814,7 @@ static void __init htab_init_page_sizes(void)
bool aligned = true;
init_hpte_page_sizes();
- if (!debug_pagealloc_enabled()) {
+ if (!debug_pagealloc_enabled_or_kfence()) {
/*
* Pick a size for the linear mapping. Currently, we only
* support 16M, 1M and 4K which is the default
@@ -981,6 +1017,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
pr_info("Partition table %p\n", partition_tb);
}
+void hpt_clear_stress(void);
+static struct timer_list stress_hpt_timer;
+static void stress_hpt_timer_fn(struct timer_list *timer)
+{
+ int next_cpu;
+
+ hpt_clear_stress();
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ tlbiel_all();
+
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer_on(&stress_hpt_timer, next_cpu);
+}
+
static void __init htab_initialize(void)
{
unsigned long table;
@@ -1000,6 +1053,21 @@ static void __init htab_initialize(void)
if (stress_slb_enabled)
static_branch_enable(&stress_slb_key);
+ if (stress_hpt_enabled) {
+ unsigned long tmp;
+ static_branch_enable(&stress_hpt_key);
+ // Too early to use nr_cpu_ids, so use NR_CPUS
+ tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
+ __alignof__(struct stress_hpt_struct),
+ 0, MEMBLOCK_ALLOC_ANYWHERE);
+ memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
+ stress_hpt_struct = __va(tmp);
+
+ timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer(&stress_hpt_timer);
+ }
+
/*
* Calculate the required size of the htab. We want the number of
* PTEGs to equal one half the number of real pages.
@@ -1066,8 +1134,7 @@ static void __init htab_initialize(void)
prot = pgprot_val(PAGE_KERNEL);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled()) {
+ if (debug_pagealloc_enabled_or_kfence()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
@@ -1076,7 +1143,6 @@ static void __init htab_initialize(void)
panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
__func__, linear_map_hash_count, &ppc64_rma_size);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
/* create bolted the linear mapping in the hash table */
for_each_mem_range(i, &base, &end) {
@@ -1248,18 +1314,19 @@ void hash__early_init_mmu_secondary(void)
*/
unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
{
- struct page *page;
+ struct folio *folio;
if (!pfn_valid(pte_pfn(pte)))
return pp;
- page = pte_page(pte);
+ folio = page_folio(pte_page(pte));
/* page is dirty */
- if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags) &&
+ !folio_test_reserved(folio)) {
if (trap == INTERRUPT_INST_STORAGE) {
- flush_dcache_icache_page(page);
- set_bit(PG_dcache_clean, &page->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
} else
pp |= HPTE_R_N;
}
@@ -1781,7 +1848,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
/*
@@ -1791,9 +1858,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
unsigned long trap;
bool is_exec;
- if (radix_enabled())
- return;
-
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
@@ -1990,7 +2054,72 @@ repeat:
return slot;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
+void hpt_clear_stress(void)
+{
+ int cpu = raw_smp_processor_id();
+ int g;
+
+ for (g = 0; g < stress_nr_groups(); g++) {
+ unsigned long last_group;
+ last_group = stress_hpt_struct[cpu].last_group[g];
+
+ if (last_group != -1UL) {
+ int i;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[g] = -1;
+ }
+ }
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
+{
+ unsigned long last_group;
+ int cpu = raw_smp_processor_id();
+
+ last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
+ if (hpte_group == last_group)
+ return;
+
+ if (last_group != -1UL) {
+ int i;
+ /*
+ * Concurrent CPUs might be inserting into this group, so
+ * give up after a number of iterations, to prevent a live
+ * lock.
+ */
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
+ }
+
+ if (ea >= PAGE_OFFSET) {
+ /*
+ * We would really like to prefetch to get the TLB loaded, then
+ * remove the PTE before returning from fault interrupt, to
+ * increase the hash fault rate.
+ *
+ * Unfortunately QEMU TCG does not model the TLB in a way that
+ * makes this possible, and systemsim (mambo) emulator does not
+ * bring in TLBs with prefetches (although loads/stores do
+ * work for non-CI PTEs).
+ *
+ * So remember this PTE and clear it on the next hash fault.
+ */
+ memmove(&stress_hpt_struct[cpu].last_group[1],
+ &stress_hpt_struct[cpu].last_group[0],
+ (stress_nr_groups() - 1) * sizeof(unsigned long));
+ stress_hpt_struct[cpu].last_group[0] = hpte_group;
+ }
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
+
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
unsigned long hash;
@@ -2005,15 +2134,18 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
if (!vsid)
return;
+ if (linear_map_hash_slots[lmi] & 0x80)
+ return;
+
ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
HPTE_V_BOLTED,
mmu_linear_psize, mmu_kernel_ssize);
BUG_ON (ret < 0);
- spin_lock(&linear_map_hash_lock);
+ raw_spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
linear_map_hash_slots[lmi] = ret | 0x80;
- spin_unlock(&linear_map_hash_lock);
+ raw_spin_unlock(&linear_map_hash_lock);
}
static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
@@ -2023,11 +2155,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+ raw_spin_lock(&linear_map_hash_lock);
+ if (!(linear_map_hash_slots[lmi] & 0x80)) {
+ raw_spin_unlock(&linear_map_hash_lock);
+ return;
+ }
hidx = linear_map_hash_slots[lmi] & 0x7f;
linear_map_hash_slots[lmi] = 0;
- spin_unlock(&linear_map_hash_lock);
+ raw_spin_unlock(&linear_map_hash_lock);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -2037,7 +2172,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
mmu_kernel_ssize, 0);
}
-void hash__kernel_map_pages(struct page *page, int numpages, int enable)
+int hash__kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long flags, vaddr, lmi;
int i;
@@ -2054,8 +2189,9 @@ void hash__kernel_map_pages(struct page *page, int numpages, int enable)
kernel_unmap_linear_page(vaddr, lmi);
}
local_irq_restore(flags);
+ return 0;
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
+#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */
void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
index 3bc0eb21b2a0..5a2e512e96db 100644
--- a/arch/powerpc/mm/book3s64/hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -143,11 +143,14 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t old_pte, pte_t pte)
{
+ unsigned long psize;
if (radix_enabled())
return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
old_pte, pte);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+
+ psize = huge_page_size(hstate_vma(vma));
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
void __init hugetlbpage_init_defaultsize(void)
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
index 5045048ce244..a57a25f06a21 100644
--- a/arch/powerpc/mm/book3s64/internal.h
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -13,6 +13,17 @@ static inline bool stress_slb(void)
return static_branch_unlikely(&stress_slb_key);
}
+extern bool stress_hpt_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static inline bool stress_hpt(void)
+{
+ return static_branch_unlikely(&stress_hpt_key);
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group);
+
void slb_setup_new_exec(void);
void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 7fcfba162e0d..c0e8d597e4cb 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
}
mmap_read_lock(mm);
- chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
+ chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) /
sizeof(struct vm_area_struct *);
chunk = min(chunk, entries);
for (entry = 0; entry < entries; entry += chunk) {
@@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
FOLL_WRITE | FOLL_LONGTERM,
- mem->hpages + entry, NULL);
+ mem->hpages + entry);
if (ret == n) {
pinned += n;
continue;
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index c766e4c26e42..1715b07c630c 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx)
static void pmd_frag_destroy(void *pmd_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pmd_frag);
+ ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 7b9966402b25..83823db3488b 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -9,6 +9,7 @@
#include <linux/memremap.h>
#include <linux/pkeys.h>
#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
#include <misc/cxl-base.h>
#include <asm/pgalloc.h>
@@ -64,11 +65,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
return changed;
}
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
+#endif
+ changed = !pud_same(*(pudp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_1G here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pudp_ptep(pudp),
+ pud_pte(entry), address, MMU_PAGE_1G);
+ }
+ return changed;
+}
+
+
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
+
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp)
+{
+ return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
+}
+
/*
* set a new huge pmd. We should not be called for updating
* an existing pmd entry. That should go via pmd_hugepage_update.
@@ -84,12 +113,29 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
assert_spin_locked(pmd_lockptr(mm, pmdp));
- WARN_ON(!(pmd_large(pmd)));
+ WARN_ON(!(pmd_leaf(pmd)));
#endif
trace_hugepage_set_pmd(addr, pmd_val(pmd));
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
+void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pud_pte(*pudp)));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+ WARN_ON(!(pud_leaf(pud)));
+#endif
+ trace_hugepage_set_pud(addr, pud_val(pud));
+ return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
+}
+
static void do_serialize(void *arg)
{
/* We've taken the IPI, so try to trim the mask while here */
@@ -100,14 +146,14 @@ static void do_serialize(void *arg)
}
/*
- * Serialize against find_current_mm_pte which does lock-less
+ * Serialize against __find_linux_pte() which does lock-less
* lookup in page tables with local interrupts disabled. For huge pages
* it casts pmd_t to pte_t. Since format of pte_t is different from
* pmd_t we want to prevent transit from pmd pointing to page table
* to pmd pointing to huge page (and back) while interrupts are disabled.
* We clear pmd to possibly replace it with page table pointer in
* different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
+ * __find_linux_pte() to finish.
*/
void serialize_against_pte_lookup(struct mm_struct *mm)
{
@@ -147,11 +193,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
return pmd;
}
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp, int full)
+{
+ pud_t pud;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
+ !pud_present(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
+ return pud;
+}
+
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
}
+static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
+{
+ return __pud(pud_val(pud) | pgprot_val(pgprot));
+}
+
/*
* At some point we should be able to get rid of
* pmd_mkhuge() and mk_huge_pmd() when we update all the
@@ -166,6 +236,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
}
+pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pudv;
+
+ pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
+}
+
pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
return pfn_pmd(page_to_pfn(page), pgprot);
@@ -306,22 +385,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -331,12 +410,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
+ * If we find ptdesc_page set, we return
* the allocated page with single fragment
* count.
*/
if (likely(!mm->context.pmd_frag)) {
- atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
@@ -357,15 +436,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
void pmd_fragment_free(unsigned long *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
- if (PageReserved(page))
- return free_reserved_page(page);
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -463,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
set_pte_at(vma->vm_mm, addr, ptep, pte);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* For hash translation mode, we use the deposited table to store hash slot
* information and they are stored at PTRS_PER_PMD offset from related pmd
@@ -484,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
return true;
}
+#endif
/*
* Does the CPU support tlbie?
@@ -553,8 +634,13 @@ EXPORT_SYMBOL_GPL(memremap_compat_align);
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- unsigned long prot = pgprot_val(protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+ unsigned long prot;
+
+ /* Radix supports execute-only, but protection_map maps X -> RX */
+ if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))
+ vm_flags |= VM_READ;
+
+ prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]);
if (vm_flags & VM_SAO)
prot |= _PAGE_SAO;
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 753e62ba67af..a974baf8f327 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -10,6 +10,7 @@
#include <asm/mmu.h>
#include <asm/setup.h>
#include <asm/smp.h>
+#include <asm/firmware.h>
#include <linux/pkeys.h>
#include <linux/of_fdt.h>
@@ -88,7 +89,8 @@ static int __init scan_pkey_feature(void)
unsigned long pvr = mfspr(SPRN_PVR);
if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
- PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9)
+ PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 ||
+ PVR_VER(pvr) == PVR_HX_C2000)
pkeys_total = 32;
}
}
@@ -290,7 +292,7 @@ void setup_kuap(bool disabled)
if (smp_processor_id() == boot_cpuid) {
pr_info("Activating Kernel Userspace Access Prevention\n");
- cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUAP;
+ cur_cpu_spec->mmu_features |= MMU_FTR_KUAP;
}
/*
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index d2fb776febb4..35fd2a95be24 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
else
radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
@@ -46,14 +47,17 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
pte_t old_pte, pte_t pte)
{
struct mm_struct *mm = vma->vm_mm;
+ unsigned long psize = huge_page_size(hstate_vma(vma));
/*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value.
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
*/
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
- (atomic_read(&mm->context.copros) > 0))
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ atomic_read(&mm->context.copros) > 0)
radix__flush_hugetlb_page(vma, addr);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index db2f3d193448..15e88f1439ec 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -30,11 +30,13 @@
#include <asm/trace.h>
#include <asm/uaccess.h>
#include <asm/ultravisor.h>
+#include <asm/set_memory.h>
#include <trace/events/thp.h>
+#include <mm/mmu_decl.h>
+
unsigned int mmu_base_pid;
-unsigned long radix_mem_block_size __ro_after_init;
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
@@ -202,14 +204,14 @@ static void radix__change_memory_range(unsigned long start, unsigned long end,
pudp = pud_alloc(&init_mm, p4dp, idx);
if (!pudp)
continue;
- if (pud_is_leaf(*pudp)) {
+ if (pud_leaf(*pudp)) {
ptep = (pte_t *)pudp;
goto update_the_pte;
}
pmdp = pmd_alloc(&init_mm, pudp, idx);
if (!pmdp)
continue;
- if (pmd_is_leaf(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
ptep = pmdp_ptep(pmdp);
goto update_the_pte;
}
@@ -228,9 +230,17 @@ void radix__mark_rodata_ro(void)
unsigned long start, end;
start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
+ end = (unsigned long)__end_rodata;
radix__change_memory_range(start, end, _PAGE_WRITE);
+
+ for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
+ end = start + PAGE_SIZE;
+ if (overlaps_interrupt_vector_text(start, end))
+ radix__change_memory_range(start, end, _PAGE_WRITE);
+ else
+ break;
+ }
}
void radix__mark_initmem_nx(void)
@@ -259,21 +269,40 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e
static unsigned long next_boundary(unsigned long addr, unsigned long end)
{
#ifdef CONFIG_STRICT_KERNEL_RWX
- if (addr < __pa_symbol(__init_begin))
- return __pa_symbol(__init_begin);
+ unsigned long stext_phys;
+
+ stext_phys = __pa_symbol(_stext);
+
+ // Relocatable kernel running at non-zero real address
+ if (stext_phys != 0) {
+ // The end of interrupts code at zero is a rodata boundary
+ unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
+ if (addr < end_intr)
+ return end_intr;
+
+ // Start of relocated kernel text is a rodata boundary
+ if (addr < stext_phys)
+ return stext_phys;
+ }
+
+ if (addr < __pa_symbol(__srwx_boundary))
+ return __pa_symbol(__srwx_boundary);
#endif
return end;
}
static int __meminit create_physical_mapping(unsigned long start,
unsigned long end,
- unsigned long max_mapping_size,
int nid, pgprot_t _prot)
{
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
+ unsigned long max_mapping_size = memory_block_size;
+
+ if (debug_pagealloc_enabled_or_kfence())
+ max_mapping_size = PAGE_SIZE;
start = ALIGN(start, PAGE_SIZE);
end = ALIGN_DOWN(end, PAGE_SIZE);
@@ -352,7 +381,6 @@ static void __init radix_init_pgtable(void)
}
WARN_ON(create_physical_mapping(start, end,
- radix_mem_block_size,
-1, PAGE_KERNEL));
}
@@ -473,58 +501,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
return 1;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __init probe_memory_block_size(unsigned long node, const char *uname, int
- depth, void *data)
-{
- unsigned long *mem_block_size = (unsigned long *)data;
- const __be32 *prop;
- int len;
-
- if (depth != 1)
- return 0;
-
- if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
- return 0;
-
- prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
-
- if (!prop || len < dt_root_size_cells * sizeof(__be32))
- /*
- * Nothing in the device tree
- */
- *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
- else
- *mem_block_size = of_read_number(prop, dt_root_size_cells);
- return 1;
-}
-
-static unsigned long __init radix_memory_block_size(void)
-{
- unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-
- /*
- * OPAL firmware feature is set by now. Hence we are ok
- * to test OPAL feature.
- */
- if (firmware_has_feature(FW_FEATURE_OPAL))
- mem_block_size = 1UL * 1024 * 1024 * 1024;
- else
- of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
-
- return mem_block_size;
-}
-
-#else /* CONFIG_MEMORY_HOTPLUG */
-
-static unsigned long __init radix_memory_block_size(void)
-{
- return 1UL * 1024 * 1024 * 1024;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-
void __init radix__early_init_devtree(void)
{
int rc;
@@ -548,16 +524,6 @@ void __init radix__early_init_devtree(void)
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_64K);
}
-
- /*
- * Max mapping size used when mapping pages. We don't use
- * ppc_md.memory_block_size() here because this get called
- * early and we don't have machine probe called yet. Also
- * the pseries implementation only check for ibm,lmb-size.
- * All hypervisor supporting radix do expose that device
- * tree node.
- */
- radix_mem_block_size = radix_memory_block_size();
return;
}
@@ -572,17 +538,6 @@ void __init radix__early_init_mmu(void)
#else
mmu_virtual_psize = MMU_PAGE_4K;
#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* vmemmap mapping */
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- } else
- mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
#endif
/*
* initialize page table size
@@ -715,10 +670,60 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
p4d_clear(p4d);
}
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
- unsigned long end)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
{
- unsigned long next;
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+ return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+ struct vmem_altmap *altmap,
+ int order)
+{
+ unsigned int nr_pages = 1 << order;
+
+ if (altmap) {
+ unsigned long alt_start, alt_end;
+ unsigned long base_pfn = page_to_pfn(page);
+
+ /*
+ * with 2M vmemmap mmaping we can have things setup
+ * such that even though atlmap is specified we never
+ * used altmap.
+ */
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
+ }
+
+ if (PageReserved(page)) {
+ /* allocated from memblock */
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
pte_t *pte;
pte = pte_start + pte_index(addr);
@@ -730,23 +735,28 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
if (!pte_present(*pte))
continue;
- if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
- /*
- * The vmemmap_free() and remove_section_mapping()
- * codepaths call us with aligned addresses.
- */
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ if (!direct)
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ pages++;
}
-
- pte_clear(&init_mm, addr, pte);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_page_is_unused(addr, next)) {
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ }
+#endif
}
+ if (direct)
+ update_page_count(mmu_virtual_psize, -pages);
}
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
- unsigned long end)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- unsigned long next;
+ unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
@@ -757,26 +767,36 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
if (!pmd_present(*pmd))
continue;
- if (pmd_is_leaf(*pmd)) {
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE)) {
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (pmd_leaf(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ pages++;
}
- pte_clear(&init_mm, addr, (pte_t *)pmd);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ }
+#endif
continue;
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next);
+ remove_pte_table(pte_base, addr, next, direct, altmap);
free_pte_table(pte_base, pmd);
}
+ if (direct)
+ update_page_count(MMU_PAGE_2M, -pages);
}
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
- unsigned long end)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- unsigned long next;
+ unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
@@ -787,23 +807,28 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
if (!pud_present(*pud))
continue;
- if (pud_is_leaf(*pud)) {
+ if (pud_leaf(*pud)) {
if (!IS_ALIGNED(addr, PUD_SIZE) ||
!IS_ALIGNED(next, PUD_SIZE)) {
WARN_ONCE(1, "%s: unaligned range\n", __func__);
continue;
}
pte_clear(&init_mm, addr, (pte_t *)pud);
+ pages++;
continue;
}
pmd_base = pud_pgtable(*pud);
- remove_pmd_table(pmd_base, addr, next);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
free_pmd_table(pmd_base, pud);
}
+ if (direct)
+ update_page_count(MMU_PAGE_1G, -pages);
}
-static void __meminit remove_pagetable(unsigned long start, unsigned long end)
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long addr, next;
pud_t *pud_base;
@@ -820,7 +845,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
if (!p4d_present(*p4d))
continue;
- if (p4d_is_leaf(*p4d)) {
+ if (p4d_leaf(*p4d)) {
if (!IS_ALIGNED(addr, P4D_SIZE) ||
!IS_ALIGNED(next, P4D_SIZE)) {
WARN_ONCE(1, "%s: unaligned range\n", __func__);
@@ -832,7 +857,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
}
pud_base = p4d_pgtable(*p4d);
- remove_pud_table(pud_base, addr, next);
+ remove_pud_table(pud_base, addr, next, direct, altmap);
free_pud_table(pud_base, p4d);
}
@@ -850,12 +875,12 @@ int __meminit radix__create_section_mapping(unsigned long start,
}
return create_physical_mapping(__pa(start), __pa(end),
- radix_mem_block_size, nid, prot);
+ nid, prot);
}
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
{
- remove_pagetable(start, end);
+ remove_pagetable(start, end, true, NULL);
return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -873,7 +898,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long phys)
{
/* Create a PTE encoding */
- unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
int ret;
@@ -882,26 +906,438 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
return -1;
}
- ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+ ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
BUG_ON(ret);
return 0;
}
+
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ if (radix_enabled())
+ return __vmemmap_can_optimize(altmap, pgmap);
+
+ return false;
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_leaf(*pmdp);
+
+ if (large)
+ vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
+
+ return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pte_t entry;
+ pte_t *ptep = pmdp_ptep(pmdp);
+
+ VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, ptep, entry);
+ asm volatile("ptesync": : :"memory");
+
+ vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
+ int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pte_t *pte = pte_offset_kernel(pmdp, addr);
+
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ if (!reuse) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+ altmap = NULL;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p && altmap)
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+ if (!p)
+ return NULL;
+ pr_debug("PAGE_SIZE vmemmap mapping\n");
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ pr_debug("Tail page reuse vmemmap mapping\n");
+ }
+
+ VM_BUG_ON(!PAGE_ALIGNED(addr));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ asm volatile("ptesync": : :"memory");
+ }
+ return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
+ unsigned long address)
+{
+ pud_t *pud;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(p4d_none(*p4dp))) {
+ if (unlikely(!slab_is_available())) {
+ pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ p4d_populate(&init_mm, p4dp, pud);
+ /* go to the pud_offset */
+ } else
+ return pud_alloc(&init_mm, p4dp, address);
+ }
+ return pud_offset(p4dp, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pud_none(*pudp))) {
+ if (unlikely(!slab_is_available())) {
+ pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pud_populate(&init_mm, pudp, pmd);
+ } else
+ return pmd_alloc(&init_mm, pudp, address);
+ }
+ return pmd_offset(pudp, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
+ unsigned long address)
+{
+ pte_t *pte;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pmd_none(*pmdp))) {
+ if (unlikely(!slab_is_available())) {
+ pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pmd_populate(&init_mm, pmdp, pte);
+ } else
+ return pte_alloc_kernel(pmdp, address);
+ }
+ return pte_offset_kernel(pmdp, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ /*
+ * keep it simple by checking addr PMD_SIZE alignment
+ * and verifying the device boundary condition.
+ * For us to use a pmd mapping, both addr and pfn should
+ * be aligned. We skip if addr is not aligned and for
+ * pfn we hope we have extra area in the altmap that
+ * can help to find an aligned block. This can result
+ * in altmap block allocation failures, in which case
+ * we fallback to RAM for vmemmap allocation.
+ */
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
+ altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ goto base_mapping;
+ }
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ pr_debug("PMD_SIZE vmemmap mapping\n");
+ continue;
+ } else if (altmap) {
+ /*
+ * A vmemmap block allocation can fail due to
+ * alignment requirements and we trying to align
+ * things aggressively there by running out of
+ * space. Try base mapping on failure.
+ */
+ goto base_mapping;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+ /*
+ * If a huge mapping exist due to early call to
+ * vmemmap_populate, let's try to use that.
+ */
+ continue;
+ }
+base_mapping:
+ /*
+ * Not able allocate higher order memory to back memmap
+ * or we found a pointer to pte page. Allocate base page
+ * size vmemmap
+ */
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ next = addr + PAGE_SIZE;
+ }
+ return 0;
+}
+
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return NULL;
+ radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+ unsigned long pfn_offset, int node)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long map_addr;
+
+ /* the second vmemmap page which we use for duplication */
+ map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+ pgd = pgd_offset_k(map_addr);
+ p4d = p4d_offset(pgd, map_addr);
+ pud = vmemmap_pud_alloc(p4d, node, map_addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, map_addr);
+ if (!pte)
+ return NULL;
+ /*
+ * Check if there exist a mapping to the left
+ */
+ if (pte_none(*pte)) {
+ /*
+ * Populate the head page vmemmap page.
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ /*
+ * Populate the tail pages vmemmap page
+ */
+ pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+ return pte;
+ }
+ return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ /*
+ * we want to map things as base page size mapping so that
+ * we can save space in vmemmap. We could have huge mapping
+ * covering out both edges.
+ */
+ unsigned long addr;
+ unsigned long addr_pfn = start_pfn;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_leaf(READ_ONCE(*pmd))) {
+ /* existing huge mapping. Skip the range */
+ addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+ next = pmd_addr_end(addr, end);
+ continue;
+ }
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+ if (!pte_none(*pte)) {
+ /*
+ * This could be because we already have a compound
+ * page whose VMEMMAP_RESERVE_NR pages were mapped and
+ * this request fall in those pages.
+ */
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ } else {
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+ pte_t *tail_page_pte;
+
+ /*
+ * if the address is aligned to huge page size it is the
+ * head mapping.
+ */
+ if (pfn_offset == 0) {
+ /* Populate the head page vmemmap page */
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Populate the tail pages vmemmap page
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ addr_pfn += 2;
+ next = addr + 2 * PAGE_SIZE;
+ continue;
+ }
+ /*
+ * get the 2nd mapping details
+ * Also create it if that doesn't exist
+ */
+ tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+ if (!tail_page_pte) {
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+ }
+ return 0;
+}
+
+
#ifdef CONFIG_MEMORY_HOTPLUG
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
- remove_pagetable(start, start + page_size);
+ remove_pagetable(start, start + page_size, true, NULL);
}
-#endif
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void radix__kernel_map_pages(struct page *page, int numpages, int enable)
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
{
- pr_warn_once("DEBUG_PAGEALLOC not supported in radix mode\n");
+ remove_pagetable(start, end, false, altmap);
}
#endif
+#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -916,8 +1352,25 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
- old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
- trace_hugepage_update(addr, old, clr, set);
+ old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
+ trace_hugepage_update_pmd(addr, old, clr, set);
+
+ return old;
+}
+
+unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+#endif
+
+ old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
+ trace_hugepage_update_pud(addr, old, clr, set);
return old;
}
@@ -937,15 +1390,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
pmd = *pmdp;
pmd_clear(pmdp);
- /*
- * pmdp collapse_flush need to ensure that there are no parallel gup
- * walk after this call. This is needed so that we can have stable
- * page ref count when collapsing a page. We don't allow a collapse page
- * if we have gup taken on the page. We can ensure that by sending IPI
- * because gup walk happens with IRQ disabled.
- */
- serialize_against_pte_lookup(vma->vm_mm);
-
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
return pmd;
@@ -1007,27 +1451,43 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
return old_pmd;
}
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old_pud;
+ unsigned long old;
+
+ old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
+ old_pud = __pud(old);
+ return old_pud;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
pte_t entry, unsigned long address, int psize)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
- _PAGE_RW | _PAGE_EXEC);
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
+ _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
unsigned long change = pte_val(entry) ^ pte_val(*ptep);
/*
- * To avoid NMMU hang while relaxing access, we need mark
- * the pte invalid in between.
+ * On POWER9, the NMMU is not able to relax PTE access permissions
+ * for a translation with a TLB. The PTE must be invalidated, TLB
+ * flushed before the new PTE is installed.
+ *
+ * This only needs to be done for radix, because hash translation does
+ * flush when updating the linux pte (and we don't support NMMU
+ * accelerators on HPT on POWER9 anyway XXX: do we?).
+ *
+ * POWER10 (and P9P) NMMU does behave as per ISA.
*/
- if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
+ atomic_read(&mm->context.copros) > 0) {
unsigned long old_pte, new_pte;
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
- /*
- * new value of pte
- */
new_pte = old_pte | set;
radix__flush_tlb_page_psize(mm, address, psize);
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
@@ -1035,9 +1495,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
__radix_pte_update(ptep, 0, set);
/*
* Book3S does not require a TLB flush when relaxing access
- * restrictions when the address space is not attached to a
- * NMMU, because the core MMU will reload the pte after taking
- * an access fault, which is defined by the architecture.
+ * restrictions when the address space (modulo the POWER9 nest
+ * MMU issue above) because the MMU will reload the PTE after
+ * taking an access fault, as defined by the architecture. See
+ * "Setting a Reference or Change Bit or Upgrading Access
+ * Authority (PTE Subject to Atomic Hardware Updates)" in
+ * Power ISA Version 3.1B.
*/
}
/* See ptesync comment in radix__set_pte_at */
@@ -1050,11 +1513,12 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
/*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value. We need to do this only for radix, because hash
- * translation does flush when updating the linux pte.
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
*/
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
(atomic_read(&mm->context.copros) > 0))
radix__flush_tlb_page(vma, addr);
@@ -1076,7 +1540,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
int pud_clear_huge(pud_t *pud)
{
- if (pud_is_leaf(*pud)) {
+ if (pud_leaf(*pud)) {
pud_clear(pud);
return 1;
}
@@ -1123,7 +1587,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
int pmd_clear_huge(pmd_t *pmd)
{
- if (pmd_is_leaf(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pmd_clear(pmd);
return 1;
}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index dda51fef2d2e..9e1f6558d026 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -127,21 +127,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static __always_inline void __tlbie_pid_lpid(unsigned long pid,
- unsigned long lpid,
- unsigned long ric)
-{
- unsigned long rb, rs, prs, r;
-
- rb = PPC_BIT(53); /* IS = 1 */
- rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -202,23 +187,6 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
- unsigned long lpid,
- unsigned long ap, unsigned long ric)
-{
- unsigned long rb, rs, prs, r;
-
- rb = va & ~(PPC_BITMASK(52, 63));
- rb |= ap << PPC_BITLSHIFT(58);
- rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap, unsigned long ric)
{
@@ -264,22 +232,6 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
}
}
-static inline void fixup_tlbie_va_range_lpid(unsigned long va,
- unsigned long pid,
- unsigned long lpid,
- unsigned long ap)
-{
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
- }
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
- }
-}
-
static inline void fixup_tlbie_pid(unsigned long pid)
{
/*
@@ -299,26 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid)
}
}
-static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
-{
- /*
- * We can use any address for the invalidation, pick one which is
- * probably unused as an optimisation.
- */
- unsigned long va = ((1UL << 52) - 1);
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
- }
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
- RIC_FLUSH_TLB);
- }
-}
-
static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap)
{
@@ -416,31 +348,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
- unsigned long ric)
-{
- asm volatile("ptesync" : : : "memory");
-
- /*
- * Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
- * in the asm statement.
- */
- switch (ric) {
- case RIC_FLUSH_TLB:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
- fixup_tlbie_pid_lpid(pid, lpid);
- break;
- case RIC_FLUSH_PWC:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- break;
- case RIC_FLUSH_ALL:
- default:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
- fixup_tlbie_pid_lpid(pid, lpid);
- }
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
struct tlbiel_pid {
unsigned long pid;
unsigned long ric;
@@ -566,20 +473,6 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
fixup_tlbie_va_range(addr - page_size, pid, ap);
}
-static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long lpid,
- unsigned long page_size,
- unsigned long psize)
-{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
- for (addr = start; addr < end; addr += page_size)
- __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
-
- fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
-}
-
static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@@ -660,18 +553,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long lpid,
- unsigned long page_size,
- unsigned long psize, bool also_pwc)
-{
- asm volatile("ptesync" : : : "memory");
- if (also_pwc)
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-
static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
@@ -700,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
*/
void radix__local_flush_tlb_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_tlb_mm);
@@ -713,12 +595,13 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm);
#ifndef CONFIG_SMP
void radix__local_flush_all_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_all_mm);
@@ -732,12 +615,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -755,10 +639,18 @@ EXPORT_SYMBOL(radix__local_flush_tlb_page);
static bool mm_needs_flush_escalation(struct mm_struct *mm)
{
/*
- * P9 nest MMU has issues with the page walk cache
- * caching PTEs and not flushing them properly when
- * RIC = 0 for a PID/LPID invalidate
+ * The P9 nest MMU has issues with the page walk cache caching PTEs
+ * and not flushing them when RIC = 0 for a PID/LPID invalidate.
+ *
+ * This may have been fixed in shipping firmware (by disabling PWC
+ * or preventing it from caching PTEs), but until that is confirmed,
+ * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes
+ * to RIC=2.
+ *
+ * POWER10 (and P9P) does not have this problem.
*/
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ return false;
if (atomic_read(&mm->context.copros) > 0)
return true;
return false;
@@ -784,12 +676,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
goto out;
if (current->active_mm == mm) {
+ unsigned long flags;
+
WARN_ON_ONCE(current->mm != NULL);
- /* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ /*
+ * It is a kernel thread and is using mm as the lazy tlb, so
+ * switch it to init_mm. This is not always called from IPI
+ * (e.g., flush_type_needed), so must disable irqs.
+ */
+ local_irq_save(flags);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
+ local_irq_restore(flags);
}
/*
@@ -801,7 +701,7 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
* that's what the caller expects.
*/
if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
- atomic_dec(&mm->context.active_cpus);
+ dec_mm_active_cpus(mm);
cpumask_clear_cpu(cpu, mm_cpumask(mm));
always_flush = true;
}
@@ -937,7 +837,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -968,6 +868,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
}
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -977,7 +878,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -1001,6 +902,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
void radix__flush_all_mm(struct mm_struct *mm)
@@ -1016,7 +918,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -1096,6 +998,9 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+/*
+ * Doesn't appear to be used anywhere. Remove.
+ */
#define TLB_FLUSH_ALL -1UL
/*
@@ -1117,23 +1022,22 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool fullmm = (end == TLB_FLUSH_ALL);
bool flush_pid, flush_pwc = false;
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- type = flush_type_needed(mm, fullmm);
+ type = flush_type_needed(mm, false);
if (type == FLUSH_TYPE_NONE)
goto out;
- if (fullmm)
- flush_pid = true;
- else if (type == FLUSH_TYPE_GLOBAL)
+ if (type == FLUSH_TYPE_GLOBAL)
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
else
flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
@@ -1171,15 +1075,12 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
}
} else {
- bool hflush = false;
+ bool hflush;
unsigned long hstart, hend;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- hstart = (start + PMD_SIZE - 1) & PMD_MASK;
- hend = end & PMD_MASK;
- if (hstart < hend)
- hflush = true;
- }
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend;
if (type == FLUSH_TYPE_LOCAL) {
asm volatile("ptesync": : :"memory");
@@ -1210,6 +1111,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -1294,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (tlb->fullmm || tlb->need_flush_all) {
- __flush_all_mm(mm, true);
+ if (tlb->fullmm) {
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * Shootdown based lazy tlb mm refcounting means we
+ * have to IPI everyone in the mm_cpumask anyway soon
+ * when the mm goes away, so might as well do it as
+ * part of the final flush now.
+ *
+ * If lazy shootdown was improved to reduce IPIs (e.g.,
+ * by batching), then it may end up being better to use
+ * tlbies here instead.
+ */
+ preempt_disable();
+
+ smp_mb(); /* see radix__flush_tlb_mm */
+ exit_flush_lazy_tlbs(mm);
+ __flush_all_mm(mm, true);
+
+ preempt_enable();
+ } else {
+ __flush_all_mm(mm, true);
+ }
+
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
@@ -1317,25 +1240,22 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool fullmm = (end == TLB_FLUSH_ALL);
bool flush_pid;
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
- fullmm = (end == TLB_FLUSH_ALL);
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- type = flush_type_needed(mm, fullmm);
+ type = flush_type_needed(mm, false);
if (type == FLUSH_TYPE_NONE)
goto out;
- if (fullmm)
- flush_pid = true;
- else if (type == FLUSH_TYPE_GLOBAL)
+ if (type == FLUSH_TYPE_GLOBAL)
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
else
flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
@@ -1377,6 +1297,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
@@ -1398,7 +1319,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
/* 4k page size, just blow the world */
@@ -1446,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
+}
+EXPORT_SYMBOL(radix__flush_pud_tlb_range);
+
void radix__flush_tlb_all(void)
{
unsigned long rb,prs,r,rs;
@@ -1471,6 +1399,127 @@ void radix__flush_tlb_all(void)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+ unsigned long lpid,
+ unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+ RIC_FLUSH_TLB);
+ }
+}
+
+static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+ unsigned long ric)
+{
+ asm volatile("ptesync" : : : "memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+ unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+}
+
+static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync" : : : "memory");
+ if (also_pwc)
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
/*
* Performs process-scoped invalidations for a given LPID
* as part of H_RPT_INVALIDATE hcall.
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 6956f637a38c..f2708c8629a5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -13,6 +13,7 @@
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
+#include <asm/lppaca.h>
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 60c6ea16a972..ec98e526167e 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -71,6 +71,8 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
if (pmd_none(*pmd))
return;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return;
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
pte_update(mm, addr, pte, 0, 0, 0);
@@ -143,30 +145,22 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops subpage_walk_ops = {
.pmd_entry = subpage_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, addr);
/*
* We don't try too hard, we just mark all the vma in that range
* VM_NOHUGEPAGE and split them.
*/
- vma = find_vma(mm, addr);
- /*
- * If the range is in unmapped range, just return
- */
- if (vma && ((addr + len) <= vma->vm_start))
- return;
-
- while (vma) {
- if (vma->vm_start >= (addr + len))
- break;
- vma->vm_flags |= VM_NOHUGEPAGE;
+ for_each_vma_range(vmi, vma, addr + len) {
+ vm_flags_set(vma, VM_NOHUGEPAGE);
walk_page_vma(vma, &subpage_walk_ops, NULL);
- vma = vma->vm_next;
}
}
#else