aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm/book3s64/pgtable.c
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.ibm.com>2020-05-05 12:47:29 +0530
committerMichael Ellerman <mpe@ellerman.id.au>2020-05-05 21:20:16 +1000
commit75358ea359e7c0dfceb3c7b3d854570b4260cb7f (patch)
treeefa39d494826c9623198d66794b55f3cea021e7d /arch/powerpc/mm/book3s64/pgtable.c
parentpowerpc/mm/book3s64: Avoid sending IPI on clearing PMD (diff)
downloadlinux-dev-75358ea359e7c0dfceb3c7b3d854570b4260cb7f.tar.xz
linux-dev-75358ea359e7c0dfceb3c7b3d854570b4260cb7f.zip
powerpc/mm/book3s64: Fix MADV_DONTNEED and parallel page fault race
MADV_DONTNEED holds mmap_sem in read mode and that implies a parallel page fault is possible and the kernel can end up with a level 1 PTE entry (THP entry) converted to a level 0 PTE entry without flushing the THP TLB entry. Most architectures including POWER have issues with kernel instantiating a level 0 PTE entry while holding level 1 TLB entries. The code sequence I am looking at is down_read(mmap_sem) down_read(mmap_sem) zap_pmd_range() zap_huge_pmd() pmd lock held pmd_cleared table details added to mmu_gather pmd_unlock() insert a level 0 PTE entry() tlb_finish_mmu(). Fix this by forcing a tlb flush before releasing pmd lock if this is not a fullmm invalidate. We can safely skip this invalidate for task exit case (fullmm invalidate) because in that case we are sure there can be no parallel fault handlers. This do change the Qemu guest RAM del/unplug time as below 128 core, 496GB guest: Without patch: munmap start: timer = 196449 ms, PID=6681 munmap finish: timer = 196488 ms, PID=6681 - delta = 39ms With patch: munmap start: timer = 196345 ms, PID=6879 munmap finish: timer = 196714 ms, PID=6879 - delta = 369ms Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200505071729.54912-23-aneesh.kumar@linux.ibm.com
Diffstat (limited to 'arch/powerpc/mm/book3s64/pgtable.c')
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c18
1 files changed, 18 insertions, 0 deletions
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 127325ead505..54b6d6d103ea 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -112,6 +112,24 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
return __pmd(old_pmd);
}
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp, int full)
+{
+ pmd_t pmd;
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ return pmd;
+}
+
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));