From 691e95fd7396905a38d98919e9c150dbc3ea21a3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 30 Mar 2015 10:41:03 +0530 Subject: powerpc/mm/thp: Make page table walk safe against thp split/collapse We can disable a THP split or a hugepage collapse by disabling irq. We do send IPI to all the cpus in the early part of split/collapse, and disabling local irq ensure we don't make progress with split/collapse. If the THP is getting split we return NULL from find_linux_pte_or_hugepte(). For all the current callers it should be ok. We need to be careful if we want to use returned pte_t pointer outside the irq disabled region. W.r.t to THP split, the pfn remains the same, but then a hugepage collapse will result in a pfn change. There are few steps we can take to avoid a hugepage collapse.One way is to take page reference inside the irq disable region. Other option is to take mmap_sem so that a parallel collapse will not happen. We can also disable collapse by taking pmd_lock. Another method used by kvm subsystem is to check whether we had a mmu_notifer update in between using mmu_notifier_retry(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 2cb278a2f658..a9dbb27ca887 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -109,7 +109,7 @@ int pgd_huge(pgd_t pgd) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { /* Only called for hugetlbfs pages, hence can ignore THP */ - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -682,28 +682,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } +/* + * We are holding mmap_sem, so a parallel huge page collapse cannot run. + * To prevent hugepage split, disable irq. + */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { pte_t *ptep; struct page *page; unsigned shift; - unsigned long mask; + unsigned long mask, flags; /* * Transparent hugepages are handled by generic code. We can skip them * here. */ + local_irq_save(flags); ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) { + local_irq_restore(flags); return ERR_PTR(-EINVAL); - + } mask = (1UL << shift) - 1; page = pte_page(*ptep); if (page) page += (address & mask) / PAGE_SIZE; + local_irq_restore(flags); return page; } @@ -950,9 +957,12 @@ void flush_dcache_icache_hugepage(struct page *page) * * So long as we atomically load page table pointers we are safe against teardown, * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 */ -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -1031,7 +1041,7 @@ out: *shift = pdshift; return ret_pte; } -EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); +EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) -- cgit v1.2.3-59-g8ed1b