diff options
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 121 |
1 files changed, 113 insertions, 8 deletions
diff --git a/mm/rmap.c b/mm/rmap.c index ced14f1af6dc..b874c4761e84 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -63,6 +63,7 @@ #include <linux/hugetlb.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> +#include <linux/memremap.h> #include <asm/tlbflush.h> @@ -390,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { + if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->degree--; continue; } @@ -424,7 +425,7 @@ static void anon_vma_ctor(void *data) init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); - anon_vma->rb_root = RB_ROOT; + anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) @@ -605,6 +606,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) tlb_ubc->flush_required = true; /* + * Ensure compiler does not re-order the setting of tlb_flush_batched + * before the PTE is cleared. + */ + barrier(); + mm->tlb_flush_batched = true; + + /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. @@ -631,6 +639,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } + +/* + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to + * releasing the PTL if TLB flushes are batched. It's possible for a parallel + * operation such as mprotect or munmap to race between reclaim unmapping + * the page and flushing the page. If this race occurs, it potentially allows + * access to data via a stale TLB entry. Tracking all mm's that have TLB + * batching in flight would be expensive during reclaim so instead track + * whether TLB batching occurred in the past and if so then do a flush here + * if required. This will cost one additional flush per reclaim cycle paid + * by the first operation at risk such as mprotect and mumap. + * + * This must be called under the PTL so that an access to tlb_flush_batched + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise + * via the PTL. + */ +void flush_tlb_batched_pending(struct mm_struct *mm) +{ + if (mm->tlb_flush_batched) { + flush_tlb_mm(mm); + + /* + * Do not allow the compiler to re-order the clearing of + * tlb_flush_batched before the tlb is flushed. + */ + barrier(); + mm->tlb_flush_batched = false; + } +} #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { @@ -851,11 +888,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, .address = address, .flags = PVMW_SYNC, }; + unsigned long start = address, end; int *cleaned = arg; + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free from this function. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { + unsigned long cstart, cend; int ret = 0; - address = pvmw.address; + + cstart = address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -868,6 +915,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); + cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -882,6 +930,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); + cstart &= PMD_MASK; + cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -890,11 +940,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, } if (ret) { - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); (*cleaned)++; } } + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return true; } @@ -1288,18 +1340,44 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; struct page *subpage; bool ret = true; + unsigned long start = address, end; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; + if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && + is_zone_device_page(page) && !is_device_private_page(page)) + return true; + if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, - flags & TTU_MIGRATION, page); + flags & TTU_SPLIT_FREEZE, page); } + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free in this function as call of try_to_unmap() + * must hold a reference on the page. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte && (flags & TTU_MIGRATION)) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + + if (!PageAnon(page)) + continue; + + set_pmd_migration_entry(&pvmw, page); + continue; + } +#endif + /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced @@ -1330,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address = pvmw.address; + if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION) && + is_zone_device_page(page)) { + swp_entry_t entry; + pte_t swp_pte; + + pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, 0); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + goto discard; + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { @@ -1385,7 +1484,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION)) { + (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { swp_entry_t entry; pte_t swp_pte; /* @@ -1409,6 +1508,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { WARN_ON_ONCE(1); ret = false; + /* We have to invalidate as we cleared the pte */ page_vma_mapped_walk_done(&pvmw); break; } @@ -1454,8 +1554,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, discard: page_remove_rmap(subpage, PageHuge(page)); put_page(page); - mmu_notifier_invalidate_page(mm, address); + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } + + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return ret; } @@ -1510,7 +1614,8 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) + if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) + && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) |