aboutsummaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c121
1 files changed, 113 insertions, 8 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index ced14f1af6dc..b874c4761e84 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -63,6 +63,7 @@
#include <linux/hugetlb.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
+#include <linux/memremap.h>
#include <asm/tlbflush.h>
@@ -390,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
anon_vma->parent->degree--;
continue;
}
@@ -424,7 +425,7 @@ static void anon_vma_ctor(void *data)
init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
- anon_vma->rb_root = RB_ROOT;
+ anon_vma->rb_root = RB_ROOT_CACHED;
}
void __init anon_vma_init(void)
@@ -605,6 +606,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
tlb_ubc->flush_required = true;
/*
+ * Ensure compiler does not re-order the setting of tlb_flush_batched
+ * before the PTE is cleared.
+ */
+ barrier();
+ mm->tlb_flush_batched = true;
+
+ /*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
* before the page is queued for IO.
@@ -631,6 +639,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
return should_defer;
}
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ if (mm->tlb_flush_batched) {
+ flush_tlb_mm(mm);
+
+ /*
+ * Do not allow the compiler to re-order the clearing of
+ * tlb_flush_batched before the tlb is flushed.
+ */
+ barrier();
+ mm->tlb_flush_batched = false;
+ }
+}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
@@ -851,11 +888,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+ unsigned long start = address, end;
int *cleaned = arg;
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free from this function.
+ */
+ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk(&pvmw)) {
+ unsigned long cstart, cend;
int ret = 0;
- address = pvmw.address;
+
+ cstart = address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -868,6 +915,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
+ cend = cstart + PAGE_SIZE;
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -882,6 +930,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
+ cstart &= PMD_MASK;
+ cend = cstart + PMD_SIZE;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -890,11 +940,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
if (ret) {
- mmu_notifier_invalidate_page(vma->vm_mm, address);
+ mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
(*cleaned)++;
}
}
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return true;
}
@@ -1288,18 +1340,44 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
bool ret = true;
+ unsigned long start = address, end;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
- flags & TTU_MIGRATION, page);
+ flags & TTU_SPLIT_FREEZE, page);
}
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free in this function as call of try_to_unmap()
+ * must hold a reference on the page.
+ */
+ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte && (flags & TTU_MIGRATION)) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+
+ if (!PageAnon(page))
+ continue;
+
+ set_pmd_migration_entry(&pvmw, page);
+ continue;
+ }
+#endif
+
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1330,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address = pvmw.address;
+ if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, 0);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ goto discard;
+ }
+
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
@@ -1385,7 +1484,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
dec_mm_counter(mm, mm_counter(page));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION)) {
+ (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
swp_entry_t entry;
pte_t swp_pte;
/*
@@ -1409,6 +1508,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
WARN_ON_ONCE(1);
ret = false;
+ /* We have to invalidate as we cleared the pte */
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1454,8 +1554,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
discard:
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
- mmu_notifier_invalidate_page(mm, address);
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
}
+
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return ret;
}
@@ -1510,7 +1614,8 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
+ if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
+ && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)