From 1897bdc4d33167e9036460631d1349e59d841f2d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 13 Nov 2014 13:46:09 +1100 Subject: mmu_notifier: add mmu_notifier_invalidate_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This notifier closes an important gap in the current mmu_notifier implementation, the existing callbacks are called too early or too late to reliably manage a non-CPU TLB. Specifically, invalidate_range_start() is called when all pages are still mapped and invalidate_range_end() when all pages are unmapped and potentially freed. This is fine when the users of the mmu_notifiers manage their own SoftTLB, like KVM does. When the TLB is managed in software it is easy to wipe out entries for a given range and prevent new entries to be established until invalidate_range_end is called. But when the user of mmu_notifiers has to manage a hardware TLB it can still wipe out TLB entries in invalidate_range_start, but it can't make sure that no new TLB entries in the given range are established between invalidate_range_start and invalidate_range_end. To avoid silent data corruption the entries in the non-CPU TLB need to be flushed when the pages are unmapped (at this point in time no _new_ TLB entries can be established in the non-CPU TLB) but not yet freed (as the non-CPU TLB may still have _existing_ entries pointing to the pages about to be freed). To fix this problem we need to catch the moment when the Linux VMM flushes remote TLBs (as a non-CPU TLB is not very CPU TLB), as this is the point in time when the pages are unmapped but _not_ yet freed. The mmu_notifier_invalidate_range() function aims to catch that moment. IOMMU code will be one user of the notifier-callback. Currently this is only the AMD IOMMUv2 driver, but its code is about to be more generalized and converted to a generic IOMMU-API extension to fit the needs of similar functionality in other IOMMUs as well. The current attempt in the AMD IOMMUv2 driver to work around the invalidate_range_start/end() shortcoming is to assign an empty page table to the non-CPU TLB between any invalidata_range_start/end calls. With the empty page-table assigned, every page-table walk to re-fill the non-CPU TLB will cause a page-fault reported to the IOMMU driver via an interrupt, possibly causing interrupt storms. The page-fault handler in the AMD IOMMUv2 driver doesn't handle the fault if an invalidate_range_start/end pair is active, it just reports back SUCCESS to the device and let it refault the page. But existing hardware (newer Radeon GPUs) that makes use of this feature don't re-fault indefinitly, after a certain number of faults for the same address the device enters a failure state and needs to be resetted. To avoid the GPUs entering a failure state we need to get rid of the empty-page-table workaround and use the mmu_notifier_invalidate_range() function introduced with this patch. Signed-off-by: Joerg Roedel Reviewed-by: Andrea Arcangeli Reviewed-by: Jérôme Glisse Cc: Peter Zijlstra Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Johannes Weiner Cc: Jay Cornwall Cc: Oded Gabbay Cc: Suravee Suthikulpanit Cc: Jesse Barnes Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Oded Gabbay --- include/linux/mmu_notifier.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/mmu_notifier.h') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 88787bb4b3b9..17907908d1df 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -242,6 +242,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, __mmu_notifier_invalidate_range_end(mm, start, end); } +static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { mm->mmu_notifier_mm = NULL; @@ -342,6 +347,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, { } +static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } -- cgit v1.2.3-59-g8ed1b From 34ee645e83b60ae3d5955f70ab9ab9a159136673 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 13 Nov 2014 13:46:09 +1100 Subject: mmu_notifier: call mmu_notifier_invalidate_range() from VMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add calls to the new mmu_notifier_invalidate_range() function to all places in the VMM that need it. Signed-off-by: Joerg Roedel Reviewed-by: Andrea Arcangeli Reviewed-by: Jérôme Glisse Cc: Peter Zijlstra Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Johannes Weiner Cc: Jay Cornwall Cc: Oded Gabbay Cc: Suravee Suthikulpanit Cc: Jesse Barnes Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Oded Gabbay --- include/linux/mmu_notifier.h | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/events/uprobes.c | 2 +- mm/fremap.c | 2 +- mm/huge_memory.c | 9 +++++---- mm/hugetlb.c | 7 ++++++- mm/ksm.c | 4 ++-- mm/memory.c | 3 ++- mm/migrate.c | 3 ++- mm/rmap.c | 2 +- 9 files changed, 61 insertions(+), 12 deletions(-) (limited to 'include/linux/mmu_notifier.h') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 17907908d1df..966da2b4b803 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -284,6 +284,44 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define ptep_clear_flush_notify(__vma, __address, __ptep) \ +({ \ + unsigned long ___addr = __address & PAGE_MASK; \ + struct mm_struct *___mm = (__vma)->vm_mm; \ + pte_t ___pte; \ + \ + ___pte = ptep_clear_flush(__vma, __address, __ptep); \ + mmu_notifier_invalidate_range(___mm, ___addr, \ + ___addr + PAGE_SIZE); \ + \ + ___pte; \ +}) + +#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ +({ \ + unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ + struct mm_struct *___mm = (__vma)->vm_mm; \ + pmd_t ___pmd; \ + \ + ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ + mmu_notifier_invalidate_range(___mm, ___haddr, \ + ___haddr + HPAGE_PMD_SIZE); \ + \ + ___pmd; \ +}) + +#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ +({ \ + unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ + pmd_t ___pmd; \ + \ + ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ + mmu_notifier_invalidate_range(__mm, ___haddr, \ + ___haddr + HPAGE_PMD_SIZE); \ + \ + ___pmd; \ +}) + /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -362,6 +400,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young +#define ptep_clear_flush_notify ptep_clear_flush +#define pmdp_clear_flush_notify pmdp_clear_flush +#define pmdp_get_and_clear_notify pmdp_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1d0af8a2c646..bc143cf56cab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); diff --git a/mm/fremap.c b/mm/fremap.c index 72b8fa361433..9129013732d7 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_present(pte)) { flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush(vma, addr, ptep); + pte = ptep_clear_flush_notify(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de984159cf0b..1d89526ed531 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1036,7 +1036,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush(vma, haddr, pmd); + pmdp_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1179,7 +1179,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush(vma, haddr, pmd); + pmdp_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1512,7 +1512,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry; ret = 1; if (!prot_numa) { - entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmdp_get_and_clear_notify(mm, addr, pmd); if (pmd_numa(entry)) entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); @@ -1644,6 +1644,7 @@ static int __split_huge_page_splitting(struct page *page, * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); + ret = 1; spin_unlock(ptl); } @@ -2834,7 +2835,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush(vma, haddr, pmd); + pmdp_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9fd722769927..2e6add04fa1b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2598,8 +2598,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_pte_at(dst, addr, dst_pte, entry); } else { - if (cow) + if (cow) { huge_ptep_set_wrprotect(src, addr, src_pte); + mmu_notifier_invalidate_range(src, mmun_start, + mmun_end); + } entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); @@ -2899,6 +2902,7 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); + mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page); @@ -3374,6 +3378,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * and that page table be reused and filled with junk. */ flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); mmu_notifier_invalidate_range_end(mm, start, end); diff --git a/mm/ksm.c b/mm/ksm.c index 6b2e337bc03c..d247efab5073 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -892,7 +892,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. */ - entry = ptep_clear_flush(vma, addr, ptep); + entry = ptep_clear_flush_notify(vma, addr, ptep); /* * Check that no O_DIRECT or similar I/O is in progress on the * page @@ -960,7 +960,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, page_add_anon_rmap(kpage, vma, addr); flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); diff --git a/mm/memory.c b/mm/memory.c index 3e503831e042..655fd3d34bb0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -238,6 +238,7 @@ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { tlb->need_flush = 0; tlb_flush(tlb); + mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif @@ -2234,7 +2235,7 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); + ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 01439953abf5..41945cb0ca38 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1854,7 +1854,7 @@ fail_putback: */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start); - pmdp_clear_flush(vma, mmun_start, pmd); + pmdp_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); @@ -1862,6 +1862,7 @@ fail_putback: if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); flush_tlb_range(vma, mmun_start, mmun_end); + mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page); goto fail_putback; diff --git a/mm/rmap.c b/mm/rmap.c index 19886fb2f13a..d3eb1e02d1c6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1378,7 +1378,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) { -- cgit v1.2.3-59-g8ed1b From 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 13 Nov 2014 13:46:09 +1100 Subject: mmu_notifier: add the callback for mmu_notifier_invalidate_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the mmu_notifier_invalidate_range() calls are in place, add the callback to allow subsystems to register against it. Signed-off-by: Joerg Roedel Reviewed-by: Andrea Arcangeli Reviewed-by: Jérôme Glisse Cc: Peter Zijlstra Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Johannes Weiner Cc: Jay Cornwall Cc: Oded Gabbay Cc: Suravee Suthikulpanit Cc: Jesse Barnes Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Oded Gabbay --- include/linux/mmu_notifier.h | 37 ++++++++++++++++++++++++++++++++----- mm/mmu_notifier.c | 25 +++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 5 deletions(-) (limited to 'include/linux/mmu_notifier.h') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 966da2b4b803..94d19f64cecf 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -98,11 +98,11 @@ struct mmu_notifier_ops { /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_sem and/or the - * locks protecting the reverse maps are held. The subsystem - * must guarantee that no additional references are taken to - * the pages in the range established between the call to - * invalidate_range_start() and the matching call to - * invalidate_range_end(). + * locks protecting the reverse maps are held. If the subsystem + * can't guarantee that no additional references are taken to + * the pages in the range, it has to implement the + * invalidate_range() notifier to remove any references taken + * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the @@ -144,6 +144,29 @@ struct mmu_notifier_ops { void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); + + /* + * invalidate_range() is either called between + * invalidate_range_start() and invalidate_range_end() when the + * VM has to free pages that where unmapped, but before the + * pages are actually freed, or outside of _start()/_end() when + * a (remote) TLB is necessary. + * + * If invalidate_range() is used to manage a non-CPU TLB with + * shared page-tables, it not necessary to implement the + * invalidate_range_start()/end() notifiers, as + * invalidate_range() alread catches the points in time when an + * external TLB range needs to be flushed. + * + * The invalidate_range() function is called under the ptl + * spin-lock and not allowed to sleep. + * + * Note that this function might be called with just a sub-range + * of what was passed to invalidate_range_start()/end(), if + * called between those functions. + */ + void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long start, unsigned long end); }; /* @@ -190,6 +213,8 @@ extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end); +extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -245,6 +270,8 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range(mm, start, end); } static inline void mmu_notifier_mm_init(struct mm_struct *mm) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 2c8da9825fe3..3b9b3d0741b2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -193,6 +193,16 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + /* + * Call invalidate_range here too to avoid the need for the + * subsystem of having to register an invalidate_range_end + * call-back when there is invalidate_range already. Usually a + * subsystem registers either invalidate_range_start()/end() or + * invalidate_range(), so this will be no additional overhead + * (besides the pointer check). + */ + if (mn->ops->invalidate_range) + mn->ops->invalidate_range(mn, mm, start, end); if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } @@ -200,6 +210,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); +void __mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range) + mn->ops->invalidate_range(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); + static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) -- cgit v1.2.3-59-g8ed1b