aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c156
1 files changed, 76 insertions, 80 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
}
static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
-
-static inline bool is_huge_zero_page(struct page *page)
-{
- return ACCESS_ONCE(huge_zero_page) == page;
-}
+struct page *huge_zero_page __read_mostly;
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
}
-static inline struct page *alloc_hugepage_vma(int defrag,
- struct vm_area_struct *vma,
- unsigned long haddr, int nd,
- gfp_t extra_gfp)
-{
- return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
- HPAGE_PMD_ORDER, vma, haddr, nd);
-}
-
/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
{
+ gfp_t gfp;
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return 0;
}
- page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
+ gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow())
- new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
- else
+ !transparent_hugepage_debug_cow()) {
+ gfp_t gfp;
+
+ gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ } else
new_page = NULL;
if (unlikely(!new_page)) {
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
page = pmd_page(*pmd);
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
+ page = pmd_page(*pmdp);
spin_unlock(ptl);
- wait_migrate_huge_page(vma->anon_vma, pmdp);
+ wait_on_page_locked(page);
goto out;
}
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Migrate the THP to the requested node, returns with page unlocked
- * and pmd_numa cleared.
+ * and access rights restored.
*/
spin_unlock(ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
- pmd = pmd_mknonnuma(pmd);
+ pmd = pmd_modify(pmd, vma->vm_page_prot);
set_pmd_at(mm, haddr, pmdp, pmd);
- VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out_unlock:
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
-int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- spinlock_t *ptl;
- int ret = 0;
-
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- /*
- * All logical pages in the range are present
- * if backed by a huge page.
- */
- spin_unlock(ptl);
- memset(vec, 1, (end - addr) >> PAGE_SHIFT);
- ret = 1;
- }
-
- return ret;
-}
-
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
- ret = 1;
- if (!prot_numa) {
+
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!prot_numa || !pmd_protnone(*pmd)) {
+ ret = 1;
entry = pmdp_get_and_clear_notify(mm, addr, pmd);
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
- } else {
- struct page *page = pmd_page(*pmd);
-
- /*
- * Do not trap faults against the zero page. The
- * read-only data is likely to be read-cached on the
- * local CPU cache and it is less useful to know about
- * local vs remote hits on the zero page.
- */
- if (!is_huge_zero_page(page) &&
- !pmd_numa(*pmd)) {
- pmdp_set_numa(mm, addr, pmd);
- ret = HPAGE_PMD_NR;
- }
}
spin_unlock(ptl);
}
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
/*
- * Note that pmd_numa is not transferred deliberately
- * to avoid any possibility that pte_numa leaks to
- * a PROT_NONE VMA by accident.
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
*/
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int referenced = 0, none = 0;
+ int none = 0;
+ bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
else
goto out;
}
- if (!pte_present(pteval) || !pte_write(pteval))
+ if (!pte_present(pteval))
goto out;
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page))
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- /* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1)
- goto out;
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ if (pte_write(pteval)) {
+ writable = true;
+ } else {
+ if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ /*
+ * Page is not in the swap cache. It can be collapsed
+ * into a THP.
+ */
+ }
+
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
- referenced = 1;
+ referenced = true;
}
- if (likely(referenced))
+ if (likely(referenced && writable))
return 1;
out:
release_pte_pages(pte, _pte);
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, referenced = 0, none = 0;
+ int ret = 0, none = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE;
+ bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
else
goto out_unmap;
}
- if (!pte_present(pteval) || !pte_write(pteval))
+ if (!pte_present(pteval))
goto out_unmap;
+ if (pte_write(pteval))
+ writable = true;
+
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON_PAGE(PageCompound(page), page);
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
- /* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1)
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page))
goto out_unmap;
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
- referenced = 1;
+ referenced = true;
}
- if (referenced)
+ if (referenced && writable)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);