diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 137 |
1 files changed, 83 insertions, 54 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 626e93db28ba..078832cf3636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); static int khugepaged_slab_init(void); +static void khugepaged_slab_exit(void); #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!khugepaged_enabled()) - return 0; - for_each_populated_zone(zone) nr_zones++; @@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) setup_per_zone_wmarks(); return 0; } -late_initcall(set_recommended_min_free_kbytes); -static int start_khugepaged(void) +static int start_stop_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { @@ -156,6 +153,7 @@ static int start_khugepaged(void) pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; + goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) @@ -166,7 +164,7 @@ static int start_khugepaged(void) kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } - +fail: return err; } @@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void) struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); @@ -202,7 +200,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); } static void put_huge_zero_page(void) @@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, int err; mutex_lock(&khugepaged_mutex); - err = start_khugepaged(); + err = start_stop_khugepaged(); mutex_unlock(&khugepaged_mutex); if (err) @@ -634,27 +632,38 @@ static int __init hugepage_init(void) err = hugepage_init_sysfs(&hugepage_kobj); if (err) - return err; + goto err_sysfs; err = khugepaged_slab_init(); if (err) - goto out; + goto err_slab; - register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker); + if (err) + goto err_hzp_shrinker; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; + return 0; + } - start_khugepaged(); + err = start_stop_khugepaged(); + if (err) + goto err_khugepaged; return 0; -out: +err_khugepaged: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_slab_exit(); +err_slab: hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: return err; } subsys_initcall(hugepage_init); @@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *page) + struct page *page, gfp_t gfp) { struct mem_cgroup *memcg; pgtable_t pgtable; @@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) return VM_FAULT_OOM; pgtable = pte_alloc_one(mm, haddr); @@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ ptl = pmd_lockptr(mm, pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - gfp_t gfp; - - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); - new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1130,8 +1138,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) @@ -1260,6 +1267,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; + bool was_writable; int flags = 0; /* A PROT_NONE fault should not end up here */ @@ -1291,17 +1299,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, flags |= TNF_FAULT_LOCAL; } - /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. - * - * FIXME! This checks "pmd_dirty()" as an approximation of - * "is this a read-only page", since checking "pmd_write()" - * is even more broken. We haven't actually turned this into - * a writable page, so pmd_write() will always be false. - */ - if (!pmd_dirty(pmd)) + /* See similar comment in do_numa_page for explanation */ + if (!(vma->vm_flags & VM_WRITE)) flags |= TNF_NO_GROUP; /* @@ -1358,12 +1357,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } + } else + flags |= TNF_MIGRATE_FAIL; goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); + was_writable = pmd_write(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + if (was_writable) + pmd = pmd_mkwrite(pmd); set_pmd_at(mm, haddr, pmdp, pmd); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); @@ -1487,6 +1491,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; + bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; /* @@ -1502,9 +1507,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (!prot_numa || !pmd_protnone(*pmd)) { entry = pmdp_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mkwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - BUG_ON(pmd_write(entry)); + BUG_ON(!preserve_write && pmd_write(entry)); } spin_unlock(ptl); } @@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void) return 0; } +static void __init khugepaged_slab_exit(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -2109,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) { while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval)) + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) release_pte_page(pte_page(pteval)); } } @@ -2120,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int none = 0; + int none_or_zero = 0; bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2207,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, pte_t pteval = *_pte; struct page *src_page; - if (pte_none(pteval)) { + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); @@ -2311,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { @@ -2326,8 +2350,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( - khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2380,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); + return *hpage; } #endif @@ -2421,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm, struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* Only allocate from the target node */ + gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); if (!new_page) return; if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) + gfp, &memcg))) return; /* @@ -2543,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none = 0; + int ret = 0, none_or_zero = 0; struct page *page; unsigned long _address; spinlock_t *ptl; @@ -2561,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; |