diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 98 |
1 files changed, 61 insertions, 37 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 25ef59b7ee34..622cced74fd9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -629,21 +629,40 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + gfp_t this_node = 0; + +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); - return GFP_TRANSHUGE_LIGHT; + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ @@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; @@ -2330,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void freeze_page(struct page *page) +static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; @@ -2345,7 +2365,7 @@ static void freeze_page(struct page *page) VM_BUG_ON_PAGE(!unmap_success, page); } -static void unfreeze_page(struct page *page) +static void remap_page(struct page *page) { int i; if (PageTransHuge(page)) { @@ -2382,6 +2402,12 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | (1L << PG_dirty))); + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2402,12 +2428,6 @@ static void __split_huge_page_tail(struct page *head, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); - page_tail->mapping = head->mapping; - - page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); /* @@ -2419,12 +2439,11 @@ static void __split_huge_page_tail(struct page *head, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - unsigned long flags) + pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - pgoff_t end = -1; int i; lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); @@ -2432,9 +2451,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - if (!PageAnon(page)) - end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2450,20 +2466,20 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ + /* Additional pin to swap cache */ if (PageSwapCache(head)) page_ref_add(head, 2); else page_ref_inc(head); } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -2568,7 +2584,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; else @@ -2606,6 +2622,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; + pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -2628,6 +2645,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } + end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2641,10 +2659,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) anon_vma = NULL; i_mmap_lock_read(mapping); + + /* + *__split_huge_page() may need to trim off pages beyond EOF: + * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, + * which cannot be nested inside the page tree lock. So note + * end now: i_size itself may be changed at any moment, but + * head page lock is good enough to serialize the trimming. + */ + end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); } /* - * Racy check if we can split the page, before freeze_page() will + * Racy check if we can split the page, before unmap_page() will * split PMDs */ if (!can_split_huge_page(head, &extra_pins)) { @@ -2653,7 +2680,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(head); + unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -2664,17 +2691,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } @@ -2690,7 +2714,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); - __split_huge_page(page, list, flags); + __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2710,7 +2734,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); ret = -EBUSY; } |