aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2024-07-15 14:03:44 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2024-07-15 14:03:44 -0700
commita23e1966932464e1c5226cb9ac4ce1d5fc10ba22 (patch)
treebf5f1b57faa01ca31656bfc48c7d6b6f0bc39189 /mm/page_alloc.c
parentInput: ads7846 - use spi_device_id table (diff)
parentInput: yealink - simplify locking in sysfs attribute handling (diff)
downloadlinux-rng-a23e1966932464e1c5226cb9ac4ce1d5fc10ba22.tar.xz
linux-rng-a23e1966932464e1c5226cb9ac4ce1d5fc10ba22.zip
Merge branch 'next' into for-linus
Prepare input updates for 6.11 merge window.
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c780
1 files changed, 477 insertions, 303 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d3460c7a480..14d39f34d336 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -32,6 +32,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/pagevec.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmstat.h>
@@ -52,6 +53,7 @@
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
+#include <linux/cacheinfo.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
@@ -284,17 +286,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
- [NULL_COMPOUND_DTOR] = NULL,
- [COMPOUND_PAGE_DTOR] = free_compound_page,
-#ifdef CONFIG_HUGETLB_PAGE
- [HUGETLB_PAGE_DTOR] = free_huge_page,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
-#endif
-};
-
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
@@ -371,10 +362,16 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
-static __always_inline
-unsigned long __get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn,
- unsigned long mask)
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn, unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
@@ -393,24 +390,10 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
return (word >> bitidx) & mask;
}
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
-{
- return __get_pfnblock_flags_mask(page, pfn, mask);
-}
-
static __always_inline int get_pfnblock_migratetype(const struct page *page,
unsigned long pfn)
{
- return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
@@ -459,7 +442,7 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
- int ret = 0;
+ int ret;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;
@@ -468,8 +451,7 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
seq = zone_span_seqbegin(zone);
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
- if (!zone_spans_pfn(zone, pfn))
- ret = 1;
+ ret = !zone_spans_pfn(zone, pfn);
} while (zone_span_seqretry(zone, seq));
if (ret)
@@ -483,19 +465,19 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
/*
* Temporary debugging check for pages not lying within a given zone.
*/
-static int __maybe_unused bad_range(struct zone *zone, struct page *page)
+static bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
- return 1;
+ return true;
if (zone != page_zone(page))
- return 1;
+ return true;
- return 0;
+ return false;
}
#else
-static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
+static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
- return 0;
+ return false;
}
#endif
@@ -539,8 +521,6 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- int base = order;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
@@ -550,7 +530,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
- return (MIGRATE_PCPTYPES * base) + migratetype;
+ return (MIGRATE_PCPTYPES * order) + migratetype;
}
static inline int pindex_to_order(unsigned int pindex)
@@ -594,19 +574,10 @@ static inline void free_the_page(struct page *page, unsigned int order)
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
- * The first tail page's ->compound_dtor holds the offset in array of compound
- * page destructors. See compound_page_dtors.
- *
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
-void free_compound_page(struct page *page)
-{
- mem_cgroup_uncharge(page_folio(page));
- free_the_page(page, compound_order(page));
-}
-
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
@@ -621,10 +592,16 @@ void prep_compound_page(struct page *page, unsigned int order)
void destroy_large_folio(struct folio *folio)
{
- enum compound_dtor_id dtor = folio->_folio_dtor;
+ if (folio_test_hugetlb(folio)) {
+ free_huge_folio(folio);
+ return;
+ }
+
+ if (folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
- VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
- compound_page_dtors[dtor](&folio->page);
+ mem_cgroup_uncharge(folio);
+ free_the_page(&folio->page, folio_order(folio));
}
static inline void set_buddy_order(struct page *page, unsigned int order)
@@ -751,7 +728,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
unsigned long higher_page_pfn;
struct page *higher_page;
- if (order >= MAX_ORDER - 1)
+ if (order >= MAX_PAGE_ORDER - 1)
return false;
higher_page_pfn = buddy_pfn & pfn;
@@ -806,7 +783,7 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
- while (order < MAX_ORDER) {
+ while (order < MAX_PAGE_ORDER) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
@@ -824,7 +801,7 @@ static inline void __free_one_page(struct page *page,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
- int buddy_mt = get_pageblock_migratetype(buddy);
+ int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
if (migratetype != buddy_mt
&& (!migratetype_is_mergeable(migratetype) ||
@@ -900,7 +877,7 @@ int split_free_page(struct page *free_page,
goto out;
}
- mt = get_pageblock_migratetype(free_page);
+ mt = get_pfnblock_migratetype(free_page, free_page_pfn);
if (likely(!is_migrate_isolate(mt)))
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -940,6 +917,9 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
+#ifdef CONFIG_PAGE_POOL
+ ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
+#endif
(page->flags & check_flags)))
return false;
@@ -966,6 +946,10 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
+#ifdef CONFIG_PAGE_POOL
+ if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ bad_reason = "page_pool leak";
+#endif
return bad_reason;
}
@@ -1078,12 +1062,12 @@ out:
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return deferred_pages_enabled();
- return page_kasan_tag(page) == 0xff;
+ return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}
static void kernel_init_pages(struct page *page, int numpages)
@@ -1097,42 +1081,40 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
-static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, fpi_t fpi_flags)
+__always_inline bool free_pages_prepare(struct page *page,
+ unsigned int order)
{
int bad = 0;
- bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+ bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
+ bool compound = PageCompound(page);
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
kmsan_free_page(page, order);
+ if (memcg_kmem_online() && PageMemcgKmem(page))
+ __memcg_kmem_uncharge_page(page, order);
+
if (unlikely(PageHWPoison(page)) && !order) {
- /*
- * Do not let hwpoison pages hit pcplists/buddy
- * Untie memcg state and reset page's owner
- */
- if (memcg_kmem_online() && PageMemcgKmem(page))
- __memcg_kmem_uncharge_page(page, order);
+ /* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
page_table_check_free(page, order);
return false;
}
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
- bool compound = PageCompound(page);
int i;
- VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
if (compound)
- ClearPageHasHWPoisoned(page);
+ page[1].flags &= ~PAGE_FLAGS_SECOND;
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1147,8 +1129,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
if (PageMappingFlags(page))
page->mapping = NULL;
- if (memcg_kmem_online() && PageMemcgKmem(page))
- __memcg_kmem_uncharge_page(page, order);
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
@@ -1210,8 +1190,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int pindex)
{
unsigned long flags;
- int min_pindex = 0;
- int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
bool isolated_pageblocks;
struct page *page;
@@ -1234,17 +1212,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Remove pages from lists in a round-robin fashion. */
do {
- if (++pindex > max_pindex)
- pindex = min_pindex;
+ if (++pindex > NR_PCP_LISTS - 1)
+ pindex = 0;
list = &pcp->lists[pindex];
- if (!list_empty(list))
- break;
-
- if (pindex == max_pindex)
- max_pindex--;
- if (pindex == min_pindex)
- min_pindex++;
- } while (1);
+ } while (list_empty(list));
order = pindex_to_order(pindex);
nr_pages = 1 << order;
@@ -1292,12 +1263,11 @@ static void free_one_page(struct zone *zone,
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
- unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order, fpi_flags))
+ if (!free_pages_prepare(page, order))
return;
/*
@@ -1307,13 +1277,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
*/
migratetype = get_pfnblock_migratetype(page, pfn);
- spin_lock_irqsave(&zone->lock, flags);
- if (unlikely(has_isolate_pageblock(zone) ||
- is_migrate_isolate(migratetype))) {
- migratetype = get_pfnblock_migratetype(page, pfn);
- }
- __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
- spin_unlock_irqrestore(&zone->lock, flags);
+ free_one_page(zone, page, pfn, order, migratetype, fpi_flags);
__count_vm_events(PGFREE, 1 << order);
}
@@ -1341,7 +1305,7 @@ void __free_pages_core(struct page *page, unsigned int order)
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
if (page_contains_unaccepted(page, order)) {
- if (order == MAX_ORDER && __free_unaccepted(page))
+ if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
return;
accept_page(page, order);
@@ -1371,7 +1335,7 @@ void __free_pages_core(struct page *page, unsigned int order)
*
* Note: the function may return non-NULL struct page even for a page block
* which contains a memory hole (i.e. there is no physical memory for a subset
- * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
+ * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
* will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
* even though the start pfn is online and valid. This should be safe most of
* the time because struct pages are still initialized via init_unavailable_range()
@@ -1459,14 +1423,14 @@ static void check_new_page_bad(struct page *page)
/*
* This page is about to be returned from the page allocator
*/
-static int check_new_page(struct page *page)
+static bool check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
- return 0;
+ return false;
check_new_page_bad(page);
- return 1;
+ return true;
}
static inline bool check_new_pages(struct page *page, unsigned int order)
@@ -1604,7 +1568,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
struct page *page;
/* Find a page of the appropriate size in the preferred list */
- for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
+ for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
@@ -1834,6 +1798,10 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
+ /* moving whole block can fail due to zone boundary conditions */
+ if (!free_pages)
+ goto single_page;
+
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
@@ -1855,14 +1823,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
else
alike_pages = 0;
}
-
- /* moving whole block can fail due to zone boundary conditions */
- if (!free_pages)
- goto single_page;
-
/*
* If a sufficient number of pages in the block are either free or of
- * comparable migratability as our allocation, claim the whole block.
+ * compatible migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
@@ -1912,17 +1875,20 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* Reserve a pageblock for exclusive use of high-order atomic allocations if
* there are no empty page blocks that contain a page with a suitable order
*/
-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
- unsigned int alloc_order)
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
{
int mt;
unsigned long max_managed, flags;
/*
- * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+ * The number reserved as: minimum is 1 pageblock, maximum is
+ * roughly 1% of a zone. But if 1% of a zone falls below a
+ * pageblock size, then don't reserve any pageblocks.
* Check is race-prone but harmless.
*/
- max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
+ if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+ return;
+ max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
if (zone->nr_reserved_highatomic >= max_managed)
return;
@@ -1976,7 +1942,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
continue;
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@ -2060,7 +2026,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER; current_order >= min_order;
+ for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2086,8 +2052,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
return false;
find_smallest:
- for (current_order = order; current_order <= MAX_ORDER;
- current_order++) {
+ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
@@ -2099,7 +2064,7 @@ find_smallest:
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
- VM_BUG_ON(current_order > MAX_ORDER);
+ VM_BUG_ON(current_order > MAX_PAGE_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
@@ -2192,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return i;
}
+/*
+ * Called from the vmstat counter updater to decay the PCP high.
+ * Return whether there are addition works to do.
+ */
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+{
+ int high_min, to_drain, batch;
+ int todo = 0;
+
+ high_min = READ_ONCE(pcp->high_min);
+ batch = READ_ONCE(pcp->batch);
+ /*
+ * Decrease pcp->high periodically to try to free possible
+ * idle PCP pages. And, avoid to free too many pages to
+ * control latency. This caps pcp->high decrement too.
+ */
+ if (pcp->high > high_min) {
+ pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ pcp->high - (pcp->high >> 3), high_min);
+ if (pcp->high > high_min)
+ todo++;
+ }
+
+ to_drain = pcp->count - pcp->high;
+ if (to_drain > 0) {
+ spin_lock(&pcp->lock);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ spin_unlock(&pcp->lock);
+ todo++;
+ }
+
+ return todo;
+}
+
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
@@ -2345,7 +2344,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
{
int migratetype;
- if (!free_pages_prepare(page, order, FPI_NONE))
+ if (!free_pages_prepare(page, order))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2353,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
- bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
{
int min_nr_free, max_nr_free;
- /* Free everything if batch freeing high-order pages. */
+ /* Free as much as possible if batch freeing high-order pages. */
if (unlikely(free_high))
- return pcp->count;
+ return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
/* Check for PCP disabled or boot pageset */
if (unlikely(high < batch))
@@ -2371,61 +2369,107 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
max_nr_free = high - batch;
/*
- * Double the number of pages freed each time there is subsequent
- * freeing of pages without any allocation.
+ * Increase the batch number to the number of the consecutive
+ * freed pages to reduce zone lock contention.
*/
- batch <<= pcp->free_factor;
- if (batch < max_nr_free)
- pcp->free_factor++;
- batch = clamp(batch, min_nr_free, max_nr_free);
+ batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
return batch;
}
static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
- bool free_high)
+ int batch, bool free_high)
{
- int high = READ_ONCE(pcp->high);
+ int high, high_min, high_max;
+
+ high_min = READ_ONCE(pcp->high_min);
+ high_max = READ_ONCE(pcp->high_max);
+ high = pcp->high = clamp(pcp->high, high_min, high_max);
- if (unlikely(!high || free_high))
+ if (unlikely(!high))
return 0;
- if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
- return high;
+ if (unlikely(free_high)) {
+ pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ high_min);
+ return 0;
+ }
/*
* If reclaim is active, limit the number of pages that can be
* stored on pcp lists
*/
- return min(READ_ONCE(pcp->batch) << 2, high);
+ if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
+ int free_count = max_t(int, pcp->free_count, batch);
+
+ pcp->high = max(high - free_count, high_min);
+ return min(batch << 2, pcp->high);
+ }
+
+ if (high_min == high_max)
+ return high;
+
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
+ int free_count = max_t(int, pcp->free_count, batch);
+
+ pcp->high = max(high - free_count, high_min);
+ high = max(pcp->count, high_min);
+ } else if (pcp->count >= high) {
+ int need_high = pcp->free_count + batch;
+
+ /* pcp->high should be large enough to hold batch freed pages */
+ if (pcp->high < need_high)
+ pcp->high = clamp(need_high, high_min, high_max);
+ }
+
+ return high;
}
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
struct page *page, int migratetype,
unsigned int order)
{
- int high;
+ int high, batch;
int pindex;
- bool free_high;
+ bool free_high = false;
+ /*
+ * On freeing, reduce the number of pages that are batch allocated.
+ * See nr_pcp_alloc() where alloc_factor is increased for subsequent
+ * allocations.
+ */
+ pcp->alloc_factor >>= 1;
__count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
+ batch = READ_ONCE(pcp->batch);
/*
* As high-order pages other than THP's stored on PCP can contribute
* to fragmentation, limit the number stored when PCP is heavily
* freeing without allocation. The remainder after bulk freeing
* stops will be drained from vmstat refresh context.
*/
- free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
-
- high = nr_pcp_high(pcp, zone, free_high);
+ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
+ free_high = (pcp->free_count >= batch &&
+ (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
+ (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
+ pcp->count >= READ_ONCE(batch)));
+ pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
+ } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
+ pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
+ }
+ if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
+ pcp->free_count += (1 << order);
+ high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
- int batch = READ_ONCE(pcp->batch);
-
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+ pcp, pindex);
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+ zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+ ZONE_MOVABLE, 0))
+ clear_bit(ZONE_BELOW_HIGH, &zone->flags);
}
}
@@ -2438,7 +2482,7 @@ void free_unref_page(struct page *page, unsigned int order)
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
- int migratetype;
+ int migratetype, pcpmigratetype;
if (!free_unref_page_prepare(page, pfn, order))
return;
@@ -2446,24 +2490,24 @@ void free_unref_page(struct page *page, unsigned int order)
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Place ISOLATE pages on the isolated list because they are being
- * offlined but treat HIGHATOMIC as movable pages so we can get those
- * areas back if necessary. Otherwise, we may have to free
+ * offlined but treat HIGHATOMIC and CMA as movable pages so we can
+ * get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
- migratetype = get_pcppage_migratetype(page);
+ migratetype = pcpmigratetype = get_pcppage_migratetype(page);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
- migratetype = MIGRATE_MOVABLE;
+ pcpmigratetype = MIGRATE_MOVABLE;
}
zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
+ free_unref_page_commit(zone, pcp, page, pcpmigratetype, order);
pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
@@ -2472,66 +2516,70 @@ void free_unref_page(struct page *page, unsigned int order)
}
/*
- * Free a list of 0-order pages
+ * Free a batch of folios
*/
-void free_unref_page_list(struct list_head *list)
+void free_unref_folios(struct folio_batch *folios)
{
unsigned long __maybe_unused UP_flags;
- struct page *page, *next;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
- int batch_count = 0;
- int migratetype;
+ int i, j, migratetype;
+
+ /* Prepare folios for freeing */
+ for (i = 0, j = 0; i < folios->nr; i++) {
+ struct folio *folio = folios->folios[i];
+ unsigned long pfn = folio_pfn(folio);
+ unsigned int order = folio_order(folio);
- /* Prepare pages for freeing */
- list_for_each_entry_safe(page, next, list, lru) {
- unsigned long pfn = page_to_pfn(page);
- if (!free_unref_page_prepare(page, pfn, 0)) {
- list_del(&page->lru);
+ if (order > 0 && folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
+ if (!free_unref_page_prepare(&folio->page, pfn, order))
continue;
- }
/*
- * Free isolated pages directly to the allocator, see
- * comment in free_unref_page.
+ * Free isolated folios and orders not handled on the PCP
+ * directly to the allocator, see comment in free_unref_page.
*/
- migratetype = get_pcppage_migratetype(page);
- if (unlikely(is_migrate_isolate(migratetype))) {
- list_del(&page->lru);
- free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+ migratetype = get_pcppage_migratetype(&folio->page);
+ if (!pcp_allowed_order(order) ||
+ is_migrate_isolate(migratetype)) {
+ free_one_page(folio_zone(folio), &folio->page, pfn,
+ order, migratetype, FPI_NONE);
continue;
}
+ folio->private = (void *)(unsigned long)order;
+ if (j != i)
+ folios->folios[j] = folio;
+ j++;
}
+ folios->nr = j;
- list_for_each_entry_safe(page, next, list, lru) {
- struct zone *zone = page_zone(page);
+ for (i = 0; i < folios->nr; i++) {
+ struct folio *folio = folios->folios[i];
+ struct zone *zone = folio_zone(folio);
+ unsigned int order = (unsigned long)folio->private;
- list_del(&page->lru);
- migratetype = get_pcppage_migratetype(page);
+ folio->private = NULL;
+ migratetype = get_pcppage_migratetype(&folio->page);
- /*
- * Either different zone requiring a different pcp lock or
- * excessive lock hold times when freeing a large list of
- * pages.
- */
- if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
+ /* Different zone requires a different pcp lock */
+ if (zone != locked_zone) {
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
}
- batch_count = 0;
-
/*
- * trylock is necessary as pages may be getting freed
+ * trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (unlikely(!pcp)) {
pcp_trylock_finish(UP_flags);
- free_one_page(zone, page, page_to_pfn(page),
- 0, migratetype, FPI_NONE);
+ free_one_page(zone, &folio->page,
+ folio_pfn(folio), order,
+ migratetype, FPI_NONE);
locked_zone = NULL;
continue;
}
@@ -2545,15 +2593,16 @@ void free_unref_page_list(struct list_head *list)
if (unlikely(migratetype >= MIGRATE_PCPTYPES))
migratetype = MIGRATE_MOVABLE;
- trace_mm_page_free_batched(page);
- free_unref_page_commit(zone, pcp, page, migratetype, 0);
- batch_count++;
+ trace_mm_page_free_batched(&folio->page);
+ free_unref_page_commit(zone, pcp, &folio->page, migratetype,
+ order);
}
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
}
+ folio_batch_reinit(folios);
}
/*
@@ -2573,8 +2622,8 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, 1 << order);
- split_page_memcg(page, 1 << order);
+ split_page_owner(page, order, 0);
+ split_page_memcg(page, order, 0);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2679,12 +2728,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
spin_lock_irqsave(&zone->lock, flags);
- /*
- * order-0 request can reach here when the pcplist is skipped
- * due to non-CMA allocation context. HIGHATOMIC area is
- * reserved for high-order atomic allocation, so order-0
- * request should skip it.
- */
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@@ -2715,6 +2758,56 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
return page;
}
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
+{
+ int high, base_batch, batch, max_nr_alloc;
+ int high_max, high_min;
+
+ base_batch = READ_ONCE(pcp->batch);
+ high_min = READ_ONCE(pcp->high_min);
+ high_max = READ_ONCE(pcp->high_max);
+ high = pcp->high = clamp(pcp->high, high_min, high_max);
+
+ /* Check for PCP disabled or boot pageset */
+ if (unlikely(high < base_batch))
+ return 1;
+
+ if (order)
+ batch = base_batch;
+ else
+ batch = (base_batch << pcp->alloc_factor);
+
+ /*
+ * If we had larger pcp->high, we could avoid to allocate from
+ * zone.
+ */
+ if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
+ high = pcp->high = min(high + batch, high_max);
+
+ if (!order) {
+ max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
+ /*
+ * Double the number of pages allocated each time there is
+ * subsequent allocation of order-0 pages without any freeing.
+ */
+ if (batch <= max_nr_alloc &&
+ pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
+ pcp->alloc_factor++;
+ batch = min(batch, max_nr_alloc);
+ }
+
+ /*
+ * Scale batch relative to order if batch implies free pages
+ * can be stored on the PCP. Batch can be 1 for small zones or
+ * for boot pagesets which should never store free pages as
+ * the pages may belong to arbitrary zones.
+ */
+ if (batch > 1)
+ batch = max(batch >> order, 2);
+
+ return batch;
+}
+
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -2727,18 +2820,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
do {
if (list_empty(list)) {
- int batch = READ_ONCE(pcp->batch);
+ int batch = nr_pcp_alloc(pcp, zone, order);
int alloced;
- /*
- * Scale batch relative to order if batch implies
- * free pages can be stored on the PCP. Batch can
- * be 1 for small zones or for boot pagesets which
- * should never store free pages as the pages may
- * belong to arbitrary zones.
- */
- if (batch > 1)
- batch = max(batch >> order, 2);
alloced = rmqueue_bulk(zone, order,
batch, list,
migratetype, alloc_flags);
@@ -2779,7 +2863,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
- pcp->free_factor >>= 1;
+ pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
pcp_spin_unlock(pcp);
@@ -2818,17 +2902,10 @@ struct page *rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
if (likely(pcp_allowed_order(order))) {
- /*
- * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
- * we need to skip it when CMA area isn't allowed.
- */
- if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
- migratetype != MIGRATE_MOVABLE) {
- page = rmqueue_pcplist(preferred_zone, zone, order,
- migratetype, alloc_flags);
- if (likely(page))
- goto out;
- }
+ page = rmqueue_pcplist(preferred_zone, zone, order,
+ migratetype, alloc_flags);
+ if (likely(page))
+ goto out;
}
page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
@@ -2935,7 +3012,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
/* For a high-order request, check at least one suitable page is free */
- for (o = order; o <= MAX_ORDER; o++) {
+ for (o = order; o < NR_PAGE_ORDERS; o++) {
struct free_area *area = &z->free_area[o];
int mt;
@@ -3166,6 +3243,25 @@ retry:
}
}
+ /*
+ * Detect whether the number of free pages is below high
+ * watermark. If so, we will decrease pcp->high and free
+ * PCP pages in free path to reduce the possibility of
+ * premature page reclaiming. Detection is done here to
+ * avoid to do that in hotter free path.
+ */
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
+ goto check_alloc_wmark;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_fast(zone, order, mark,
+ ac->highest_zoneidx, alloc_flags,
+ gfp_mask))
+ goto try_this_zone;
+ else
+ set_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+check_alloc_wmark:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
@@ -3225,7 +3321,7 @@ try_this_zone:
* if the pageblock should be reserved for the future
*/
if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
- reserve_highatomic_pageblock(page, zone, order);
+ reserve_highatomic_pageblock(page, zone);
return page;
} else {
@@ -3860,14 +3956,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
else
(*no_progress_loops)++;
- /*
- * Make sure we converge to OOM if we cannot make any progress
- * several times in the row.
- */
- if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
- /* Before OOM, exhaust highatomic_reserve */
- return unreserve_highatomic_pageblock(ac, true);
- }
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES)
+ goto out;
+
/*
* Keep reclaiming pages while there is a chance this will lead
@@ -3910,6 +4001,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
schedule_timeout_uninterruptible(1);
else
cond_resched();
+out:
+ /* Before OOM, exhaust highatomic_reserve */
+ if (!ret)
+ return unreserve_highatomic_pageblock(ac, true);
+
return ret;
}
@@ -3951,6 +4047,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ bool can_compact = gfp_compaction_allowed(gfp_mask);
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4021,7 +4118,7 @@ restart:
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim &&
+ if (can_direct_reclaim && can_compact &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
@@ -4119,9 +4216,10 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_RETRY_MAYFAIL
+ * __GFP_RETRY_MAYFAIL and we can compact
*/
- if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+ if (costly_order && (!can_compact ||
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4134,7 +4232,7 @@ retry:
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
- if (did_some_progress > 0 &&
+ if (did_some_progress > 0 && can_compact &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
@@ -4450,7 +4548,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
- if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
+ if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
return NULL;
gfp &= gfp_allowed_mask;
@@ -4507,11 +4605,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
- preferred_nid, nodemask);
-
- if (page && order > 1)
- prep_transhuge_page(page);
- return (struct folio *)page;
+ preferred_nid, nodemask);
+ return page_rmappable_folio(page);
}
EXPORT_SYMBOL(__folio_alloc);
@@ -4598,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
gfp_t gfp = gfp_mask;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
- __GFP_NOMEMALLOC;
+ gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
@@ -4612,6 +4707,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
return page;
}
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+ if (!nc->va)
+ return;
+
+ __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+ nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
@@ -4621,9 +4726,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
}
EXPORT_SYMBOL(__page_frag_cache_drain);
-void *page_frag_alloc_align(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align_mask)
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -4692,7 +4797,7 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_align);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
@@ -4714,8 +4819,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
struct page *page = virt_to_page((void *)addr);
struct page *last = page + nr;
- split_page_owner(page, 1 << order);
- split_page_memcg(page, 1 << order);
+ split_page_owner(page, order, 0);
+ split_page_memcg(page, order, 0);
while (page < --last)
set_page_refcounted(last);
@@ -4735,7 +4840,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
* minimum number of pages to satisfy the request. alloc_pages() can only
* allocate memory in power-of-two pages.
*
- * This function is also limited by MAX_ORDER.
+ * This function is also limited by MAX_PAGE_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
*
@@ -4928,8 +5033,11 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
+ /*
+ * Use the local node if we haven't already, but for memoryless local
+ * node, we should skip it and fall back to other nodes.
+ */
+ if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) {
node_set(node, *used_node_mask);
return node;
}
@@ -5139,19 +5247,17 @@ static void __build_all_zonelists(void *data)
unsigned long flags;
/*
- * Explicitly disable this CPU's interrupts before taking seqlock
- * to prevent any IRQ handler from calling into the page allocator
- * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ * The zonelist_update_seq must be acquired with irqsave because the
+ * reader can be invoked from IRQ with GFP_ATOMIC.
*/
- local_irq_save(flags);
+ write_seqlock_irqsave(&zonelist_update_seq, flags);
/*
- * Explicitly disable this CPU's synchronous printk() before taking
- * seqlock to prevent any printk() from trying to hold port->lock, for
+ * Also disable synchronous printk() to prevent any printk() from
+ * trying to hold port->lock, for
* tty_insert_flip_string_and_push_buffer() on other CPU might be
* calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
*/
printk_deferred_enter();
- write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -5188,9 +5294,8 @@ static void __build_all_zonelists(void *data)
#endif
}
- write_sequnlock(&zonelist_update_seq);
printk_deferred_exit();
- local_irq_restore(flags);
+ write_sequnlock_irqrestore(&zonelist_update_seq, flags);
}
static noinline void __init
@@ -5308,14 +5413,15 @@ static int zone_batchsize(struct zone *zone)
}
static int percpu_pagelist_high_fraction;
-static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+static int zone_highsize(struct zone *zone, int batch, int cpu_online,
+ int high_fraction)
{
#ifdef CONFIG_MMU
int high;
int nr_split_cpus;
unsigned long total_pages;
- if (!percpu_pagelist_high_fraction) {
+ if (!high_fraction) {
/*
* By default, the high value of the pcp is based on the zone
* low watermark so that if they are full then background
@@ -5328,15 +5434,15 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
* value is based on a fraction of the managed pages in the
* zone.
*/
- total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+ total_pages = zone_managed_pages(zone) / high_fraction;
}
/*
* Split the high value across all online CPUs local to the zone. Note
* that early in boot that CPUs may not be online yet and that during
* CPU hotplug that the cpumask is not yet updated when a CPU is being
- * onlined. For memory nodes that have no CPUs, split pcp->high across
- * all online CPUs to mitigate the risk that reclaim is triggered
+ * onlined. For memory nodes that have no CPUs, split the high value
+ * across all online CPUs to mitigate the risk that reclaim is triggered
* prematurely due to pages stored on pcp lists.
*/
nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
@@ -5364,19 +5470,21 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
* However, guaranteeing these relations at all times would require e.g. write
* barriers here but also careful usage of read barriers at the read side, and
* thus be prone to error and bad for performance. Thus the update only prevents
- * store tearing. Any new users of pcp->batch and pcp->high should ensure they
- * can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
+ * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
+ * should ensure they can cope with those fields changing asynchronously, and
+ * fully trust only the pcp->count field on the local CPU with interrupts
+ * disabled.
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
* exist).
*/
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
- unsigned long batch)
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min,
+ unsigned long high_max, unsigned long batch)
{
WRITE_ONCE(pcp->batch, batch);
- WRITE_ONCE(pcp->high, high);
+ WRITE_ONCE(pcp->high_min, high_min);
+ WRITE_ONCE(pcp->high_max, high_max);
}
static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
@@ -5396,20 +5504,21 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
* need to be as careful as pageset_update() as nobody can access the
* pageset yet.
*/
- pcp->high = BOOT_PAGESET_HIGH;
+ pcp->high_min = BOOT_PAGESET_HIGH;
+ pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_factor = 0;
+ pcp->free_count = 0;
}
-static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
- unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
+ unsigned long high_max, unsigned long batch)
{
struct per_cpu_pages *pcp;
int cpu;
for_each_possible_cpu(cpu) {
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- pageset_update(pcp, high, batch);
+ pageset_update(pcp, high_min, high_max, batch);
}
}
@@ -5419,19 +5528,34 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
*/
static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
- int new_high, new_batch;
+ int new_high_min, new_high_max, new_batch;
new_batch = max(1, zone_batchsize(zone));
- new_high = zone_highsize(zone, new_batch, cpu_online);
+ if (percpu_pagelist_high_fraction) {
+ new_high_min = zone_highsize(zone, new_batch, cpu_online,
+ percpu_pagelist_high_fraction);
+ /*
+ * PCP high is tuned manually, disable auto-tuning via
+ * setting high_min and high_max to the manual value.
+ */
+ new_high_max = new_high_min;
+ } else {
+ new_high_min = zone_highsize(zone, new_batch, cpu_online, 0);
+ new_high_max = zone_highsize(zone, new_batch, cpu_online,
+ MIN_PERCPU_PAGELIST_HIGH_FRACTION);
+ }
- if (zone->pageset_high == new_high &&
+ if (zone->pageset_high_min == new_high_min &&
+ zone->pageset_high_max == new_high_max &&
zone->pageset_batch == new_batch)
return;
- zone->pageset_high = new_high;
+ zone->pageset_high_min = new_high_min;
+ zone->pageset_high_max = new_high_max;
zone->pageset_batch = new_batch;
- __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
+ __zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max,
+ new_batch);
}
void __meminit setup_zone_pageset(struct zone *zone)
@@ -5466,6 +5590,36 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
mutex_unlock(&pcp_batch_high_lock);
}
+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
+{
+ struct per_cpu_pages *pcp;
+ struct cpu_cacheinfo *cci;
+
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+ /*
+ * If data cache slice of CPU is large enough, "pcp->batch"
+ * pages can be preserved in PCP before draining PCP for
+ * consecutive high-order pages freeing without allocation.
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+ spin_lock(&pcp->lock);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+ spin_unlock(&pcp->lock);
+}
+
+void setup_pcp_cacheinfo(unsigned int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zone_pcp_update_cacheinfo(zone, cpu);
+}
+
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
@@ -5507,7 +5661,8 @@ __meminit void zone_pcp_init(struct zone *zone)
*/
zone->per_cpu_pageset = &boot_pageset;
zone->per_cpu_zonestats = &boot_zonestats;
- zone->pageset_high = BOOT_PAGESET_HIGH;
+ zone->pageset_high_min = BOOT_PAGESET_HIGH;
+ zone->pageset_high_max = BOOT_PAGESET_HIGH;
zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
@@ -5694,9 +5849,9 @@ static void __setup_per_zone_wmarks(void)
struct zone *zone;
unsigned long flags;
- /* Calculate total number of !ZONE_HIGHMEM pages */
+ /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */
for_each_zone(zone) {
- if (!is_highmem(zone))
+ if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
lowmem_pages += zone_managed_pages(zone);
}
@@ -5705,16 +5860,16 @@ static void __setup_per_zone_wmarks(void)
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone_managed_pages(zone);
- do_div(tmp, lowmem_pages);
- if (is_highmem(zone)) {
+ tmp = div64_ul(tmp, lowmem_pages);
+ if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
- * need highmem pages, so cap pages_min to a small
- * value here.
+ * need highmem and movable zones pages, so cap pages_min
+ * to a small value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control async page reclaim, and so should
- * not be capped for highmem.
+ * not be capped for highmem and movable zones.
*/
unsigned long min_pages;
@@ -6079,9 +6234,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
}
}
-/* [start, end) must belong to a single zone. */
+/*
+ * [start, end) must belong to a single zone.
+ * @migratetype: using migratetype to filter the type of migration in
+ * trace_mm_alloc_contig_migrate_range_info.
+ */
int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ int migratetype)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6092,6 +6252,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
.nid = zone_to_nid(cc->zone),
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
+ struct page *page;
+ unsigned long total_mapped = 0;
+ unsigned long total_migrated = 0;
+ unsigned long total_reclaimed = 0;
lru_cache_disable();
@@ -6117,9 +6281,18 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
+ if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
+ total_reclaimed += nr_reclaimed;
+ list_for_each_entry(page, &cc->migratepages, lru)
+ total_mapped += page_mapcount(page);
+ }
+
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
+ if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
+ total_migrated += cc->nr_migratepages;
+
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
* to retry again over this error, so do the same here.
@@ -6133,9 +6306,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
- return ret;
}
- return 0;
+
+ trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
+ total_migrated,
+ total_reclaimed,
+ total_mapped);
+ return (ret < 0) ? ret : 0;
}
/**
@@ -6215,7 +6392,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end);
+ ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
if (ret && ret != -EBUSY)
goto done;
ret = 0;
@@ -6240,7 +6417,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order > MAX_ORDER) {
+ if (++order > MAX_PAGE_ORDER) {
outer_start = start;
break;
}
@@ -6409,13 +6586,14 @@ EXPORT_SYMBOL(free_contig_range);
void zone_pcp_disable(struct zone *zone)
{
mutex_lock(&pcp_batch_high_lock);
- __zone_set_pageset_high_and_batch(zone, 0, 1);
+ __zone_set_pageset_high_and_batch(zone, 0, 0, 1);
__drain_all_pages(zone, true);
}
void zone_pcp_enable(struct zone *zone)
{
- __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
+ __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+ zone->pageset_high_max, zone->pageset_batch);
mutex_unlock(&pcp_batch_high_lock);
}
@@ -6493,7 +6671,7 @@ bool is_free_buddy_page(struct page *page)
unsigned long pfn = page_to_pfn(page);
unsigned int order;
- for (order = 0; order <= MAX_ORDER; order++) {
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
if (PageBuddy(page_head) &&
@@ -6501,7 +6679,7 @@ bool is_free_buddy_page(struct page *page)
break;
}
- return order <= MAX_ORDER;
+ return order <= MAX_PAGE_ORDER;
}
EXPORT_SYMBOL(is_free_buddy_page);
@@ -6515,28 +6693,24 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
int migratetype)
{
unsigned long size = 1 << high;
- struct page *current_buddy, *next_page;
+ struct page *current_buddy;
while (high > low) {
high--;
size >>= 1;
if (target >= &page[size]) {
- next_page = page + size;
current_buddy = page;
+ page = page + size;
} else {
- next_page = page;
current_buddy = page + size;
}
if (set_page_guard(zone, current_buddy, high, migratetype))
continue;
- if (current_buddy != target) {
- add_to_free_list(current_buddy, zone, high, migratetype);
- set_buddy_order(current_buddy, high);
- page = next_page;
- }
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
}
}
@@ -6552,7 +6726,7 @@ bool take_page_off_buddy(struct page *page)
bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
int page_order = buddy_order(page_head);
@@ -6677,9 +6851,9 @@ static bool try_to_accept_memory_one(struct zone *zone)
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
- accept_page(page, MAX_ORDER);
+ accept_page(page, MAX_PAGE_ORDER);
- __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
+ __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
if (last)
static_branch_dec(&zones_with_unaccepted_pages);