aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c492
1 files changed, 291 insertions, 201 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c5c57b..76c9688b6a0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
- unsigned long max_initialise;
- unsigned long reserved_lowmem;
+ phys_addr_t start_addr, end_addr;
+ unsigned long max_pgcnt;
+ unsigned long reserved;
/*
* Initialise at least 2G of a node but also take into account that
* two large system hashes that can take up 1GB for 0.25TB/node.
*/
- max_initialise = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
+ max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
/*
* Compensate the all the memblock reservations (e.g. crash kernel)
* from the initial estimation to make sure we will initialize enough
* memory to boot.
*/
- reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
- pgdat->node_start_pfn + max_initialise);
- max_initialise += reserved_lowmem;
+ start_addr = PFN_PHYS(pgdat->node_start_pfn);
+ end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+ reserved = memblock_reserved_memory_within(start_addr, end_addr);
+ max_pgcnt += PHYS_PFN(reserved);
- pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+ pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_size) &&
+ if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
/*
* Check tail pages before head page information is cleared to
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(struct page *page,
- unsigned long pfn, int nr_pages)
+static void __init deferred_free_range(unsigned long pfn,
+ unsigned long nr_pages)
{
- int i;
+ struct page *page;
+ unsigned long i;
- if (!page)
+ if (!nr_pages)
return;
+ page = pfn_to_page(pfn);
+
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
complete(&pgdat_init_all_done_comp);
}
+/*
+ * Helper for deferred_init_range, free the given range, reset the counters, and
+ * return number of pages freed.
+ */
+static inline unsigned long __init __def_free(unsigned long *nr_free,
+ unsigned long *free_base_pfn,
+ struct page **page)
+{
+ unsigned long nr = *nr_free;
+
+ deferred_free_range(*free_base_pfn, nr);
+ *free_base_pfn = 0;
+ *nr_free = 0;
+ *page = NULL;
+
+ return nr;
+}
+
+static unsigned long __init deferred_init_range(int nid, int zid,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ unsigned long free_base_pfn = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_free = 0;
+ struct page *page = NULL;
+ unsigned long pfn;
+
+ /*
+ * First we check if pfn is valid on architectures where it is possible
+ * to have holes within pageblock_nr_pages. On systems where it is not
+ * possible, this function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the
+ * validity of the head pfn.
+ *
+ * meminit_pfn_in_nid is checked on systems where pfns can interleave
+ * within a node: a pfn is between start and end of a node, but does not
+ * belong to this memory node.
+ *
+ * Finally, we minimize pfn page lookups and scheduler checks by
+ * performing it only once every pageblock_nr_pages.
+ *
+ * We do it in two loops: first we initialize struct page, than free to
+ * buddy allocator, becuse while we are freeing pages we can access
+ * pages that are ahead (computing buddy page in __free_one_page()).
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+ if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
+ if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ if (page && (pfn & nr_pgmask))
+ page++;
+ else
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zid, nid);
+ cond_resched();
+ }
+ }
+ }
+
+ page = NULL;
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (page && (pfn & nr_pgmask)) {
+ page++;
+ nr_free++;
+ } else {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ page = pfn_to_page(pfn);
+ free_base_pfn = pfn;
+ nr_free = 1;
+ cond_resched();
+ }
+ }
+ /* Free the last block of pages to allocator */
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
int nid = pgdat->node_id;
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long start = jiffies;
unsigned long nr_pages = 0;
- unsigned long walk_start, walk_end;
- int i, zid;
+ unsigned long spfn, epfn;
+ phys_addr_t spa, epa;
+ int zid;
struct zone *zone;
unsigned long first_init_pfn = pgdat->first_deferred_pfn;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ u64 i;
if (first_init_pfn == ULONG_MAX) {
pgdat_init_report_one_done();
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
+ first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
- for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
- unsigned long pfn, end_pfn;
- struct page *page = NULL;
- struct page *free_base_page = NULL;
- unsigned long free_base_pfn = 0;
- int nr_to_free = 0;
-
- end_pfn = min(walk_end, zone_end_pfn(zone));
- pfn = first_init_pfn;
- if (pfn < walk_start)
- pfn = walk_start;
- if (pfn < zone->zone_start_pfn)
- pfn = zone->zone_start_pfn;
-
- for (; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
- goto free_range;
-
- /*
- * Ensure pfn_valid is checked every
- * pageblock_nr_pages for memory holes
- */
- if ((pfn & (pageblock_nr_pages - 1)) == 0) {
- if (!pfn_valid(pfn)) {
- page = NULL;
- goto free_range;
- }
- }
-
- if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- page = NULL;
- goto free_range;
- }
-
- /* Minimise pfn page lookups and scheduler checks */
- if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
- page++;
- } else {
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page,
- free_base_pfn, nr_to_free);
- free_base_page = NULL;
- free_base_pfn = nr_to_free = 0;
-
- page = pfn_to_page(pfn);
- cond_resched();
- }
-
- if (page->flags) {
- VM_BUG_ON(page_zone(page) != zone);
- goto free_range;
- }
-
- __init_single_page(page, pfn, zid, nid);
- if (!free_base_page) {
- free_base_page = page;
- free_base_pfn = pfn;
- nr_to_free = 0;
- }
- nr_to_free++;
-
- /* Where possible, batch up pages for a single free */
- continue;
-free_range:
- /* Free the current block of pages to allocator */
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page, free_base_pfn,
- nr_to_free);
- free_base_page = NULL;
- free_base_pfn = nr_to_free = 0;
- }
- /* Free the last block of pages to allocator */
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
-
- first_init_pfn = max(end_pfn, first_init_pfn);
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ nr_pages += deferred_init_range(nid, zid, spfn, epfn);
}
/* Sanity check that the next zone really is unpopulated */
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
-static inline
+static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
};
#ifdef CONFIG_CMA
-static struct page *__rmqueue_cma_fallback(struct zone *zone,
+static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
-static inline bool
+static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
@@ -2289,8 +2321,8 @@ do_steal:
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
+static __always_inline struct page *
+__rmqueue(struct zone *zone, unsigned int order, int migratetype)
{
struct page *page;
@@ -2315,7 +2347,7 @@ retry:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, bool cold)
+ int migratetype)
{
int i, alloced = 0;
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
continue;
/*
- * Split buddy pages returned by expand() are received here
- * in physical page order. The page is added to the callers and
- * list and the list head then moves forward. From the callers
- * perspective, the linked list is ordered by page number in
- * some conditions. This is useful for IO devices that can
- * merge IO requests if the physical pages are ordered
- * properly.
+ * Split buddy pages returned by expand() are received here in
+ * physical page order. The page is added to the tail of
+ * caller's list. From the callers perspective, the linked list
+ * is ordered by page number under some conditions. This is
+ * useful for IO devices that can forward direction from the
+ * head, thus also in the physical page order. This is useful
+ * for IO devices that can merge IO requests if the physical
+ * pages are ordered properly.
*/
- if (likely(!cold))
- list_add(&page->lru, list);
- else
- list_add_tail(&page->lru, list);
- list = &page->lru;
+ list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2478,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
if (WARN_ON_ONCE(!mm_percpu_wq))
return;
- /* Workqueues cannot recurse */
- if (current->flags & PF_WQ_WORKER)
- return;
-
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
@@ -2590,24 +2615,25 @@ void mark_free_pages(struct zone *zone)
}
#endif /* CONFIG_PM */
-/*
- * Free a 0-order page
- * cold == true ? free a cold page : free a hot page
- */
-void free_hot_cold_page(struct page *page, bool cold)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
- unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pcp_prepare(page))
- return;
+ return false;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
- local_irq_save(flags);
+ return true;
+}
+
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ int migratetype;
+
+ migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
/*
@@ -2620,38 +2646,73 @@ void free_hot_cold_page(struct page *page, bool cold)
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, pfn, 0, migratetype);
- goto out;
+ return;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (!cold)
- list_add(&page->lru, &pcp->lists[migratetype]);
- else
- list_add_tail(&page->lru, &pcp->lists[migratetype]);
+ list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
+}
-out:
+/*
+ * Free a 0-order page
+ */
+void free_unref_page(struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_unref_page_prepare(page, pfn))
+ return;
+
+ local_irq_save(flags);
+ free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
+ unsigned long flags, pfn;
+ int batch_count = 0;
+ /* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- trace_mm_page_free_batched(page, cold);
- free_hot_cold_page(page, cold);
+ pfn = page_to_pfn(page);
+ if (!free_unref_page_prepare(page, pfn))
+ list_del(&page->lru);
+ set_page_private(page, pfn);
}
+
+ local_irq_save(flags);
+ list_for_each_entry_safe(page, next, list, lru) {
+ unsigned long pfn = page_private(page);
+
+ set_page_private(page, 0);
+ trace_mm_page_free_batched(page);
+ free_unref_page_commit(page, pfn);
+
+ /*
+ * Guard against excessive IRQ disabled times when we get
+ * a large list of pages to free.
+ */
+ if (++batch_count == SWAP_CLUSTER_MAX) {
+ local_irq_restore(flags);
+ batch_count = 0;
+ local_irq_save(flags);
+ }
+ }
+ local_irq_restore(flags);
}
/*
@@ -2669,15 +2730,6 @@ void split_page(struct page *page, unsigned int order)
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
-#ifdef CONFIG_KMEMCHECK
- /*
- * Split shadow pages too, because free(page[0]) would
- * otherwise free the whole shadow.
- */
- if (kmemcheck_page_is_tracked(page))
- split_page(virt_to_page(page[0].shadow), order);
-#endif
-
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order);
@@ -2743,6 +2795,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
+ /* skip numa counters update if numa stats is disabled */
+ if (!static_branch_likely(&vm_numa_stat_key))
+ return;
+
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
@@ -2758,7 +2814,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
- bool cold, struct per_cpu_pages *pcp,
+ struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
@@ -2767,16 +2823,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
- migratetype, cold);
+ migratetype);
if (unlikely(list_empty(list)))
return NULL;
}
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
-
+ page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
@@ -2791,14 +2843,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
{
struct per_cpu_pages *pcp;
struct list_head *list;
- bool cold = ((gfp_flags & __GFP_COLD) != 0);
struct page *page;
unsigned long flags;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
@@ -3006,9 +3057,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
if (!area->nr_free)
continue;
- if (alloc_harder)
- return true;
-
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
if (!list_empty(&area->free_list[mt]))
return true;
@@ -3020,6 +3068,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
}
#endif
+ if (alloc_harder &&
+ !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ return true;
}
return false;
}
@@ -3235,20 +3286,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
- pr_warn("%s: ", current->comm);
-
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- pr_cont("%pV", &vaf);
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+ current->comm, &vaf, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
va_end(args);
- pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
- if (nodemask)
- pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
- else
- pr_cont("(null)\n");
-
cpuset_print_current_mems_allowed();
dump_stack();
@@ -3868,8 +3913,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
- unsigned long alloc_start = jiffies;
- unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
int reserve_flags;
@@ -4001,14 +4044,6 @@ retry:
if (!can_direct_reclaim)
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
@@ -4223,9 +4258,6 @@ out:
page = NULL;
}
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
@@ -4262,7 +4294,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, false);
+ free_unref_page(page);
else
__free_pages_ok(page, order);
}
@@ -4320,7 +4352,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
unsigned int order = compound_order(page);
if (order == 0)
- free_hot_cold_page(page, false);
+ free_unref_page(page);
else
__free_pages_ok(page, order);
}
@@ -6126,6 +6158,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
}
}
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
@@ -6135,7 +6168,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
if (!pgdat->node_spanned_pages)
return;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6157,6 +6189,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
+ pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ __func__, pgdat->node_id, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6169,8 +6204,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
+#else
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6197,16 +6234,51 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
zones_size, zholes_size);
alloc_node_mem_map(pgdat);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
- nid, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
-#endif
reset_deferred_meminit(pgdat);
free_area_init_core(pgdat);
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Only struct pages that are backed by physical memory are zeroed and
+ * initialized by going through __init_single_page(). But, there are some
+ * struct pages which are reserved in memblock allocator and their fields
+ * may be accessed (for example page_to_pfn() on some configuration accesses
+ * flags). We must explicitly zero those struct pages.
+ */
+void __paginginit zero_resv_unavail(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ u64 i, pgcnt;
+
+ /*
+ * Loop through ranges that are reserved, but do not have reported
+ * physical memory backing.
+ */
+ pgcnt = 0;
+ for_each_resv_unavail_range(i, &start, &end) {
+ for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+ continue;
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+ }
+
+ /*
+ * Struct pages that do not have backing memory. This could be because
+ * firmware is using some of this memory, or for some other reasons.
+ * Once memblock is changed so such behaviour is not allowed: i.e.
+ * list of "reserved" memory must be a subset of list of "memory", then
+ * this code can be removed.
+ */
+ if (pgcnt)
+ pr_info("Reserved but unavailable: %lld pages", pgcnt);
+}
+#endif /* CONFIG_HAVE_MEMBLOCK */
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#if MAX_NUMNODES > 1
@@ -6630,6 +6702,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
+ zero_resv_unavail();
}
static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6793,6 +6866,7 @@ void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+ zero_resv_unavail();
}
static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7305,18 +7379,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
- /*
- * memblock allocator returns zeroed memory already, so HASH_ZERO is
- * currently not used when HASH_EARLY is specified.
- */
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
- if (flags & HASH_EARLY)
- table = memblock_virt_alloc_nopanic(size, 0);
- else if (hashdist)
+ if (flags & HASH_EARLY) {
+ if (flags & HASH_ZERO)
+ table = memblock_virt_alloc_nopanic(size, 0);
+ else
+ table = memblock_virt_alloc_raw(size, 0);
+ } else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- else {
+ } else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
@@ -7353,10 +7426,10 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+ int migratetype,
bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
- int mt;
/*
* For avoiding noise data, lru_add_drain_all() should be called
@@ -7364,8 +7437,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
*/
if (zone_idx(zone) == ZONE_MOVABLE)
return false;
- mt = get_pageblock_migratetype(page);
- if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
+
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark isolate
+ * CMA pageblocks even when they are not movable in fact so consider
+ * them movable here.
+ */
+ if (is_migrate_cma(migratetype) &&
+ is_migrate_cma(get_pageblock_migratetype(page)))
return false;
pfn = page_to_pfn(page);
@@ -7377,6 +7456,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
page = pfn_to_page(check);
+ if (PageReserved(page))
+ return true;
+
/*
* Hugepages are not in LRU lists, but they're movable.
* We need not scan over tail pages bacause we don't
@@ -7450,7 +7532,7 @@ bool is_pageblock_removable_nolock(struct page *page)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, true);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
}
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
@@ -7546,6 +7628,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .no_set_skip_hint = true,
.gfp_mask = current_gfp_context(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -7582,11 +7665,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/*
* In case of -EBUSY, we'd like to know which page causes problem.
- * So, just fall through. We will check it in test_pages_isolated().
+ * So, just fall through. test_pages_isolated() has a tracepoint
+ * which will report the busy page.
+ *
+ * It is possible that busy pages could become available before
+ * the call to test_pages_isolated, and the range will actually be
+ * allocated. So, if we fall through be sure to clear ret so that
+ * -EBUSY is not accidentally used or returned to caller.
*/
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
+ ret =0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES