aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c141
1 files changed, 95 insertions, 46 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef19f22b2b7d..48aaf7b9f253 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
-#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
@@ -126,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+ return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+ page->index = migratetype;
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
@@ -207,6 +224,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
"HighMem",
#endif
"Movable",
+#ifdef CONFIG_ZONE_DEVICE
+ "Device",
+#endif
};
int min_free_kbytes = 1024;
@@ -789,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- mt = get_freepage_migratetype(page);
+
+ mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
@@ -953,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- set_freepage_migratetype(page, migratetype);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
@@ -981,21 +1004,21 @@ static void __init __free_pages_boot_core(struct page *page,
#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
- /* The system will behave unpredictably otherwise */
- BUG_ON(system_state != SYSTEM_BOOTING);
-
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
+ if (nid < 0)
+ nid = 0;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
}
#endif
@@ -1060,7 +1083,15 @@ static void __init deferred_free_range(struct page *page,
__free_pages_boot_core(page, pfn, 0);
}
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
@@ -1077,7 +1108,7 @@ static int __init deferred_init_memmap(void *data)
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (first_init_pfn == ULONG_MAX) {
- up_read(&pgdat_init_rwsem);
+ pgdat_init_report_one_done();
return 0;
}
@@ -1177,7 +1208,8 @@ free_range:
pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
jiffies_to_msecs(jiffies - start));
- up_read(&pgdat_init_rwsem);
+
+ pgdat_init_report_one_done();
return 0;
}
@@ -1185,14 +1217,17 @@ void __init page_alloc_init_late(void)
{
int nid;
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
for_each_node_state(nid, N_MEMORY) {
- down_read(&pgdat_init_rwsem);
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
}
/* Block until all are initialised */
- down_write(&pgdat_init_rwsem);
- up_write(&pgdat_init_rwsem);
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -1285,6 +1320,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(atomic_read(&page->_count) != 0))
bad_reason = "nonzero _count";
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ bad_reason = "HWPoisoned (hardware-corrupted)";
+ bad_flags = __PG_HWPOISON;
+ }
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -1328,12 +1367,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
set_page_owner(page, order, gfp_flags);
/*
- * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+ * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
- page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ set_page_pfmemalloc(page);
+ else
+ clear_page_pfmemalloc(page);
return 0;
}
@@ -1362,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -1439,7 +1481,6 @@ int move_freepages(struct zone *zone,
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
- set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1609,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
expand(zone, page, order, current_order, area,
start_migratetype);
/*
- * The freepage_migratetype may differ from pageblock's
+ * The pcppage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
+ * find_suitable_fallback(). This is OK as long as it does not
+ * differ for MIGRATE_CMA pageblocks. Those can be used as
+ * fallback only via special __rmqueue_cma_fallback() function
*/
- set_freepage_migratetype(page, start_migratetype);
+ set_pcppage_migratetype(page, start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -1692,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
- if (is_migrate_cma(get_freepage_migratetype(page)))
+ if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1889,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -2094,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+ get_pcppage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2675,6 +2715,12 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct page *page;
*did_some_progress = 0;
@@ -2726,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);
@@ -3330,7 +3375,7 @@ refill:
atomic_add(size - 1, &page->_count);
/* reset page count bias and offset to start of new frag */
- nc->pfmemalloc = page->pfmemalloc;
+ nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = size;
nc->offset = size;
}
@@ -3469,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -5045,6 +5088,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
{
unsigned long zone_start_pfn, zone_end_pfn;
+ /* When hotadd a new node from cpu_up(), the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
/* Get the start and end of the zone */
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
@@ -5108,6 +5155,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ /* When hotadd a new node from cpu_up(), the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
@@ -5277,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
*
* NOTE: pgdat should get zeroed by caller.
*/
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -5429,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5441,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn);
+ free_area_init_core(pgdat);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5452,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
*/
void __init setup_nr_node_ids(void)
{
- unsigned int node;
- unsigned int highest = 0;
+ unsigned int highest;
- for_each_node_mask(node, node_possible_map)
- highest = node;
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
@@ -5977,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
*
- * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
* function may optionally be used to account for unfreeable pages in the
@@ -6030,7 +6079,7 @@ void __init page_alloc_init(void)
}
/*
- * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
@@ -6074,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
/*
* setup_per_zone_lowmem_reserve - called whenever
- * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
+ * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/