aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-01-27 11:14:02 -0500
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-01-27 11:14:02 -0500
commit6c02b7b1610f873888af20f291c07730889ff0f9 (patch)
tree1b33e6642cc81605b8d37c0bda0abff0ba64fa2d /mm/page_alloc.c
parentx86: xen: size struct xen_spinlock to always fit in arch_spinlock_t (diff)
parentLinux 3.3-rc1 (diff)
downloadlinux-dev-6c02b7b1610f873888af20f291c07730889ff0f9.tar.xz
linux-dev-6c02b7b1610f873888af20f291c07730889ff0f9.zip
Merge commit 'v3.3-rc1' into stable/for-linus-fixes-3.3
* commit 'v3.3-rc1': (9775 commits) Linux 3.3-rc1 x86, syscall: Need __ARCH_WANT_SYS_IPC for 32 bits qnx4: don't leak ->BitMap on late failure exits qnx4: reduce the insane nesting in qnx4_checkroot() qnx4: di_fname is an array, for crying out loud... KEYS: Permit key_serial() to be called with a const key pointer keys: fix user_defined key sparse messages ima: fix cred sparse warning uml: fix compile for x86-64 MPILIB: Add a missing ENOMEM check tpm: fix (ACPI S3) suspend regression nvme: fix merge error due to change of 'make_request_fn' fn type xen: using EXPORT_SYMBOL requires including export.h gpio: tps65910: Use correct offset for gpio initialization acpi/apei/einj: Add extensions to EINJ from rev 5.0 of acpi spec intel_idle: Split up and provide per CPU initialization func ACPI processor: Remove unneeded variable passed by acpi_processor_hotadd_init V2 tg3: Fix single-vector MSI-X code openvswitch: Fix multipart datapath dumps. ipv6: fix per device IP snmp counters ...
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c583
1 files changed, 260 insertions, 323 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6ce27331834c..0027d8f4a1bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory. This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~GFP_IOFS;
}
+
+bool pm_suspended_storage(void)
+{
+ if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ return false;
+ return true;
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -181,42 +197,17 @@ static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- #ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * MAX_ACTIVE_REGIONS determines the maximum number of distinct ranges
- * of memory (RAM) that may be registered with add_active_range().
- * Ranges passed to add_active_range() will be merged if possible so
- * the number of times add_active_range() can be called is related to
- * the number of nodes and the number of holes
- */
- #ifdef CONFIG_MAX_ACTIVE_REGIONS
- /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
- #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
- #else
- #if MAX_NUMNODES >= 32
- /* If there can be many nodes, allow up to 50 holes per node */
- #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
- #else
- /* By default, allow up to 256 distinct regions */
- #define MAX_ACTIVE_REGIONS 256
- #endif
- #endif
-
- static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
- static int __meminitdata nr_nodemap_entries;
-#endif /* !CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
- static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
- static unsigned long __initdata required_kernelcore;
- static unsigned long __initdata required_movablecore;
- static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-
- /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
- int movable_zone;
- EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore;
+static unsigned long __initdata required_movablecore;
+static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -336,8 +327,8 @@ out:
*
* The remaining PAGE_SIZE pages are called "tail pages".
*
- * All pages have PG_compound set. All pages have their ->private pointing at
- * the head page (even the head page has this).
+ * All pages have PG_compound set. All tail pages have their ->first_page
+ * pointing at the head page.
*
* The first tail page's ->lru.next holds the address of the compound page's
* put_page() function. Its ->lru.prev holds the order of allocation.
@@ -359,8 +350,8 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
-
__SetPageTail(p);
+ set_page_count(p, 0);
p->first_page = page;
}
}
@@ -406,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
clear_highpage(page + i);
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
static inline void set_page_order(struct page *page, int order)
{
set_page_private(page, order);
@@ -463,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
+ return 1;
+ }
+
if (PageBuddy(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
return 1;
@@ -519,11 +546,19 @@ static inline void __free_one_page(struct page *page,
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
-
- /* Our buddy is free, merge with it and move up one order. */
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
+ /*
+ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ * merge with it and move up one order.
+ */
+ if (page_is_guard(buddy)) {
+ clear_page_guard_flag(buddy);
+ set_page_private(page, 0);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ } else {
+ list_del(&buddy->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(buddy);
+ }
combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
@@ -657,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
int i;
int bad = 0;
- trace_mm_page_free_direct(page, order);
+ trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
if (PageAnon(page))
@@ -695,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
- if (order == 0) {
- __ClearPageReserved(page);
- set_page_count(page, 0);
- set_page_refcounted(page);
- __free_page(page);
- } else {
- int loop;
+ unsigned int nr_pages = 1 << order;
+ unsigned int loop;
- prefetchw(page);
- for (loop = 0; loop < (1 << order); loop++) {
- struct page *p = &page[loop];
+ prefetchw(page);
+ for (loop = 0; loop < nr_pages; loop++) {
+ struct page *p = &page[loop];
- if (loop + 1 < (1 << order))
- prefetchw(p + 1);
- __ClearPageReserved(p);
- set_page_count(p, 0);
- }
-
- set_page_refcounted(page);
- __free_pages(page, order);
+ if (loop + 1 < nr_pages)
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
}
+
+ set_page_refcounted(page);
+ __free_pages(page, order);
}
@@ -749,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
high--;
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (high < debug_guardpage_minorder()) {
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ INIT_LIST_HEAD(&page[size].lru);
+ set_page_guard_flag(&page[size]);
+ set_page_private(&page[size], high);
+ /* Guard pages are not available for any usage */
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ continue;
+ }
+#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
@@ -1214,6 +1257,19 @@ out:
}
/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ trace_mm_page_free_batched(page, cold);
+ free_hot_cold_page(page, cold);
+ }
+}
+
+/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
@@ -1411,7 +1467,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
static int __init fail_page_alloc_debugfs(void)
{
- mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@ -1460,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
long min = mark;
int o;
- free_pages -= (1 << order) + 1;
+ free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
@@ -1670,6 +1726,35 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1759,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
return;
/*
@@ -1798,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ unsigned long did_some_progress,
unsigned long pages_reclaimed)
{
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
return 0;
+ /* Always retry if specifically requested */
+ if (gfp_mask & __GFP_NOFAIL)
+ return 1;
+
+ /*
+ * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+ * making forward progress without invoking OOM. Suspend also disables
+ * storage devices so kswapd will not help. Bail if we are suspending.
+ */
+ if (!did_some_progress && pm_suspended_storage())
+ return 0;
+
/*
* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
* means __GFP_NOFAIL, but that may not be true in other
@@ -1822,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
return 1;
- /*
- * Don't let big-order allocations loop unless the caller
- * explicitly requests that.
- */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
return 0;
}
@@ -1889,13 +1981,19 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;
- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
+ return NULL;
+
+ if (compaction_deferred(preferred_zone)) {
+ *deferred_compaction = true;
return NULL;
+ }
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
@@ -1924,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone);
cond_resched();
}
@@ -1936,8 +2040,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
@@ -2087,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
+ bool deferred_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2167,12 +2273,22 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -2221,7 +2337,8 @@ rebalance:
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+ if (should_alloc_retry(gfp_mask, order, did_some_progress,
+ pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
@@ -2235,8 +2352,9 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
@@ -2331,16 +2449,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-void __pagevec_free(struct pagevec *pvec)
-{
- int i = pagevec_count(pvec);
-
- while (--i >= 0) {
- trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
- free_hot_cold_page(pvec->pages[i], pvec->cold);
- }
-}
-
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
@@ -3380,9 +3488,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
unsigned long block_migratetype;
int reserve;
- /* Get the start pfn, end pfn and the number of blocks to reserve */
+ /*
+ * Get the start pfn, end pfn and the number of blocks to reserve
+ * We have to be careful to be aligned to pageblock_nr_pages to
+ * make sure that we always check pfn_valid for the first page in
+ * the block.
+ */
start_pfn = zone->zone_start_pfn;
end_pfn = start_pfn + zone->spanned_pages;
+ start_pfn = roundup(start_pfn, pageblock_nr_pages);
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
@@ -3404,25 +3518,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
if (page_to_nid(page) != zone_to_nid(zone))
continue;
- /* Blocks with reserved pages will never free, skip them. */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
block_migratetype = get_pageblock_migratetype(page);
- /* If this block is reserved, account for it */
- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;
- /* Suitable for reserving if this block is movable */
- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page, MIGRATE_RESERVE);
- move_freepages_block(zone, page, MIGRATE_RESERVE);
- reserve--;
- continue;
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
}
/*
@@ -3734,7 +3856,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
return 0;
}
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -4002,7 +4124,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
-#else
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zones_size)
@@ -4020,7 +4142,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
return zholes_size[zone_type];
}
-#endif
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
@@ -4140,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
- enum lru_list l;
+ enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4190,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l)
- INIT_LIST_HEAD(&zone->lru[l].list);
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4243,10 +4365,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4271,7 +4393,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
free_area_init_core(pgdat, zones_size, zholes_size);
}
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#if MAX_NUMNODES > 1
/*
@@ -4292,201 +4414,6 @@ static inline void setup_nr_node_ids(void)
}
#endif
-#ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-/*
- * Common iterator interface used to define for_each_mem_pfn_range().
- */
-void __meminit __next_mem_pfn_range(int *idx, int nid,
- unsigned long *out_start_pfn,
- unsigned long *out_end_pfn, int *out_nid)
-{
- struct node_active_region *r = NULL;
-
- while (++*idx < nr_nodemap_entries) {
- if (nid == MAX_NUMNODES || nid == early_node_map[*idx].nid) {
- r = &early_node_map[*idx];
- break;
- }
- }
- if (!r) {
- *idx = -1;
- return;
- }
-
- if (out_start_pfn)
- *out_start_pfn = r->start_pfn;
- if (out_end_pfn)
- *out_end_pfn = r->end_pfn;
- if (out_nid)
- *out_nid = r->nid;
-}
-
-/**
- * add_active_range - Register a range of PFNs backed by physical memory
- * @nid: The node ID the range resides on
- * @start_pfn: The start PFN of the available physical memory
- * @end_pfn: The end PFN of the available physical memory
- *
- * These ranges are stored in an early_node_map[] and later used by
- * free_area_init_nodes() to calculate zone sizes and holes. If the
- * range spans a memory hole, it is up to the architecture to ensure
- * the memory is not freed by the bootmem allocator. If possible
- * the range being registered will be merged with existing ranges.
- */
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- int i;
-
- mminit_dprintk(MMINIT_TRACE, "memory_register",
- "Entering add_active_range(%d, %#lx, %#lx) "
- "%d entries of %d used\n",
- nid, start_pfn, end_pfn,
- nr_nodemap_entries, MAX_ACTIVE_REGIONS);
-
- mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
-
- /* Merge with existing active regions if possible */
- for (i = 0; i < nr_nodemap_entries; i++) {
- if (early_node_map[i].nid != nid)
- continue;
-
- /* Skip if an existing region covers this new one */
- if (start_pfn >= early_node_map[i].start_pfn &&
- end_pfn <= early_node_map[i].end_pfn)
- return;
-
- /* Merge forward if suitable */
- if (start_pfn <= early_node_map[i].end_pfn &&
- end_pfn > early_node_map[i].end_pfn) {
- early_node_map[i].end_pfn = end_pfn;
- return;
- }
-
- /* Merge backward if suitable */
- if (start_pfn < early_node_map[i].start_pfn &&
- end_pfn >= early_node_map[i].start_pfn) {
- early_node_map[i].start_pfn = start_pfn;
- return;
- }
- }
-
- /* Check that early_node_map is large enough */
- if (i >= MAX_ACTIVE_REGIONS) {
- printk(KERN_CRIT "More than %d memory regions, truncating\n",
- MAX_ACTIVE_REGIONS);
- return;
- }
-
- early_node_map[i].nid = nid;
- early_node_map[i].start_pfn = start_pfn;
- early_node_map[i].end_pfn = end_pfn;
- nr_nodemap_entries = i + 1;
-}
-
-/**
- * remove_active_range - Shrink an existing registered range of PFNs
- * @nid: The node id the range is on that should be shrunk
- * @start_pfn: The new PFN of the range
- * @end_pfn: The new PFN of the range
- *
- * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept near the end physical page range that has already been
- * registered. This function allows an arch to shrink an existing registered
- * range.
- */
-void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long this_start_pfn, this_end_pfn;
- int i, j;
- int removed = 0;
-
- printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
- nid, start_pfn, end_pfn);
-
- /* Find the old active region end and shrink */
- for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
- if (this_start_pfn >= start_pfn && this_end_pfn <= end_pfn) {
- /* clear it */
- early_node_map[i].start_pfn = 0;
- early_node_map[i].end_pfn = 0;
- removed = 1;
- continue;
- }
- if (this_start_pfn < start_pfn && this_end_pfn > start_pfn) {
- early_node_map[i].end_pfn = start_pfn;
- if (this_end_pfn > end_pfn)
- add_active_range(nid, end_pfn, this_end_pfn);
- continue;
- }
- if (this_start_pfn >= start_pfn && this_end_pfn > end_pfn &&
- this_start_pfn < end_pfn) {
- early_node_map[i].start_pfn = end_pfn;
- continue;
- }
- }
-
- if (!removed)
- return;
-
- /* remove the blank ones */
- for (i = nr_nodemap_entries - 1; i > 0; i--) {
- if (early_node_map[i].nid != nid)
- continue;
- if (early_node_map[i].end_pfn)
- continue;
- /* we found it, get rid of it */
- for (j = i; j < nr_nodemap_entries - 1; j++)
- memcpy(&early_node_map[j], &early_node_map[j+1],
- sizeof(early_node_map[j]));
- j = nr_nodemap_entries - 1;
- memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
- nr_nodemap_entries--;
- }
-}
-
-/**
- * remove_all_active_ranges - Remove all currently registered regions
- *
- * During discovery, it may be found that a table like SRAT is invalid
- * and an alternative discovery method must be used. This function removes
- * all currently registered regions.
- */
-void __init remove_all_active_ranges(void)
-{
- memset(early_node_map, 0, sizeof(early_node_map));
- nr_nodemap_entries = 0;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
- struct node_active_region *arange = (struct node_active_region *)a;
- struct node_active_region *brange = (struct node_active_region *)b;
-
- /* Done this way to avoid overflows */
- if (arange->start_pfn > brange->start_pfn)
- return 1;
- if (arange->start_pfn < brange->start_pfn)
- return -1;
-
- return 0;
-}
-
-/* sort the node_map by start_pfn */
-void __init sort_node_map(void)
-{
- sort(early_node_map, (size_t)nr_nodemap_entries,
- sizeof(struct node_active_region),
- cmp_node_active_region, NULL);
-}
-#else /* !CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline void sort_node_map(void)
-{
-}
-#endif
-
/**
* node_map_pfn_alignment - determine the maximum internode alignment
*
@@ -4740,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (zone->present_pages)
+ if (zone->present_pages) {
node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ break;
+ }
}
#endif
}
@@ -4764,9 +4693,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
unsigned long start_pfn, end_pfn;
int i, nid;
- /* Sort early_node_map as initialisation assumes it is sorted */
- sort_node_map();
-
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
@@ -4867,7 +4793,7 @@ static int __init cmdline_parse_movablecore(char *p)
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
/**
* set_dma_reserve - set the specified number of pages reserved in the first zone
@@ -4951,8 +4877,19 @@ static void calculate_totalreserve_pages(void)
if (max > zone->present_pages)
max = zone->present_pages;
reserve_pages += max;
+ /*
+ * Lowmem reserves are not available to
+ * GFP_HIGHUSER page cache allocations and
+ * kswapd tries to balance zones to their high
+ * watermark. As a result, neither should be
+ * regarded as dirtyable memory, to prevent a
+ * situation where reclaim has to clean pages
+ * in order to balance the zones.
+ */
+ zone->dirty_balance_reserve = max;
}
}
+ dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}