diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 319 | 
1 files changed, 269 insertions, 50 deletions
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d03c946d5566..431214b941ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,6 +49,7 @@  #include <linux/debugobjects.h>  #include <linux/kmemleak.h>  #include <linux/memory.h> +#include <linux/compaction.h>  #include <trace/events/kmem.h>  #include <linux/ftrace_event.h> @@ -56,6 +57,22 @@  #include <asm/div64.h>  #include "internal.h" +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in <linux/topology.h>. + */ +DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +#endif +  /*   * Array of node states.   */ @@ -475,6 +492,8 @@ static inline void __free_one_page(struct page *page,  		int migratetype)  {  	unsigned long page_idx; +	unsigned long combined_idx; +	struct page *buddy;  	if (unlikely(PageCompound(page)))  		if (unlikely(destroy_compound_page(page, order))) @@ -488,9 +507,6 @@ static inline void __free_one_page(struct page *page,  	VM_BUG_ON(bad_range(zone, page));  	while (order < MAX_ORDER-1) { -		unsigned long combined_idx; -		struct page *buddy; -  		buddy = __page_find_buddy(page, page_idx, order);  		if (!page_is_buddy(page, buddy, order))  			break; @@ -505,8 +521,29 @@ static inline void __free_one_page(struct page *page,  		order++;  	}  	set_page_order(page, order); -	list_add(&page->lru, -		&zone->free_area[order].free_list[migratetype]); + +	/* +	 * If this is not the largest possible page, check if the buddy +	 * of the next-highest order is free. If it is, it's possible +	 * that pages are being freed that will coalesce soon. In case, +	 * that is happening, add the free page to the tail of the list +	 * so it's less likely to be used soon and more likely to be merged +	 * as a higher order page +	 */ +	if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { +		struct page *higher_page, *higher_buddy; +		combined_idx = __find_combined_index(page_idx, order); +		higher_page = page + combined_idx - page_idx; +		higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); +		if (page_is_buddy(higher_page, higher_buddy, order + 1)) { +			list_add_tail(&page->lru, +				&zone->free_area[order].free_list[migratetype]); +			goto out; +		} +	} + +	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out:  	zone->free_area[order].nr_free++;  } @@ -599,20 +636,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,  	spin_unlock(&zone->lock);  } -static void __free_pages_ok(struct page *page, unsigned int order) +static bool free_pages_prepare(struct page *page, unsigned int order)  { -	unsigned long flags;  	int i;  	int bad = 0; -	int wasMlocked = __TestClearPageMlocked(page);  	trace_mm_page_free_direct(page, order);  	kmemcheck_free_shadow(page, order); -	for (i = 0 ; i < (1 << order) ; ++i) -		bad += free_pages_check(page + i); +	for (i = 0; i < (1 << order); i++) { +		struct page *pg = page + i; + +		if (PageAnon(pg)) +			pg->mapping = NULL; +		bad += free_pages_check(pg); +	}  	if (bad) -		return; +		return false;  	if (!PageHighMem(page)) {  		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); @@ -622,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)  	arch_free_page(page, order);  	kernel_map_pages(page, 1 << order, 0); +	return true; +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ +	unsigned long flags; +	int wasMlocked = __TestClearPageMlocked(page); + +	if (!free_pages_prepare(page, order)) +		return; +  	local_irq_save(flags);  	if (unlikely(wasMlocked))  		free_page_mlock(page); @@ -1107,21 +1158,9 @@ void free_hot_cold_page(struct page *page, int cold)  	int migratetype;  	int wasMlocked = __TestClearPageMlocked(page); -	trace_mm_page_free_direct(page, 0); -	kmemcheck_free_shadow(page, 0); - -	if (PageAnon(page)) -		page->mapping = NULL; -	if (free_pages_check(page)) +	if (!free_pages_prepare(page, 0))  		return; -	if (!PageHighMem(page)) { -		debug_check_no_locks_freed(page_address(page), PAGE_SIZE); -		debug_check_no_obj_freed(page_address(page), PAGE_SIZE); -	} -	arch_free_page(page, 0); -	kernel_map_pages(page, 1, 0); -  	migratetype = get_pageblock_migratetype(page);  	set_page_private(page, migratetype);  	local_irq_save(flags); @@ -1188,6 +1227,51 @@ void split_page(struct page *page, unsigned int order)  }  /* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ +	unsigned int order; +	unsigned long watermark; +	struct zone *zone; + +	BUG_ON(!PageBuddy(page)); + +	zone = page_zone(page); +	order = page_order(page); + +	/* Obey watermarks as if the page was being allocated */ +	watermark = low_wmark_pages(zone) + (1 << order); +	if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) +		return 0; + +	/* Remove page from free list */ +	list_del(&page->lru); +	zone->free_area[order].nr_free--; +	rmv_page_order(page); +	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + +	/* Split into individual pages */ +	set_page_refcounted(page); +	split_page(page, order); + +	if (order >= pageblock_order - 1) { +		struct page *endpage = page + (1 << order) - 1; +		for (; page < endpage; page += pageblock_nr_pages) +			set_pageblock_migratetype(page, MIGRATE_MOVABLE); +	} + +	return 1 << order; +} + +/*   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But   * we cheat by calling it from here, in the order > 0 path.  Saves a branch   * or two. @@ -1693,6 +1777,62 @@ out:  	return page;  } +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, +	struct zonelist *zonelist, enum zone_type high_zoneidx, +	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +	int migratetype, unsigned long *did_some_progress) +{ +	struct page *page; + +	if (!order || compaction_deferred(preferred_zone)) +		return NULL; + +	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, +								nodemask); +	if (*did_some_progress != COMPACT_SKIPPED) { + +		/* Page migration frees to the PCP lists but we want merging */ +		drain_pages(get_cpu()); +		put_cpu(); + +		page = get_page_from_freelist(gfp_mask, nodemask, +				order, zonelist, high_zoneidx, +				alloc_flags, preferred_zone, +				migratetype); +		if (page) { +			preferred_zone->compact_considered = 0; +			preferred_zone->compact_defer_shift = 0; +			count_vm_event(COMPACTSUCCESS); +			return page; +		} + +		/* +		 * It's bad if compaction run occurs and fails. +		 * The most likely reason is that pages exist, +		 * but not enough to satisfy watermarks. +		 */ +		count_vm_event(COMPACTFAIL); +		defer_compaction(preferred_zone); + +		cond_resched(); +	} + +	return NULL; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, +	struct zonelist *zonelist, enum zone_type high_zoneidx, +	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +	int migratetype, unsigned long *did_some_progress) +{ +	return NULL; +} +#endif /* CONFIG_COMPACTION */ +  /* The really slow allocator path where we enter direct reclaim */  static inline struct page *  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, @@ -1879,6 +2019,15 @@ rebalance:  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))  		goto nopage; +	/* Try direct compaction */ +	page = __alloc_pages_direct_compact(gfp_mask, order, +					zonelist, high_zoneidx, +					nodemask, +					alloc_flags, preferred_zone, +					migratetype, &did_some_progress); +	if (page) +		goto got_pg; +  	/* Try direct reclaim and then allocating */  	page = __alloc_pages_direct_reclaim(gfp_mask, order,  					zonelist, high_zoneidx, @@ -1970,10 +2119,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  	if (unlikely(!zonelist->_zonerefs->zone))  		return NULL; +	get_mems_allowed();  	/* The preferred zone is used for statistics later */  	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); -	if (!preferred_zone) +	if (!preferred_zone) { +		put_mems_allowed();  		return NULL; +	}  	/* First allocation attempt */  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -1983,6 +2135,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  		page = __alloc_pages_slowpath(gfp_mask, order,  				zonelist, high_zoneidx, nodemask,  				preferred_zone, migratetype); +	put_mems_allowed();  	trace_mm_page_alloc(page, order, gfp_mask, migratetype);  	return page; @@ -2434,8 +2587,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,  			strncpy((char*)table->data, saved_string,  				NUMA_ZONELIST_ORDER_LEN);  			user_zonelist_order = oldval; -		} else if (oldval != user_zonelist_order) -			build_all_zonelists(); +		} else if (oldval != user_zonelist_order) { +			mutex_lock(&zonelists_mutex); +			build_all_zonelists(NULL); +			mutex_unlock(&zonelists_mutex); +		}  	}  out:  	mutex_unlock(&zl_order_mutex); @@ -2579,10 +2735,10 @@ static int default_zonelist_order(void)  	struct zone *z;  	int average_size;  	/* -         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. +         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.  	 * If they are really small and used heavily, the system can fall  	 * into OOM very easily. -	 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. +	 * This function detect ZONE_DMA/DMA32 size and configures zone order.  	 */  	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */  	low_kmem_size = 0; @@ -2594,6 +2750,15 @@ static int default_zonelist_order(void)  				if (zone_type < ZONE_NORMAL)  					low_kmem_size += z->present_pages;  				total_size += z->present_pages; +			} else if (zone_type == ZONE_NORMAL) { +				/* +				 * If any node has only lowmem, then node order +				 * is preferred to allow kernel allocations +				 * locally; otherwise, they can easily infringe +				 * on other nodes when there is an abundance of +				 * lowmem available to allocate from. +				 */ +				return ZONELIST_ORDER_NODE;  			}  		}  	} @@ -2707,6 +2872,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)  		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);  } +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ +	struct zone *zone; + +	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), +				   gfp_zone(GFP_KERNEL), +				   NULL, +				   &zone); +	return zone->node; +} +#endif  #else	/* CONFIG_NUMA */ @@ -2776,9 +2959,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)   */  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); + +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex);  /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data)  {  	int nid;  	int cpu; @@ -2793,6 +2983,14 @@ static int __build_all_zonelists(void *dummy)  		build_zonelist_cache(pgdat);  	} +#ifdef CONFIG_MEMORY_HOTPLUG +	/* Setup real pagesets for the new zone */ +	if (data) { +		struct zone *zone = data; +		setup_zone_pageset(zone); +	} +#endif +  	/*  	 * Initialize the boot_pagesets that are going to be used  	 * for bootstrapping processors. The real pagesets for @@ -2806,13 +3004,31 @@ static int __build_all_zonelists(void *dummy)  	 * needs the percpu allocator in order to allocate its pagesets  	 * (a chicken-egg dilemma).  	 */ -	for_each_possible_cpu(cpu) +	for_each_possible_cpu(cpu) {  		setup_pageset(&per_cpu(boot_pageset, cpu), 0); +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +		/* +		 * We now know the "local memory node" for each node-- +		 * i.e., the node of the first zone in the generic zonelist. +		 * Set up numa_mem percpu variable for on-line cpus.  During +		 * boot, only the boot cpu should be on-line;  we'll init the +		 * secondary cpus' numa_mem as they come on-line.  During +		 * node/memory hotplug, we'll fixup all on-line cpus. +		 */ +		if (cpu_online(cpu)) +			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif +	} +  	return 0;  } -void build_all_zonelists(void) +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ +void build_all_zonelists(void *data)  {  	set_zonelist_order(); @@ -2823,7 +3039,7 @@ void build_all_zonelists(void)  	} else {  		/* we have to stop all cpus to guarantee there is no user  		   of zonelist */ -		stop_machine(__build_all_zonelists, NULL, NULL); +		stop_machine(__build_all_zonelists, data, NULL);  		/* cpuset refresh routine should be here */  	}  	vm_total_pages = nr_free_pagecache_pages(); @@ -3146,31 +3362,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,  		pcp->batch = PAGE_SHIFT * 8;  } +static __meminit void setup_zone_pageset(struct zone *zone) +{ +	int cpu; + +	zone->pageset = alloc_percpu(struct per_cpu_pageset); + +	for_each_possible_cpu(cpu) { +		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + +		setup_pageset(pcp, zone_batchsize(zone)); + +		if (percpu_pagelist_fraction) +			setup_pagelist_highmark(pcp, +				(zone->present_pages / +					percpu_pagelist_fraction)); +	} +} +  /*   * Allocate per cpu pagesets and initialize them.   * Before this call only boot pagesets were available. - * Boot pagesets will no longer be used by this processorr - * after setup_per_cpu_pageset().   */  void __init setup_per_cpu_pageset(void)  {  	struct zone *zone; -	int cpu; - -	for_each_populated_zone(zone) { -		zone->pageset = alloc_percpu(struct per_cpu_pageset); - -		for_each_possible_cpu(cpu) { -			struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - -			setup_pageset(pcp, zone_batchsize(zone)); -			if (percpu_pagelist_fraction) -				setup_pagelist_highmark(pcp, -					(zone->present_pages / -						percpu_pagelist_fraction)); -		} -	} +	for_each_populated_zone(zone) +		setup_zone_pageset(zone);  }  static noinline __init_refok | 
