aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c516
1 files changed, 237 insertions, 279 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1bad301820c7..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,8 @@
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/lockdep.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -2535,9 +2537,14 @@ void drain_all_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
void mark_free_pages(struct zone *zone)
{
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
@@ -2552,6 +2559,11 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
if (page_zone(page) != zone)
continue;
@@ -2565,8 +2577,13 @@ void mark_free_pages(struct zone *zone)
unsigned long i;
pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++)
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
}
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -2934,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2947,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!alloc_harder)) {
free_pages -= z->nr_reserved_highatomic;
- else
- min -= min / 4;
+ } else {
+ /*
+ * OOM victims can try even harder than normal ALLOC_HARDER
+ * users on the grounds that it's definitely going to be in
+ * the exit path shortly and free memory. Any allocation it
+ * makes during the free path will be small and short-lived.
+ */
+ if (alloc_flags & ALLOC_OOM)
+ min -= min / 2;
+ else
+ min -= min / 4;
+ }
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
@@ -3188,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
- if (test_thread_flag(TIF_MEMDIE) ||
+ if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3275,10 +3303,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
- * we're still under heavy pressure.
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
*/
- page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -3494,6 +3525,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
}
#endif /* CONFIG_COMPACTION */
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+ STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* this guy won't enter reclaim */
+ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return false;
+
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return false;
+
+ return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3508,7 +3580,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3516,7 +3588,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
@@ -3607,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
return alloc_flags;
}
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
{
- if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ if (!tsk_is_oom_victim(tsk))
return false;
+ /*
+ * !MMU doesn't have oom reaper so give access to memory reserves
+ * only to the thread with TIF_MEMDIE set
+ */
+ if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+ return false;
+
+ return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return 0;
if (gfp_mask & __GFP_MEMALLOC)
- return true;
+ return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- return true;
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- return true;
+ return ALLOC_NO_WATERMARKS;
+ if (!in_interrupt()) {
+ if (current->flags & PF_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ else if (oom_reserves_allowed(current))
+ return ALLOC_OOM;
+ }
- return false;
+ return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!__gfp_pfmemalloc_flags(gfp_mask);
}
/*
@@ -3774,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
+ int reserve_flags;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3879,15 +3977,16 @@ retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
- if (gfp_pfmemalloc_allowed(gfp_mask))
- alloc_flags = ALLOC_NO_WATERMARKS;
+ reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+ if (reserve_flags)
+ alloc_flags = reserve_flags;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
@@ -3964,8 +4063,8 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) &&
- (alloc_flags == ALLOC_NO_WATERMARKS ||
+ if (tsk_is_oom_victim(current) &&
+ (alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4045,7 +4144,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- lockdep_trace_alloc(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
@@ -4447,7 +4547,7 @@ long si_mem_available(void)
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
- available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -4476,7 +4576,7 @@ void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_page_state(NR_FREE_PAGES);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
@@ -4611,11 +4711,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
- global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE),
- global_page_state(NR_FREE_PAGES),
+ global_zone_page_state(NR_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_zone_page_state(NR_FREE_PAGES),
free_pcp,
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4777,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
*
* Add all populated zones of a node to the zonelist.
*/
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
+ int nr_zones = 0;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (managed_zone(zone)) {
- zoneref_set_zone(zone,
- &zonelist->_zonerefs[nr_zones++]);
+ zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
@@ -4796,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
return nr_zones;
}
-
-/*
- * zonelist_order:
- * 0 = automatic detection of better ordering.
- * 1 = order by ([node] distance, -zonetype)
- * 2 = order by (-zonetype, [node] distance)
- *
- * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- * the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT 0
-#define ZONELIST_ORDER_NODE 1
-#define ZONELIST_ORDER_ZONE 2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
#ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN 16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- * = "[dD]efault - default, automatic configuration.
- * = "[nN]ode - order by node locality, then by zone within node
- * = "[zZ]one - order by zone, then by locality within zone
- */
static int __parse_numa_zonelist_order(char *s)
{
- if (*s == 'd' || *s == 'D') {
- user_zonelist_order = ZONELIST_ORDER_DEFAULT;
- } else if (*s == 'n' || *s == 'N') {
- user_zonelist_order = ZONELIST_ORDER_NODE;
- } else if (*s == 'z' || *s == 'Z') {
- user_zonelist_order = ZONELIST_ORDER_ZONE;
- } else {
- pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
+ /*
+ * We used to support different zonlists modes but they turned
+ * out to be just not useful. Let's keep the warning in place
+ * if somebody still use the cmd line parameter so that we do
+ * not fail it silently
+ */
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4849,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- int ret;
-
if (!s)
return 0;
- ret = __parse_numa_zonelist_order(s);
- if (ret == 0)
- strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
- return ret;
+ return __parse_numa_zonelist_order(s);
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
+char numa_zonelist_order[] = "Node";
+
/*
* sysctl handler for numa_zonelist_order
*/
@@ -4869,42 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
- char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ char *str;
int ret;
- static DEFINE_MUTEX(zl_order_mutex);
- mutex_lock(&zl_order_mutex);
- if (write) {
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
- ret = -EINVAL;
- goto out;
- }
- strcpy(saved_string, (char *)table->data);
- }
- ret = proc_dostring(table, write, buffer, length, ppos);
- if (ret)
- goto out;
- if (write) {
- int oldval = user_zonelist_order;
+ if (!write)
+ return proc_dostring(table, write, buffer, length, ppos);
+ str = memdup_user_nul(buffer, 16);
+ if (IS_ERR(str))
+ return PTR_ERR(str);
- ret = __parse_numa_zonelist_order((char *)table->data);
- if (ret) {
- /*
- * bogus value. restore saved string
- */
- strncpy((char *)table->data, saved_string,
- NUMA_ZONELIST_ORDER_LEN);
- user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order) {
- mem_hotplug_begin();
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- mem_hotplug_done();
- }
- }
-out:
- mutex_unlock(&zl_order_mutex);
+ ret = __parse_numa_zonelist_order(str);
+ kfree(str);
return ret;
}
@@ -4978,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int i;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ pg_data_t *node = NODE_DATA(node_order[i]);
+
+ nr_zones = build_zonerefs_node(node, zonerefs);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -4996,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -5011,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
-static int node_order[MAX_NUMNODES];
-
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
- int pos, j, node;
- int zone_type; /* needs to be signed */
- struct zone *z;
- struct zonelist *zonelist;
-
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- pos = 0;
- for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (managed_zone(z)) {
- zoneref_set_zone(z,
- &zonelist->_zonerefs[pos++]);
- check_highest_zone(zone_type);
- }
- }
- }
- zonelist->_zonerefs[pos].zone = NULL;
- zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
- if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
- current_zonelist_order = default_zonelist_order();
- else
- current_zonelist_order = user_zonelist_order;
-}
static void build_zonelists(pg_data_t *pgdat)
{
- int i, node, load;
+ static int node_order[MAX_NUMNODES];
+ int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
- struct zonelist *zonelist;
- unsigned int order = current_zonelist_order;
-
- /* initialize zonelists */
- for (i = 0; i < MAX_ZONELISTS; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->_zonerefs[0].zone = NULL;
- zonelist->_zonerefs[0].zone_idx = 0;
- }
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5092,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- i = 0;
-
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
@@ -5104,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat)
node_distance(local_node, prev_node))
node_load[node] = load;
+ node_order[nr_nodes++] = node;
prev_node = node;
load--;
- if (order == ZONELIST_ORDER_NODE)
- build_zonelists_in_node_order(pgdat, node);
- else
- node_order[i++] = node; /* remember order */
- }
-
- if (order == ZONELIST_ORDER_ZONE) {
- /* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, i);
}
+ build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}
@@ -5142,21 +5111,17 @@ static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
-static void set_zonelist_order(void)
-{
- current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
@@ -5169,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
#endif /* CONFIG_NUMA */
@@ -5201,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat)
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
-
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
{
int nid;
- int cpu;
+ int __maybe_unused cpu;
pg_data_t *self = data;
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+ /*
+ * This node is hotadded and no memory is yet present. So just
+ * building zonelists is fine - no need to touch other nodes.
+ */
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- }
-
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
-
- build_zonelists(pgdat);
- }
+ } else {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
- /*
- * Initialize the boot_pagesets that are going to be used
- * for bootstrapping processors. The real pagesets for
- * each zone will be allocated later when the per cpu
- * allocator is available.
- *
- * boot_pagesets are used also for bootstrapping offline
- * cpus if the system is already booted because the pagesets
- * are needed to initialize allocators on a specific cpu too.
- * F.e. the percpu allocator needs the page allocator which
- * needs the percpu allocator in order to allocate its pagesets
- * (a chicken-egg dilemma).
- */
- for_each_possible_cpu(cpu) {
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ build_zonelists(pgdat);
+ }
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
@@ -5255,45 +5204,53 @@ static int __build_all_zonelists(void *data)
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
- if (cpu_online(cpu))
+ for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
- return 0;
+ spin_unlock(&lock);
}
static noinline void __init
build_all_zonelists_init(void)
{
+ int cpu;
+
__build_all_zonelists(NULL);
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
/*
- * Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
{
- set_zonelist_order();
-
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
- if (zone)
- setup_zone_pageset(zone);
-#endif
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
- stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
+ __build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5309,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
@@ -5565,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
pageset_set_high_and_batch(zone, pcp);
}
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -7019,9 +6975,11 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
- mutex_lock(&zonelists_mutex);
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
__setup_per_zone_wmarks();
- mutex_unlock(&zonelists_mutex);
+ spin_unlock(&lock);
}
/*