aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/Makefile3
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c79
-rw-r--r--mm/compaction.c160
-rw-r--r--mm/dmapool.c55
-rw-r--r--mm/highmem.c29
-rw-r--r--mm/huge_memory.c641
-rw-r--r--mm/hugetlb.c52
-rw-r--r--mm/hugetlb_cgroup.c23
-rw-r--r--mm/internal.h12
-rw-r--r--mm/ksm.c27
-rw-r--r--mm/memcontrol.c229
-rw-r--r--mm/memory-failure.c43
-rw-r--r--mm/memory.c229
-rw-r--r--mm/memory_hotplug.c424
-rw-r--r--mm/mempolicy.c340
-rw-r--r--mm/migrate.c438
-rw-r--r--mm/mmap.c569
-rw-r--r--mm/mprotect.c137
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c22
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/oom_kill.c138
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c314
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/page_isolation.c27
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c134
-rw-r--r--mm/shmem.c118
-rw-r--r--mm/slub.c4
-rw-r--r--mm/sparse.c35
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c111
-rw-r--r--mm/vmstat.c28
40 files changed, 3630 insertions, 1204 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..71259e052ce8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,14 @@ config NO_BOOTMEM
config MEMORY_ISOLATION
boolean
+config MOVABLE_NODE
+ boolean "Enable to assign a node which has only movable memory"
+ depends on HAVE_MEMBLOCK
+ depends on NO_BOOTMEM
+ depends on X86_64
+ depends on NUMA
+ depends on BROKEN
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -188,6 +196,21 @@ config SPLIT_PTLOCK_CPUS
default "4"
#
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+ bool "Allow for balloon memory compaction/migration"
+ def_bool y
+ depends on COMPACTION && VIRTIO_BALLOON
+ help
+ Memory fragmentation introduced by ballooning might reduce
+ significantly the number of 2MB contiguous memory blocks that can be
+ used within a guest, thus imposing performance penalties associated
+ with the reduced number of transparent huge pages that could be used
+ by the guest workload. Allowing the compaction & migration for memory
+ pages enlisted as being part of memory balloon devices avoids the
+ scenario aforementioned and helps improving memory defragmentation.
+
+#
# support for memory compaction
config COMPACTION
bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o interval_tree.o $(mmu-y)
+ compaction.o balloon_compaction.o \
+ interval_tree.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+
+/*
+ * balloon_devinfo_alloc - allocates a balloon device information descriptor.
+ * @balloon_dev_descriptor: pointer to reference the balloon device which
+ * this struct balloon_dev_info will be servicing.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct balloon_dev_info which will be used to reference a balloon device
+ * as well as to keep track of the balloon device page list.
+ */
+struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+{
+ struct balloon_dev_info *b_dev_info;
+ b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
+ if (!b_dev_info)
+ return ERR_PTR(-ENOMEM);
+
+ b_dev_info->balloon_device = balloon_dev_descriptor;
+ b_dev_info->mapping = NULL;
+ b_dev_info->isolated_pages = 0;
+ spin_lock_init(&b_dev_info->pages_lock);
+ INIT_LIST_HEAD(&b_dev_info->pages);
+
+ return b_dev_info;
+}
+EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+
+/*
+ * balloon_page_enqueue - allocates a new page and inserts it into the balloon
+ * page list.
+ * @b_dev_info: balloon device decriptor where we will insert a new page to
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page
+ * before definetively removing it from the guest system.
+ * This function returns the page address for the recently enqueued page or
+ * NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+{
+ unsigned long flags;
+ struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
+ if (!page)
+ return NULL;
+
+ /*
+ * Block others from accessing the 'page' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'page' at this point.
+ */
+ BUG_ON(!trylock_page(page));
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ * the its address to allow the driver release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call it to properly de-allocate a previous enlisted balloon page
+ * before definetively releasing it back to the guest system.
+ * This function returns the page address for the recently dequeued page or
+ * NULL in the case we find balloon's page list temporarily empty due to
+ * compaction isolated pages.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ bool dequeued_page;
+
+ dequeued_page = false;
+ list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ /*
+ * Block others from accessing the 'page' while we get around
+ * establishing additional references and preparing the 'page'
+ * to be released by the balloon driver.
+ */
+ if (trylock_page(page)) {
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ /*
+ * Raise the page refcount here to prevent any wrong
+ * attempt to isolate this page, in case of coliding
+ * with balloon_page_isolate() just after we release
+ * the page lock.
+ *
+ * balloon_page_free() will take care of dropping
+ * this extra refcount later.
+ */
+ get_page(page);
+ balloon_page_delete(page);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ dequeued_page = true;
+ break;
+ }
+ }
+
+ if (!dequeued_page) {
+ /*
+ * If we are unable to dequeue a balloon page because the page
+ * list is empty and there is no isolated pages, then something
+ * went out of track and some balloon pages are lost.
+ * BUG() here, otherwise the balloon driver may get stuck into
+ * an infinite loop while attempting to release all its pages.
+ */
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ if (unlikely(list_empty(&b_dev_info->pages) &&
+ !b_dev_info->isolated_pages))
+ BUG();
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
+ * @b_dev_info: holds the balloon device information descriptor.
+ * @a_ops: balloon_mapping address_space_operations descriptor.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct address_space which will be used as the special page->mapping for
+ * balloon device enlisted page instances.
+ */
+struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+ const struct address_space_operations *a_ops)
+{
+ struct address_space *mapping;
+
+ mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Give a clean 'zeroed' status to all elements of this special
+ * balloon page->mapping struct address_space instance.
+ */
+ address_space_init_once(mapping);
+
+ /*
+ * Set mapping->flags appropriately, to allow balloon pages
+ * ->mapping identification.
+ */
+ mapping_set_balloon(mapping);
+ mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
+
+ /* balloon's page->mapping->a_ops callback descriptor */
+ mapping->a_ops = a_ops;
+
+ /*
+ * Establish a pointer reference back to the balloon device descriptor
+ * this particular page->mapping will be servicing.
+ * This is used by compaction / migration procedures to identify and
+ * access the balloon device pageset while isolating / migrating pages.
+ *
+ * As some balloon drivers can register multiple balloon devices
+ * for a single guest, this also helps compaction / migration to
+ * properly deal with multiple balloon pagesets, when required.
+ */
+ mapping->private_data = b_dev_info;
+ b_dev_info->mapping = mapping;
+
+ return mapping;
+}
+EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
+
+static inline void __isolate_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_del(&page->lru);
+ b_dev_info->isolated_pages++;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline void __putback_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_add(&page->lru, &b_dev_info->pages);
+ b_dev_info->isolated_pages--;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline int __migrate_balloon_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
+}
+
+/* __isolate_lru_page() counterpart for a ballooned page */
+bool balloon_page_isolate(struct page *page)
+{
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a balloon page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * As balloon pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the balloon driver releasing a page.
+ *
+ * In order to avoid having an already isolated balloon page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released by
+ * the balloon driver, lets be sure we have the page lock
+ * before proceeding with the balloon page isolation steps.
+ */
+ if (likely(trylock_page(page))) {
+ /*
+ * A ballooned page, by default, has just one refcount.
+ * Prevent concurrent compaction threads from isolating
+ * an already isolated balloon page by refcount check.
+ */
+ if (__is_movable_balloon_page(page) &&
+ page_count(page) == 2) {
+ __isolate_balloon_page(page);
+ unlock_page(page);
+ return true;
+ }
+ unlock_page(page);
+ }
+ put_page(page);
+ }
+ return false;
+}
+
+/* putback_lru_page() counterpart for a ballooned page */
+void balloon_page_putback(struct page *page)
+{
+ /*
+ * 'lock_page()' stabilizes the page and prevents races against
+ * concurrent isolation threads attempting to re-isolate it.
+ */
+ lock_page(page);
+
+ if (__is_movable_balloon_page(page)) {
+ __putback_balloon_page(page);
+ /* drop the extra ref count taken for page isolation */
+ put_page(page);
+ } else {
+ WARN_ON(1);
+ dump_page(page);
+ }
+ unlock_page(page);
+}
+
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct address_space *mapping;
+ int rc = -EAGAIN;
+
+ /*
+ * Block others from accessing the 'newpage' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'newpage' at this point.
+ */
+ BUG_ON(!trylock_page(newpage));
+
+ if (WARN_ON(!__is_movable_balloon_page(page))) {
+ dump_page(page);
+ unlock_page(newpage);
+ return rc;
+ }
+
+ mapping = page->mapping;
+ if (mapping)
+ rc = __migrate_balloon_page(mapping, newpage, page, mode);
+
+ unlock_page(newpage);
+ return rc;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..1324cd74faec 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
/*
* free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
* @size: size of the range in bytes
*
* This is only useful when the bootmem allocator has already been torn
* down, but we are still initializing the system. Pages are given directly
* to the page allocator, no bootmem metadata is updated because it is gone.
*/
-void __init free_bootmem_late(unsigned long addr, unsigned long size)
+void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
{
unsigned long cursor, end;
- kmemleak_free_part(__va(addr), size);
+ kmemleak_free_part(__va(physaddr), size);
- cursor = PFN_UP(addr);
- end = PFN_DOWN(addr + size);
+ cursor = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
for (; cursor < end; cursor++) {
__free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ if (!is_highmem(z))
+ z->managed_pages = 0;
+}
+
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
* @pgdat: node to be released
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
return free_all_bootmem_core(pgdat->bdata);
}
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +398,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
/**
* free_bootmem - mark a page range as usable
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
*
* The range must be contiguous but may span node boundaries.
*/
-void __init free_bootmem(unsigned long addr, unsigned long size)
+void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
unsigned long start, end;
- kmemleak_free_part(__va(addr), size);
+ kmemleak_free_part(__va(physaddr), size);
- start = PFN_UP(addr);
- end = PFN_DOWN(addr + size);
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
mark_bootmem(start, end, 0, 0);
}
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags);
}
-int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
- int flags)
-{
- return reserve_bootmem(phys, len, flags);
-}
-
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
@@ -575,27 +590,6 @@ find_block:
return NULL;
}
-static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
- unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
-
-#ifdef CONFIG_HAVE_ARCH_BOOTMEM
- {
- bootmem_data_t *p_bdata;
-
- p_bdata = bootmem_arch_preferred_node(bdata, size, align,
- goal, limit);
- if (p_bdata)
- return alloc_bootmem_bdata(p_bdata, size, align,
- goal, limit);
- }
-#endif
- return NULL;
-}
-
static void * __init alloc_bootmem_core(unsigned long size,
unsigned long align,
unsigned long goal,
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
bootmem_data_t *bdata;
void *region;
- region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
- if (region)
- return region;
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
list_for_each_entry(bdata, &bdata_list, list) {
if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
{
void *ptr;
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
again:
- ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
- align, goal, limit);
- if (ptr)
- return ptr;
/* do not panic in alloc_bootmem_bdata() */
if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index 9eef55838fca..5ad7f4f4d6f7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,6 +14,7 @@
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
+#include <linux/balloon_compaction.h>
#include "internal.h"
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -214,60 +215,6 @@ static bool suitable_migration_target(struct page *page)
return false;
}
-static void compact_capture_page(struct compact_control *cc)
-{
- unsigned long flags;
- int mtype, mtype_low, mtype_high;
-
- if (!cc->page || *cc->page)
- return;
-
- /*
- * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
- * regardless of the migratetype of the freelist is is captured from.
- * This is fine because the order for a high-order MIGRATE_MOVABLE
- * allocation is typically at least a pageblock size and overall
- * fragmentation is not impaired. Other allocation types must
- * capture pages from their own migratelist because otherwise they
- * could pollute other pageblocks like MIGRATE_MOVABLE with
- * difficult to move pages and making fragmentation worse overall.
- */
- if (cc->migratetype == MIGRATE_MOVABLE) {
- mtype_low = 0;
- mtype_high = MIGRATE_PCPTYPES;
- } else {
- mtype_low = cc->migratetype;
- mtype_high = cc->migratetype + 1;
- }
-
- /* Speculatively examine the free lists without zone lock */
- for (mtype = mtype_low; mtype < mtype_high; mtype++) {
- int order;
- for (order = cc->order; order < MAX_ORDER; order++) {
- struct page *page;
- struct free_area *area;
- area = &(cc->zone->free_area[order]);
- if (list_empty(&area->free_list[mtype]))
- continue;
-
- /* Take the lock and attempt capture of the page */
- if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
- return;
- if (!list_empty(&area->free_list[mtype])) {
- page = list_entry(area->free_list[mtype].next,
- struct page, lru);
- if (capture_free_page(page, cc->order, mtype)) {
- spin_unlock_irqrestore(&cc->zone->lock,
- flags);
- *cc->page = page;
- return;
- }
- }
- spin_unlock_irqrestore(&cc->zone->lock, flags);
- }
- }
-}
-
/*
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -356,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
+ count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+ count_vm_events(COMPACTISOLATED, total_isolated);
+
return total_isolated;
}
@@ -565,9 +516,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
goto next_pageblock;
}
- /* Check may be lockless but that's ok as we recheck later */
- if (!PageLRU(page))
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU pages and balloon pages
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ if (unlikely(balloon_page_movable(page))) {
+ if (locked && balloon_page_isolate(page)) {
+ /* Successfully isolated */
+ cc->finished_update_migrate = true;
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+ goto check_compact_cluster;
+ }
+ }
continue;
+ }
/*
* PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +587,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
cc->nr_migratepages++;
nr_isolated++;
+check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -646,6 +613,10 @@ next_pageblock:
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ if (nr_isolated)
+ count_vm_events(COMPACTISOLATED, nr_isolated);
+
return low_pfn;
}
@@ -713,7 +684,15 @@ static void isolate_freepages(struct zone *zone,
/* Found a block suitable for isolating free pages from */
isolated = 0;
- end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+
+ /*
+ * As pfn may not start aligned, pfn+pageblock_nr_page
+ * may cross a MAX_ORDER_NR_PAGES boundary and miss
+ * a pfn_valid check. Ensure isolate_freepages_block()
+ * only scans within a pageblock
+ */
+ end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ end_pfn = min(end_pfn, zone_end_pfn);
isolated = isolate_freepages_block(cc, pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
@@ -928,6 +907,60 @@ unsigned long compaction_suitable(struct zone *zone, int order)
return COMPACT_CONTINUE;
}
+static void compact_capture_page(struct compact_control *cc)
+{
+ unsigned long flags;
+ int mtype, mtype_low, mtype_high;
+
+ if (!cc->page || *cc->page)
+ return;
+
+ /*
+ * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+ * regardless of the migratetype of the freelist is is captured from.
+ * This is fine because the order for a high-order MIGRATE_MOVABLE
+ * allocation is typically at least a pageblock size and overall
+ * fragmentation is not impaired. Other allocation types must
+ * capture pages from their own migratelist because otherwise they
+ * could pollute other pageblocks like MIGRATE_MOVABLE with
+ * difficult to move pages and making fragmentation worse overall.
+ */
+ if (cc->migratetype == MIGRATE_MOVABLE) {
+ mtype_low = 0;
+ mtype_high = MIGRATE_PCPTYPES;
+ } else {
+ mtype_low = cc->migratetype;
+ mtype_high = cc->migratetype + 1;
+ }
+
+ /* Speculatively examine the free lists without zone lock */
+ for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+ int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct page *page;
+ struct free_area *area;
+ area = &(cc->zone->free_area[order]);
+ if (list_empty(&area->free_list[mtype]))
+ continue;
+
+ /* Take the lock and attempt capture of the page */
+ if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+ return;
+ if (!list_empty(&area->free_list[mtype])) {
+ page = list_entry(area->free_list[mtype].next,
+ struct page, lru);
+ if (capture_free_page(page, cc->order, mtype)) {
+ spin_unlock_irqrestore(&cc->zone->lock,
+ flags);
+ *cc->page = page;
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+ }
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
@@ -978,7 +1011,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
- putback_lru_pages(&cc->migratepages);
+ putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
@@ -990,20 +1023,17 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ MR_COMPACTION);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
- count_vm_event(COMPACTBLOCKS);
- count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
- if (nr_remaining)
- count_vm_events(COMPACTPAGEFAILED, nr_remaining);
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
- /* Release LRU pages not migrated */
+ /* Release isolated pages not migrated */
if (err) {
- putback_lru_pages(&cc->migratepages);
+ putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
if (err == -ENOMEM) {
ret = COMPACT_PARTIAL;
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
size_t allocation;
size_t boundary;
char name[32];
- wait_queue_head_t waitq;
struct list_head pools;
};
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
unsigned int offset;
};
-#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
-
static DEFINE_MUTEX(pools_lock);
static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
- init_waitqueue_head(&retval->waitq);
if (dev) {
int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
pool_initialise_page(pool, page);
- list_add(&page->page_list, &pool->page_list);
page->in_use = 0;
page->offset = 0;
} else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
might_sleep_if(mem_flags & __GFP_WAIT);
spin_lock_irqsave(&pool->lock, flags);
- restart:
list_for_each_entry(page, &pool->page_list, page_list) {
if (page->offset < pool->allocation)
goto ready;
}
- page = pool_alloc_page(pool, GFP_ATOMIC);
- if (!page) {
- if (mem_flags & __GFP_WAIT) {
- DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_UNINTERRUPTIBLE);
- __add_wait_queue(&pool->waitq, &wait);
- spin_unlock_irqrestore(&pool->lock, flags);
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
- schedule_timeout(POOL_TIMEOUT_JIFFIES);
+ page = pool_alloc_page(pool, mem_flags);
+ if (!page)
+ return NULL;
- spin_lock_irqsave(&pool->lock, flags);
- __remove_wait_queue(&pool->waitq, &wait);
- goto restart;
- }
- retval = NULL;
- goto done;
- }
+ spin_lock_irqsave(&pool->lock, flags);
+ list_add(&page->page_list, &pool->page_list);
ready:
page->in_use++;
offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
retval = offset + page->vaddr;
*handle = offset + page->dma;
#ifdef DMAPOOL_DEBUG
+ {
+ int i;
+ u8 *data = retval;
+ /* page->offset is stored in first 4 bytes */
+ for (i = sizeof(page->offset); i < pool->size; i++) {
+ if (data[i] == POOL_POISON_FREED)
+ continue;
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_alloc %s, %p (corruped)\n",
+ pool->name, retval);
+ else
+ pr_err("dma_pool_alloc %s, %p (corruped)\n",
+ pool->name, retval);
+
+ /*
+ * Dump the first 4 bytes even if they are not
+ * POOL_POISON_FREED
+ */
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+ data, pool->size, 1);
+ break;
+ }
+ }
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
- done:
spin_unlock_irqrestore(&pool->lock, flags);
return retval;
}
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
- if (waitqueue_active(&pool->waitq))
- wake_up_locked(&pool->waitq);
/*
* Resist a temptation to do
* if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..d999077431df 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr)
unsigned long addr = (unsigned long)vaddr;
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
- int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+ int i = PKMAP_NR(addr);
return pte_page(pkmap_page_table[i]);
}
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void)
* So no dangers, even with speculative execution.
*/
page = pte_page(pkmap_page_table[i]);
- pte_clear(&init_mm, (unsigned long)page_address(page),
- &pkmap_page_table[i]);
+ pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
need_flush = 1;
@@ -324,11 +323,7 @@ struct page_address_map {
struct list_head list;
};
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool; /* freelist */
-static spinlock_t pool_lock; /* protects page_address_pool */
+static struct page_address_map page_address_maps[LAST_PKMAP];
/*
* Hash table bucket
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual)
pas = page_slot(page);
if (virtual) { /* Add */
- BUG_ON(list_empty(&page_address_pool));
-
- spin_lock_irqsave(&pool_lock, flags);
- pam = list_entry(page_address_pool.next,
- struct page_address_map, list);
- list_del(&pam->list);
- spin_unlock_irqrestore(&pool_lock, flags);
-
+ pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
pam->page = page;
pam->virtual = virtual;
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual)
if (pam->page == page) {
list_del(&pam->list);
spin_unlock_irqrestore(&pas->lock, flags);
- spin_lock_irqsave(&pool_lock, flags);
- list_add_tail(&pam->list, &page_address_pool);
- spin_unlock_irqrestore(&pool_lock, flags);
goto done;
}
}
@@ -425,20 +410,14 @@ done:
return;
}
-static struct page_address_map page_address_maps[LAST_PKMAP];
-
void __init page_address_init(void)
{
int i;
- INIT_LIST_HEAD(&page_address_pool);
- for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
- list_add(&page_address_maps[i].list, &page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
INIT_LIST_HEAD(&page_address_htable[i].lh);
spin_lock_init(&page_address_htable[i].lock);
}
- spin_lock_init(&pool_lock);
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..32754eece63e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,15 @@
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
+#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/migrate.h>
+
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +163,77 @@ static int start_khugepaged(void)
return err;
}
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);
+
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return 0;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
+}
+
+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
#ifdef CONFIG_SYSFS
static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);
+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
+ &use_zero_page_attr.attr,
#ifdef CONFIG_DEBUG_VM
&debug_cow_attr.attr,
#endif
@@ -550,6 +640,8 @@ static int __init hugepage_init(void)
goto out;
}
+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -599,13 +691,22 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
return pmd;
}
+static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+{
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ return entry;
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
- entry = mk_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- entry = pmd_mkhuge(entry);
+ entry = mk_huge_pmd(page, vma);
/*
* The spinlocking to take the lru_lock inside
* page_add_new_anon_rmap() acts as a full memory
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
+{
+ pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
+ return true;
+}
+
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ unsigned long zero_pfn;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {
@@ -710,7 +849,8 @@ out:
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
goto out_unlock;
}
+ /*
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ unsigned long zero_pfn;
+ bool set;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
+ BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
+ ret = 0;
+ goto out_unlock;
+ }
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
spin_unlock(&src_mm->page_table_lock);
@@ -777,6 +937,102 @@ out:
return ret;
}
+void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+ pmd_t entry;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto unlock;
+
+ entry = pmd_mkyoung(orig_pmd);
+ haddr = address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+ update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+ spin_unlock(&mm->page_table_lock);
+}
+
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_page;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
+ inc_mm_counter(mm, MM_ANONPAGES);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+out_free_page:
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
+}
+
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
int ret = 0;
- struct page *page, *new_page;
+ struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
- haddr = address & HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
get_page(page);
spin_unlock(&mm->page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
- ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
- pmd, orig_pmd, page, haddr);
- if (ret & VM_FAULT_OOM)
- split_huge_page(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
+ address, pmd, orig_pmd, haddr);
+ } else {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
+ put_page(page);
+ }
goto out;
}
count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
- split_huge_page(page);
- put_page(page);
+ if (page) {
+ split_huge_page(page);
+ put_page(page);
+ }
ret |= VM_FAULT_OOM;
goto out;
}
- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ if (is_huge_zero_pmd(orig_pmd))
+ clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
mmun_start = haddr;
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
- put_page(page);
+ if (page)
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mn;
} else {
pmd_t entry;
- VM_BUG_ON(!PageHead(page));
- entry = mk_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- entry = pmd_mkhuge(entry);
+ entry = mk_huge_pmd(new_page, vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- page_remove_rmap(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ } else {
+ VM_BUG_ON(!PageHead(page));
+ page_remove_rmap(page);
+ put_page(page);
+ }
ret |= VM_FAULT_WRITE;
}
spin_unlock(&mm->page_table_lock);
@@ -1017,6 +1289,81 @@ out:
return page;
}
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+ bool page_locked = false;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+ get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ put_page(page);
+ goto clear_pmdnuma;
+ }
+
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+ page_locked = true;
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr,
+ page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+ else {
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ goto out_unlock;
+ }
+ goto clear_pmdnuma;
+ }
+
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+
+clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ if (page_locked)
+ unlock_page(page);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1028,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
pgtable = pgtable_trans_huge_withdraw(tlb->mm);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
- page = pmd_page(orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ }
pte_free(tlb->mm, pgtable);
ret = 1;
}
@@ -1099,7 +1452,7 @@ out:
}
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot)
+ unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
@@ -1107,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
+ if (!prot_numa) {
+ entry = pmd_modify(entry, newprot);
+ BUG_ON(pmd_write(entry));
+ } else {
+ struct page *page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
+ !pmd_numa(*pmd)) {
+ entry = pmd_mknuma(entry);
+ }
+ }
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
@@ -1146,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd, *ret = NULL;
if (address & ~HPAGE_PMD_MASK)
goto out;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- goto out;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
goto out;
-
- pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto out;
if (pmd_page(*pmd) != page)
@@ -1205,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->mutex to
+ * and it won't wait on the anon_vma->root->rwsem to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
@@ -1296,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
+ page_xchg_last_nid(page_tail, page_last_nid(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1363,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -1405,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
@@ -1458,8 +1817,9 @@ int split_huge_page(struct page *page)
struct anon_vma *anon_vma;
int ret = 1;
+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
goto out;
ret = 0;
@@ -1472,7 +1832,7 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page));
out_unlock:
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
out:
return ret;
}
@@ -1701,64 +2061,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
}
}
-static void release_all_pte_pages(pte_t *pte)
-{
- release_pte_pages(pte, pte + HPAGE_PMD_NR);
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
struct page *page;
pte_t *_pte;
- int referenced = 0, isolated = 0, none = 0;
+ int referenced = 0, none = 0;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval)) {
if (++none <= khugepaged_max_ptes_none)
continue;
- else {
- release_pte_pages(pte, _pte);
+ else
goto out;
- }
}
- if (!pte_present(pteval) || !pte_write(pteval)) {
- release_pte_pages(pte, _pte);
+ if (!pte_present(pteval) || !pte_write(pteval))
goto out;
- }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
- release_pte_pages(pte, _pte);
+ if (unlikely(!page))
goto out;
- }
+
VM_BUG_ON(PageCompound(page));
BUG_ON(!PageAnon(page));
VM_BUG_ON(!PageSwapBacked(page));
/* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1) {
- release_pte_pages(pte, _pte);
+ if (page_count(page) != 1)
goto out;
- }
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page)) {
- release_pte_pages(pte, _pte);
+ if (!trylock_page(page))
goto out;
- }
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
if (isolate_lru_page(page)) {
unlock_page(page);
- release_pte_pages(pte, _pte);
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +2116,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
- if (unlikely(!referenced))
- release_all_pte_pages(pte);
- else
- isolated = 1;
+ if (likely(referenced))
+ return 1;
out:
- return isolated;
+ release_pte_pages(pte, _pte);
+ return 0;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +2262,26 @@ static struct page
}
#endif
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ return false;
+
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ return true;
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
struct vm_area_struct *vma,
int node)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
@@ -1960,31 +2316,15 @@ static void collapse_huge_page(struct mm_struct *mm,
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
goto out;
-
- if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
-
- if (!vma->anon_vma || vma->vm_ops)
+ if (!hugepage_vma_check(vma))
goto out;
- if (is_vma_temporary_stack(vma))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
goto out;
- VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- goto out;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- goto out;
-
- pmd = pmd_offset(pud, address);
- /* pmd can't go away or become huge under us */
- if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ if (pmd_trans_huge(*pmd))
goto out;
- anon_vma_lock(vma->anon_vma);
+ anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
@@ -2028,9 +2368,7 @@ static void collapse_huge_page(struct mm_struct *mm,
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_pmd(new_page, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
- _pmd = pmd_mkhuge(_pmd);
+ _pmd = mk_huge_pmd(new_page, vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2402,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
unsigned long address,
struct page **hpage)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte, *_pte;
int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2412,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
goto out;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- goto out;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ if (pmd_trans_huge(*pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2523,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
-
- if ((!(vma->vm_flags & VM_HUGEPAGE) &&
- !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE)) {
- skip:
+ if (!hugepage_vma_check(vma)) {
+skip:
progress++;
continue;
}
- if (!vma->anon_vma || vma->vm_ops)
- goto skip;
- if (is_vma_temporary_stack(vma))
- goto skip;
- VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart >= hend)
@@ -2356,19 +2677,65 @@ static int khugepaged(void *none)
return 0;
}
-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ put_huge_zero_page();
+}
+
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
{
struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
@@ -2376,31 +2743,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
BUG_ON(pmd_trans_huge(*pmd));
}
+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, address, pmd);
}
void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..e5318c7793ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
/*
* Generic hugetlb support.
- * (C) William Irwin, April 2004
+ * (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
- if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
+ if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
}
}
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+ int nr_nodes = nodes_weight(node_states[N_MEMORY]);
while (nr_nodes) {
void *addr;
addr = __alloc_bootmem_node_nopanic(
NODE_DATA(hstate_next_node_to_alloc(h,
- &node_states[N_HIGH_MEMORY])),
+ &node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
- &node_states[N_HIGH_MEMORY]))
+ &node_states[N_MEMORY]))
break;
}
h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
}
} else if (nodes_allowed) {
/*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
init_nodemask_of_node(nodes_allowed, nid);
} else
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
- if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
return len;
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
* remove hstate attributes from any nodes that have them.
*/
for (nid = 0; nid < nr_node_ids; nid++)
- hugetlb_unregister_node(&node_devices[nid]);
+ hugetlb_unregister_node(node_devices[nid]);
}
/*
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_HIGH_MEMORY) {
- struct node *node = &node_devices[nid];
+ for_each_node_state(nid, N_MEMORY) {
+ struct node *node = node_devices[nid];
if (node->dev.id == nid)
hugetlb_register_node(node);
}
@@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
+ h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_free = first_node(node_states[N_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
/*
@@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
}
h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
- if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
out:
@@ -2386,8 +2386,10 @@ again:
/*
* HWPoisoned hugepage is already unmapped and dropped reference
*/
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ pte_clear(mm, address, ptep);
continue;
+ }
page = pte_page(pte);
/*
@@ -3014,7 +3016,7 @@ same_page:
return i ? i : -EFAULT;
}
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -3022,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
@@ -3032,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- if (huge_pmd_unshare(mm, &address, ptep))
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
continue;
+ }
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
+ pages++;
}
}
spin_unlock(&mm->page_table_lock);
@@ -3049,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+
+ return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
@@ -3170,7 +3178,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
spin_lock(&hugetlb_lock);
if (is_hugepage_on_freelist(hpage)) {
- list_del(&hpage->lru);
+ /*
+ * Hwpoisoned hugepage isn't linked to activelist or freelist,
+ * but dangling hpage->lru can trigger list-debug warnings
+ * (this happens when we call unpoison_memory() on it),
+ * so let it point to itself with list_del_init().
+ */
+ list_del_init(&hpage->lru);
set_page_refcounted(hpage);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..b5bde7a5c017 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
return false;
}
-static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
{
int idx;
struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
return &h_cgroup->css;
}
-static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
{
struct hugetlb_cgroup *h_cgroup;
@@ -155,18 +155,13 @@ out:
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup.
*/
-static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
{
struct hstate *h;
struct page *page;
- int ret = 0, idx = 0;
+ int idx = 0;
do {
- if (cgroup_task_count(cgroup) ||
- !list_empty(&cgroup->children)) {
- ret = -EBUSY;
- goto out;
- }
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
}
cond_resched();
} while (hugetlb_cgroup_have_usage(cgroup));
-out:
- return ret;
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -411,8 +404,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
struct cgroup_subsys hugetlb_subsys = {
.name = "hugetlb",
- .create = hugetlb_cgroup_create,
- .pre_destroy = hugetlb_cgroup_pre_destroy,
- .destroy = hugetlb_cgroup_destroy,
- .subsys_id = hugetlb_subsys_id,
+ .css_alloc = hugetlb_cgroup_css_alloc,
+ .css_offline = hugetlb_cgroup_css_offline,
+ .css_free = hugetlb_cgroup_css_free,
+ .subsys_id = hugetlb_subsys_id,
};
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..d597f94cc205 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
/*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+
+/*
* in mm/page_alloc.c
*/
extern void __free_pages_bootmem(struct page *page, unsigned int order);
@@ -212,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);
local_irq_save(flags);
- __dec_zone_page_state(page, NR_MLOCK);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
- __inc_zone_page_state(newpage, NR_MLOCK);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
local_irq_restore(flags);
}
}
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..82dfb4b54321 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (addr == -EFAULT)
goto out;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
goto out;
-
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
-
- pmd = pmd_offset(pud, addr);
BUG_ON(pmd_trans_huge(*pmd));
- if (!pmd_present(*pmd))
- goto out;
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
@@ -1634,7 +1624,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1688,7 +1678,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1741,7 +1731,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
- int oom_score_adj;
-
- oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+ set_current_oom_origin();
err = unmerge_and_remove_all_rmap_items();
- compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
- oom_score_adj);
+ clear_current_oom_origin();
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..bbfac5063ca8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,8 @@
#include <trace/events/vmscan.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
+EXPORT_SYMBOL(mem_cgroup_subsys);
+
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -800,7 +802,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
int nid;
u64 total = 0;
- for_each_node_state(nid, N_HIGH_MEMORY)
+ for_each_node_state(nid, N_MEMORY)
total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
return total;
}
@@ -1015,13 +1017,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
struct mem_cgroup *memcg;
- if (!mm)
- return;
-
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
@@ -1040,7 +1039,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
out:
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_count_vm_event);
+EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1498,8 +1497,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
return limit;
}
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
- int order)
+static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ int order)
{
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
@@ -1644,9 +1643,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
return;
/* make a nodemask where this memcg uses memory from */
- memcg->scan_nodes = node_states[N_HIGH_MEMORY];
+ memcg->scan_nodes = node_states[N_MEMORY];
- for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+ for_each_node_mask(nid, node_states[N_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
@@ -1717,7 +1716,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
/*
* Check rest of nodes.
*/
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
continue;
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -2370,7 +2369,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
again:
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
- VM_BUG_ON(css_is_removed(&memcg->css));
if (mem_cgroup_is_root(memcg))
goto done;
if (nr_pages == 1 && consume_stock(memcg))
@@ -2510,9 +2508,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
/*
* A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller must check css_is_removed() or some if
- * it's concern. (dropping refcnt from swap can be called against removed
- * memcg.)
+ * rcu_read_lock(). The caller is responsible for calling css_tryget if
+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
+ * called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
@@ -2709,13 +2707,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, anon, nr_pages);
- /*
- * We charges against "to" which may not have any tasks. Then, "to"
- * can be under rmdir(). But in current implementation, caller of
- * this function is just force_empty() and move charge, so it's
- * guaranteed that "to" is never removed. So, we don't check rmdir
- * status here.
- */
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
@@ -2729,10 +2720,27 @@ out:
return ret;
}
-/*
- * move charges to its parent.
+/**
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
*/
-
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
@@ -2742,9 +2750,7 @@ static int mem_cgroup_move_parent(struct page *page,
unsigned long uninitialized_var(flags);
int ret;
- /* Is ROOT ? */
- if (mem_cgroup_is_root(child))
- return -EINVAL;
+ VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
@@ -2761,8 +2767,10 @@ static int mem_cgroup_move_parent(struct page *page,
if (!parent)
parent = root_mem_cgroup;
- if (nr_pages > 1)
+ if (nr_pages > 1) {
+ VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
+ }
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
@@ -2904,7 +2912,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
return;
if (!memcg)
return;
- cgroup_exclude_rmdir(&memcg->css);
__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
@@ -2918,12 +2925,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
}
- /*
- * At swapin, we may charge account against cgroup which has no tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3288,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;
*memcgp = NULL;
- VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return;
+ if (PageTransHuge(page))
+ nr_pages <<= compound_order(page);
+
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
@@ -3358,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
}
/* remove redundant charge if migration failed*/
@@ -3371,8 +3375,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
if (!memcg)
return;
- /* blocks rmdir() */
- cgroup_exclude_rmdir(&memcg->css);
+
if (!migration_ok) {
used = oldpage;
unused = newpage;
@@ -3406,13 +3409,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
*/
if (anon)
mem_cgroup_uncharge_page(used);
- /*
- * At migration, we may charge account against cgroup which has no
- * tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
/*
@@ -3712,17 +3708,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return nr_reclaimed;
}
-/*
+/**
+ * mem_cgroup_force_empty_list - clears LRU of a group
+ * @memcg: group to clear
+ * @node: NUMA node
+ * @zid: zone id
+ * @lru: lru to to clear
+ *
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
- * reclaim the pages page themselves - it just removes the page_cgroups.
- * Returns true if some page_cgroups were not freed, indicating that the caller
- * must retry this operation.
+ * reclaim the pages page themselves - pages are moved to the parent (or root)
+ * group.
*/
-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
struct lruvec *lruvec;
- unsigned long flags, loop;
+ unsigned long flags;
struct list_head *list;
struct page *busy;
struct zone *zone;
@@ -3731,11 +3732,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
list = &lruvec->lists[lru];
- loop = mem_cgroup_get_lru_size(lruvec, lru);
- /* give some margin against EBUSY etc...*/
- loop += 256;
busy = NULL;
- while (loop--) {
+ do {
struct page_cgroup *pc;
struct page *page;
@@ -3761,76 +3759,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
cond_resched();
} else
busy = NULL;
- }
- return !list_empty(list);
+ } while (!list_empty(list));
}
/*
- * make mem_cgroup's charge to be 0 if there is no task.
+ * make mem_cgroup's charge to be 0 if there is no task by moving
+ * all the charges and pages to the parent.
* This enables deleting this mem_cgroup.
+ *
+ * Caller is responsible for holding css reference on the memcg.
*/
-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
{
- int ret;
- int node, zid, shrink;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = memcg->css.cgroup;
-
- css_get(&memcg->css);
+ int node, zid;
- shrink = 0;
- /* should free all ? */
- if (free_all)
- goto try_to_free;
-move_account:
do {
- ret = -EBUSY;
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
- goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
- ret = 0;
mem_cgroup_start_move(memcg);
- for_each_node_state(node, N_HIGH_MEMORY) {
- for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
+ for_each_node_state(node, N_MEMORY) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
enum lru_list lru;
for_each_lru(lru) {
- ret = mem_cgroup_force_empty_list(memcg,
+ mem_cgroup_force_empty_list(memcg,
node, zid, lru);
- if (ret)
- break;
}
}
- if (ret)
- break;
}
mem_cgroup_end_move(memcg);
memcg_oom_recover(memcg);
cond_resched();
- /* "ret" should also be checked to ensure all lists are empty. */
- } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
-out:
- css_put(&memcg->css);
- return ret;
-try_to_free:
+ /*
+ * This is a safety check because mem_cgroup_force_empty_list
+ * could have raced with mem_cgroup_replace_page_cache callers
+ * so the lru seemed empty but the page could have been added
+ * right after the check. RES_USAGE should be safe as we always
+ * charge before adding to the LRU.
+ */
+ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
+}
+
+/*
+ * Reclaims as many pages from the given memcg as possible and moves
+ * the rest to the parent.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct cgroup *cgrp = memcg->css.cgroup;
+
/* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
- ret = -EBUSY;
- goto out;
- }
+ if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ return -EBUSY;
+
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
- shrink = 1;
while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
int progress;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending(current))
+ return -EINTR;
+
progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
false);
if (!progress) {
@@ -3841,13 +3835,23 @@ try_to_free:
}
lru_add_drain();
- /* try move_account...there may be some *locked* pages. */
- goto move_account;
+ mem_cgroup_reparent_charges(memcg);
+
+ return 0;
}
static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
- return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ int ret;
+
+ if (mem_cgroup_is_root(memcg))
+ return -EINVAL;
+ css_get(&memcg->css);
+ ret = mem_cgroup_force_empty(memcg);
+ css_put(&memcg->css);
+
+ return ret;
}
@@ -4120,7 +4124,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
seq_printf(m, "total=%lu", total_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
@@ -4128,7 +4132,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
seq_printf(m, "file=%lu", file_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_FILE);
seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4137,7 +4141,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
seq_printf(m, "anon=%lu", anon_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_ANON);
seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4146,7 +4150,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
seq_printf(m, "unevictable=%lu", unevictable_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
BIT(LRU_UNEVICTABLE));
seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4953,7 +4957,7 @@ err_cleanup:
}
static struct cgroup_subsys_state * __ref
-mem_cgroup_create(struct cgroup *cont)
+mem_cgroup_css_alloc(struct cgroup *cont)
{
struct mem_cgroup *memcg, *parent;
long error = -ENOMEM;
@@ -5034,14 +5038,14 @@ free_out:
return ERR_PTR(error);
}
-static int mem_cgroup_pre_destroy(struct cgroup *cont)
+static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- return mem_cgroup_force_empty(memcg, false);
+ mem_cgroup_reparent_charges(memcg);
}
-static void mem_cgroup_destroy(struct cgroup *cont)
+static void mem_cgroup_css_free(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -5631,16 +5635,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
- .create = mem_cgroup_create,
- .pre_destroy = mem_cgroup_pre_destroy,
- .destroy = mem_cgroup_destroy,
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
.use_id = 1,
- .__DEPRECATED_clear_css_refs = true,
};
#ifdef CONFIG_MEMCG_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b9034a..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av;
pgoff_t pgoff;
- av = page_lock_anon_vma(page);
+ av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma(av);
+ page_unlock_anon_vma_read(av);
}
/*
@@ -781,16 +781,16 @@ static struct page_state {
{ compound, compound, "huge", me_huge_page },
#endif
- { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
+ { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
- { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
- { unevict, unevict, "unevictable LRU", me_pagecache_clean},
+ { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+ { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
- { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
- { mlock, mlock, "mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
+ { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
- { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
+ { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
/*
@@ -812,14 +812,14 @@ static struct page_state {
#undef slab
#undef reserved
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
static void action_result(unsigned long pfn, char *msg, int result)
{
- struct page *page = pfn_to_page(pfn);
-
- printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
- pfn,
- PageDirty(page) ? "dirty " : "",
- msg, action_name[result]);
+ pr_err("MCE %#lx: %s page recovery: %s\n",
+ pfn, msg, action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* Isolate the page, so that it doesn't get reallocated if it
* was free.
*/
- set_migratetype_isolate(p);
+ set_migratetype_isolate(p, true);
/*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_trans_head(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
+ if (PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
ret = get_any_page(page, pfn, flags);
if (ret < 0)
@@ -1558,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..e6a3b933517e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
+#include <linux/migrate.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -717,20 +718,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- return pfn == zero_pfn;
-}
-#endif
-
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
@@ -1250,7 +1237,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1517,9 +1504,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto no_page_table;
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
spin_lock(&mm->page_table_lock);
@@ -1546,6 +1535,8 @@ split_fallthrough:
pte = *ptep;
if (!pte_present(pte))
goto no_page;
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -1697,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (gup_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ /*
+ * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+ * would be called on PROT_NONE ranges. We must never invoke
+ * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+ * page faults would unprotect the PROT_NONE ranges if
+ * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+ * bitflag. So to avoid that, don't set FOLL_NUMA if
+ * FOLL_FORCE is set.
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
i = 0;
do {
@@ -2794,13 +2798,8 @@ unlock:
oom_free_new:
page_cache_release(new_page);
oom:
- if (old_page) {
- if (page_mkwrite) {
- unlock_page(old_page);
- page_cache_release(old_page);
- }
+ if (old_page)
page_cache_release(old_page);
- }
return VM_FAULT_OOM;
unwritable_page:
@@ -3431,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int current_nid)
+{
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ return mpol_misplaced(page, vma, addr);
+}
+
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ int current_nid = -1;
+ int target_nid;
+ bool migrated = false;
+
+ /*
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ *
+ * ptep_modify_prot_start is not called as this is clearing
+ * the _PAGE_NUMA bit and it is not really expected that there
+ * would be concurrent hardware modifications to the PTE.
+ */
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*ptep, pte))) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ pte = pte_mknonnuma(pte);
+ set_pte_at(mm, addr, ptep, pte);
+ update_mmu_cache(vma, addr, ptep);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
+ current_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ pte_unmap_unlock(ptep, ptl);
+ if (target_nid == -1) {
+ /*
+ * Account for the fault against the current node if it not
+ * being replaced regardless of where the page is located.
+ */
+ current_nid = numa_node_id();
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+
+out:
+ if (current_nid != -1)
+ task_numa_fault(current_nid, 1, migrated);
+ return 0;
+}
+
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ pte_t *pte, *orig_pte;
+ unsigned long _addr = addr & PMD_MASK;
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+ int local_nid = numa_node_id();
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+ if (pmd_numa(pmd)) {
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+ numa = true;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!numa)
+ return 0;
+
+ /* we're in a page fault so some vma must be in the range */
+ BUG_ON(!vma);
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+ VM_BUG_ON(offset >= PMD_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+ pte += offset >> PAGE_SHIFT;
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+ int curr_nid = local_nid;
+ int target_nid;
+ bool migrated;
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+ continue;
+ if (addr >= vma->vm_end) {
+ vma = find_vma(mm, addr);
+ /* there's a pte present so there must be a vma */
+ BUG_ON(!vma);
+ BUG_ON(addr < vma->vm_start);
+ }
+ if (pte_numa(pteval)) {
+ pteval = pte_mknonnuma(pteval);
+ set_pte_at(mm, addr, pte, pteval);
+ }
+ page = vm_normal_page(vma, addr, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+ /*
+ * Note that the NUMA fault is later accounted to either
+ * the node that is currently running or where the page is
+ * migrated to.
+ */
+ curr_nid = local_nid;
+ target_nid = numa_migrate_prep(page, vma, addr,
+ page_to_nid(page));
+ if (target_nid == -1) {
+ put_page(page);
+ continue;
+ }
+
+ /* Migrate to the requested node */
+ pte_unmap_unlock(pte, ptl);
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ curr_nid = target_nid;
+ task_numa_fault(curr_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3469,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
+ if (pte_numa(entry))
+ return do_numa_page(mm, vma, address, entry, pte, pmd);
+
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
@@ -3537,9 +3702,13 @@ retry:
barrier();
if (pmd_trans_huge(orig_pmd)) {
- if (flags & FAULT_FLAG_WRITE &&
- !pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd)) {
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+ if (pmd_numa(orig_pmd))
+ return do_huge_pmd_numa_page(mm, vma, address,
+ orig_pmd, pmd);
+
+ if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
/*
@@ -3550,17 +3719,25 @@ retry:
if (unlikely(ret & VM_FAULT_OOM))
goto retry;
return ret;
+ } else {
+ huge_pmd_set_accessed(mm, vma, address, pmd,
+ orig_pmd, dirty);
}
+
return 0;
}
}
+ if (pmd_numa(*pmd))
+ return do_pmd_numa_page(mm, vma, address, pmd);
+
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..962e353aa86f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
void __ref put_page_bootmem(struct page *page)
{
unsigned long type;
+ static DEFINE_MUTEX(ppb_lock);
type = (unsigned long) page->lru.next;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
+
+ /*
+ * Please refer to comment for __free_pages_bootmem()
+ * for why we serialize here.
+ */
+ mutex_lock(&ppb_lock);
__free_pages_bootmem(page, 0);
+ mutex_unlock(&ppb_lock);
}
}
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writelock(zone);
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
- if (start_pfn < zone->zone_start_pfn)
+ if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
zone->zone_start_pfn = start_pfn;
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
+static void resize_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ zone_span_writelock(zone);
+
+ if (end_pfn - start_pfn) {
+ zone->zone_start_pfn = start_pfn;
+ zone->spanned_pages = end_pfn - start_pfn;
+ } else {
+ /*
+ * make it consist as free_area_init_core(),
+ * if spanned_pages = 0, then keep start_pfn = 0
+ */
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
+
+ zone_span_writeunlock(zone);
+}
+
+static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ enum zone_type zid = zone_idx(zone);
+ int nid = zone->zone_pgdat->node_id;
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ set_page_links(pfn_to_page(pfn), zid, nid, pfn);
+}
+
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int ret;
+ unsigned long flags;
+ unsigned long z1_start_pfn;
+
+ if (!z1->wait_table) {
+ ret = init_currently_empty_zone(z1, start_pfn,
+ end_pfn - start_pfn, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
+
+ pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+ /* can't move pfns which are higher than @z2 */
+ if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+ goto out_fail;
+ /* the move out part mast at the left most of @z2 */
+ if (start_pfn > z2->zone_start_pfn)
+ goto out_fail;
+ /* must included/overlap */
+ if (end_pfn <= z2->zone_start_pfn)
+ goto out_fail;
+
+ /* use start_pfn for z1's start_pfn if z1 is empty */
+ if (z1->spanned_pages)
+ z1_start_pfn = z1->zone_start_pfn;
+ else
+ z1_start_pfn = start_pfn;
+
+ resize_zone(z1, z1_start_pfn, end_pfn);
+ resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+ fix_zone_id(z1, start_pfn, end_pfn);
+
+ return 0;
+out_fail:
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+ return -1;
+}
+
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int ret;
+ unsigned long flags;
+ unsigned long z2_end_pfn;
+
+ if (!z2->wait_table) {
+ ret = init_currently_empty_zone(z2, start_pfn,
+ end_pfn - start_pfn, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
+
+ pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+ /* can't move pfns which are lower than @z1 */
+ if (z1->zone_start_pfn > start_pfn)
+ goto out_fail;
+ /* the move out part mast at the right most of @z1 */
+ if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
+ goto out_fail;
+ /* must included/overlap */
+ if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+ goto out_fail;
+
+ /* use end_pfn for z2's end_pfn if z2 is empty */
+ if (z2->spanned_pages)
+ z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+ else
+ z2_end_pfn = end_pfn;
+
+ resize_zone(z1, z1->zone_start_pfn, start_pfn);
+ resize_zone(z2, start_pfn, z2_end_pfn);
+
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+ fix_zone_id(z2, start_pfn, end_pfn);
+
+ return 0;
+out_fail:
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+ return -1;
+}
+
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long old_pgdat_end_pfn =
pgdat->node_start_pfn + pgdat->node_spanned_pages;
- if (start_pfn < pgdat->node_start_pfn)
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +589,96 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
return 0;
}
+#ifdef CONFIG_MOVABLE_NODE
+/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+ return true;
+}
+#else /* #ifdef CONFIG_MOVABLE_NODE */
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+ return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+#endif /* #ifdef CONFIG_MOVABLE_NODE */
+
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ int nid = zone_to_nid(zone);
+ enum zone_type zone_last = ZONE_NORMAL;
+
+ /*
+ * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_NORMAL,
+ * set zone_last to ZONE_NORMAL.
+ *
+ * If we don't have HIGHMEM nor movable node,
+ * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+ * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ */
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * if the memory to be online is in a zone of 0...zone_last, and
+ * the zones of 0...zone_last don't have memory before online, we will
+ * need to set the node to node_states[N_NORMAL_MEMORY] after
+ * the memory is online.
+ */
+ if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+ arg->status_change_nid_normal = nid;
+ else
+ arg->status_change_nid_normal = -1;
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If we have movable node, node_states[N_HIGH_MEMORY]
+ * contains nodes which have zones of 0...ZONE_HIGHMEM,
+ * set zone_last to ZONE_HIGHMEM.
+ *
+ * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_MOVABLE,
+ * set zone_last to ZONE_MOVABLE.
+ */
+ zone_last = ZONE_HIGHMEM;
+ if (N_MEMORY == N_HIGH_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+ arg->status_change_nid_high = nid;
+ else
+ arg->status_change_nid_high = -1;
+#else
+ arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+ /*
+ * if the node don't have memory befor online, we will need to
+ * set the node to node_states[N_MEMORY] after the memory
+ * is online.
+ */
+ if (!node_state(nid, N_MEMORY))
+ arg->status_change_nid = nid;
+ else
+ arg->status_change_nid = -1;
+}
+
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_set_state(node, N_NORMAL_MEMORY);
+
+ if (arg->status_change_nid_high >= 0)
+ node_set_state(node, N_HIGH_MEMORY);
+
+ node_set_state(node, N_MEMORY);
+}
+
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long onlined_pages = 0;
struct zone *zone;
@@ -471,13 +688,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
struct memory_notify arg;
lock_memory_hotplug();
+ /*
+ * This doesn't need a lock to do pfn_to_page().
+ * The section can't be removed here because of the
+ * memory_block->state_mutex.
+ */
+ zone = page_zone(pfn_to_page(pfn));
+
+ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+ !can_online_high_movable(zone)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
+
+ if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+ if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
+ }
+ if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+ if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
+ }
+
+ /* Previous code may changed the zone of the pfn range */
+ zone = page_zone(pfn_to_page(pfn));
+
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
- arg.status_change_nid = -1;
+ node_states_check_changes_online(nr_pages, zone, &arg);
nid = page_to_nid(pfn_to_page(pfn));
- if (node_present_pages(nid) == 0)
- arg.status_change_nid = nid;
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -487,23 +731,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
return ret;
}
/*
- * This doesn't need a lock to do pfn_to_page().
- * The section can't be removed here because of the
- * memory_block->state_mutex.
- */
- zone = page_zone(pfn_to_page(pfn));
- /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
mutex_lock(&zonelists_mutex);
- if (!populated_zone(zone))
+ if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
+ build_all_zonelists(NULL, zone);
+ }
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ if (need_zonelists_rebuild)
+ zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +756,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
return ret;
}
+ zone->managed_pages += onlined_pages;
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
if (onlined_pages) {
- node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+ node_states_set_node(zone_to_nid(zone), &arg);
if (need_zonelists_rebuild)
- build_all_zonelists(NULL, zone);
+ build_all_zonelists(NULL, NULL);
else
zone_pcp_update(zone);
}
@@ -812,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* migrate_pages returns # of failed pages.
*/
ret = migrate_pages(&source, alloc_migrate_target, 0,
- true, MIGRATE_SYNC);
+ true, MIGRATE_SYNC,
+ MR_MEMORY_HOTPLUG);
if (ret)
putback_lru_pages(&source);
}
@@ -847,7 +1091,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
{
int ret;
long offlined = *(long *)data;
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+ ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
offlined = nr_pages;
if (!ret)
*(long *)data += offlined;
@@ -867,6 +1111,129 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
+#ifdef CONFIG_MOVABLE_NODE
+/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+ return true;
+}
+#else /* #ifdef CONFIG_MOVABLE_NODE */
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt;
+
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+
+ if (present_pages > nr_pages)
+ return true;
+
+ present_pages = 0;
+ for (; zt <= ZONE_MOVABLE; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+
+ /*
+ * we can't offline the last normal memory until all
+ * higher memory is offlined.
+ */
+ return present_pages == 0;
+}
+#endif /* #ifdef CONFIG_MOVABLE_NODE */
+
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt, zone_last = ZONE_NORMAL;
+
+ /*
+ * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_NORMAL,
+ * set zone_last to ZONE_NORMAL.
+ *
+ * If we don't have HIGHMEM nor movable node,
+ * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+ * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ */
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is in a zone of 0...zone_last,
+ * and it is the last present memory, 0...zone_last will
+ * become empty after offline , thus we can determind we will
+ * need to clear the node from node_states[N_NORMAL_MEMORY].
+ */
+ for (zt = 0; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ arg->status_change_nid_normal = zone_to_nid(zone);
+ else
+ arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If we have movable node, node_states[N_HIGH_MEMORY]
+ * contains nodes which have zones of 0...ZONE_HIGHMEM,
+ * set zone_last to ZONE_HIGHMEM.
+ *
+ * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_MOVABLE,
+ * set zone_last to ZONE_MOVABLE.
+ */
+ zone_last = ZONE_HIGHMEM;
+ if (N_MEMORY == N_HIGH_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ for (; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ arg->status_change_nid_high = zone_to_nid(zone);
+ else
+ arg->status_change_nid_high = -1;
+#else
+ arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+ /*
+ * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+ */
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * check whether node_states[N_HIGH_MEMORY] will be changed
+ * If we try to offline the last present @nr_pages from the node,
+ * we can determind we will need to clear the node from
+ * node_states[N_HIGH_MEMORY].
+ */
+ for (; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (nr_pages >= present_pages)
+ arg->status_change_nid = zone_to_nid(zone);
+ else
+ arg->status_change_nid = -1;
+}
+
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_clear_state(node, N_NORMAL_MEMORY);
+
+ if ((N_MEMORY != N_NORMAL_MEMORY) &&
+ (arg->status_change_nid_high >= 0))
+ node_clear_state(node, N_HIGH_MEMORY);
+
+ if ((N_MEMORY != N_HIGH_MEMORY) &&
+ (arg->status_change_nid >= 0))
+ node_clear_state(node, N_MEMORY);
+}
+
static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
@@ -893,16 +1260,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
+ ret = -EINVAL;
+ if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
+ goto out;
+
/* set above range as isolated */
- ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ ret = start_isolate_page_range(start_pfn, end_pfn,
+ MIGRATE_MOVABLE, true);
if (ret)
goto out;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
- arg.status_change_nid = -1;
- if (nr_pages >= node_present_pages(node))
- arg.status_change_nid = node;
+ node_states_check_changes_offline(nr_pages, zone, &arg);
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
ret = notifier_to_errno(ret);
@@ -943,10 +1313,10 @@ repeat:
goto repeat;
}
}
- /* drain all zone's lru pagevec, this is asyncronous... */
+ /* drain all zone's lru pagevec, this is asynchronous... */
lru_add_drain_all();
yield();
- /* drain pcp pages , this is synchrouns. */
+ /* drain pcp pages, this is synchronous. */
drain_all_pages();
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -955,12 +1325,13 @@ repeat:
goto failed_removal;
}
printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
- /* Ok, all of our target is islaoted.
+ /* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
/* reset pagetype flags and makes migrate type to be MOVABLE */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
/* removal success */
+ zone->managed_pages -= offlined_pages;
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
@@ -975,10 +1346,9 @@ repeat:
} else
zone_pcp_update(zone);
- if (!node_present_pages(node)) {
- node_clear_state(node, N_HIGH_MEMORY);
+ node_states_clear_node(node, &arg);
+ if (arg.status_change_nid >= 0)
kswapd_stop(node);
- }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a54c294..d1b315e98627 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
.flags = MPOL_F_LOCAL,
};
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ struct mempolicy *pol = p->mempolicy;
+ int node;
+
+ if (!pol) {
+ node = numa_node_id();
+ if (node != -1)
+ pol = &preferred_node_policy[node];
+
+ /* preferred_node_policy is not initialised early in boot */
+ if (!pol->mode)
+ pol = NULL;
+ }
+
+ return pol;
+}
+
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
- /* Check N_HIGH_MEMORY */
+ /* Check N_MEMORY */
nodes_and(nsc->mask1,
- cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
+ cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL; /* simply delete any existing policy */
+ return NULL;
}
VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
return 0;
}
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int nr_updated;
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ unsigned long endvma = vma->vm_end;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
}
- if (!is_vm_hugetlb_page(vma) &&
- ((flags & MPOL_MF_STRICT) ||
+
+ if (is_vm_hugetlb_page(vma))
+ goto next;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ goto next;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))) {
- unsigned long endvma = vma->vm_end;
+ vma_migratable(vma))) {
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
break;
}
}
+next:
prev = vma;
}
return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
- int nr_failed = 0;
-
+ err = PTR_ERR(vma); /* maybe ... */
+ if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
+ if (!err) {
+ int nr_failed = 0;
+
if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
- if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
+ if (!nodes_subset(*new, node_states[N_MEMORY])) {
err = -EINVAL;
goto out_put;
}
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
- struct mempolicy *pol = task->mempolicy;
+ struct mempolicy *pol = get_task_policy(task);
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
struct mempolicy *pol;
- struct zonelist *zl;
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -1926,23 +1997,11 @@ retry_cpuset:
return page;
}
- zl = policy_zonelist(gfp, pol, node);
- if (unlikely(mpol_needs_cond_ref(pol))) {
- /*
- * slow path: ref counted shared policy
- */
- struct page *page = __alloc_pages_nodemask(gfp, order,
- zl, policy_nodemask(gfp, pol));
- __mpol_put(pol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
- goto retry_cpuset;
- return page;
- }
- /*
- * fast path: default or task policy
- */
- page = __alloc_pages_nodemask(gfp, order, zl,
+ page = __alloc_pages_nodemask(gfp, order,
+ policy_zonelist(gfp, pol, node),
policy_nodemask(gfp, pol));
+ if (unlikely(mpol_needs_cond_ref(pol)))
+ __mpol_put(pol);
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
@@ -1969,7 +2028,7 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -2037,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
return new;
}
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
- * after return. Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
- struct mempolicy *frompol)
-{
- if (!mpol_needs_cond_ref(frompol))
- return frompol;
-
- *tompol = *frompol;
- tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
- __mpol_put(frompol);
- return tompol;
-}
-
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -2175,6 +2212,115 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON) {
+ int last_nid;
+
+ polnid = numa_node_id();
+
+ /*
+ * Multi-stage node selection is used in conjunction
+ * with a periodic migration fault to build a temporal
+ * task<->page relation. By using a two-stage filter we
+ * remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist
+ * probability, we can equate a task's usage of a
+ * particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and
+ * getting the same result twice in a row, given these
+ * samples are fully independent, is then given by
+ * P(n)^2, provided our sample period is sufficiently
+ * short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making
+ * it less likely we act on an unlikely task<->page
+ * relation.
+ */
+ last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2340,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
mutex_unlock(&p->mutex);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
+{
+ bool numabalancing_default = false;
+
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+ numabalancing_default = true;
+
+ if (nr_node_ids > 1 && !numabalancing_override) {
+ printk(KERN_INFO "Enabling automatic NUMA balancing. "
+ "Configure with numa_balancing= or sysctl");
+ set_numabalancing_state(numabalancing_default);
+ }
+}
+
+static int __init setup_numabalancing(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ numabalancing_override = true;
+
+ if (!strcmp(str, "enable")) {
+ set_numabalancing_state(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_numabalancing_state(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+
+ return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
@@ -2355,13 +2545,22 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
+
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
@@ -2381,6 +2580,8 @@ void __init numa_policy_init(void)
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
+
+ check_numabalancing_enable();
}
/* Reset policy of current process to default */
@@ -2397,14 +2598,13 @@ void numa_default_policy(void)
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
- [MPOL_LOCAL] = "local"
+ [MPOL_LOCAL] = "local",
};
@@ -2442,7 +2642,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
- if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
@@ -2450,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (mode > MPOL_LOCAL)
+ if (mode >= MPOL_MAX)
goto out;
switch (mode) {
@@ -2476,7 +2676,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
- nodes = node_states[N_HIGH_MEMORY];
+ nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..32efd8028bc9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,9 +35,13 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/balloon_compaction.h>
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
+
#include "internal.h"
/*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- putback_lru_page(page);
+ putback_lru_page(page);
+ }
+}
+
+/*
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
+ */
+void putback_movable_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ if (unlikely(balloon_page_movable(page)))
+ balloon_page_putback(page);
+ else
+ putback_lru_page(page);
}
}
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
swp_entry_t entry;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
goto out;
ptl = &mm->page_table_lock;
} else {
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
-
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
goto out;
-
- pmd = pmd_offset(pud, addr);
if (pmd_trans_huge(*pmd))
goto out;
- if (!pmd_present(*pmd))
- goto out;
ptep = pte_offset_map(pmd, addr);
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode)
{
- int expected_count;
+ int expected_count = 0;
void **pslot;
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != 1)
return -EAGAIN;
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
spin_unlock_irq(&mapping->tree_lock);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
/*
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
if (!mapping) {
if (page_count(page) != 1)
return -EAGAIN;
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
/*
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- if (PageHuge(page))
+ if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
migrate_page_copy(newpage, page);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
/*
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
} while (bh != head);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(buffer_migrate_page);
#endif
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
*
* Return value:
* < 0 - error code
- * == 0 - success
+ * MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
else
rc = fallback_migrate_page(mapping, newpage, page, mode);
- if (rc) {
+ if (rc != MIGRATEPAGE_SUCCESS) {
newpage->mapping = NULL;
} else {
if (remap_swapcache)
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
if (PageAnon(page)) {
/*
- * Only page_lock_anon_vma() understands the subtleties of
+ * Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
anon_vma = page_get_anon_vma(page);
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
}
+ if (unlikely(balloon_page_movable(page))) {
+ /*
+ * A ballooned page does not need any special attention from
+ * physical to virtual reverse mapping procedures.
+ * Skip any attempt to unmap PTEs or to remap swap cache,
+ * in order to avoid burning cycles at rmap level, and perform
+ * the page migration right away (proteced by page lock).
+ */
+ rc = balloon_page_migrate(newpage, page, mode);
+ goto uncharge;
+ }
+
/*
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +843,9 @@ skip_unmap:
put_anon_vma(anon_vma);
uncharge:
- mem_cgroup_end_migration(mem, page, newpage, rc == 0);
+ mem_cgroup_end_migration(mem, page, newpage,
+ (rc == MIGRATEPAGE_SUCCESS ||
+ rc == MIGRATEPAGE_BALLOON_SUCCESS));
unlock:
unlock_page(page);
out:
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
goto out;
rc = __unmap_and_move(page, newpage, force, offlining, mode);
+
+ if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+ /*
+ * A ballooned page has been migrated already.
+ * Now, it's the time to wrap-up counters,
+ * handle the page back to Buddy and return.
+ */
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ balloon_page_free(page);
+ return MIGRATEPAGE_SUCCESS;
+ }
out:
if (rc != -EAGAIN) {
/*
@@ -958,10 +1001,11 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
+ int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
case -EAGAIN:
retry++;
break;
- case 0:
+ case MIGRATEPAGE_SUCCESS:
+ nr_succeeded++;
break;
default:
/* Permanent failure */
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
}
}
}
- rc = 0;
+ rc = nr_failed + retry;
out:
+ if (nr_succeeded)
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ if (nr_failed)
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- if (rc)
- return rc;
-
- return nr_failed + retry;
+ return rc;
}
int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
/* try again */
cond_resched();
break;
- case 0:
+ case MIGRATEPAGE_SUCCESS:
goto out;
default:
rc = -EIO;
@@ -1139,7 +1187,8 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, MIGRATE_SYNC);
+ (unsigned long)pm, 0, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
if (node < 0 || node >= MAX_NUMNODES)
goto out_pm;
- if (!node_state(node, N_HIGH_MEMORY))
+ if (!node_state(node, N_MEMORY))
goto out_pm;
err = -EACCES;
@@ -1403,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+ int nr_migrate_pages)
+{
+ int z;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable)
+ continue;
+
+ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) +
+ nr_migrate_pages,
+ 0, 0))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+ unsigned long data,
+ int **result)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_exact_node(nid,
+ (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN) &
+ ~GFP_IOFS, 0);
+ if (newpage)
+ page_xchg_last_nid(newpage, page_last_nid(page));
+
+ return newpage;
+}
+
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+ msecs_to_jiffies(pteupdate_interval_millisecs)))
+ return false;
+
+ if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+ return false;
+
+ return true;
+}
+
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+ bool rate_limited = false;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ spin_lock(&pgdat->numabalancing_migrate_lock);
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies +
+ msecs_to_jiffies(migrate_interval_millisecs);
+ }
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+ rate_limited = true;
+ else
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
+
+ return rate_limited;
+}
+
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+ int ret = 0;
+
+ /* Avoid migrating to a node that is nearly full */
+ if (migrate_balanced_pgdat(pgdat, 1)) {
+ int page_lru;
+
+ if (isolate_lru_page(page)) {
+ put_page(page);
+ return 0;
+ }
+
+ /* Page is isolated */
+ ret = 1;
+ page_lru = page_is_file_cache(page);
+ if (!PageTransHuge(page))
+ inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+ else
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ HPAGE_PMD_NR);
+ }
+
+ /*
+ * Page is either isolated or there is not enough space on the target
+ * node. If isolated, then it has taken a reference count and the
+ * callers reference can be safely dropped without the page
+ * disappearing underneath us during migration. Otherwise the page is
+ * not to be migrated but the callers reference should still be
+ * dropped so it does not leak.
+ */
+ put_page(page);
+
+ return ret;
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ int nr_remaining;
+ LIST_HEAD(migratepages);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1) {
+ put_page(page);
+ goto out;
+ }
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, 1)) {
+ put_page(page);
+ goto out;
+ }
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated)
+ goto out;
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages,
+ alloc_misplaced_dst_page,
+ node, false, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ putback_lru_pages(&migratepages);
+ isolated = 0;
+ } else
+ count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ BUG_ON(!list_empty(&migratepages));
+out:
+ return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1)
+ goto out_dropref;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+ goto out_dropref;
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ if (!new_page) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out_dropref;
+ }
+ page_xchg_last_nid(new_page, page_last_nid(page));
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ put_page(new_page);
+ goto out_keep_locked;
+ }
+
+ /* Prepare a page as a migration target */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ migrate_page_copy(new_page, page);
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+ mlock_migrate_page(page, new_page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ unlock_page(page);
+ putback_lru_page(page);
+
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out;
+ }
+
+ /*
+ * Traditional migration needs to prepare the memcg charge
+ * transaction early to prevent the old page from being
+ * uncharged when installing migration entries. Here we can
+ * save the potential rollback and start the charge transfer
+ * only when migration is already known to end successfully.
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = pmd_mknonnuma(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+ page_add_new_anon_rmap(new_page, vma, haddr);
+
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, entry);
+ page_remove_rmap(page);
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+ * before it's fully transferred to the new page.
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(&mm->page_table_lock);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+out:
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+ return isolated;
+
+out_dropref:
+ put_page(page);
+out_keep_locked:
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c41e7d9..f54b235f29a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
+#include <linux/rbtree_augmented.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
@@ -297,40 +312,88 @@ out:
return retval;
}
+static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max, subtree_gap;
+ max = vma->vm_start;
+ if (vma->vm_prev)
+ max -= vma->vm_prev->vm_end;
+ if (vma->vm_rb.rb_left) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ if (vma->vm_rb.rb_right) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ return max;
+}
+
#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root)
{
- int i = 0, j;
+ int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev)
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
- if (vma->vm_start < pend)
+ if (vma->vm_start < prev) {
+ printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ bug = 1;
+ }
+ if (vma->vm_start < pend) {
printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
- if (vma->vm_start > vma->vm_end)
- printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+ bug = 1;
+ }
+ if (vma->vm_start > vma->vm_end) {
+ printk("vm_end %lx < vm_start %lx\n",
+ vma->vm_end, vma->vm_start);
+ bug = 1;
+ }
+ if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
+ printk("free gap %lx, correct %lx\n",
+ vma->rb_subtree_gap,
+ vma_compute_subtree_gap(vma));
+ bug = 1;
+ }
i++;
pn = nd;
prev = vma->vm_start;
pend = vma->vm_end;
}
j = 0;
- for (nd = pn; nd; nd = rb_prev(nd)) {
+ for (nd = pn; nd; nd = rb_prev(nd))
j++;
+ if (i != j) {
+ printk("backwards %d, forwards %d\n", j, i);
+ bug = 1;
+ }
+ return bug ? -1 : i;
+}
+
+static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
+{
+ struct rb_node *nd;
+
+ for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+ struct vm_area_struct *vma;
+ vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+ BUG_ON(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
}
- if (i != j)
- printk("backwards %d, forwards %d\n", j, i), i = 0;
- return i;
}
void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
+ unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
while (vma) {
struct anon_vma_chain *avc;
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
vma_unlock_anon_vma(vma);
+ highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
}
- if (i != mm->map_count)
- printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+ if (i != mm->map_count) {
+ printk("map_count %d vm_next %d\n", mm->map_count, i);
+ bug = 1;
+ }
+ if (highest_address != mm->highest_vm_end) {
+ printk("mm->highest_vm_end %lx, found %lx\n",
+ mm->highest_vm_end, highest_address);
+ bug = 1;
+ }
i = browse_rb(&mm->mm_rb);
- if (i != mm->map_count)
- printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+ if (i != mm->map_count) {
+ printk("map_count %d rb %d\n", mm->map_count, i);
+ bug = 1;
+ }
BUG_ON(bug);
}
#else
+#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif
+RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+
+/*
+ * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
+ * vma->vm_prev->vm_end values changed, without modifying the vma's position
+ * in the rbtree.
+ */
+static void vma_gap_update(struct vm_area_struct *vma)
+{
+ /*
+ * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
+ * function that does exacltly what we want.
+ */
+ vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
+}
+
+static inline void vma_rb_insert(struct vm_area_struct *vma,
+ struct rb_root *root)
+{
+ /* All rb_subtree_gap values must be consistent prior to insertion */
+ validate_mm_rb(root, NULL);
+
+ rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+{
+ /*
+ * All rb_subtree_gap values must be consistent prior to erase,
+ * with the possible exception of the vma being erased.
+ */
+ validate_mm_rb(root, vma);
+
+ /*
+ * Note rb_erase_augmented is a fairly large inline function,
+ * so make sure we instantiate it only once with our desired
+ * augmented rbtree callbacks.
+ */
+ rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
+ /* Update tracking information for the gap following the new vma. */
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ mm->highest_vm_end = vma->vm_end;
+
+ /*
+ * vma->vm_prev wasn't known when we followed the rbtree to find the
+ * correct insertion point for that vma. As a result, we could not
+ * update the vma vm_rb parents rb_subtree_gap values on the way down.
+ * So, we first insert the vma with a zero rb_subtree_gap value
+ * (to be consistent with what we did on the way down), and then
+ * immediately update the gap to the correct value. Finally we
+ * rebalance the rbtree after all augmented values have been set.
+ */
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ vma->rb_subtree_gap = 0;
+ vma_gap_update(vma);
+ vma_rb_insert(vma, &mm->mm_rb);
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -498,12 +631,12 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next;
- prev->vm_next = next;
+ vma_rb_erase(vma, &mm->mm_rb);
+ prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
}
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
+ bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma) {
VM_BUG_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
vma_interval_tree_remove(next, root);
}
- vma->vm_start = start;
- vma->vm_end = end;
+ if (start != vma->vm_start) {
+ vma->vm_start = start;
+ start_changed = true;
+ }
+ if (end != vma->vm_end) {
+ vma->vm_end = end;
+ end_changed = true;
+ }
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, insert);
+ } else {
+ if (start_changed)
+ vma_gap_update(vma);
+ if (end_changed) {
+ if (!next)
+ mm->highest_vm_end = end;
+ else if (!adjust_next)
+ vma_gap_update(next);
+ }
}
if (anon_vma) {
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
- if (remove_next == 2) {
- next = vma->vm_next;
+ next = vma->vm_next;
+ if (remove_next == 2)
goto again;
- }
+ else if (next)
+ vma_gap_update(next);
+ else
+ mm->highest_vm_end = end;
}
if (insert && file)
uprobe_mmap(insert);
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
- VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE);
+ VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE,
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
@@ -1335,7 +1488,11 @@ munmap_back:
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
+ * Bug: If addr is changed, prev, rb_link, rb_parent should
+ * be updated for vma_link()
*/
+ WARN_ON_ONCE(addr != vma->vm_start);
+
addr = vma->vm_start;
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
@@ -1400,6 +1557,206 @@ unacct_error:
return error;
}
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ /*
+ * We implement the search by looking for an rbtree node that
+ * immediately follows a suitable gap. That is,
+ * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
+ * - gap_end = vma->vm_start >= info->low_limit + length;
+ * - gap_end - gap_start >= length
+ */
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /* Adjust search limits by the desired length */
+ if (info->high_limit < length)
+ return -ENOMEM;
+ high_limit = info->high_limit - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ goto check_highest;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ goto check_highest;
+
+ while (true) {
+ /* Visit left subtree if it looks promising */
+ gap_end = vma->vm_start;
+ if (gap_end >= low_limit && vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+check_current:
+ /* Check if current node has a suitable gap */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+ if (gap_end >= low_limit && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit right subtree if it looks promising */
+ if (vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ goto check_highest;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_left) {
+ gap_start = vma->vm_prev->vm_end;
+ gap_end = vma->vm_start;
+ goto check_current;
+ }
+ }
+ }
+
+check_highest:
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+
+found:
+ /* We found a suitable gap. Clip it with the original low_limit. */
+ if (gap_start < info->low_limit)
+ gap_start = info->low_limit;
+
+ /* Adjust gap address to the desired alignment */
+ gap_start += (info->align_offset - gap_start) & info->align_mask;
+
+ VM_BUG_ON(gap_start + info->length > info->high_limit);
+ VM_BUG_ON(gap_start + info->length > gap_end);
+ return gap_start;
+}
+
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /*
+ * Adjust search limits by the desired length.
+ * See implementation comment at top of unmapped_area().
+ */
+ gap_end = info->high_limit;
+ if (gap_end < length)
+ return -ENOMEM;
+ high_limit = gap_end - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ if (gap_start <= high_limit)
+ goto found_highest;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ return -ENOMEM;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ return -ENOMEM;
+
+ while (true) {
+ /* Visit right subtree if it looks promising */
+ gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ if (gap_start <= high_limit && vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+check_current:
+ /* Check if current node has a suitable gap */
+ gap_end = vma->vm_start;
+ if (gap_end < low_limit)
+ return -ENOMEM;
+ if (gap_start <= high_limit && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit left subtree if it looks promising */
+ if (vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ return -ENOMEM;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_right) {
+ gap_start = vma->vm_prev ?
+ vma->vm_prev->vm_end : 0;
+ goto check_current;
+ }
+ }
+ }
+
+found:
+ /* We found a suitable gap. Clip it with the original high_limit. */
+ if (gap_end > info->high_limit)
+ gap_end = info->high_limit;
+
+found_highest:
+ /* Compute highest gap address at the desired alignment */
+ gap_end -= info->length;
+ gap_end -= (gap_end - info->align_offset) & info->align_mask;
+
+ VM_BUG_ON(gap_end < info->low_limit);
+ VM_BUG_ON(gap_end < gap_start);
+ return gap_end;
+}
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info;
if (len > TASK_SIZE)
return -ENOMEM;
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- if (len > mm->cached_hole_size) {
- start_addr = addr = mm->free_area_cache;
- } else {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- addr = TASK_UNMAPPED_BASE;
- start_addr = addr;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = 0;
+ return vm_unmapped_area(&info);
}
#endif
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0, start_addr;
+ unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
-try_again:
- /* either no address requested or can't fit in requested address hole */
- start_addr = addr = mm->free_area_cache;
-
- if (addr < len)
- goto fail;
-
- addr -= len;
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
-
-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- *
- * Note: this is different with the case of bottomup
- * which does the fully line-search, but we use find_vma
- * here that causes some holes skipped.
- */
- if (start_addr != mm->mmap_base) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = 0;
- goto try_again;
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = mm->mmap_base;
+ info.align_mask = 0;
+ addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
@@ -1563,14 +1853,13 @@ fail:
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
return addr;
}
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_sem
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * vma_lock_anon_vma() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ vma->vm_mm->highest_vm_end = address;
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
perf_event_mmap(vma);
}
}
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_sem
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * vma_lock_anon_vma() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
+ vma_gap_update(vma);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
perf_event_mmap(vma);
}
}
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ vma_rb_erase(vma, &mm->mm_rb);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
- if (vma)
+ if (vma) {
vma->vm_prev = prev;
+ vma_gap_update(vma);
+ } else
+ mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+ down_write(&anon_vma->root->rwsem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->mutex. If some other vma in this mm shares
+ * anon_vma->root->rwsem. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..3dca970367db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
+ struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
+ unsigned long pages = 0;
+ bool all_same_node = true;
+ int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
+ bool updated = false;
ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ if (!prot_numa) {
+ ptent = pte_modify(ptent, newprot);
+ updated = true;
+ } else {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page) {
+ int this_nid = page_to_nid(page);
+ if (last_nid == -1)
+ last_nid = this_nid;
+ if (last_nid != this_nid)
+ all_same_node = false;
+
+ /* only check non-shared pages */
+ if (!pte_numa(oldpte) &&
+ page_mapcount(page) == 1) {
+ ptent = pte_mknuma(ptent);
+ updated = true;
+ }
+ }
+ }
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
- if (dirty_accountable && pte_dirty(ptent))
+ if (dirty_accountable && pte_dirty(ptent)) {
ptent = pte_mkwrite(ptent);
+ updated = true;
+ }
+ if (updated)
+ pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,100 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
+ pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+
+ *ret_all_same_node = all_same_node;
+ return pages;
}
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
unsigned long next;
+ unsigned long pages = 0;
+ bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma->vm_mm, pmd);
- else if (change_huge_pmd(vma, pmd, addr, newprot))
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
+ pages += HPAGE_PMD_NR;
continue;
+ }
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa, &all_same_node);
+
+ /*
+ * If we are changing protections for NUMA hinting faults then
+ * set pmd_numa if the examined pages were all on the same
+ * node. This allows a regular PMD to be handled as one fault
+ * and effectively batches the taking of the PTL
+ */
+ if (prot_numa && all_same_node)
+ change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
+
+ return pages;
}
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
+ unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(vma, pud, addr, next, newprot,
- dirty_accountable);
+ pages += change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
+
+ return pages;
}
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
+ unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(vma, pgd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
- flush_tlb_range(vma, start, end);
+
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
+
+ return pages;
+}
+
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long pages;
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ if (is_vm_hugetlb_page(vma))
+ pages = hugetlb_change_protection(vma, start, end, newprot);
+ else
+ pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+
+ return pages;
}
int
@@ -213,12 +304,8 @@ success:
dirty_accountable = 1;
}
- mmu_notifier_invalidate_range_start(mm, start, end);
- if (is_vm_hugetlb_page(vma))
- hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
- else
- change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
- mmu_notifier_invalidate_range_end(mm, start, end);
+ change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d3307a..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
}
}
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
need_flush = true;
continue;
} else if (!err) {
- split_huge_page_pmd(vma->vm_mm, old_pmd);
+ split_huge_page_pmd(vma, old_addr, old_pmd);
}
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b31411..b8294fc03df8 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
return count;
}
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ if (!is_highmem(z))
+ z->managed_pages = 0;
+}
+
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
* @pgdat: node to be released
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
/* free_low_memory_core_early(MAX_NUMNODES) will be called later */
return 0;
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
*/
unsigned long __init free_all_bootmem(void)
{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
+
/*
* We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
diff --git a/mm/nommu.c b/mm/nommu.c
index 45131b41bcdb..79c3cac87afa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(num_physpages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
-/*
- * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
- * @old_val: old oom_score_adj for compare
- * @new_val: new oom_score_adj for swap
- *
- * Sets the oom_score_adj value for current to @new_val iff its present value is
- * @old_val. Usually used to reinstate a previous value to prevent racing with
- * userspacing tuning the value in the interim.
- */
-void compare_swap_oom_score_adj(int old_val, int new_val)
-{
- struct sighand_struct *sighand = current->sighand;
-
- spin_lock_irq(&sighand->siglock);
- if (current->signal->oom_score_adj == old_val)
- current->signal->oom_score_adj = new_val;
- trace_oom_score_adj_update(current);
- spin_unlock_irq(&sighand->siglock);
-}
-
-/**
- * test_set_oom_score_adj() - set current's oom_score_adj and return old value
- * @new_val: new oom_score_adj value
- *
- * Sets the oom_score_adj value for current to @new_val with proper
- * synchronization and returns the old value. Usually used to temporarily
- * set a value, save the old value in the caller, and then reinstate it later.
- */
-int test_set_oom_score_adj(int new_val)
-{
- struct sighand_struct *sighand = current->sighand;
- int old_val;
-
- spin_lock_irq(&sighand->siglock);
- old_val = current->signal->oom_score_adj;
- current->signal->oom_score_adj = new_val;
- trace_oom_score_adj_update(current);
- spin_unlock_irq(&sighand->siglock);
-
- return old_val;
-}
-
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
- adj = p->signal->oom_score_adj;
+ adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+ if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
*totalpages = total_swap_pages;
for_each_node_mask(nid, *nodemask)
*totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (!task->mm)
return OOM_SCAN_CONTINUE;
- if (task->flags & PF_EXITING) {
+ /*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (oom_task_origin(task))
+ return OOM_SCAN_SELECT;
+
+ if (task->flags & PF_EXITING && !force_kill) {
/*
- * If task is current and is in the process of releasing memory,
- * allow the "kill" to set TIF_MEMDIE, which will allow it to
- * access memory reserves. Otherwise, it may stall forever.
- *
- * The iteration isn't broken here, however, in case other
- * threads are found to have already been oom killed.
+ * If this task is not being ptraced on exit, then wait for it
+ * to finish before killing some other task unnecessarily.
*/
- if (task == current)
- return OOM_SCAN_SELECT;
- else if (!force_kill) {
- /*
- * If this task is not being ptraced on exit, then wait
- * for it to finish before killing some other task
- * unnecessarily.
- */
- if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
- return OOM_SCAN_ABORT;
- }
+ if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+ return OOM_SCAN_ABORT;
}
return OOM_SCAN_OK;
}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%d\n",
+ "oom_score_adj=%hd\n",
current->comm, gfp_mask, order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
spin_unlock(&zone_scan_lock);
}
-/*
- * Try to acquire the oom killer lock for all system zones. Returns zero if a
- * parallel oom killing is taking place, otherwise locks all zones and returns
- * non-zero.
- */
-static int try_set_system_oom(void)
-{
- struct zone *zone;
- int ret = 1;
-
- spin_lock(&zone_scan_lock);
- for_each_populated_zone(zone)
- if (zone_is_oom_locked(zone)) {
- ret = 0;
- goto out;
- }
- for_each_populated_zone(zone)
- zone_set_flag(zone, ZONE_OOM_LOCKED);
-out:
- spin_unlock(&zone_scan_lock);
- return ret;
-}
-
-/*
- * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
- * attempts or page faults may now recall the oom killer, if necessary.
- */
-static void clear_system_oom(void)
-{
- struct zone *zone;
-
- spin_lock(&zone_scan_lock);
- for_each_populated_zone(zone)
- zone_clear_flag(zone, ZONE_OOM_LOCKED);
- spin_unlock(&zone_scan_lock);
-}
-
/**
* out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
return;
/*
- * If current has a pending SIGKILL, then automatically select it. The
- * goal is to allow it to allocate so that it may quickly exit and free
- * its memory.
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
*/
- if (fatal_signal_pending(current)) {
+ if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -756,15 +671,16 @@ out:
/*
* The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
- * oom killing is already in progress so do nothing. If a task is found with
- * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
+ * parallel oom killing is already in progress so do nothing.
*/
void pagefault_out_of_memory(void)
{
- if (try_set_system_oom()) {
+ struct zonelist *zonelist = node_zonelist(first_online_node,
+ GFP_KERNEL);
+
+ if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
- clear_system_oom();
+ clear_zonelist_oom(zonelist, GFP_KERNEL);
}
- schedule_timeout_killable(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..6f4271224493 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
}
/*
- * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
- * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
- * @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
- unsigned long nr_pages_dirtied)
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
int ratelimit;
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
*/
p = &__get_cpu_var(dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
+ unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(mapping, current->nr_dirtied);
}
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
void throttle_vm_writeout(gfp_t gfp_mask)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bcb72c6e2b2d..d037c8bc1512 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
+#ifdef CONFIG_MOVABLE_NODE
+ [N_MEMORY] = { { [0] = 1UL } },
+#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
@@ -523,7 +526,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
- * -- wli
+ * -- nyc
*/
static inline void __free_one_page(struct page *page,
@@ -608,6 +611,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
+ reset_page_last_nid(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -667,11 +671,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- if (is_migrate_cma(mt))
- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+ if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+ }
} while (--to_free && --batch_free && !list_empty(list));
}
- __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -730,6 +736,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
+/*
+ * Read access to zone->managed_pages is safe because it's unsigned long,
+ * but we still need to serialize writers. Currently all callers of
+ * __free_pages_bootmem() except put_page_bootmem() should only be used
+ * at boot time. So for shorter boot time, we shift the burden to
+ * put_page_bootmem() to serialize writers.
+ */
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
@@ -745,6 +758,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
set_page_count(p, 0);
}
+ page_zone(page)->managed_pages += 1 << order;
set_page_refcounted(page);
__free_pages(page, order);
}
@@ -780,7 +794,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
- * -- wli
+ * -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
@@ -1392,21 +1406,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
zone = page_zone(page);
order = page_order(page);
+ mt = get_pageblock_migratetype(page);
- /* Obey watermarks as if the page was being allocated */
- watermark = low_wmark_pages(zone) + (1 << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
- return 0;
+ if (mt != MIGRATE_ISOLATE) {
+ /* Obey watermarks as if the page was being allocated */
+ watermark = low_wmark_pages(zone) + (1 << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return 0;
+
+ __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+ }
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
- mt = get_pageblock_migratetype(page);
- if (unlikely(mt != MIGRATE_ISOLATE))
- __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
-
if (alloc_order != order)
expand(zone, page, alloc_order, order,
&zone->free_area[order], migratetype);
@@ -1422,7 +1437,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
}
}
- return 1UL << order;
+ return 1UL << alloc_order;
}
/*
@@ -1692,7 +1707,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
+ * tasks mems_allowed, or node_states[N_MEMORY].)
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
@@ -1721,7 +1736,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :
- &node_states[N_HIGH_MEMORY];
+ &node_states[N_MEMORY];
return allowednodes;
}
@@ -1871,7 +1886,7 @@ zonelist_scan:
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
- if (NUMA_BUILD && zlc_active &&
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1932,8 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
@@ -1936,7 +1952,7 @@ zonelist_scan:
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
- if (NUMA_BUILD && zlc_active &&
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
@@ -1962,11 +1978,11 @@ try_this_zone:
if (page)
break;
this_zone_full:
- if (NUMA_BUILD)
+ if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
@@ -2266,7 +2282,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL;
/* After successful reclaim, reconsider all zones for allocation */
- if (NUMA_BUILD)
+ if (IS_ENABLED(CONFIG_NUMA))
zlc_clear_zones_full(zonelist);
retry:
@@ -2412,12 +2428,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx,
- zone_idx(preferred_zone));
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2512,7 @@ rebalance:
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
- (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
+ (gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* Try direct reclaim and then allocating */
@@ -2818,7 +2836,7 @@ unsigned int nr_free_pagecache_pages(void)
static inline void show_node(struct zone *zone)
{
- if (NUMA_BUILD)
+ if (IS_ENABLED(CONFIG_NUMA))
printk("Node %d ", zone_to_nid(zone));
}
@@ -2876,6 +2894,31 @@ out:
#define K(x) ((x) << (PAGE_SHIFT-10))
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RESERVE] = 'R',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+ [MIGRATE_ISOLATE] = 'I',
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk("(%s) ", tmp);
+}
+
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -2950,6 +2993,7 @@ void show_free_areas(unsigned int filter)
" isolated(anon):%lukB"
" isolated(file):%lukB"
" present:%lukB"
+ " managed:%lukB"
" mlocked:%lukB"
" dirty:%lukB"
" writeback:%lukB"
@@ -2979,6 +3023,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_ISOLATED_ANON)),
K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages),
+ K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_FILE_DIRTY)),
K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3004,6 +3049,7 @@ void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ unsigned char types[MAX_ORDER];
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
@@ -3012,12 +3058,24 @@ void show_free_areas(unsigned int filter)
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr[order] = zone->free_area[order].nr_free;
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!list_empty(&area->free_list[type]))
+ types[order] |= 1 << type;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++)
+ for (order = 0; order < MAX_ORDER; order++) {
printk("%lu*%lukB ", nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
printk("= %lukB\n", K(total));
}
@@ -3194,7 +3252,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
return node;
}
- for_each_node_state(n, N_HIGH_MEMORY) {
+ for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -3336,7 +3394,7 @@ static int default_zonelist_order(void)
* local memory, NODE_ORDER may be suitable.
*/
average_size = total_size /
- (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
+ (nodes_weight(node_states[N_MEMORY]) + 1);
for_each_online_node(nid) {
low_kmem_size = 0;
total_size = 0;
@@ -3826,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
+ reset_page_last_nid(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4432,6 +4491,26 @@ void __init set_pageblock_order(void)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
+{
+ unsigned long pages = spanned_pages;
+
+ /*
+ * Provide a more accurate estimation if there are holes within
+ * the zone and SPARSEMEM is in use. If there are holes within the
+ * zone, each populated memory region may cost us one or two extra
+ * memmap pages due to alignment because memmap pages for each
+ * populated regions may not naturally algined on page boundary.
+ * So the (present_pages >> 4) heuristic is a tradeoff for that.
+ */
+ if (spanned_pages > present_pages + (present_pages >> 4) &&
+ IS_ENABLED(CONFIG_SPARSEMEM))
+ pages = present_pages;
+
+ return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -4449,54 +4528,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+ spin_lock_init(&pgdat->numabalancing_migrate_lock);
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, memmap_pages;
+ unsigned long size, realsize, freesize, memmap_pages;
size = zone_spanned_pages_in_node(nid, j, zones_size);
- realsize = size - zone_absent_pages_in_node(nid, j,
+ realsize = freesize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
- * Adjust realsize so that it accounts for how much memory
+ * Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages =
- PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
- if (realsize >= memmap_pages) {
- realsize -= memmap_pages;
+ memmap_pages = calc_memmap_size(size, realsize);
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
- " %s zone: %lu pages exceeds realsize %lu\n",
- zone_names[j], memmap_pages, realsize);
+ " %s zone: %lu pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
/* Account for reserved pages */
- if (j == 0 && realsize > dma_reserve) {
- realsize -= dma_reserve;
+ if (j == 0 && freesize > dma_reserve) {
+ freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
- nr_kernel_pages += realsize;
- nr_all_pages += realsize;
+ nr_kernel_pages += freesize;
+ /* Charge for highmem memmap if there are enough kernel pages */
+ else if (nr_kernel_pages > memmap_pages * 2)
+ nr_kernel_pages -= memmap_pages;
+ nr_all_pages += freesize;
zone->spanned_pages = size;
- zone->present_pages = realsize;
+ zone->present_pages = freesize;
+ /*
+ * Set an approximate value for lowmem here, it will be adjusted
+ * when the bootmem allocator frees pages into the buddy system.
+ * And all highmem pages will be managed by the buddy system.
+ */
+ zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
+ zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+ zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
@@ -4687,7 +4779,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
- * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ * Populate N_MEMORY for calculating usable_nodes.
*/
static unsigned long __init early_calculate_totalpages(void)
{
@@ -4700,7 +4792,7 @@ static unsigned long __init early_calculate_totalpages(void)
totalpages += pages;
if (pages)
- node_set_state(nid, N_HIGH_MEMORY);
+ node_set_state(nid, N_MEMORY);
}
return totalpages;
}
@@ -4717,9 +4809,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
- nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
+ nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
- int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+ int usable_nodes = nodes_weight(node_states[N_MEMORY]);
/*
* If movablecore was specified, calculate what size of
@@ -4754,7 +4846,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
/*
@@ -4846,23 +4938,27 @@ restart:
out:
/* restore the node_state */
- node_states[N_HIGH_MEMORY] = saved_node_state;
+ node_states[N_MEMORY] = saved_node_state;
}
-/* Any regular memory on that node ? */
-static void __init check_for_regular_memory(pg_data_t *pgdat)
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
{
-#ifdef CONFIG_HIGHMEM
enum zone_type zone_type;
- for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ return;
+
+ for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (zone->present_pages) {
- node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
+ zone_type <= ZONE_NORMAL)
+ node_set_state(nid, N_NORMAL_MEMORY);
break;
}
}
-#endif
}
/**
@@ -4945,8 +5041,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Any memory on that node */
if (pgdat->node_present_pages)
- node_set_state(nid, N_HIGH_MEMORY);
- check_for_regular_memory(pgdat);
+ node_set_state(nid, N_MEMORY);
+ check_for_memory(pgdat, nid);
}
}
@@ -5174,10 +5270,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
- zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
- zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
- zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
-
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -5575,7 +5667,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
* expect this function should be exact.
*/
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+ bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
int mt;
@@ -5610,6 +5703,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
continue;
}
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (skip_hwpoisoned_pages && PageHWPoison(page))
+ continue;
+
if (!PageLRU(page))
found++;
/*
@@ -5652,7 +5752,7 @@ bool is_pageblock_removable_nolock(struct page *page)
zone->zone_start_pfn + zone->spanned_pages <= pfn)
return false;
- return !has_unmovable_pages(zone, page, 0);
+ return !has_unmovable_pages(zone, page, 0, true);
}
#ifdef CONFIG_CMA
@@ -5679,7 +5779,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned int tries = 0;
int ret = 0;
- migrate_prep_local();
+ migrate_prep();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
@@ -5707,61 +5807,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
ret = migrate_pages(&cc->migratepages,
alloc_migrate_target,
- 0, false, MIGRATE_SYNC);
+ 0, false, MIGRATE_SYNC,
+ MR_CMA);
}
- putback_lru_pages(&cc->migratepages);
+ putback_movable_pages(&cc->migratepages);
return ret > 0 ? 0 : ret;
}
-/*
- * Update zone's cma pages counter used for watermark level calculation.
- */
-static inline void __update_cma_watermarks(struct zone *zone, int count)
-{
- unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
- zone->min_cma_pages += count;
- spin_unlock_irqrestore(&zone->lock, flags);
- setup_per_zone_wmarks();
-}
-
-/*
- * Trigger memory pressure bump to reclaim some pages in order to be able to
- * allocate 'count' pages in single page units. Does similar work as
- *__alloc_pages_slowpath() function.
- */
-static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
-{
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zonelist *zonelist = node_zonelist(0, gfp_mask);
- int did_some_progress = 0;
- int order = 1;
-
- /*
- * Increase level of watermarks to force kswapd do his job
- * to stabilise at new watermark level.
- */
- __update_cma_watermarks(zone, count);
-
- /* Obey watermarks as if the page was being allocated */
- while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
- wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
-
- did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
- NULL);
- if (!did_some_progress) {
- /* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order, NULL, false);
- }
- }
-
- /* Restore original watermark levels. */
- __update_cma_watermarks(zone, -count);
-
- return count;
-}
-
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -5785,7 +5838,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype)
{
- struct zone *zone = page_zone(pfn_to_page(start));
unsigned long outer_start, outer_end;
int ret = 0, order;
@@ -5823,7 +5875,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype);
+ pfn_max_align_up(end), migratetype,
+ false);
if (ret)
return ret;
@@ -5862,18 +5915,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end)) {
+ if (test_pages_isolated(outer_start, end, false)) {
pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
outer_start, end);
ret = -EBUSY;
goto done;
}
- /*
- * Reclaim enough pages to make sure that contiguous allocation
- * will not starve the system.
- */
- __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
/* Grab isolated pages from freelists. */
outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5931,7 +5979,6 @@ void __meminit zone_pcp_update(struct zone *zone)
}
#endif
-#ifdef CONFIG_MEMORY_HOTREMOVE
void zone_pcp_reset(struct zone *zone)
{
unsigned long flags;
@@ -5951,6 +5998,7 @@ void zone_pcp_reset(struct zone *zone)
local_irq_restore(flags);
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be isolated before calling this.
*/
@@ -5977,6 +6025,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
continue;
}
page = pfn_to_page(pfn);
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
+ pfn++;
+ SetPageReserved(page);
+ continue;
+ }
+
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
@@ -5987,8 +6045,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
- __mod_zone_page_state(zone, NR_FREE_PAGES,
- - (1UL << order));
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
+ offline_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
if (mem_cgroup_disabled())
return;
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
start_pfn = node_start_pfn(nid);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..9d2264ea4606 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype)
zone->nr_pageblock_isolate--;
}
-int set_migratetype_isolate(struct page *page)
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page)
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
- if (!has_unmovable_pages(zone, page, arg.pages_found))
+ if (!has_unmovable_pages(zone, page, arg.pages_found,
+ skip_hwpoisoned_pages))
ret = 0;
/*
@@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype)
+ unsigned migratetype, bool skip_hwpoisoned_pages)
{
unsigned long pfn;
unsigned long undo_pfn;
@@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && set_migratetype_isolate(page)) {
+ if (page &&
+ set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
undo_pfn = pfn;
goto undo;
}
@@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* Returns 1 if all pages in the range are isolated.
*/
static int
-__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
+ bool skip_hwpoisoned_pages)
{
struct page *page;
@@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
else if (page_count(page) == 0 &&
get_freepage_migratetype(page) == MIGRATE_ISOLATE)
pfn += 1;
+ else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
+ /*
+ * The HWPoisoned page may be not in buddy
+ * system, and page_count() is not 0.
+ */
+ pfn++;
+ continue;
+ }
else
break;
}
@@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
return 1;
}
-int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+ bool skip_hwpoisoned_pages)
{
unsigned long pfn, flags;
struct page *page;
@@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
/* Check all pages are free or Marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+ ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+ skip_hwpoisoned_pages);
spin_unlock_irqrestore(&zone->lock, flags);
return ret ? 0 : -EBUSY;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd_mm(walk->mm, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index ddc5efb9c5bb..8c8e08f3a692 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
if (!chunk)
return;
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- kfree(chunk);
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
}
/*
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
+ if (!str)
+ return -EINVAL;
+
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
- * Only sets the access flags (dirty, accessed, and
- * writable). Furthermore, we know it always gets set to a "more
+ * Only sets the access flags (dirty, accessed), as well as write
+ * permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
* instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
return changed;
}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pte_t pte;
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
+ if (pte_accessible(pte))
+ flush_tlb_page(vma, address);
return pte;
}
#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
- * anon_vma->mutex
+ * anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*/
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
- * Synchronize against page_lock_anon_vma() such that
+ * Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
- * mutex_trylock() from page_lock_anon_vma(). This orders:
+ * down_read_trylock() from page_lock_anon_vma_read(). This orders:
*
- * page_lock_anon_vma() VS put_anon_vma()
- * mutex_trylock() atomic_dec_and_test()
+ * page_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
* LOCK MB
- * atomic_read() mutex_is_locked()
+ * atomic_read() rwsem_is_locked()
*
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
- if (mutex_is_locked(&anon_vma->root->mutex)) {
- anon_vma_lock(anon_vma);
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
anon_vma_unlock(anon_vma);
}
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
allocated = anon_vma;
}
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
struct anon_vma *new_root = anon_vma->root;
if (new_root != root) {
if (WARN_ON_ONCE(root))
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
root = new_root;
- mutex_lock(&root->mutex);
+ down_write(&root->rwsem);
}
return root;
}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
static inline void unlock_anon_vma_root(struct anon_vma *root)
{
if (root)
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
}
/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
/*
* Iterate the list once more, it now only contains empty and unlinked
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
- * needing to acquire the anon_vma->root->mutex.
+ * needing to write-acquire the anon_vma->root->rwsem.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
- mutex_init(&anon_vma->mutex);
+ init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT;
}
@@ -442,7 +442,7 @@ out:
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
- if (mutex_trylock(&root_anon_vma->mutex)) {
+ if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
if (!page_mapped(page)) {
- mutex_unlock(&root_anon_vma->mutex);
+ up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
/* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock();
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
- * we'll deadlock on the anon_vma_lock() recursion.
+ * we'll deadlock on the anon_vma_lock_write() recursion.
*/
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
@@ -504,9 +504,9 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
/*
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return address;
}
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ pmd = NULL;
+out:
+ return pmd;
+}
+
/*
* Check that @page is mapped at @address into @mm.
*
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp, int sync)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
goto check;
}
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return NULL;
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return NULL;
if (pmd_trans_huge(*pmd))
return NULL;
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
struct anon_vma_chain *avc;
int referenced = 0;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return referenced;
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return referenced;
}
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
* containing the swap entry, but page not yet written to swap.
*
* And we can skip it on file pages, so long as the filesystem
- * participates in dirty tracking; but need to catch shm and tmpfs
- * and ramfs pages which have been modified since creation by read
- * fault.
+ * participates in dirty tracking (note that this is not only an
+ * optimization but also solves problems caused by dirty flag in
+ * storage key getting set by a write from inside kernel); but need to
+ * catch shm and tmpfs and ramfs pages which have been modified since
+ * creation by read fault.
*
* Note that mapping must be decided above, before decrementing
* mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ if (!PageHuge(page)) {
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
+ }
set_pte_at(mm, address, pte,
- swp_entry_to_pte(make_hwpoison_entry(page)));
+ swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
@@ -1299,7 +1315,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+ * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
struct vm_area_struct *vma, struct page *check_page)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (end > vma->vm_end)
end = vma->vm_end;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return ret;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return ret;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return ret;
mmun_start = address;
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return ret;
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return ret;
}
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
int ret = SWAP_AGAIN;
/*
- * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 89341b658bd0..03f9ba8fb8e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
-
- spol = mpol_cond_copy(&mpol,
- mpol_shared_policy_lookup(&info->policy, index));
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
/* Bias interleave by inode number to distribute better across nodes */
pvma.vm_pgoff = index + info->vfs_inode.i_ino;
pvma.vm_ops = NULL;
- pvma.vm_policy = spol;
- return swapin_readahead(swap, gfp, &pvma, 0);
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+
+ page = swapin_readahead(swap, gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
@@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- /*
- * alloc_page_vma() will drop the shared policy reference
- */
- return alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_page_vma(gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ pgoff_t index, pgoff_t end, int origin)
+{
+ struct page *page;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool done = false;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ pvec.nr = 1; /* start small: we may be there already */
+ while (!done) {
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ pvec.nr, pvec.pages, indices);
+ if (!pvec.nr) {
+ if (origin == SEEK_DATA)
+ index = end;
+ break;
+ }
+ for (i = 0; i < pvec.nr; i++, index++) {
+ if (index < indices[i]) {
+ if (origin == SEEK_HOLE) {
+ done = true;
+ break;
+ }
+ index = indices[i];
+ }
+ page = pvec.pages[i];
+ if (page && !radix_tree_exceptional_entry(page)) {
+ if (!PageUptodate(page))
+ page = NULL;
+ }
+ if (index >= end ||
+ (page && origin == SEEK_DATA) ||
+ (!page && origin == SEEK_HOLE)) {
+ done = true;
+ break;
+ }
+ }
+ shmem_deswap_pagevec(&pvec);
+ pagevec_release(&pvec);
+ pvec.nr = PAGEVEC_SIZE;
+ cond_resched();
+ }
+ return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t start, end;
+ loff_t new_offset;
+
+ if (origin != SEEK_DATA && origin != SEEK_HOLE)
+ return generic_file_llseek_size(file, offset, origin,
+ MAX_LFS_FILESIZE, i_size_read(inode));
+ mutex_lock(&inode->i_mutex);
+ /* We're holding i_mutex so we can access i_size directly */
+
+ if (offset < 0)
+ offset = -EINVAL;
+ else if (offset >= inode->i_size)
+ offset = -ENXIO;
+ else {
+ start = offset >> PAGE_CACHE_SHIFT;
+ end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+ new_offset <<= PAGE_CACHE_SHIFT;
+ if (new_offset > offset) {
+ if (new_offset < inode->i_size)
+ offset = new_offset;
+ else if (origin == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ offset = inode->i_size;
+ }
+ }
+
+ if (offset >= 0 && offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -2580,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = generic_file_llseek,
+ .llseek = shmem_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..487f0bdd53c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg)
struct memory_notify *marg = arg;
int offline_node;
- offline_node = marg->status_change_nid;
+ offline_node = marg->status_change_nid_normal;
/*
* If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg)
struct kmem_cache_node *n;
struct kmem_cache *s;
struct memory_notify *marg = arg;
- int nid = marg->status_change_nid;
+ int nid = marg->status_change_nid_normal;
int ret = 0;
/*
diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2888f2..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
return; /* XXX: Not implemented yet */
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
}
#else
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
got_map_page:
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:
- memset(ret, 0, memmap_size);
return ret;
}
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
get_order(sizeof(struct page) * nr_pages));
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
unsigned long magic;
+ struct page *page = virt_to_page(memmap);
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
*/
if (memmap) {
- struct page *memmap_page;
- memmap_page = virt_to_page(memmap);
-
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
- free_map_bootmem(memmap_page, nr_pages);
+ free_map_bootmem(memmap, nr_pages);
}
}
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
+ memset(memmap, 0, sizeof(struct page) * nr_pages);
+
ms->section_mem_map |= SECTION_MARKED_PRESENT;
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -773,6 +772,27 @@ out:
return ret;
}
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+ int i;
+
+ if (!memmap)
+ return;
+
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ if (PageHWPoison(&memmap[i])) {
+ atomic_long_sub(1, &mce_bad_pages);
+ ClearPageHWPoison(&memmap[i]);
+ }
+ }
+}
+#else
+static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
+
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
{
struct page *memmap = NULL;
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
ms->pageblock_flags = NULL;
}
+ clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
free_section_usemap(memmap, usemap);
}
#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
-static void enable_swap_info(struct swap_info_struct *p, int prio,
+static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
unsigned long *frontswap_map)
{
int i, prev;
- spin_lock(&swap_lock);
if (prio >= 0)
p->prio = prio;
else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+}
+
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
+{
+ spin_lock(&swap_lock);
+ _enable_swap_info(p, prio, swap_map, frontswap_map);
frontswap_init(p->type);
spin_unlock(&swap_lock);
}
+static void reinsert_swap_info(struct swap_info_struct *p)
+{
+ spin_lock(&swap_lock);
+ _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+ spin_unlock(&swap_lock);
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int oom_score_adj;
int i, type, prev;
int err;
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
- oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+ set_current_oom_origin();
err = try_to_unuse(type, false, 0); /* force all pages to be unused */
- compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
+ clear_current_oom_origin();
if (err) {
- /*
- * reading p->prio and p->swap_map outside the lock is
- * safe here because only sys_swapon and sys_swapoff
- * change them, and there can be no other sys_swapon or
- * sys_swapoff for this swap_info_struct at this point.
- */
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+ reinsert_swap_info(p);
goto out_dput;
}
diff --git a/mm/util.c b/mm/util.c
index dc3036cdcc6a..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
*
* The contents of the object pointed to are preserved up to the
* lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
+ * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
* %NULL pointer, the object pointed to is freed.
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
- if (NUMA_BUILD) {
+ if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
unsigned int *ptr = NULL;
int ret;
- if (NUMA_BUILD) {
+ if (IS_ENABLED(CONFIG_NUMA)) {
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
if (ptr == NULL)
return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 48550c66f1f2..7f3096137b8a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1679,13 +1679,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
if (global_reclaim(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
- /* If we have very few page cache pages,
- force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
+ /*
+ * If we have very few page cache pages, force-scan
+ * anon pages.
+ */
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
goto out;
+ } else if (!inactive_file_is_low_global(zone)) {
+ /*
+ * There is enough inactive page cache, do not
+ * reclaim anything from the working set right now.
+ */
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
+ goto out;
}
}
@@ -1752,7 +1763,7 @@ out:
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
- if (COMPACTION_BUILD && sc->order &&
+ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -2005,7 +2016,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (zone->all_unreclaimable &&
sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- if (COMPACTION_BUILD) {
+ if (IS_ENABLED(CONFIG_COMPACTION)) {
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
@@ -2207,9 +2218,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
- * when the low watermark is reached
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
*/
-static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zone *zone;
@@ -2224,13 +2238,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
- return;
+ goto out;
+
+ /*
+ * If a fatal signal is pending, this process should not throttle.
+ * It should return quickly so it can exit and free its memory
+ */
+ if (fatal_signal_pending(current))
+ goto out;
/* Check if the pfmemalloc reserves are ok */
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
pgdat = zone->zone_pgdat;
if (pfmemalloc_watermark_ok(pgdat))
- return;
+ goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2246,12 +2267,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat), HZ);
- return;
+
+ goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat));
+
+check_pending:
+ if (fatal_signal_pending(current))
+ return true;
+
+out:
+ return false;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2273,13 +2302,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.gfp_mask = sc.gfp_mask,
};
- throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
-
/*
- * Do not enter reclaim if fatal signal is pending. 1 is returned so
- * that the page allocator does not consider triggering OOM
+ * Do not enter reclaim if fatal signal was delivered while throttled.
+ * 1 is returned so that the page allocator does not OOM kill at this
+ * point.
*/
- if (fatal_signal_pending(current))
+ if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2397,6 +2425,20 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
+static bool zone_balanced(struct zone *zone, int order,
+ unsigned long balance_gap, int classzone_idx)
+{
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+ balance_gap, classzone_idx, 0))
+ return false;
+
+ if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+ !compaction_suitable(zone, order))
+ return false;
+
+ return true;
+}
+
/*
* pgdat_balanced is used when checking if a node is balanced for high-order
* allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2475,8 +2517,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
continue;
}
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- i, 0))
+ if (!zone_balanced(zone, order, 0, i))
all_zones_ok = false;
else
balanced += zone->present_pages;
@@ -2585,8 +2626,7 @@ loop_again:
break;
}
- if (!zone_watermark_ok_safe(zone, order,
- high_wmark_pages(zone), 0, 0)) {
+ if (!zone_balanced(zone, order, 0, 0)) {
end_zone = i;
break;
} else {
@@ -2656,15 +2696,14 @@ loop_again:
* Do not reclaim more than needed for compaction.
*/
testorder = order;
- if (COMPACTION_BUILD && order &&
+ if (IS_ENABLED(CONFIG_COMPACTION) && order &&
compaction_suitable(zone, order) !=
COMPACT_SKIPPED)
testorder = 0;
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
- !zone_watermark_ok_safe(zone, testorder,
- high_wmark_pages(zone) + balance_gap,
- end_zone, 0)) {
+ !zone_balanced(zone, testorder,
+ balance_gap, end_zone)) {
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2691,8 +2730,7 @@ loop_again:
continue;
}
- if (!zone_watermark_ok_safe(zone, testorder,
- high_wmark_pages(zone), end_zone, 0)) {
+ if (!zone_balanced(zone, testorder, 0, end_zone)) {
all_zones_ok = 0;
/*
* We are still under min water mark. This
@@ -2797,29 +2835,10 @@ out:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable &&
- sc.priority != DEF_PRIORITY)
- continue;
-
- /* Would compaction fail due to lack of free memory? */
- if (COMPACTION_BUILD &&
- compaction_suitable(zone, order) == COMPACT_SKIPPED)
- goto loop_again;
-
- /* Confirm the zone is balanced for order-0 */
- if (!zone_watermark_ok(zone, 0,
- high_wmark_pages(zone), 0, 0)) {
- order = sc.order = 0;
- goto loop_again;
- }
-
/* Check if the memory needs to be defragmented. */
if (zone_watermark_ok(zone, order,
low_wmark_pages(zone), *classzone_idx, 0))
zones_need_compaction = 0;
-
- /* If balanced, clear the congested flag */
- zone_clear_flag(zone, ZONE_CONGESTED);
}
if (zones_need_compaction)
@@ -2944,7 +2963,7 @@ static int kswapd(void *p)
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
- int ret;
+ bool ret;
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3112,7 +3131,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
@@ -3168,7 +3187,7 @@ static int __init kswapd_init(void)
int nid;
swap_setup();
- for_each_node_state(nid, N_HIGH_MEMORY)
+ for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7370579111b..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
"pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_pte_updates",
+ "numa_hint_faults",
+ "numa_hint_faults_local",
+ "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+ "pgmigrate_success",
+ "pgmigrate_fail",
+#endif
#ifdef CONFIG_COMPACTION
- "compact_blocks_moved",
- "compact_pages_moved",
- "compact_pagemigrate_failed",
+ "compact_migrate_scanned",
+ "compact_free_scanned",
+ "compact_isolated",
"compact_stall",
"compact_fail",
"compact_success",
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_split",
+ "thp_zero_page_alloc",
+ "thp_zero_page_alloc_failed",
#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
- if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n high %lu"
"\n scanned %lu"
"\n spanned %lu"
- "\n present %lu",
+ "\n present %lu"
+ "\n managed %lu",
zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
zone->pages_scanned,
zone->spanned_pages,
- zone->present_pages);
+ zone->present_pages,
+ zone->managed_pages);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
- if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
walk_zones_in_node(m, pgdat, unusable_show_print);