aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig36
-rw-r--r--mm/Kconfig.debug4
-rw-r--r--mm/Makefile5
-rw-r--r--mm/backing-dev.c124
-rw-r--r--mm/balloon_compaction.c72
-rw-r--r--mm/compaction.c68
-rw-r--r--mm/debug.c31
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/filemap.c172
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c143
-rw-r--r--mm/hmm.c496
-rw-r--r--mm/huge_memory.c136
-rw-r--r--mm/hugetlb.c113
-rw-r--r--mm/hugetlb_cgroup.c4
-rw-r--r--mm/init-mm.c3
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/common.c42
-rw-r--r--mm/kasan/kasan.h14
-rw-r--r--mm/kasan/report.c44
-rw-r--r--mm/kasan/tags_report.c24
-rw-r--r--mm/khugepaged.c383
-rw-r--r--mm/kmemleak.c358
-rw-r--r--mm/ksm.c18
-rw-r--r--mm/madvise.c390
-rw-r--r--mm/memblock.c6
-rw-r--r--mm/memcontrol.c544
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory-failure.c36
-rw-r--r--mm/memory.c17
-rw-r--r--mm/memory_hotplug.c222
-rw-r--r--mm/mempolicy.c143
-rw-r--r--mm/memremap.c457
-rw-r--r--mm/migrate.c300
-rw-r--r--mm/mincore.c19
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mmap.c54
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/mmu_notifier.c263
-rw-r--r--mm/mprotect.c28
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c26
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c99
-rw-r--r--mm/page_ext.c23
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_owner.c131
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/page_vma_mapped.c3
-rw-r--r--mm/pagewalk.c126
-rw-r--r--mm/percpu.c23
-rw-r--r--mm/quicklist.c103
-rw-r--r--mm/rmap.c34
-rw-r--r--mm/shmem.c403
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab.c3
-rw-r--r--mm/slab.h68
-rw-r--r--mm/slab_common.c65
-rw-r--r--mm/slob.c64
-rw-r--r--mm/slub.c108
-rw-r--r--mm/sparse.c27
-rw-r--r--mm/swap.c58
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c41
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/usercopy.c10
-rw-r--r--mm/util.c122
-rw-r--r--mm/vmalloc.c115
-rw-r--r--mm/vmpressure.c20
-rw-r--r--mm/vmscan.c324
-rw-r--r--mm/vmstat.c27
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/z3fold.c88
-rw-r--r--mm/zpool.c16
-rw-r--r--mm/zsmalloc.c103
-rw-r--r--mm/zswap.c15
78 files changed, 4674 insertions, 2405 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec636a1fc..a5dae9a7eb51 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -273,11 +273,6 @@ config BOUNCE
by default when ZONE_DMA or HIGHMEM is selected, but you
may say n to override this.
-config NR_QUICK
- int
- depends on QUICKLIST
- default "1"
-
config VIRT_TO_BUS
bool
help
@@ -669,23 +664,17 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
-config MIGRATE_VMA_HELPER
- bool
-
config DEV_PAGEMAP_OPS
bool
+#
+# Helpers to mirror range of the CPU page tables of a process into device page
+# tables.
+#
config HMM_MIRROR
- bool "HMM mirror CPU page table into a device page table"
- depends on (X86_64 || PPC64)
- depends on MMU && 64BIT
- select MMU_NOTIFIER
- help
- Select HMM_MIRROR if you want to mirror range of the CPU page table of a
- process into a device page table. Here, mirror means "keep synchronized".
- Prerequisites: the device must provide the ability to write-protect its
- page tables (at PAGE_SIZE granularity), and must be able to recover from
- the resulting potential page faults.
+ bool
+ depends on MMU
+ depends on MMU_NOTIFIER
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
@@ -723,6 +712,17 @@ config GUP_BENCHMARK
config GUP_GET_PTE_LOW_HIGH
bool
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
config ARCH_HAS_PTE_SPECIAL
bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 82b6a20898bd..327b3ebf23bf 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC
Also, the state of page tracking structures is checked more often as
pages are being allocated and freed, as unexpected state changes
often happen for same reasons as memory corruption (e.g. double free,
- use-after-free).
+ use-after-free). The error reports for these checks can be augmented
+ with stack traces of last allocation and freeing of the page, when
+ PAGE_OWNER is also selected and enabled on boot.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/Makefile b/mm/Makefile
index 338e528ad436..d996846697ef 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
+CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
@@ -72,7 +75,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
@@ -102,5 +104,6 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
+obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e8e89158adec..c360f6a6c844 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/wait.h>
+#include <linux/rbtree.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
/*
- * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
- * locking.
+ * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
+ * reader side locking.
*/
DEFINE_SPINLOCK(bdi_lock);
+static u64 bdi_id_cursor;
+static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
@@ -236,8 +239,8 @@ static int __init default_bdi_init(void)
{
int err;
- bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
- WQ_UNBOUND | WQ_SYSFS, 0);
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
+ WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
@@ -615,13 +618,12 @@ out_put:
}
/**
- * wb_get_create - get wb for a given memcg, create if necessary
+ * wb_get_lookup - get wb for a given memcg
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
- * @gfp: allocation mask to use
*
- * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
- * create one. The returned wb has its refcount incremented.
+ * Try to get the wb for @memcg_css on @bdi. The returned wb has its
+ * refcount incremented.
*
* This function uses css_get() on @memcg_css and thus expects its refcnt
* to be positive on invocation. IOW, rcu_read_lock() protection on
@@ -638,6 +640,39 @@ out_put:
* each lookup. On mismatch, the existing wb is discarded and a new one is
* created.
*/
+struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css)
+{
+ struct bdi_writeback *wb;
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ rcu_read_lock();
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb) {
+ struct cgroup_subsys_state *blkcg_css;
+
+ /* see whether the blkcg association has changed */
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
+ if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
+ wb = NULL;
+ css_put(blkcg_css);
+ }
+ rcu_read_unlock();
+
+ return wb;
+}
+
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
+ * create one. See wb_get_lookup() for more details.
+ */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp)
@@ -650,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
return &bdi->wb;
do {
- rcu_read_lock();
- wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
- if (wb) {
- struct cgroup_subsys_state *blkcg_css;
-
- /* see whether the blkcg association has changed */
- blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
- &io_cgrp_subsys);
- if (unlikely(wb->blkcg_css != blkcg_css ||
- !wb_tryget(wb)))
- wb = NULL;
- css_put(blkcg_css);
- }
- rcu_read_unlock();
+ wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
return wb;
@@ -859,9 +881,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
EXPORT_SYMBOL(bdi_alloc_node);
+static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
+{
+ struct rb_node **p = &bdi_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct backing_dev_info *bdi;
+
+ lockdep_assert_held(&bdi_lock);
+
+ while (*p) {
+ parent = *p;
+ bdi = rb_entry(parent, struct backing_dev_info, rb_node);
+
+ if (bdi->id > id)
+ p = &(*p)->rb_left;
+ else if (bdi->id < id)
+ p = &(*p)->rb_right;
+ else
+ break;
+ }
+
+ if (parentp)
+ *parentp = parent;
+ return p;
+}
+
+/**
+ * bdi_get_by_id - lookup and get bdi from its id
+ * @id: bdi id to lookup
+ *
+ * Find bdi matching @id and get it. Returns NULL if the matching bdi
+ * doesn't exist or is already unregistered.
+ */
+struct backing_dev_info *bdi_get_by_id(u64 id)
+{
+ struct backing_dev_info *bdi = NULL;
+ struct rb_node **p;
+
+ spin_lock_bh(&bdi_lock);
+ p = bdi_lookup_rb_node(id, NULL);
+ if (*p) {
+ bdi = rb_entry(*p, struct backing_dev_info, rb_node);
+ bdi_get(bdi);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return bdi;
+}
+
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
struct device *dev;
+ struct rb_node *parent, **p;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
@@ -877,7 +948,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
set_bit(WB_registered, &bdi->wb.state);
spin_lock_bh(&bdi_lock);
+
+ bdi->id = ++bdi_id_cursor;
+
+ p = bdi_lookup_rb_node(bdi->id, &parent);
+ rb_link_node(&bdi->rb_node, parent, p);
+ rb_insert_color(&bdi->rb_node, &bdi_tree);
+
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
@@ -918,6 +997,7 @@ EXPORT_SYMBOL(bdi_register_owner);
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
spin_lock_bh(&bdi_lock);
+ rb_erase(&bdi->rb_node, &bdi_tree);
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 83a7b614061f..26de020aae7b 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -21,7 +21,6 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
* memory corruption is possible and we should stop execution.
*/
BUG_ON(!trylock_page(page));
- list_del(&page->lru);
balloon_page_insert(b_dev_info, page);
unlock_page(page);
__count_vm_event(BALLOON_INFLATE);
@@ -33,8 +32,8 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
* @b_dev_info: balloon device descriptor where we will insert a new page to
* @pages: pages to enqueue - allocated using balloon_page_alloc.
*
- * Driver must call it to properly enqueue a balloon pages before definitively
- * removing it from the guest system.
+ * Driver must call this function to properly enqueue balloon pages before
+ * definitively removing them from the guest system.
*
* Return: number of pages that were enqueued.
*/
@@ -47,6 +46,7 @@ size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_for_each_entry_safe(page, tmp, pages, lru) {
+ list_del(&page->lru);
balloon_page_enqueue_one(b_dev_info, page);
n_pages++;
}
@@ -63,12 +63,13 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
* @n_req_pages: number of requested pages.
*
* Driver must call this function to properly de-allocate a previous enlisted
- * balloon pages before definetively releasing it back to the guest system.
+ * balloon pages before definitively releasing it back to the guest system.
* This function tries to remove @n_req_pages from the ballooned pages and
* return them to the caller in the @pages list.
*
- * Note that this function may fail to dequeue some pages temporarily empty due
- * to compaction isolated pages.
+ * Note that this function may fail to dequeue some pages even if the balloon
+ * isn't empty - since the page list can be temporarily empty due to compaction
+ * of isolated pages.
*
* Return: number of pages that were added to the @pages list.
*/
@@ -112,31 +113,35 @@ EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
/*
* balloon_page_alloc - allocates a new page for insertion into the balloon
- * page list.
+ * page list.
+ *
+ * Driver must call this function to properly allocate a new balloon page.
+ * Driver must call balloon_page_enqueue before definitively removing the page
+ * from the guest system.
*
- * Driver must call it to properly allocate a new enlisted balloon page.
- * Driver must call balloon_page_enqueue before definitively removing it from
- * the guest system. This function returns the page address for the recently
- * allocated page or NULL in the case we fail to allocate a new page this turn.
+ * Return: struct page for the allocated page or NULL on allocation failure.
*/
struct page *balloon_page_alloc(void)
{
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY);
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN);
return page;
}
EXPORT_SYMBOL_GPL(balloon_page_alloc);
/*
- * balloon_page_enqueue - allocates a new page and inserts it into the balloon
- * page list.
- * @b_dev_info: balloon device descriptor where we will insert a new page to
+ * balloon_page_enqueue - inserts a new page into the balloon page list.
+ *
+ * @b_dev_info: balloon device descriptor where we will insert a new page
* @page: new page to enqueue - allocated using balloon_page_alloc.
*
- * Driver must call it to properly enqueue a new allocated balloon page
- * before definitively removing it from the guest system.
- * This function returns the page address for the recently enqueued page or
- * NULL in the case we fail to allocate a new page this turn.
+ * Drivers must call this function to properly enqueue a new allocated balloon
+ * page before definitively removing the page from the guest system.
+ *
+ * Drivers must not call balloon_page_enqueue on pages that have been pushed to
+ * a list with balloon_page_push before removing them with balloon_page_pop. To
+ * enqueue a list of pages, use balloon_page_list_enqueue instead.
*/
void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
struct page *page)
@@ -151,14 +156,23 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
- * the its address to allow the driver release the page.
+ * its address to allow the driver to release the page.
* @b_dev_info: balloon device decriptor where we will grab a page from.
*
- * Driver must call it to properly de-allocate a previous enlisted balloon page
- * before definetively releasing it back to the guest system.
- * This function returns the page address for the recently dequeued page or
- * NULL in the case we find balloon's page list temporarily empty due to
- * compaction isolated pages.
+ * Driver must call this function to properly dequeue a previously enqueued page
+ * before definitively releasing it back to the guest system.
+ *
+ * Caller must perform its own accounting to ensure that this
+ * function is called only if some pages are actually enqueued.
+ *
+ * Note that this function may fail to dequeue some pages even if there are
+ * some enqueued pages - since the page list can be temporarily empty due to
+ * the compaction of isolated pages.
+ *
+ * TODO: remove the caller accounting requirements, and allow caller to wait
+ * until all pages can be dequeued.
+ *
+ * Return: struct page for the dequeued page, or NULL if no page was dequeued.
*/
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
{
@@ -171,9 +185,9 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
if (n_pages != 1) {
/*
* If we are unable to dequeue a balloon page because the page
- * list is empty and there is no isolated pages, then something
+ * list is empty and there are no isolated pages, then something
* went out of track and some balloon pages are lost.
- * BUG() here, otherwise the balloon driver may get stuck into
+ * BUG() here, otherwise the balloon driver may get stuck in
* an infinite loop while attempting to release all its pages.
*/
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
@@ -224,8 +238,8 @@ int balloon_page_migrate(struct address_space *mapping,
/*
* We can not easily support the no copy case here so ignore it as it
- * is unlikely to be use with ballon pages. See include/linux/hmm.h for
- * user of the MIGRATE_SYNC_NO_COPY mode.
+ * is unlikely to be used with balloon pages. See include/linux/hmm.h
+ * for a user of the MIGRATE_SYNC_NO_COPY mode.
*/
if (mode == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
diff --git a/mm/compaction.c b/mm/compaction.c
index 9e1b9acb116b..672d3c78c6ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -270,14 +270,15 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
/* Ensure the start of the pageblock or zone is online and valid */
block_pfn = pageblock_start_pfn(pfn);
- block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn));
+ block_pfn = max(block_pfn, zone->zone_start_pfn);
+ block_page = pfn_to_online_page(block_pfn);
if (block_page) {
page = block_page;
pfn = block_pfn;
}
/* Ensure the end of the pageblock or zone is online and valid */
- block_pfn += pageblock_nr_pages;
+ block_pfn = pageblock_end_pfn(pfn) - 1;
block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
end_page = pfn_to_online_page(block_pfn);
if (!end_page)
@@ -303,7 +304,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
- } while (page < end_page);
+ } while (page <= end_page);
return false;
}
@@ -842,13 +843,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/*
* Periodically drop the lock (if held) regardless of its
- * contention, to give chance to IRQs. Abort async compaction
- * if contended.
+ * contention, to give chance to IRQs. Abort completely if
+ * a fatal signal is pending.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc))
- break;
+ flags, &locked, cc)) {
+ low_pfn = 0;
+ goto fatal_pending;
+ }
if (!pfn_valid_within(low_pfn))
goto isolate_fail;
@@ -967,7 +970,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* is safe to read and it's 0 for tail pages.
*/
if (unlikely(PageCompound(page))) {
- low_pfn += (1UL << compound_order(page)) - 1;
+ low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
}
@@ -1060,6 +1063,7 @@ isolate_abort:
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
+fatal_pending:
cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1734,8 +1738,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
- struct compact_control *cc)
+static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
@@ -1753,8 +1756,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
*/
low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
- if (block_start_pfn < zone->zone_start_pfn)
- block_start_pfn = zone->zone_start_pfn;
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
/*
* fast_find_migrateblock marks a pageblock skipped so to avoid
@@ -1784,8 +1787,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
- page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
- zone);
+ page = pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone);
if (!page)
continue;
@@ -2075,6 +2078,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
@@ -2155,7 +2169,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
cc->rescan = true;
}
- switch (isolate_migratepages(cc->zone, cc)) {
+ switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
@@ -2278,10 +2292,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
{
enum compact_result ret;
struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.order = order,
.search_order = order,
.gfp_mask = gfp_mask,
@@ -2302,8 +2312,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
if (capture)
current->capture_control = &capc;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
ret = compact_zone(&cc, &capc);
@@ -2405,8 +2413,6 @@ static void compact_node(int nid)
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
@@ -2420,11 +2426,7 @@ static void compact_node(int nid)
if (!populated_zone(zone))
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
compact_zone(&cc, NULL);
@@ -2526,8 +2528,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
@@ -2551,16 +2551,10 @@ static void kcompactd_do_work(pg_data_t *pgdat)
COMPACT_CONTINUE)
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
- cc.total_migrate_scanned = 0;
- cc.total_free_scanned = 0;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
if (kthread_should_stop())
return;
+
+ cc.zone = zone;
status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
diff --git a/mm/debug.c b/mm/debug.c
index 8345bb6e4769..0461df1207cb 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -67,28 +67,31 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(page) ? 0 : page_mapcount(page);
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx",
- page, page_ref_count(page), mapcount,
- page->mapping, page_to_pgoff(page));
if (PageCompound(page))
- pr_cont(" compound_mapcount: %d", compound_mapcount(page));
- pr_cont("\n");
- if (PageAnon(page))
- pr_warn("anon ");
- else if (PageKsm(page))
- pr_warn("ksm ");
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px "
+ "index:%#lx compound_mapcount: %d\n",
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page),
+ compound_mapcount(page));
+ else
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n",
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page));
+ if (PageKsm(page))
+ pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ else if (PageAnon(page))
+ pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
else if (mapping) {
- pr_warn("%ps ", mapping->a_ops);
if (mapping->host && mapping->host->i_dentry.first) {
struct dentry *dentry;
dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
- pr_warn("name:\"%pd\" ", dentry);
- }
+ pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
+ } else
+ pr_warn("%ps\n", mapping->a_ops);
+ pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
}
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
-
hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd032037..4f17c83db575 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,8 +27,7 @@
* deactivate the pages and clear PG_Referenced.
*/
-static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
- int advice)
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
struct inode *inode;
struct address_space *mapping;
@@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
}
return 0;
}
+EXPORT_SYMBOL(generic_fadvise);
int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index d0cf700bf201..85b7d087eb45 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -40,6 +40,7 @@
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
+#include <linux/ramfs.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -126,7 +127,7 @@ static void page_cache_delete(struct address_space *mapping,
/* hugetlb pages are represented by a single entry in the xarray */
if (!PageHuge(page)) {
xas_set_order(&xas, page->index, compound_order(page));
- nr = 1U << compound_order(page);
+ nr = compound_nr(page);
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -203,8 +204,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
- } else {
- VM_BUG_ON_PAGE(PageTransHuge(page), page);
+ } else if (PageTransHuge(page)) {
+ __dec_node_page_state(page, NR_FILE_THPS);
+ filemap_nr_thps_dec(mapping);
}
/*
@@ -281,11 +283,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
@@ -294,40 +296,43 @@ static void page_cache_delete_batch(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ if (i >= pagevec_count(pvec))
break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index >
- pvec->pages[i]->index, page);
- continue;
- }
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + compound_nr(page) - 1 == xas.xa_index)
i++;
- } else {
- VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
- != pvec->pages[i]->index, page);
- tail_pages--;
- }
xas_store(&xas, NULL);
total_pages++;
}
@@ -408,7 +413,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -617,10 +623,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
+/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- return (!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional);
+ if (dax_mapping(mapping))
+ return mapping->nrexceptional;
+
+ return mapping->nrpages;
}
int filemap_write_and_wait(struct address_space *mapping)
@@ -1516,7 +1525,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
XA_STATE(xas, &mapping->i_pages, offset);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
repeat:
@@ -1531,25 +1540,19 @@ repeat:
if (!page || xa_is_value(page))
goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
+ if (!page_cache_get_speculative(page))
goto repeat;
- }
/*
- * Has the page moved?
+ * Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1646,7 +1649,7 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
@@ -1731,7 +1734,6 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, ULONG_MAX) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1742,17 +1744,13 @@ unsigned find_get_entries(struct address_space *mapping,
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
@@ -1761,7 +1759,7 @@ export:
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1803,33 +1801,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
rcu_read_lock();
xas_for_each(&xas, page, end) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1874,7 +1866,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1884,24 +1875,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (xa_is_value(page))
break;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1937,7 +1923,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
xas_for_each_marked(&xas, page, end, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1948,26 +1933,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*index = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -2562,12 +2542,12 @@ retry_find:
goto out_retry;
/* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
/*
* We have a locked page in the page cache, now we need to check
@@ -2648,7 +2628,7 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2657,24 +2637,19 @@ void filemap_map_pages(struct vm_fault *vmf,
if (xa_is_value(page))
goto next;
- head = compound_head(page);
-
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(head))
+ if (PageLocked(page))
goto next;
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto next;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto skip;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
@@ -2988,6 +2963,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
loff_t count;
int ret;
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
if (!iov_iter_count(from))
return 0;
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index c64dca6e27c2..c431ca81dad5 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -46,6 +46,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
nr_frames = vec->nr_allocated;
+ start = untagged_addr(start);
+
down_read(&mm->mmap_sem);
locked = 1;
vma = find_vma_intersection(mm, start, start + 1);
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..8f236a335ae9 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,85 +29,70 @@ struct follow_page_context {
unsigned int page_mask;
};
-typedef int (*set_dirty_func_t)(struct page *page);
-
-static void __put_user_pages_dirty(struct page **pages,
- unsigned long npages,
- set_dirty_func_t sdf)
-{
- unsigned long index;
-
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
-
- /*
- * Checking PageDirty at this point may race with
- * clear_page_dirty_for_io(), but that's OK. Two key cases:
- *
- * 1) This code sees the page as already dirty, so it skips
- * the call to sdf(). That could happen because
- * clear_page_dirty_for_io() called page_mkclean(),
- * followed by set_page_dirty(). However, now the page is
- * going to get written back, which meets the original
- * intention of setting it dirty, so all is well:
- * clear_page_dirty_for_io() goes on to call
- * TestClearPageDirty(), and write the page back.
- *
- * 2) This code sees the page as clean, so it calls sdf().
- * The page stays dirty, despite being written back, so it
- * gets written back again in the next writeback cycle.
- * This is harmless.
- */
- if (!PageDirty(page))
- sdf(page);
-
- put_user_page(page);
- }
-}
-
/**
- * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
- * @pages: array of pages to be marked dirty and released.
+ * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
+ * @make_dirty: whether to mark the pages dirty
*
* "gup-pinned page" refers to a page that has had one of the get_user_pages()
* variants called on that page.
*
* For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
+ * listed as clean. In any case, releases all pages using put_user_page(),
+ * possibly via put_user_pages(), for the non-dirty case.
*
* Please see the put_user_page() documentation for details.
*
- * set_page_dirty(), which does not lock the page, is used here.
- * Therefore, it is the caller's responsibility to ensure that this is
- * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), put_user_page().
*
*/
-void put_user_pages_dirty(struct page **pages, unsigned long npages)
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
{
- __put_user_pages_dirty(pages, npages, set_page_dirty);
-}
-EXPORT_SYMBOL(put_user_pages_dirty);
+ unsigned long index;
-/**
- * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
- * @pages: array of pages to be marked dirty and released.
- * @npages: number of pages in the @pages array.
- *
- * For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
- *
- * Please see the put_user_page() documentation for details.
- *
- * This is just like put_user_pages_dirty(), except that it invokes
- * set_page_dirty_lock(), instead of set_page_dirty().
- *
- */
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
-{
- __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+
+ if (!make_dirty) {
+ put_user_pages(pages, npages);
+ return;
+ }
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key
+ * cases:
+ *
+ * 1) This code sees the page as already dirty, so it
+ * skips the call to set_page_dirty(). That could happen
+ * because clear_page_dirty_for_io() called
+ * page_mkclean(), followed by set_page_dirty().
+ * However, now the page is going to get written back,
+ * which meets the original intention of setting it
+ * dirty, so all is well: clear_page_dirty_for_io() goes
+ * on to call TestClearPageDirty(), and write the page
+ * back.
+ *
+ * 2) This code sees the page as clean, so it calls
+ * set_page_dirty(). The page stays dirty, despite being
+ * written back, so it gets written back again in the
+ * next writeback cycle. This is harmless.
+ */
+ if (!PageDirty(page))
+ set_page_dirty_lock(page);
+ put_user_page(page);
+ }
}
EXPORT_SYMBOL(put_user_pages_dirty_lock);
@@ -399,7 +384,7 @@ retry_locked:
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -408,7 +393,7 @@ retry_locked:
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else {
+ } else if (flags & FOLL_SPLIT) {
if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
@@ -420,6 +405,10 @@ retry_locked:
put_page(page);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
}
return ret ? ERR_PTR(ret) :
@@ -799,6 +788,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (!nr_pages)
return 0;
+ start = untagged_addr(start);
+
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/*
@@ -961,6 +952,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct *vma;
vm_fault_t ret, major = 0;
+ address = untagged_addr(address);
+
if (unlocked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
@@ -1460,7 +1453,7 @@ check_again:
* gup may start from a tail page. Advance step by the left
* part.
*/
- step = (1 << compound_order(head)) - (pages[i] - head);
+ step = compound_nr(head) - (pages[i] - head);
/*
* If we get a page from the CMA zone, since we are going to
* be pinning these entries, we might as well move them out
@@ -1980,7 +1973,8 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
}
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *head, *page;
@@ -1993,7 +1987,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
pte = READ_ONCE(*ptep);
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;
/* hugepages are never "special" */
@@ -2030,7 +2024,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
}
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
- unsigned int pdshift, unsigned long end, int write,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
@@ -2040,7 +2034,7 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
- if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);
@@ -2048,7 +2042,7 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
- unsigned pdshift, unsigned long end, int write,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
@@ -2056,7 +2050,8 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
#endif /* CONFIG_ARCH_HAS_HUGEPD */
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
diff --git a/mm/hmm.c b/mm/hmm.c
index e1eedef129cf..902f5fa6bf93 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -8,7 +8,7 @@
* Refer to include/linux/hmm.h for information about heterogeneous memory
* management or HMM for short.
*/
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/hmm.h>
#include <linux/init.h>
#include <linux/rmap.h>
@@ -26,101 +26,37 @@
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
-static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-
-/**
- * hmm_get_or_create - register HMM against an mm (HMM internal)
- *
- * @mm: mm struct to attach to
- * Returns: returns an HMM object, either by referencing the existing
- * (per-process) object, or by creating a new one.
- *
- * This is not intended to be used directly by device drivers. If mm already
- * has an HMM struct then it get a reference on it and returns it. Otherwise
- * it allocates an HMM struct, initializes it, associate it with the mm and
- * returns it.
- */
-static struct hmm *hmm_get_or_create(struct mm_struct *mm)
+static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm)
{
struct hmm *hmm;
- lockdep_assert_held_write(&mm->mmap_sem);
-
- /* Abuse the page_table_lock to also protect mm->hmm. */
- spin_lock(&mm->page_table_lock);
- hmm = mm->hmm;
- if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref))
- goto out_unlock;
- spin_unlock(&mm->page_table_lock);
-
- hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+ hmm = kzalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
init_waitqueue_head(&hmm->wq);
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
- hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
spin_lock_init(&hmm->ranges_lock);
- kref_init(&hmm->kref);
hmm->notifiers = 0;
- hmm->mm = mm;
-
- hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
- if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
- kfree(hmm);
- return NULL;
- }
-
- mmgrab(hmm->mm);
-
- /*
- * We hold the exclusive mmap_sem here so we know that mm->hmm is
- * still NULL or 0 kref, and is safe to update.
- */
- spin_lock(&mm->page_table_lock);
- mm->hmm = hmm;
-
-out_unlock:
- spin_unlock(&mm->page_table_lock);
- return hmm;
+ return &hmm->mmu_notifier;
}
-static void hmm_free_rcu(struct rcu_head *rcu)
+static void hmm_free_notifier(struct mmu_notifier *mn)
{
- struct hmm *hmm = container_of(rcu, struct hmm, rcu);
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- mmdrop(hmm->mm);
+ WARN_ON(!list_empty(&hmm->ranges));
+ WARN_ON(!list_empty(&hmm->mirrors));
kfree(hmm);
}
-static void hmm_free(struct kref *kref)
-{
- struct hmm *hmm = container_of(kref, struct hmm, kref);
-
- spin_lock(&hmm->mm->page_table_lock);
- if (hmm->mm->hmm == hmm)
- hmm->mm->hmm = NULL;
- spin_unlock(&hmm->mm->page_table_lock);
-
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
- mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
-}
-
-static inline void hmm_put(struct hmm *hmm)
-{
- kref_put(&hmm->kref, hmm_free);
-}
-
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
- /* Bail out if hmm is in the process of being freed */
- if (!kref_get_unless_zero(&hmm->kref))
- return;
-
/*
* Since hmm_range_register() holds the mmget() lock hmm_release() is
* prevented as long as a range exists.
@@ -137,8 +73,6 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
mirror->ops->release(mirror);
}
up_read(&hmm->mirrors_sem);
-
- hmm_put(hmm);
}
static void notifiers_decrement(struct hmm *hmm)
@@ -165,23 +99,14 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
{
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
- struct hmm_update update;
struct hmm_range *range;
unsigned long flags;
int ret = 0;
- if (!kref_get_unless_zero(&hmm->kref))
- return 0;
-
- update.start = nrange->start;
- update.end = nrange->end;
- update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = mmu_notifier_range_blockable(nrange);
-
spin_lock_irqsave(&hmm->ranges_lock, flags);
hmm->notifiers++;
list_for_each_entry(range, &hmm->ranges, list) {
- if (update.end < range->start || update.start >= range->end)
+ if (nrange->end < range->start || nrange->start >= range->end)
continue;
range->valid = false;
@@ -198,9 +123,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
list_for_each_entry(mirror, &hmm->mirrors, list) {
int rc;
- rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+ rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange);
if (rc) {
- if (WARN_ON(update.blockable || rc != -EAGAIN))
+ if (WARN_ON(mmu_notifier_range_blockable(nrange) ||
+ rc != -EAGAIN))
continue;
ret = -EAGAIN;
break;
@@ -211,7 +137,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
out:
if (ret)
notifiers_decrement(hmm);
- hmm_put(hmm);
return ret;
}
@@ -220,17 +145,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
{
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- if (!kref_get_unless_zero(&hmm->kref))
- return;
-
notifiers_decrement(hmm);
- hmm_put(hmm);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
.release = hmm_release,
.invalidate_range_start = hmm_invalidate_range_start,
.invalidate_range_end = hmm_invalidate_range_end,
+ .alloc_notifier = hmm_alloc_notifier,
+ .free_notifier = hmm_free_notifier,
};
/*
@@ -242,18 +165,27 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
*
* To start mirroring a process address space, the device driver must register
* an HMM mirror struct.
+ *
+ * The caller cannot unregister the hmm_mirror while any ranges are
+ * registered.
+ *
+ * Callers using this function must put a call to mmu_notifier_synchronize()
+ * in their module exit functions.
*/
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
{
+ struct mmu_notifier *mn;
+
lockdep_assert_held_write(&mm->mmap_sem);
/* Sanity check */
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
- mirror->hmm = hmm_get_or_create(mm);
- if (!mirror->hmm)
- return -ENOMEM;
+ mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm);
+ if (IS_ERR(mn))
+ return PTR_ERR(mn);
+ mirror->hmm = container_of(mn, struct hmm, mmu_notifier);
down_write(&mirror->hmm->mirrors_sem);
list_add(&mirror->list, &mirror->hmm->mirrors);
@@ -277,7 +209,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
down_write(&hmm->mirrors_sem);
list_del(&mirror->list);
up_write(&hmm->mirrors_sem);
- hmm_put(hmm);
+ mmu_notifier_put(&hmm->mmu_notifier);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
@@ -285,8 +217,7 @@ struct hmm_vma_walk {
struct hmm_range *range;
struct dev_pagemap *pgmap;
unsigned long last;
- bool fault;
- bool block;
+ unsigned int flags;
};
static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
@@ -298,17 +229,27 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
vm_fault_t ret;
- flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
- flags |= write_fault ? FAULT_FLAG_WRITE : 0;
+ if (!vma)
+ goto err;
+
+ if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
+ flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (write_fault)
+ flags |= FAULT_FLAG_WRITE;
+
ret = handle_mm_fault(vma, addr, flags);
- if (ret & VM_FAULT_RETRY)
+ if (ret & VM_FAULT_RETRY) {
+ /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
return -EAGAIN;
- if (ret & VM_FAULT_ERROR) {
- *pfn = range->values[HMM_PFN_ERROR];
- return -EFAULT;
}
+ if (ret & VM_FAULT_ERROR)
+ goto err;
return -EBUSY;
+
+err:
+ *pfn = range->values[HMM_PFN_ERROR];
+ return -EFAULT;
}
static int hmm_pfns_bad(unsigned long addr,
@@ -328,8 +269,8 @@ static int hmm_pfns_bad(unsigned long addr,
}
/*
- * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
- * @start: range virtual start address (inclusive)
+ * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
+ * @addr: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @fault: should we fault or not ?
* @write_fault: write fault ?
@@ -346,13 +287,15 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
- unsigned long i, page_size;
+ unsigned long i;
hmm_vma_walk->last = addr;
- page_size = hmm_range_page_size(range);
- i = (addr - range->start) >> range->page_shift;
+ i = (addr - range->start) >> PAGE_SHIFT;
+
+ if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
+ return -EPERM;
- for (; addr < end; addr += page_size, i++) {
+ for (; addr < end; addr += PAGE_SIZE, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
@@ -373,15 +316,15 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
{
struct hmm_range *range = hmm_vma_walk->range;
- if (!hmm_vma_walk->fault)
+ if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
return;
/*
* So we not only consider the individual per page request we also
* consider the default flags requested for the range. The API can
- * be use in 2 fashions. The first one where the HMM user coalesce
- * multiple page fault into one request and set flags per pfns for
- * of those faults. The second one where the HMM user want to pre-
+ * be used 2 ways. The first one where the HMM user coalesces
+ * multiple page faults into one request and sets flags per pfn for
+ * those faults. The second one where the HMM user wants to pre-
* fault a range with specific flags. For the latter one it is a
* waste to have the user pre-fill the pfn arrays with a default
* flags value.
@@ -391,7 +334,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
- /* If this is device memory than only fault if explicitly requested */
+ /* If this is device memory then only fault if explicitly requested */
if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
/* Do we fault on device memory ? */
if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
@@ -418,7 +361,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
{
unsigned long i;
- if (!hmm_vma_walk->fault) {
+ if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
*fault = *write_fault = false;
return;
}
@@ -458,22 +401,10 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
}
-static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
-{
- if (!pud_present(pud))
- return 0;
- return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
-}
-
-static int hmm_vma_handle_pmd(struct mm_walk *walk,
- unsigned long addr,
- unsigned long end,
- uint64_t *pfns,
- pmd_t pmd)
-{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, uint64_t *pfns, pmd_t pmd)
+{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
@@ -488,7 +419,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
if (pmd_protnone(pmd) || fault || write_fault)
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
- pfn = pmd_pfn(pmd) + pte_index(addr);
+ pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
if (pmd_devmap(pmd)) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
@@ -504,11 +435,12 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
}
hmm_vma_walk->last = end;
return 0;
-#else
- /* If THP is not enabled then we should never reach that code ! */
- return -EINVAL;
-#endif
}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+/* stub to allow the code below to compile */
+int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, uint64_t *pfns, pmd_t pmd);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
{
@@ -525,7 +457,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- struct vm_area_struct *vma = walk->vma;
bool fault, write_fault;
uint64_t cpu_flags;
pte_t pte = *ptep;
@@ -546,6 +477,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
if (!non_swap_entry(entry)) {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -574,8 +508,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (fault || write_fault) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
- migration_entry_wait(vma->vm_mm,
- pmdp, addr);
+ migration_entry_wait(walk->mm, pmdp, addr);
return -EBUSY;
}
return 0;
@@ -623,21 +556,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- struct vm_area_struct *vma = walk->vma;
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
pmd_t pmd;
-
again:
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
- return hmm_pfns_bad(start, end, walk);
-
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
bool fault, write_fault;
unsigned long npages;
@@ -651,7 +579,7 @@ again:
0, &fault, &write_fault);
if (fault || write_fault) {
hmm_vma_walk->last = addr;
- pmd_migration_entry_wait(vma->vm_mm, pmdp);
+ pmd_migration_entry_wait(walk->mm, pmdp);
return -EBUSY;
}
return 0;
@@ -660,11 +588,11 @@ again:
if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
- * No need to take pmd_lock here, even if some other threads
+ * No need to take pmd_lock here, even if some other thread
* is splitting the huge pmd we will get that event through
* mmu_notifier callback.
*
- * So just read pmd value and check again its a transparent
+ * So just read pmd value and check again it's a transparent
* huge or device mapping one and compute corresponding pfn
* values.
*/
@@ -678,7 +606,7 @@ again:
}
/*
- * We have handled all the valid case above ie either none, migration,
+ * We have handled all the valid cases above ie either none, migration,
* huge or transparent huge. At this point either it is a valid pmd
* entry pointing to pte directory or it is a bad pmd that will not
* recover.
@@ -714,10 +642,19 @@ again:
return 0;
}
-static int hmm_vma_walk_pud(pud_t *pudp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
+ struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
@@ -781,42 +718,29 @@ again:
return 0;
}
+#else
+#define hmm_vma_walk_pud NULL
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long start, unsigned long end,
struct mm_walk *walk)
{
-#ifdef CONFIG_HUGETLB_PAGE
- unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+ unsigned long addr = start, i, pfn;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- struct hstate *h = hstate_vma(vma);
uint64_t orig_pfn, cpu_flags;
bool fault, write_fault;
spinlock_t *ptl;
pte_t entry;
int ret = 0;
- size = 1UL << huge_page_shift(h);
- mask = size - 1;
- if (range->page_shift != PAGE_SHIFT) {
- /* Make sure we are looking at full page. */
- if (start & mask)
- return -EINVAL;
- if (end < (start + size))
- return -EINVAL;
- pfn_inc = size >> PAGE_SHIFT;
- } else {
- pfn_inc = 1;
- size = PAGE_SIZE;
- }
-
-
- ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
entry = huge_ptep_get(pte);
- i = (start - range->start) >> range->page_shift;
+ i = (start - range->start) >> PAGE_SHIFT;
orig_pfn = range->pfns[i];
range->pfns[i] = range->values[HMM_PFN_NONE];
cpu_flags = pte_to_hmm_pfn_flags(range, entry);
@@ -828,8 +752,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
goto unlock;
}
- pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
- for (; addr < end; addr += size, i++, pfn += pfn_inc)
+ pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
hmm_vma_walk->last = end;
@@ -841,10 +765,10 @@ unlock:
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
return ret;
-#else /* CONFIG_HUGETLB_PAGE */
- return -EINVAL;
-#endif
}
+#else
+#define hmm_vma_walk_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
@@ -859,44 +783,32 @@ static void hmm_pfns_clear(struct hmm_range *range,
* hmm_range_register() - start tracking change to CPU page table over a range
* @range: range
* @mm: the mm struct for the range of virtual address
- * @start: start virtual address (inclusive)
- * @end: end virtual address (exclusive)
- * @page_shift: expect page shift for the range
- * Returns 0 on success, -EFAULT if the address space is no longer valid
+ *
+ * Return: 0 on success, -EFAULT if the address space is no longer valid
*
* Track updates to the CPU page table see include/linux/hmm.h
*/
-int hmm_range_register(struct hmm_range *range,
- struct hmm_mirror *mirror,
- unsigned long start,
- unsigned long end,
- unsigned page_shift)
+int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
{
- unsigned long mask = ((1UL << page_shift) - 1UL);
struct hmm *hmm = mirror->hmm;
unsigned long flags;
range->valid = false;
range->hmm = NULL;
- if ((start & mask) || (end & mask))
+ if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
return -EINVAL;
- if (start >= end)
+ if (range->start >= range->end)
return -EINVAL;
- range->page_shift = page_shift;
- range->start = start;
- range->end = end;
-
/* Prevent hmm_release() from running while the range is valid */
- if (!mmget_not_zero(hmm->mm))
+ if (!mmget_not_zero(hmm->mmu_notifier.mm))
return -EFAULT;
/* Initialize range to track CPU page table updates. */
spin_lock_irqsave(&hmm->ranges_lock, flags);
range->hmm = hmm;
- kref_get(&hmm->kref);
list_add(&range->list, &hmm->ranges);
/*
@@ -928,8 +840,7 @@ void hmm_range_unregister(struct hmm_range *range)
spin_unlock_irqrestore(&hmm->ranges_lock, flags);
/* Drop reference taken by hmm_range_register() */
- mmput(hmm->mm);
- hmm_put(hmm);
+ mmput(hmm->mmu_notifier.mm);
/*
* The range is now invalid and the ref on the hmm is dropped, so
@@ -941,105 +852,33 @@ void hmm_range_unregister(struct hmm_range *range)
}
EXPORT_SYMBOL(hmm_range_unregister);
-/*
- * hmm_range_snapshot() - snapshot CPU page table for a range
- * @range: range
- * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- * permission (for instance asking for write and range is read only),
- * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
- * vma or it is illegal to access that range), number of valid pages
- * in range->pfns[] (from range start address).
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See in include/linux/hmm.h for example
- * on how to use.
- */
-long hmm_range_snapshot(struct hmm_range *range)
-{
- const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
- unsigned long start = range->start, end;
- struct hmm_vma_walk hmm_vma_walk;
- struct hmm *hmm = range->hmm;
- struct vm_area_struct *vma;
- struct mm_walk mm_walk;
-
- lockdep_assert_held(&hmm->mm->mmap_sem);
- do {
- /* If range is no longer valid force retry. */
- if (!range->valid)
- return -EAGAIN;
-
- vma = find_vma(hmm->mm, start);
- if (vma == NULL || (vma->vm_flags & device_vma))
- return -EFAULT;
-
- if (is_vm_hugetlb_page(vma)) {
- if (huge_page_shift(hstate_vma(vma)) !=
- range->page_shift &&
- range->page_shift != PAGE_SHIFT)
- return -EINVAL;
- } else {
- if (range->page_shift != PAGE_SHIFT)
- return -EINVAL;
- }
-
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it
- * does not allow write access, either. HMM does not
- * support architecture that allow write without read.
- */
- hmm_pfns_clear(range, range->pfns,
- range->start, range->end);
- return -EPERM;
- }
-
- range->vma = vma;
- hmm_vma_walk.pgmap = NULL;
- hmm_vma_walk.last = start;
- hmm_vma_walk.fault = false;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
- end = min(range->end, vma->vm_end);
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pud_entry = hmm_vma_walk_pud;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
- mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
-
- walk_page_range(start, end, &mm_walk);
- start = end;
- } while (start < range->end);
-
- return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(hmm_range_snapshot);
+static const struct mm_walk_ops hmm_walk_ops = {
+ .pud_entry = hmm_vma_walk_pud,
+ .pmd_entry = hmm_vma_walk_pmd,
+ .pte_hole = hmm_vma_walk_hole,
+ .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
+};
-/*
- * hmm_range_fault() - try to fault some address in a virtual address range
- * @range: range being faulted
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Return: number of valid pages in range->pfns[] (from range start
- * address). This may be zero. If the return value is negative,
- * then one of the following values may be returned:
+/**
+ * hmm_range_fault - try to fault some address in a virtual address range
+ * @range: range being faulted
+ * @flags: HMM_FAULT_* flags
*
- * -EINVAL invalid arguments or mm or virtual address are in an
- * invalid vma (for instance device file vma).
- * -ENOMEM: Out of memory.
- * -EPERM: Invalid permission (for instance asking for write and
- * range is read only).
- * -EAGAIN: If you need to retry and mmap_sem was drop. This can only
- * happens if block argument is false.
- * -EBUSY: If the the range is being invalidated and you should wait
- * for invalidation to finish.
- * -EFAULT: Invalid (ie either no valid vma or it is illegal to access
- * that range), number of valid pages in range->pfns[] (from
- * range start address).
+ * Return: the number of valid pages in range->pfns[] (from range start
+ * address), which may be zero. On error one of the following status codes
+ * can be returned:
+ *
+ * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
+ * (e.g., device file vma).
+ * -ENOMEM: Out of memory.
+ * -EPERM: Invalid permission (e.g., asking for write and range is read
+ * only).
+ * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped.
+ * -EBUSY: The range has been invalidated and the caller needs to wait for
+ * the invalidation to finish.
+ * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
+ * that range) number of valid pages in range->pfns[] (from
+ * range start address).
*
* This is similar to a regular CPU page fault except that it will not trigger
* any memory migration if the memory being faulted is not accessible by CPUs
@@ -1048,39 +887,26 @@ EXPORT_SYMBOL(hmm_range_snapshot);
* On error, for one virtual address in the range, the function will mark the
* corresponding HMM pfn entry with an error flag.
*/
-long hmm_range_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, unsigned int flags)
{
const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
struct hmm *hmm = range->hmm;
struct vm_area_struct *vma;
- struct mm_walk mm_walk;
int ret;
- lockdep_assert_held(&hmm->mm->mmap_sem);
+ lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
do {
/* If range is no longer valid force retry. */
- if (!range->valid) {
- up_read(&hmm->mm->mmap_sem);
- return -EAGAIN;
- }
+ if (!range->valid)
+ return -EBUSY;
- vma = find_vma(hmm->mm, start);
+ vma = find_vma(hmm->mmu_notifier.mm, start);
if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT;
- if (is_vm_hugetlb_page(vma)) {
- if (huge_page_shift(hstate_vma(vma)) !=
- range->page_shift &&
- range->page_shift != PAGE_SHIFT)
- return -EINVAL;
- } else {
- if (range->page_shift != PAGE_SHIFT)
- return -EINVAL;
- }
-
if (!(vma->vm_flags & VM_READ)) {
/*
* If vma do not allow read access, then assume that it
@@ -1092,27 +918,18 @@ long hmm_range_fault(struct hmm_range *range, bool block)
return -EPERM;
}
- range->vma = vma;
hmm_vma_walk.pgmap = NULL;
hmm_vma_walk.last = start;
- hmm_vma_walk.fault = true;
- hmm_vma_walk.block = block;
+ hmm_vma_walk.flags = flags;
hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
end = min(range->end, vma->vm_end);
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pud_entry = hmm_vma_walk_pud;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
- mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+ walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
+ &hmm_vma_walk);
do {
- ret = walk_page_range(start, end, &mm_walk);
+ ret = walk_page_range(vma->vm_mm, start, end,
+ &hmm_walk_ops, &hmm_vma_walk);
start = hmm_vma_walk.last;
/* Keep trying while the range is valid. */
@@ -1135,25 +952,22 @@ long hmm_range_fault(struct hmm_range *range, bool block)
EXPORT_SYMBOL(hmm_range_fault);
/**
- * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
- * @range: range being faulted
- * @device: device against to dma map page to
- * @daddrs: dma address of mapped pages
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
- * drop and you need to try again, some other error value otherwise
+ * hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device to map page to
+ * @daddrs: array of dma addresses for the mapped pages
+ * @flags: HMM_FAULT_*
*
- * Note same usage pattern as hmm_range_fault().
+ * Return: the number of pages mapped on success (including zero), or any
+ * status return from hmm_range_fault() otherwise.
*/
-long hmm_range_dma_map(struct hmm_range *range,
- struct device *device,
- dma_addr_t *daddrs,
- bool block)
+long hmm_range_dma_map(struct hmm_range *range, struct device *device,
+ dma_addr_t *daddrs, unsigned int flags)
{
unsigned long i, npages, mapped;
long ret;
- ret = hmm_range_fault(range, block);
+ ret = hmm_range_fault(range, flags);
if (ret <= 0)
return ret ? ret : -EBUSY;
@@ -1224,7 +1038,6 @@ EXPORT_SYMBOL(hmm_range_dma_map);
/**
* hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
* @range: range being unmapped
- * @vma: the vma against which the range (optional)
* @device: device against which dma map was done
* @daddrs: dma address of mapped pages
* @dirty: dirty page if it had the write flag set
@@ -1236,7 +1049,6 @@ EXPORT_SYMBOL(hmm_range_dma_map);
* concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
*/
long hmm_range_dma_unmap(struct hmm_range *range,
- struct vm_area_struct *vma,
struct device *device,
dma_addr_t *daddrs,
bool dirty)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1334ede667a8..13cc93785006 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -495,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- /* ->lru in the tail pages is occupied by compound_head. */
- return &page[2].deferred_list;
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ if (memcg)
+ return &memcg->deferred_split_queue;
+ else
+ return &pgdat->deferred_split_queue;
}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ return &pgdat->deferred_split_queue;
+}
+#endif
void prep_transhuge_page(struct page *page)
{
@@ -2485,6 +2500,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct page *head = compound_head(page);
pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
+ struct address_space *swap_cache = NULL;
+ unsigned long offset = 0;
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2492,6 +2509,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
+ if (PageAnon(head) && PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ offset = swp_offset(entry);
+ swap_cache = swap_address_space(entry);
+ xa_lock(&swap_cache->i_pages);
+ }
+
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@ -2501,17 +2526,28 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages, offset + i,
+ head + i, 0);
}
}
ClearPageCompound(head);
+
+ split_page_owner(head, HPAGE_PMD_ORDER);
+
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
- if (PageSwapCache(head))
+ if (PageSwapCache(head)) {
page_ref_add(head, 2);
- else
+ xa_unlock(&swap_cache->i_pages);
+ } else {
page_ref_inc(head);
+ }
} else {
/* Additional pin to page cache */
page_ref_add(head, 2);
@@ -2658,6 +2694,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2744,17 +2781,22 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&pgdata->split_queue_lock);
+ spin_lock(&ds_queue->split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
- if (mapping)
- __dec_node_page_state(page, NR_SHMEM_THPS);
- spin_unlock(&pgdata->split_queue_lock);
+ if (mapping) {
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_THPS);
+ else
+ __dec_node_page_state(page, NR_FILE_THPS);
+ }
+
+ spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2771,7 +2813,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
}
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
@@ -2793,53 +2835,86 @@ out:
void free_transhuge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
unsigned long flags;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ /*
+ * The try_to_unmap() in page reclaim path might reach here too,
+ * this may cause a race condition to corrupt deferred split queue.
+ * And, if page reclaim is already handling the same page, it is
+ * unnecessary to handle it again in shrinker.
+ *
+ * Check PageSwapCache to determine if the page is being
+ * handled by page reclaim since THP swap would add the page into
+ * swap cache before calling try_to_unmap().
+ */
+ if (PageSwapCache(page))
+ return;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &pgdata->split_queue);
- pgdata->split_queue_len++;
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
+#endif
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
- return READ_ONCE(pgdata->split_queue_len);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+ return READ_ONCE(ds_queue->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &pgdata->split_queue) {
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
@@ -2847,12 +2922,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
break;
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -2866,15 +2941,15 @@ next:
put_page(page);
}
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
- list_splice_tail(&list, &pgdata->split_queue);
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ list_splice_tail(&list, &ds_queue->split_queue);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
*/
- if (!split && list_empty(&pgdata->split_queue))
+ if (!split && list_empty(&ds_queue->split_queue))
return SHRINK_STOP;
return split;
}
@@ -2883,7 +2958,8 @@ static struct shrinker deferred_split_shrinker = {
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
};
#ifdef CONFIG_DEBUG_FS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ede7e7f5d1ab..b45a95363a84 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1084,11 +1084,10 @@ static bool pfn_range_valid_gigantic(struct zone *z,
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
+ page = pfn_to_online_page(i);
+ if (!page)
return false;
- page = pfn_to_page(i);
-
if (page_zone(page) != z)
return false;
@@ -1405,12 +1404,25 @@ pgoff_t __basepage_index(struct page *page)
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
+ bool alloc_try_hard = true;
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ /*
+ * By default we always try hard to allocate the page with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * a loop (to adjust global huge page counts) and previous allocation
+ * failed, do not continue to try hard on the same node. Use the
+ * node_alloc_noretry bitmap to manage this state information.
+ */
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+ alloc_try_hard = false;
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+ if (alloc_try_hard)
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
@@ -1419,6 +1431,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ /*
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+ * indicates an overall state change. Clear bit so that we resume
+ * normal 'try hard' allocations.
+ */
+ if (node_alloc_noretry && page && !alloc_try_hard)
+ node_clear(nid, *node_alloc_noretry);
+
+ /*
+ * If we tried hard to get a page but failed, set bit so that
+ * subsequent attempts will not try as hard until there is an
+ * overall state change.
+ */
+ if (node_alloc_noretry && !page && alloc_try_hard)
+ node_set(nid, *node_alloc_noretry);
+
return page;
}
@@ -1427,7 +1455,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
@@ -1435,7 +1464,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
- nid, nmask);
+ nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
@@ -1450,14 +1479,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+ node_alloc_noretry);
if (page)
break;
}
@@ -1601,7 +1632,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -1637,7 +1668,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2207,13 +2238,33 @@ static void __init gather_bootmem_prealloc(void)
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ nodemask_t *node_alloc_noretry;
+
+ if (!hstate_is_gigantic(h)) {
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * Ignore errors as lower level routines can deal with
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
+ * time, we are likely in bigger trouble.
+ */
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+ GFP_KERNEL);
+ } else {
+ /* allocations done at boot time */
+ node_alloc_noretry = NULL;
+ }
+
+ /* bit mask controlling how hard we retry per-node allocations */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY]))
+ &node_states[N_MEMORY],
+ node_alloc_noretry))
break;
cond_resched();
}
@@ -2225,6 +2276,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
+
+ kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
@@ -2323,6 +2376,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
+
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * If we can not allocate the bit mask, do not attempt to allocate
+ * the requested huge pages.
+ */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+ else
+ return -ENOMEM;
spin_lock(&hugetlb_lock);
@@ -2356,6 +2420,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
/* Fall through to decrease pool */
@@ -2388,7 +2453,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed,
+ node_alloc_noretry);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2429,6 +2495,8 @@ out:
h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
+
return 0;
}
@@ -3856,6 +3924,25 @@ retry:
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
+ /*
+ * Returning error will result in faulting task being
+ * sent SIGBUS. The hugetlb fault mutex prevents two
+ * tasks from racing to fault in the same page which
+ * could result in false unable to allocate errors.
+ * Page migration does not take the fault mutex, but
+ * does a clear then write of pte's under page table
+ * lock. Page fault code could race with migration,
+ * notice the clear pte and try to allocate a page
+ * here. Before returning error, get ptl and make
+ * sure there really is no pte entry.
+ */
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
+ ret = 0;
+ spin_unlock(ptl);
+ goto out;
+ }
+ spin_unlock(ptl);
ret = vmf_error(PTR_ERR(page));
goto out;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 68c2f2f3c05b..2ac38bdc18a1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
if (!page_hcg || page_hcg != h_cg)
goto out;
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
@@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a319211e..19603302a77f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,6 +5,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/cpumask.h>
+#include <linux/mman.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
@@ -35,6 +36,6 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
- .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
+ .cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index e32390802fd3..0d5f720c75ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2277b82902d8..6814d6d6a023 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
const void *object)
{
- BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
return (void *)object + cache->kasan_info.alloc_meta_offset;
}
@@ -315,14 +314,31 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
return (void *)object + cache->kasan_info.free_meta_offset;
}
+
+static void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
void kasan_poison_slab(struct page *page)
{
unsigned long i;
- for (i = 0; i < (1 << compound_order(page)); i++)
+ for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- kasan_poison_shadow(page_address(page),
- PAGE_SIZE << compound_order(page),
+ kasan_poison_shadow(page_address(page), page_size(page),
KASAN_KMALLOC_REDZONE);
}
@@ -407,8 +423,14 @@ static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return shadow_byte < 0 ||
shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
- else
- return tag != (u8)shadow_byte;
+
+ /* else CONFIG_KASAN_SW_TAGS: */
+ if ((u8)shadow_byte == KASAN_TAG_INVALID)
+ return true;
+ if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
+ return true;
+
+ return false;
}
static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
@@ -446,7 +468,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
unlikely(!(cache->flags & SLAB_KASAN)))
return false;
- set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
+ kasan_set_free_info(cache, object, tag);
+
quarantine_put(get_free_info(cache, object), cache);
return IS_ENABLED(CONFIG_KASAN_GENERIC);
@@ -518,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_SHADOW_SCALE_SIZE);
- redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+ redzone_end = (unsigned long)ptr + page_size(page);
kasan_unpoison_shadow(ptr, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
@@ -554,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
kasan_report_invalid_free(ptr, ip);
return;
}
- kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
- KASAN_FREE_PAGE);
+ kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
} else {
__kasan_slab_free(page->slab_cache, ptr, ip, false);
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 014f19e76247..35cff6bbb716 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -95,9 +95,19 @@ struct kasan_track {
depot_stack_handle_t stack;
};
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#define KASAN_NR_FREE_STACKS 5
+#else
+#define KASAN_NR_FREE_STACKS 1
+#endif
+
struct kasan_alloc_meta {
struct kasan_track alloc_track;
- struct kasan_track free_track;
+ struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
+ u8 free_track_idx;
+#endif
};
struct qlist_node {
@@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip);
+struct page *kasan_addr_to_page(const void *addr);
+
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 0e5f965f1882..621782100eaa 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
}
}
-static struct page *addr_to_page(const void *addr)
+struct page *kasan_addr_to_page(const void *addr)
{
if ((addr >= (void *)PAGE_OFFSET) &&
(addr < high_memory))
@@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
+static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
+
static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr)
+ const void *addr, u8 tag)
{
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
if (cache->flags & SLAB_KASAN) {
+ struct kasan_track *free_track;
+
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
- print_track(&alloc_info->free_track, "Freed");
+ free_track = kasan_get_free_track(cache, object, tag);
+ print_track(free_track, "Freed");
pr_err("\n");
}
@@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr)
print_decoded_frame_descr(frame_descr);
}
-static void print_address_description(void *addr)
+static void print_address_description(void *addr, u8 tag)
{
- struct page *page = addr_to_page(addr);
+ struct page *page = kasan_addr_to_page(addr);
dump_stack();
pr_err("\n");
@@ -355,7 +378,7 @@ static void print_address_description(void *addr)
struct kmem_cache *cache = page->slab_cache;
void *object = nearest_obj(cache, page, addr);
- describe_object(cache, object, addr);
+ describe_object(cache, object, addr, tag);
}
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
@@ -435,13 +458,14 @@ static bool report_enabled(void)
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
+ u8 tag = get_tag(object);
+ object = reset_tag(object);
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- print_tags(get_tag(object), reset_tag(object));
- object = reset_tag(object);
+ print_tags(tag, object);
pr_err("\n");
- print_address_description(object);
+ print_address_description(object, tag);
pr_err("\n");
print_shadow_for_address(object);
end_report(&flags);
@@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
pr_err("\n");
if (addr_has_shadow(untagged_addr)) {
- print_address_description(untagged_addr);
+ print_address_description(untagged_addr, get_tag(tagged_addr));
pr_err("\n");
print_shadow_for_address(info.first_bad_addr);
} else {
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
index 8eaf5f722271..969ae08f59d7 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/tags_report.c
@@ -36,6 +36,30 @@
const char *get_bug_type(struct kasan_access_info *info)
{
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ struct kasan_alloc_meta *alloc_meta;
+ struct kmem_cache *cache;
+ struct page *page;
+ const void *addr;
+ void *object;
+ u8 tag;
+ int i;
+
+ tag = get_tag(info->access_addr);
+ addr = reset_tag(info->access_addr);
+ page = kasan_addr_to_page(addr);
+ if (page && PageSlab(page)) {
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, (void *)addr);
+ alloc_meta = get_alloc_info(cache, object);
+
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ return "out-of-bounds";
+ }
+
+#endif
return "invalid-access";
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..a8a57bebb5fa 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -48,6 +48,7 @@ enum scan_result {
SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
+ SCAN_PAGE_HAS_PRIVATE,
};
#define CREATE_TRACE_POINTS
@@ -76,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
+#define MAX_PTE_MAPPED_THP 8
+
/**
* struct mm_slot - hash lookup from mm to mm_slot
* @hash: hash collision list
@@ -86,6 +89,10 @@ struct mm_slot {
struct hlist_node hash;
struct list_head mm_node;
struct mm_struct *mm;
+
+ /* pte-mapped THP in this mm */
+ int nr_pte_mapped_thp;
+ unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
@@ -404,7 +411,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
(vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
- if (shmem_file(vma->vm_file)) {
+
+ if (shmem_file(vma->vm_file) ||
+ (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ vma->vm_file &&
+ (vm_flags & VM_DENYWRITE))) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -456,8 +467,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long hstart, hend;
/*
- * khugepaged does not yet work on non-shmem files or special
- * mappings. And file-private shmem THP is not supported.
+ * khugepaged only supports read-only files for non-shmem files.
+ * khugepaged does not yet work on special mappings. And
+ * file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
@@ -710,7 +722,7 @@ static bool khugepaged_scan_abort(int nid)
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
- if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
@@ -1016,12 +1028,13 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
+
+ pte = pte_offset_map(pmd, address);
+ pte_ptl = pte_lockptr(mm, pmd);
+
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
@@ -1248,6 +1261,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
}
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct mm_slot *mm_slot;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct page *hpage = NULL;
+ pte_t *start_pte, *pte;
+ pmd_t *pmd, _pmd;
+ spinlock_t *ptl;
+ int count = 0;
+ int i;
+
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ return;
+
+ /*
+ * This vm_flags may not have VM_HUGEPAGE if the page was not
+ * collapsed by this mm. But we can still collapse if the page is
+ * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+ * will not fail the vma for missing VM_HUGEPAGE
+ */
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ return;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd)
+ return;
+
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+ /* step 1: check all mapped PTEs are to the right huge page */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ /* empty pte, skip */
+ if (pte_none(*pte))
+ continue;
+
+ /* page swapped out, abort */
+ if (!pte_present(*pte))
+ goto abort;
+
+ page = vm_normal_page(vma, addr, *pte);
+
+ if (!page || !PageCompound(page))
+ goto abort;
+
+ if (!hpage) {
+ hpage = compound_head(page);
+ /*
+ * The mapping of the THP should not change.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may
+ * change the page table, but the new page will
+ * not pass PageCompound() check.
+ */
+ if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
+ goto abort;
+ }
+
+ /*
+ * Confirm the page maps to the correct subpage.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may change
+ * the page table, but the new page will not pass
+ * PageCompound() check.
+ */
+ if (WARN_ON(hpage + i != page))
+ goto abort;
+ count++;
+ }
+
+ /* step 2: adjust rmap */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ if (pte_none(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ page_remove_rmap(page, false);
+ }
+
+ pte_unmap_unlock(start_pte, ptl);
+
+ /* step 3: set proper refcount and mm_counters. */
+ if (hpage) {
+ page_ref_sub(hpage, count);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ }
+
+ /* step 4: collapse pmd */
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ return;
+
+abort:
+ pte_unmap_unlock(start_pte, ptl);
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+ int i;
+
+ if (likely(mm_slot->nr_pte_mapped_thp == 0))
+ return 0;
+
+ if (!down_write_trylock(&mm->mmap_sem))
+ return -EBUSY;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+ mm_slot->nr_pte_mapped_thp = 0;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1256,7 +1422,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- /* probably overkill */
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth investing
+ * down_write(mmap_sem) as PMD-mapping is likely to be split
+ * later.
+ *
+ * Not that vma->anon_vma check is racy: it can be set up after
+ * the check but before we took mmap_sem by the fault path.
+ * But page lock would prevent establishing any new ptes of the
+ * page, so we are safe.
+ *
+ * An alternative would be drop the check, but check that page
+ * table is clear before calling pmdp_collapse_flush() under
+ * ptl. It has higher chance to recover THP for the VMA, but
+ * has higher cost too.
+ */
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1269,9 +1450,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
/*
* We need exclusive mmap_sem to retract page table.
- * If trylock fails we would end up with pte-mapped THP after
- * re-fault. Not ideal, but it's more important to not disturb
- * the system too much.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_sem while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
*/
if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1281,18 +1463,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
up_write(&vma->vm_mm->mmap_sem);
mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ } else {
+ /* Try again later */
+ khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
}
}
i_mmap_unlock_write(mapping);
}
/**
- * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
- * + swap in pages if necessary;
+ * + swap/gup in pages if necessary;
* + fill in gaps;
* + keep old pages around in case rollback is required;
* - if replacing succeeds:
@@ -1304,10 +1489,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_shmem(struct mm_struct *mm,
- struct address_space *mapping, pgoff_t start,
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
struct page **hpage, int node)
{
+ struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
struct mem_cgroup *memcg;
@@ -1315,7 +1501,9 @@ static void collapse_shmem(struct mm_struct *mm,
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
+ bool is_shmem = shmem_file(file);
+ VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
@@ -1347,7 +1535,8 @@ static void collapse_shmem(struct mm_struct *mm,
} while (1);
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (is_shmem)
+ __SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;
@@ -1362,41 +1551,64 @@ static void collapse_shmem(struct mm_struct *mm,
struct page *page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
- if (!page) {
- /*
- * Stop if extent has been truncated or hole-punched,
- * and is now completely empty.
- */
- if (index == start) {
- if (!xas_next_entry(&xas, end - 1)) {
- result = SCAN_TRUNCATED;
+ if (is_shmem) {
+ if (!page) {
+ /*
+ * Stop if extent has been truncated or
+ * hole-punched, and is now completely
+ * empty.
+ */
+ if (index == start) {
+ if (!xas_next_entry(&xas, end - 1)) {
+ result = SCAN_TRUNCATED;
+ goto xa_locked;
+ }
+ xas_set(&xas, index);
+ }
+ if (!shmem_charge(mapping->host, 1)) {
+ result = SCAN_FAIL;
goto xa_locked;
}
- xas_set(&xas, index);
+ xas_store(&xas, new_page);
+ nr_none++;
+ continue;
}
- if (!shmem_charge(mapping->host, 1)) {
- result = SCAN_FAIL;
+
+ if (xa_is_value(page) || !PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ /* swap in or instantiate fallocated page */
+ if (shmem_getpage(mapping->host, index, &page,
+ SGP_NOHUGE)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
goto xa_locked;
}
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
- nr_none++;
- continue;
- }
-
- if (xa_is_value(page) || !PageUptodate(page)) {
- xas_unlock_irq(&xas);
- /* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOHUGE)) {
- result = SCAN_FAIL;
- goto xa_unlocked;
+ } else { /* !is_shmem */
+ if (!page || xa_is_value(page)) {
+ xas_unlock_irq(&xas);
+ page_cache_sync_readahead(mapping, &file->f_ra,
+ file, index,
+ PAGE_SIZE);
+ /* drain pagevecs to help isolate_lru_page() */
+ lru_add_drain();
+ page = find_lock_page(mapping, index);
+ if (unlikely(page == NULL)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
}
- } else if (trylock_page(page)) {
- get_page(page);
- xas_unlock_irq(&xas);
- } else {
- result = SCAN_PAGE_LOCK;
- goto xa_locked;
}
/*
@@ -1404,7 +1616,12 @@ static void collapse_shmem(struct mm_struct *mm,
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+
+ /* make sure the page is up to date */
+ if (unlikely(!PageUptodate(page))) {
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
/*
* If file was truncated then extended, or hole-punched, before
@@ -1420,11 +1637,27 @@ static void collapse_shmem(struct mm_struct *mm,
goto out_unlock;
}
+ if (!is_shmem && PageDirty(page)) {
+ /*
+ * khugepaged only works on read-only fd, so this
+ * page is dirty because it hasn't been flushed
+ * since first write.
+ */
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
+
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+ goto out_unlock;
+ }
+
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
@@ -1454,7 +1687,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
@@ -1462,12 +1695,20 @@ out_unlock:
goto xa_unlocked;
}
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ if (is_shmem)
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ else {
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+ filemap_nr_thps_inc(mapping);
+ }
+
if (nr_none) {
struct zone *zone = page_zone(new_page);
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+ if (is_shmem)
+ __mod_node_page_state(zone->zone_pgdat,
+ NR_SHMEM, nr_none);
}
xa_locked:
@@ -1505,10 +1746,15 @@ xa_unlocked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+
+ if (is_shmem) {
+ set_page_dirty(new_page);
+ lru_cache_add_anon(new_page);
+ } else {
+ lru_cache_add_file(new_page);
+ }
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
- lru_cache_add_anon(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -1523,7 +1769,9 @@ xa_unlocked:
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
+
+ if (is_shmem)
+ shmem_uncharge(mapping->host, nr_none);
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
@@ -1563,11 +1811,11 @@ out:
/* TODO: tracepoints */
}
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
struct page *page = NULL;
+ struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
@@ -1606,7 +1854,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
break;
}
- if (page_count(page) != 1 + page_mapcount(page)) {
+ if (page_count(page) !=
+ 1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
@@ -1631,19 +1880,23 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
- collapse_shmem(mm, mapping, start, hpage, node);
+ collapse_file(mm, file, start, hpage, node);
}
}
/* TODO: tracepoints */
}
#else
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
BUILD_BUG();
}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ return 0;
+}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1668,6 +1921,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
+ khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = mm_slot->mm;
/*
@@ -1713,17 +1967,18 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (shmem_file(vma->vm_file)) {
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- if (!shmem_huge_enabled(vma))
+
+ if (shmem_file(vma->vm_file)
+ && !shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
- khugepaged_scan_shmem(mm, file->f_mapping,
- pgoff, hpage);
+ khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index dbbd518fb6b3..244607663363 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -114,7 +114,7 @@
/* GFP bitmask for kmemleak internal allocations */
#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
- __GFP_NOWARN | __GFP_NOFAIL)
+ __GFP_NOWARN)
/* scanning area inside a memory block */
struct kmemleak_scan_area {
@@ -168,6 +168,8 @@ struct kmemleak_object {
#define OBJECT_REPORTED (1 << 1)
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+/* flag set to fully scan the object when scan_area allocation failed */
+#define OBJECT_FULL_SCAN (1 << 3)
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
@@ -183,6 +185,10 @@ struct kmemleak_object {
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
+/* memory pool allocation */
+static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
+static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
+static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
@@ -193,13 +199,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled;
+static int kmemleak_enabled = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled;
+static int kmemleak_free_enabled = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
-/* enables or disables early logging of the memory operations */
-static int kmemleak_early_log = 1;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
@@ -227,49 +231,6 @@ static bool kmemleak_found_leaks;
static bool kmemleak_verbose;
module_param_named(verbose, kmemleak_verbose, bool, 0600);
-/*
- * Early object allocation/freeing logging. Kmemleak is initialized after the
- * kernel allocator. However, both the kernel allocator and kmemleak may
- * allocate memory blocks which need to be tracked. Kmemleak defines an
- * arbitrary buffer to hold the allocation/freeing information before it is
- * fully initialized.
- */
-
-/* kmemleak operation type for early logging */
-enum {
- KMEMLEAK_ALLOC,
- KMEMLEAK_ALLOC_PERCPU,
- KMEMLEAK_FREE,
- KMEMLEAK_FREE_PART,
- KMEMLEAK_FREE_PERCPU,
- KMEMLEAK_NOT_LEAK,
- KMEMLEAK_IGNORE,
- KMEMLEAK_SCAN_AREA,
- KMEMLEAK_NO_SCAN,
- KMEMLEAK_SET_EXCESS_REF
-};
-
-/*
- * Structure holding the information passed to kmemleak callbacks during the
- * early logging.
- */
-struct early_log {
- int op_type; /* kmemleak operation type */
- int min_count; /* minimum reference count */
- const void *ptr; /* allocated/freed memory block */
- union {
- size_t size; /* memory block size */
- unsigned long excess_ref; /* surplus reference passing */
- };
- unsigned long trace[MAX_TRACE]; /* stack trace */
- unsigned int trace_len; /* stack trace length */
-};
-
-/* early logging buffer and current position */
-static struct early_log
- early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
-static int crt_early_log __initdata;
-
static void kmemleak_disable(void);
/*
@@ -450,6 +411,54 @@ static int get_object(struct kmemleak_object *object)
}
/*
+ * Memory pool allocation and freeing. kmemleak_lock must not be held.
+ */
+static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ /* try the slab allocator first */
+ if (object_cache) {
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (object)
+ return object;
+ }
+
+ /* slab allocation failed, try the memory pool */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ object = list_first_entry_or_null(&mem_pool_free_list,
+ typeof(*object), object_list);
+ if (object)
+ list_del(&object->object_list);
+ else if (mem_pool_free_count)
+ object = &mem_pool[--mem_pool_free_count];
+ else
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
+ * Return the object to either the slab allocator or the memory pool.
+ */
+static void mem_pool_free(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
+ kmem_cache_free(object_cache, object);
+ return;
+ }
+
+ /* add the object to the memory pool free list */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ list_add(&object->object_list, &mem_pool_free_list);
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
* RCU callback to free a kmemleak_object.
*/
static void free_object_rcu(struct rcu_head *rcu)
@@ -467,7 +476,7 @@ static void free_object_rcu(struct rcu_head *rcu)
hlist_del(&area->node);
kmem_cache_free(scan_area_cache, area);
}
- kmem_cache_free(object_cache, object);
+ mem_pool_free(object);
}
/*
@@ -485,7 +494,15 @@ static void put_object(struct kmemleak_object *object)
/* should only get here after delete_object was called */
WARN_ON(object->flags & OBJECT_ALLOCATED);
- call_rcu(&object->rcu, free_object_rcu);
+ /*
+ * It may be too early for the RCU callbacks, however, there is no
+ * concurrent object_list traversal when !object_cache and all objects
+ * came from the memory pool. Free the object directly.
+ */
+ if (object_cache)
+ call_rcu(&object->rcu, free_object_rcu);
+ else
+ free_object_rcu(&object->rcu);
}
/*
@@ -510,6 +527,16 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
}
/*
+ * Remove an object from the object_tree_root and object_list. Must be called
+ * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ */
+static void __remove_object(struct kmemleak_object *object)
+{
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+}
+
+/*
* Look up an object in the object search tree and remove it from both
* object_tree_root and object_list. The returned object's use_count should be
* at least 1, as initially set by create_object().
@@ -521,10 +548,8 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
write_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- if (object) {
- rb_erase(&object->rb_node, &object_tree_root);
- list_del_rcu(&object->object_list);
- }
+ if (object)
+ __remove_object(object);
write_unlock_irqrestore(&kmemleak_lock, flags);
return object;
@@ -550,7 +575,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
struct rb_node **link, *rb_parent;
unsigned long untagged_ptr;
- object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ object = mem_pool_alloc(gfp);
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
@@ -689,9 +714,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
/*
* Create one or two objects that may result from the memory block
* split. Note that partial freeing is only done by free_bootmem() and
- * this happens before kmemleak_init() is called. The path below is
- * only executed during early log recording in kmemleak_init(), so
- * GFP_KERNEL is enough.
+ * this happens before kmemleak_init() is called.
*/
start = object->pointer;
end = object->pointer + object->size;
@@ -763,7 +786,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
- struct kmemleak_scan_area *area;
+ struct kmemleak_scan_area *area = NULL;
object = find_and_get_object(ptr, 1);
if (!object) {
@@ -772,13 +795,16 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
return;
}
- area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- if (!area) {
- pr_warn("Cannot allocate a scan area\n");
- goto out;
- }
+ if (scan_area_cache)
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
spin_lock_irqsave(&object->lock, flags);
+ if (!area) {
+ pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
+ /* mark the object for full scan to avoid false positives */
+ object->flags |= OBJECT_FULL_SCAN;
+ goto out_unlock;
+ }
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
@@ -795,7 +821,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
hlist_add_head(&area->node, &object->area_list);
out_unlock:
spin_unlock_irqrestore(&object->lock, flags);
-out:
put_object(object);
}
@@ -845,86 +870,6 @@ static void object_no_scan(unsigned long ptr)
put_object(object);
}
-/*
- * Log an early kmemleak_* call to the early_log buffer. These calls will be
- * processed later once kmemleak is fully initialized.
- */
-static void __init log_early(int op_type, const void *ptr, size_t size,
- int min_count)
-{
- unsigned long flags;
- struct early_log *log;
-
- if (kmemleak_error) {
- /* kmemleak stopped recording, just count the requests */
- crt_early_log++;
- return;
- }
-
- if (crt_early_log >= ARRAY_SIZE(early_log)) {
- crt_early_log++;
- kmemleak_disable();
- return;
- }
-
- /*
- * There is no need for locking since the kernel is still in UP mode
- * at this stage. Disabling the IRQs is enough.
- */
- local_irq_save(flags);
- log = &early_log[crt_early_log];
- log->op_type = op_type;
- log->ptr = ptr;
- log->size = size;
- log->min_count = min_count;
- log->trace_len = __save_stack_trace(log->trace);
- crt_early_log++;
- local_irq_restore(flags);
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc(struct early_log *log)
-{
- struct kmemleak_object *object;
- unsigned long flags;
- int i;
-
- if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
- return;
-
- /*
- * RCU locking needed to ensure object is not freed via put_object().
- */
- rcu_read_lock();
- object = create_object((unsigned long)log->ptr, log->size,
- log->min_count, GFP_ATOMIC);
- if (!object)
- goto out;
- spin_lock_irqsave(&object->lock, flags);
- for (i = 0; i < log->trace_len; i++)
- object->trace[i] = log->trace[i];
- object->trace_len = log->trace_len;
- spin_unlock_irqrestore(&object->lock, flags);
-out:
- rcu_read_unlock();
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc_percpu(struct early_log *log)
-{
- unsigned int cpu;
- const void __percpu *ptr = log->ptr;
-
- for_each_possible_cpu(cpu) {
- log->ptr = per_cpu_ptr(ptr, cpu);
- early_alloc(log);
- }
-}
-
/**
* kmemleak_alloc - register a newly allocated object
* @ptr: pointer to beginning of the object
@@ -946,8 +891,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -975,8 +918,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -1001,11 +942,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp
create_object((unsigned long)area->addr, size, 2, gfp);
object_set_excess_ref((unsigned long)area,
(unsigned long)area->addr);
- } else if (kmemleak_early_log) {
- log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
- /* reusing early_log.size for storing area->addr */
- log_early(KMEMLEAK_SET_EXCESS_REF,
- area, (unsigned long)area->addr, 0);
}
}
EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
@@ -1023,8 +959,6 @@ void __ref kmemleak_free(const void *ptr)
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -1043,8 +977,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -1065,8 +997,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
@@ -1117,8 +1047,6 @@ void __ref kmemleak_not_leak(const void *ptr)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
@@ -1137,8 +1065,6 @@ void __ref kmemleak_ignore(const void *ptr)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1159,8 +1085,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
@@ -1179,8 +1103,6 @@ void __ref kmemleak_no_scan(const void *ptr)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
@@ -1408,7 +1330,8 @@ static void scan_object(struct kmemleak_object *object)
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
- if (hlist_empty(&object->area_list)) {
+ if (hlist_empty(&object->area_list) ||
+ object->flags & OBJECT_FULL_SCAN) {
void *start = (void *)object->pointer;
void *end = (void *)(object->pointer + object->size);
void *next;
@@ -1919,12 +1842,16 @@ static const struct file_operations kmemleak_fops = {
static void __kmemleak_do_cleanup(void)
{
- struct kmemleak_object *object;
+ struct kmemleak_object *object, *tmp;
- rcu_read_lock();
- list_for_each_entry_rcu(object, &object_list, object_list)
- delete_object_full(object->pointer);
- rcu_read_unlock();
+ /*
+ * Kmemleak has already been disabled, no need for RCU list traversal
+ * or kmemleak_lock held.
+ */
+ list_for_each_entry_safe(object, tmp, &object_list, object_list) {
+ __remove_object(object);
+ __delete_object(object);
+ }
}
/*
@@ -1993,50 +1920,27 @@ static int __init kmemleak_boot_config(char *str)
}
early_param("kmemleak", kmemleak_boot_config);
-static void __init print_log_trace(struct early_log *log)
-{
- pr_notice("Early log backtrace:\n");
- stack_trace_print(log->trace, log->trace_len, 2);
-}
-
/*
* Kmemleak initialization.
*/
void __init kmemleak_init(void)
{
- int i;
- unsigned long flags;
-
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
- kmemleak_early_log = 0;
kmemleak_disable();
return;
}
#endif
+ if (kmemleak_error)
+ return;
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
- crt_early_log);
-
- /* the kernel is still in UP mode, so disabling the IRQs is enough */
- local_irq_save(flags);
- kmemleak_early_log = 0;
- if (kmemleak_error) {
- local_irq_restore(flags);
- return;
- } else {
- kmemleak_enabled = 1;
- kmemleak_free_enabled = 1;
- }
- local_irq_restore(flags);
-
/* register the data/bss sections */
create_object((unsigned long)_sdata, _edata - _sdata,
KMEMLEAK_GREY, GFP_ATOMIC);
@@ -2047,57 +1951,6 @@ void __init kmemleak_init(void)
create_object((unsigned long)__start_ro_after_init,
__end_ro_after_init - __start_ro_after_init,
KMEMLEAK_GREY, GFP_ATOMIC);
-
- /*
- * This is the point where tracking allocations is safe. Automatic
- * scanning is started during the late initcall. Add the early logged
- * callbacks to the kmemleak infrastructure.
- */
- for (i = 0; i < crt_early_log; i++) {
- struct early_log *log = &early_log[i];
-
- switch (log->op_type) {
- case KMEMLEAK_ALLOC:
- early_alloc(log);
- break;
- case KMEMLEAK_ALLOC_PERCPU:
- early_alloc_percpu(log);
- break;
- case KMEMLEAK_FREE:
- kmemleak_free(log->ptr);
- break;
- case KMEMLEAK_FREE_PART:
- kmemleak_free_part(log->ptr, log->size);
- break;
- case KMEMLEAK_FREE_PERCPU:
- kmemleak_free_percpu(log->ptr);
- break;
- case KMEMLEAK_NOT_LEAK:
- kmemleak_not_leak(log->ptr);
- break;
- case KMEMLEAK_IGNORE:
- kmemleak_ignore(log->ptr);
- break;
- case KMEMLEAK_SCAN_AREA:
- kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
- break;
- case KMEMLEAK_NO_SCAN:
- kmemleak_no_scan(log->ptr);
- break;
- case KMEMLEAK_SET_EXCESS_REF:
- object_set_excess_ref((unsigned long)log->ptr,
- log->excess_ref);
- break;
- default:
- kmemleak_warn("Unknown early log operation: %d\n",
- log->op_type);
- }
-
- if (kmemleak_warning) {
- print_log_trace(log);
- kmemleak_warning = 0;
- }
- }
}
/*
@@ -2126,7 +1979,8 @@ static int __init kmemleak_late_init(void)
mutex_unlock(&scan_mutex);
}
- pr_info("Kernel memory leak detector initialized\n");
+ pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n",
+ mem_pool_free_count);
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 3dc4346411e4..dbee2eb4dd05 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page)
return checksum;
}
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
-}
-
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069f..94c343b4c968 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,15 +11,18 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
+#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
+#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
@@ -29,6 +32,11 @@
#include "internal.h"
+struct madvise_walk_private {
+ struct mmu_gather *tlb;
+ bool pageout;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -40,6 +48,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
case MADV_FREE:
return 0;
default:
@@ -105,28 +115,14 @@ static long madvise_behavior(struct vm_area_struct *vma,
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
}
@@ -152,15 +148,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
goto out;
}
error = __split_vma(mm, vma, start, 1);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
if (end != vma->vm_end) {
@@ -169,15 +158,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
goto out;
}
error = __split_vma(mm, vma, end, 0);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
success:
@@ -185,6 +167,14 @@ success:
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
+
+out_convert_errno:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
out:
return error;
}
@@ -225,19 +215,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
return 0;
}
-static void force_swapin_readahead(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- struct mm_walk walk = {
- .mm = vma->vm_mm,
- .pmd_entry = swapin_walk_pmd_entry,
- .private = vma,
- };
-
- walk_page_range(start, end, &walk);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
+static const struct mm_walk_ops swapin_walk_ops = {
+ .pmd_entry = swapin_walk_pmd_entry,
+};
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
@@ -275,11 +255,13 @@ static long madvise_willneed(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
+ loff_t offset;
*prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- force_swapin_readahead(vma, start, end);
+ walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
@@ -298,12 +280,276 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ /*
+ * Filesystem's fadvise may need to take various locks. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ get_file(file);
+ up_read(&current->mm->mmap_sem);
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+ fput(file);
+ down_read(&current->mm->mmap_sem);
+ return 0;
+}
+
+static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct madvise_walk_private *private = walk->private;
+ struct mmu_gather *tlb = private->tlb;
+ bool pageout = private->pageout;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page = NULL;
+ LIST_HEAD(page_list);
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
+ } else
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+regular_page:
+#endif
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * Creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ /*
+ * We are deactivating a page for accelerating reclaiming.
+ * VM couldn't reclaim the page unless we clear PG_young.
+ * As a side effect, it makes confuse idle-page tracking
+ * because they will miss recent referenced history.
+ */
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
+ } else
+ deactivate_page(page);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ cond_resched();
+
+ return 0;
+}
+
+static const struct mm_walk_ops cold_walk_ops = {
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
+};
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = false,
+ .tlb = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = true,
+ .tlb = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ if (!can_do_pageout(vma))
+ return 0;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
- force_page_cache_readahead(file->f_mapping, file, start, end - start);
return 0;
}
@@ -440,20 +686,9 @@ next:
return 0;
}
-static void madvise_free_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- struct mm_walk free_walk = {
- .pmd_entry = madvise_free_pte_range,
- .mm = vma->vm_mm,
- .private = tlb,
- };
-
- tlb_start_vma(tlb, vma);
- walk_page_range(addr, end, &free_walk);
- tlb_end_vma(tlb, vma);
-}
+static const struct mm_walk_ops madvise_free_walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+};
static int madvise_free_single_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
@@ -480,7 +715,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- madvise_free_page_range(&tlb, vma, range.start, range.end);
+ tlb_start_vma(&tlb, vma);
+ walk_page_range(vma->vm_mm, range.start, range.end,
+ &madvise_free_walk_ops, &tlb);
+ tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, range.start, range.end);
@@ -519,7 +757,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
int behavior)
{
*prev = vma;
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -541,7 +779,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
return -ENOMEM;
}
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -695,6 +933,10 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -716,6 +958,8 @@ madvise_behavior_valid(int behavior)
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -810,6 +1054,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
size_t len;
struct blk_plug plug;
+ start = untagged_addr(start);
+
if (!madvise_behavior_valid(behavior))
return error;
diff --git a/mm/memblock.c b/mm/memblock.c
index 7d4f61ae666a..c4b16cae2bc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
align = SMP_CACHE_BYTES;
}
- if (end > memblock.current_limit)
- end = memblock.current_limit;
-
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
@@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_internal(
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, nid);
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
+
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
/* retry allocation without lower limit */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cdbb7a84cb6e..46ad252e6d6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -57,6 +57,7 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
@@ -87,6 +88,10 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -313,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -436,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
}
}
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -486,7 +484,7 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- if (PageHead(page) && PageSlab(page))
+ if (PageSlab(page) && !PageTail(page))
memcg = memcg_from_slab_page(page);
else
memcg = READ_ONCE(page->mem_cgroup);
@@ -752,15 +750,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
@@ -768,6 +764,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+{
+ struct page *page = virt_to_head_page(p);
+ pg_data_t *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = memcg_from_slab_page(page);
+
+ /* Untracked pages have no memcg, no lruvec. Update only the node */
+ if (!memcg || memcg == root_mem_cgroup) {
+ __mod_node_page_state(pgdat, idx, val);
+ } else {
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ __mod_lruvec_state(lruvec, idx, val);
+ }
+ rcu_read_unlock();
+}
+
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
@@ -944,7 +960,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -1130,26 +1146,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
css_put(&prev->css);
}
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+ struct mem_cgroup *dead_memcg)
{
- struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
int i;
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(from, nid);
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
}
}
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup *last;
+
+ do {
+ __invalidate_reclaim_iterators(memcg, dead_memcg);
+ last = memcg;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ /*
+ * When cgruop1 non-hierarchy mode is used,
+ * parent_mem_cgroup() does not walk all the way up to the
+ * cgroup root (root_mem_cgroup). So we have to handle
+ * dead_memcg from cgroup root separately.
+ */
+ if (last != root_mem_cgroup)
+ __invalidate_reclaim_iterators(root_mem_cgroup,
+ dead_memcg);
+}
+
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
@@ -1532,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -2229,21 +2269,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
@@ -2318,11 +2359,67 @@ static void high_work_func(struct work_struct *work)
}
/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
+ unsigned long usage, high, clamped_high;
+ unsigned long pflags;
+ unsigned long penalty_jiffies, overage;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
@@ -2331,8 +2428,75 @@ void mem_cgroup_handle_over_high(void)
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
- css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ *
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ if (usage <= high)
+ goto out;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if it was a
+ * threshold of 1 page
+ */
+ clamped_high = max(high, 1UL);
+
+ overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+ clamped_high);
+
+ penalty_jiffies = ((u64)overage * overage * HZ)
+ >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2371,6 +2535,15 @@ retry:
}
/*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
@@ -2784,6 +2957,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+ /*
+ * Enforce __GFP_NOFAIL allocation because callers are not
+ * prepared to see failures and likely do not have any failure
+ * handling code.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->kmem, nr_pages);
+ return 0;
+ }
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
@@ -3221,6 +3404,72 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+ int min_idx, max_idx;
+
+ if (slab_only) {
+ min_idx = NR_SLAB_RECLAIMABLE;
+ max_idx = NR_SLAB_UNRECLAIMABLE;
+ } else {
+ min_idx = 0;
+ max_idx = MEMCG_NR_STAT;
+ }
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ if (!slab_only)
+ max_idx = NR_VM_NODE_STAT_ITEMS;
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3270,7 +3519,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!parent)
parent = root_mem_cgroup;
+ /*
+ * Deactivate and reparent kmem_caches. Then flush percpu
+ * slab statistics to have precise values at the parent and
+ * all ancestor levels. It's required to keep slab stats
+ * accurate after the reparenting of kmem_caches.
+ */
memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_flush_percpu_vmstats(memcg, true);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -3398,6 +3654,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
@@ -4062,6 +4321,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
#ifdef CONFIG_CGROUP_WRITEBACK
+#include <trace/events/writeback.h>
+
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -4145,6 +4406,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
}
}
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4565,11 +4950,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
}
}
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
- mem_cgroup_id_get_many(memcg, 1);
-}
-
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
{
mem_cgroup_id_put_many(memcg, 1);
@@ -4653,6 +5033,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -4661,6 +5047,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg;
unsigned int size;
int node;
+ int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4704,6 +5091,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4833,7 +5228,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
@@ -5029,6 +5429,8 @@ static int mem_cgroup_move_account(struct page *page,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -5052,11 +5454,15 @@ static int mem_cgroup_move_account(struct page *page,
anon = PageAnon(page);
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(pgdat, from);
+ to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -5068,16 +5474,24 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && !list_empty(page_deferred_list(page))) {
+ spin_lock(&from->deferred_split_queue.split_queue_lock);
+ list_del_init(page_deferred_list(page));
+ from->deferred_split_queue.split_queue_len--;
+ spin_unlock(&from->deferred_split_queue.split_queue_lock);
+ }
+#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -5086,6 +5500,17 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && list_empty(page_deferred_list(page))) {
+ spin_lock(&to->deferred_split_queue.split_queue_lock);
+ list_add_tail(page_deferred_list(page),
+ &to->deferred_split_queue.split_queue);
+ to->deferred_split_queue.split_queue_len++;
+ spin_unlock(&to->deferred_split_queue.split_queue_lock);
+ }
+#endif
+
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -5244,17 +5669,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}
+static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+};
+
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- };
down_read(&mm->mmap_sem);
- walk_page_range(0, mm->highest_vm_end,
- &mem_cgroup_count_precharge_walk);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -5523,13 +5947,12 @@ put: /* get_mctgt_type() gets the page */
return ret;
}
+static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+};
+
static void mem_cgroup_move_charge(void)
{
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mc.mm,
- };
-
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
@@ -5555,7 +5978,8 @@ retry:
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
@@ -6257,7 +6681,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
unsigned int nr_pages = 1;
if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
+ nr_pages = compound_nr(page);
ug->nr_huge += nr_pages;
}
if (PageAnon(page))
@@ -6269,7 +6693,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
}
ug->pgpgout++;
} else {
- ug->nr_kmem += 1 << compound_order(page);
+ ug->nr_kmem += compound_nr(page);
__ClearPageKmemcg(page);
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a46b9c..2647c898990c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
xas_for_each(xas, page, ULONG_MAX) {
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas->xa_index);
if (page_count(page) - page_mapcount(page) > 1)
xas_set_mark(xas, MEMFD_TAG_PINNED);
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
bool clear = true;
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas.xa_index);
if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7ef849da8278..3151c87dff73 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -199,7 +199,6 @@ struct to_kill {
struct task_struct *tsk;
unsigned long addr;
short size_shift;
- char addr_valid;
};
/*
@@ -324,22 +323,27 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
}
tk->addr = page_address_in_vma(p, vma);
- tk->addr_valid = 1;
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
else
tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
/*
- * In theory we don't have to kill when the page was
- * munmaped. But it could be also a mremap. Since that's
- * likely very rare kill anyways just out of paranoia, but use
- * a SIGKILL because the error is not contained anymore.
+ * Send SIGKILL if "tk->addr == -EFAULT". Also, as
+ * "tk->size_shift" is always non-zero for !is_zone_device_page(),
+ * so "tk->size_shift == 0" effectively checks no mapping on
+ * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
+ * to a process' address space, it's possible not all N VMAs
+ * contain mappings for the page, but at least one VMA does.
+ * Only deliver SIGBUS with payload derived from the VMA that
+ * has a mapping for the page.
*/
- if (tk->addr == -EFAULT || tk->size_shift == 0) {
+ if (tk->addr == -EFAULT) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
- tk->addr_valid = 0;
+ } else if (tk->size_shift == 0) {
+ kfree(tk);
+ return;
}
get_task_struct(tsk);
tk->tsk = tsk;
@@ -366,7 +370,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* make sure the process doesn't catch the
* signal and then access the memory. Just kill it.
*/
- if (fail || tk->addr_valid == 0) {
+ if (fail || tk->addr == -EFAULT) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
@@ -1253,17 +1257,19 @@ int memory_failure(unsigned long pfn, int flags)
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
- if (!pfn_valid(pfn)) {
+ p = pfn_to_online_page(pfn);
+ if (!p) {
+ if (pfn_valid(pfn)) {
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (pgmap)
+ return memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ }
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
return -ENXIO;
}
- pgmap = get_dev_pagemap(pfn, NULL);
- if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags, pgmap);
-
- p = pfn_to_page(pfn);
if (PageHuge(p))
return memory_failure_hugetlb(pfn, flags);
if (TestSetPageHWPoison(p)) {
diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..b1ca51a079f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -518,7 +518,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
@@ -1026,6 +1026,9 @@ again:
if (pte_none(ptent))
continue;
+ if (need_resched())
+ break;
+
if (pte_present(ptent)) {
struct page *page;
@@ -1093,7 +1096,6 @@ again:
if (unlikely(details))
continue;
- entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry))
rss[MM_SWAPENTS]--;
else if (is_migration_entry(entry)) {
@@ -1124,8 +1126,11 @@ again:
if (force_flush) {
force_flush = 0;
tlb_flush_mmu(tlb);
- if (addr != end)
- goto again;
+ }
+
+ if (addr != end) {
+ cond_resched();
+ goto again;
}
return addr;
@@ -2196,6 +2201,10 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ if (vmf->vma->vm_file &&
+ IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
+ return VM_FAULT_SIGBUS;
+
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2a9bbddb0e55..3b62a9ff8ea0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -132,7 +132,6 @@ static void release_memory_resource(struct resource *res)
return;
release_resource(res);
kfree(res);
- return;
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -437,67 +436,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- int nid = pgdat->node_id;
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
continue;
-
- if (pfn_to_nid(pfn) != nid)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
continue;
+ }
- /* If we find valid section, we have nothing to do */
- return;
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
static void __remove_zone(struct zone *zone, unsigned long start_pfn,
@@ -508,7 +473,7 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn,
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
@@ -633,33 +598,30 @@ static void generic_online_page(struct page *page, unsigned int order)
#endif
}
-static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
-{
- unsigned long end = start + nr_pages;
- int order, onlined_pages = 0;
-
- while (start < end) {
- order = min(MAX_ORDER - 1,
- get_order(PFN_PHYS(end) - PFN_PHYS(start)));
- (*online_page_callback)(pfn_to_page(start), order);
-
- onlined_pages += (1UL << order);
- start += (1UL << order);
- }
- return onlined_pages;
-}
-
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long onlined_pages = *(unsigned long *)arg;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+ int order;
- if (PageReserved(pfn_to_page(start_pfn)))
- onlined_pages += online_pages_blocks(start_pfn, nr_pages);
+ /*
+ * Online the pages. The callback might decide to keep some pages
+ * PG_reserved (to add them to the buddy later), but we still account
+ * them as being online/belonging to this zone ("present").
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
+ order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
+ /* __free_pages_core() wants pfns to be aligned to the order */
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
+ order = 0;
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ }
- online_mem_sections(start_pfn, start_pfn + nr_pages);
+ /* mark all involved sections as online */
+ online_mem_sections(start_pfn, end_pfn);
- *(unsigned long *)arg = onlined_pages;
+ *(unsigned long *)arg += nr_pages;
return 0;
}
@@ -715,8 +677,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
-}
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ */
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
@@ -805,20 +772,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- struct zone *zone;
-
- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
- return zone;
-}
-
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -841,7 +794,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
put_device(&mem->dev);
/* associate pfn range with the zone */
- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -865,6 +819,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ /* not a single memory resource was applicable */
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
goto failed_addition;
@@ -878,27 +833,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
shuffle_zone(zone);
- if (onlined_pages) {
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
- }
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ else
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
- if (onlined_pages) {
- kswapd_run(nid);
- kcompactd_run(nid);
- }
+ kswapd_run(nid);
+ kcompactd_run(nid);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
- if (onlined_pages)
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &arg);
mem_hotplug_done();
return 0;
@@ -934,8 +884,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
if (!pgdat)
return NULL;
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
arch_refresh_nodedata(nid, pgdat);
} else {
+ int cpu;
/*
* Reset the nr_zones, order and classzone_idx before reuse.
* Note that kswapd will init kswapd_classzone_idx properly
@@ -944,6 +897,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
}
/* we can use NODE_DATA(nid) from here */
@@ -953,7 +912,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -979,7 +937,6 @@ static void rollback_node_hotadd(int nid)
arch_refresh_nodedata(nid, NULL);
free_percpu(pgdat->per_cpu_nodestats);
arch_free_nodedata(pgdat);
- return;
}
@@ -1311,7 +1268,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
head = compound_head(page);
if (page_huge_active(head))
return pfn;
- skip = (1 << compound_order(head)) - (page - head);
+ skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
return 0;
@@ -1349,7 +1306,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
@@ -1664,7 +1621,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+ endpa = beginpa + memory_block_size_bytes() - 1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
@@ -1689,6 +1646,18 @@ static int check_cpu_on_node(pg_data_t *pgdat)
return 0;
}
+static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
+{
+ int nid = *(int *)arg;
+
+ /*
+ * If a memory block belongs to multiple nodes, the stored nid is not
+ * reliable. However, such blocks are always online (e.g., cannot get
+ * offlined) and, therefore, are still spanned by the node.
+ */
+ return mem->nid == nid ? -EEXIST : 0;
+}
+
/**
* try_offline_node
* @nid: the node ID
@@ -1701,25 +1670,24 @@ static int check_cpu_on_node(pg_data_t *pgdat)
void try_offline_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = pgdat->node_start_pfn;
- unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
-
- if (!present_section_nr(section_nr))
- continue;
+ int rc;
- if (pfn_to_nid(pfn) != nid)
- continue;
+ /*
+ * If the node still spans pages (especially ZONE_DEVICE), don't
+ * offline it. A node spans memory after move_pfn_range_to_zone(),
+ * e.g., after the memory block was onlined.
+ */
+ if (pgdat->node_spanned_pages)
+ return;
- /*
- * some memory sections of this node are not removed, and we
- * can't offline node now.
- */
+ /*
+ * Especially offline memory blocks might not be spanned by the
+ * node. They will get spanned by the node once they get onlined.
+ * However, they link to the node in sysfs and can get onlined later.
+ */
+ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
+ if (rc)
return;
- }
if (check_cpu_on_node(pgdat))
return;
@@ -1802,7 +1770,7 @@ void __remove_memory(int nid, u64 start, u64 size)
{
/*
- * trigger BUG() is some memory is not offlined prior to calling this
+ * trigger BUG() if some memory is not offlined prior to calling this
* function
*/
if (try_remove_memory(nid, start, size))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f48693f75b37..e08c94170ae4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -68,7 +68,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
@@ -403,7 +403,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
},
};
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
struct queue_pages {
@@ -429,11 +429,14 @@ static inline bool queue_pages_required(struct page *page,
}
/*
- * queue_pages_pmd() has three possible return values:
- * 1 - pages are placed on the right node or queued successfully.
- * 0 - THP was split.
- * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
- * page was already on a node that does not follow the policy.
+ * queue_pages_pmd() has four possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * 2 - THP was split.
+ * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
+ * existing page was already on a node that does not follow the
+ * policy.
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -451,23 +454,20 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
+ ret = 2;
goto out;
}
- if (!queue_pages_required(page, qp)) {
- ret = 1;
+ if (!queue_pages_required(page, qp))
goto unlock;
- }
- ret = 1;
flags = qp->flags;
/* go to thp migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- if (!vma_migratable(walk->vma)) {
- ret = -EIO;
+ if (!vma_migratable(walk->vma) ||
+ migrate_page_add(page, qp->pagelist, flags)) {
+ ret = 1;
goto unlock;
}
-
- migrate_page_add(page, qp->pagelist, flags);
} else
ret = -EIO;
unlock:
@@ -479,6 +479,13 @@ out:
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
+ *
+ * queue_pages_pte_range() has three possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
+ * on a node that does not follow the policy.
*/
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -488,17 +495,17 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
int ret;
+ bool has_unmovable = false;
pte_t *pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
- if (ret > 0)
- return 0;
- else if (ret < 0)
+ if (ret != 2)
return ret;
}
+ /* THP was split, fall through to pte walk */
if (pmd_trans_unstable(pmd))
return 0;
@@ -519,14 +526,28 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!queue_pages_required(page, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- if (!vma_migratable(vma))
+ /* MPOL_MF_STRICT must be specified if we get here */
+ if (!vma_migratable(vma)) {
+ has_unmovable = true;
break;
- migrate_page_add(page, qp->pagelist, flags);
+ }
+
+ /*
+ * Do not abort immediately since there may be
+ * temporary off LRU pages in the range. Still
+ * need migrate other LRU pages.
+ */
+ if (migrate_page_add(page, qp->pagelist, flags))
+ has_unmovable = true;
} else
break;
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
+
+ if (has_unmovable)
+ return 1;
+
return addr != end ? -EIO : 0;
}
@@ -634,12 +655,26 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
return 1;
}
+static const struct mm_walk_ops queue_pages_walk_ops = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+};
+
/*
* Walk through page tables and collect pages to be migrated.
*
* If pages found in a given range are on a set of nodes (determined by
* @nodes and @flags,) it's isolated and queued to the pagelist which is
- * passed via @private.)
+ * passed via @private.
+ *
+ * queue_pages_range() has three possible return values:
+ * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * 0 - queue pages successfully or no misplaced page.
+ * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
+ * memory range specified by nodemask and maxnode points outside
+ * your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
@@ -652,15 +687,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.nmask = nodes,
.prev = NULL,
};
- struct mm_walk queue_pages_walk = {
- .hugetlb_entry = queue_pages_hugetlb,
- .pmd_entry = queue_pages_pte_range,
- .test_walk = queue_pages_test_walk,
- .mm = mm,
- .private = &qp,
- };
- return walk_page_range(start, end, &queue_pages_walk);
+ return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
}
/*
@@ -940,7 +968,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
/*
* page migration, thp tail pages can be passed.
*/
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
struct page *head = compound_head(page);
@@ -953,8 +981,19 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_cache(head),
hpage_nr_pages(head));
+ } else if (flags & MPOL_MF_STRICT) {
+ /*
+ * Non-movable page may reach here. And, there may be
+ * temporary off LRU pages or non-LRU movable pages.
+ * Treat them as unmovable pages since they can't be
+ * isolated, so they can't be moved at the moment. It
+ * should return -EIO for this case too.
+ */
+ return -EIO;
}
}
+
+ return 0;
}
/* page allocation callback for NUMA node migration */
@@ -1157,9 +1196,10 @@ static struct page *new_page(struct page *page, unsigned long start)
}
#else
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
+ return -EIO;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
@@ -1182,6 +1222,7 @@ static long do_mbind(unsigned long start, unsigned long len,
struct mempolicy *new;
unsigned long end;
int err;
+ int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
@@ -1243,10 +1284,15 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- err = queue_pages_range(mm, start, end, nmask,
+ ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- if (!err)
- err = mbind_range(mm, start, end, new);
+
+ if (ret < 0) {
+ err = ret;
+ goto up_out;
+ }
+
+ err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
@@ -1259,13 +1305,16 @@ static long do_mbind(unsigned long start, unsigned long len,
putback_movable_pages(&pagelist);
}
- if (nr_failed && (flags & MPOL_MF_STRICT))
+ if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
- } else
- putback_movable_pages(&pagelist);
+ } else {
+up_out:
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+ }
up_write(&mm->mmap_sem);
- mpol_out:
+mpol_out:
mpol_put(new);
return err;
}
@@ -1360,6 +1409,7 @@ static long kernel_mbind(unsigned long start, unsigned long len,
int err;
unsigned short mode_flags;
+ start = untagged_addr(start);
mode_flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if (mode >= MPOL_MAX)
@@ -1467,10 +1517,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
if (nodes_empty(*new))
goto out_put;
- nodes_and(*new, *new, node_states[N_MEMORY]);
- if (nodes_empty(*new))
- goto out_put;
-
err = security_task_movememory(task);
if (err)
goto out_put;
@@ -1517,6 +1563,8 @@ static int kernel_get_mempolicy(int __user *policy,
int uninitialized_var(pval);
nodemask_t nodes;
+ addr = untagged_addr(addr);
+
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
@@ -2087,6 +2135,17 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
mpol_cond_put(pol);
page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE, order);
+
+ /*
+ * If hugepage allocations are configured to always
+ * synchronous compact or the vma has been madvised
+ * to prefer hugepage backing, retry allowing remote
+ * memory as well.
+ */
+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_NORETRY, order);
+
goto out;
}
}
diff --git a/mm/memremap.c b/mm/memremap.c
new file mode 100644
index 000000000000..03ccbdfeb697
--- /dev/null
+++ b/mm/memremap.c
@@ -0,0 +1,457 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/kasan.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/pfn_t.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/types.h>
+#include <linux/wait_bit.h>
+#include <linux/xarray.h>
+
+static DEFINE_XARRAY(pgmap_array);
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+static atomic_t devmap_managed_enable;
+
+static void devmap_managed_enable_put(void)
+{
+ if (atomic_dec_and_test(&devmap_managed_enable))
+ static_branch_disable(&devmap_managed_key);
+}
+
+static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+{
+ if (!pgmap->ops || !pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return -EINVAL;
+ }
+
+ if (atomic_inc_return(&devmap_managed_enable) == 1)
+ static_branch_enable(&devmap_managed_key);
+ return 0;
+}
+#else
+static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+{
+ return -EINVAL;
+}
+static void devmap_managed_enable_put(void)
+{
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+static void pgmap_array_delete(struct resource *res)
+{
+ xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ NULL, GFP_KERNEL);
+ synchronize_rcu();
+}
+
+static unsigned long pfn_first(struct dev_pagemap *pgmap)
+{
+ return PHYS_PFN(pgmap->res.start) +
+ vmem_altmap_offset(pgmap_altmap(pgmap));
+}
+
+static unsigned long pfn_end(struct dev_pagemap *pgmap)
+{
+ const struct resource *res = &pgmap->res;
+
+ return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+static unsigned long pfn_next(unsigned long pfn)
+{
+ if (pfn % 1024 == 0)
+ cond_resched();
+ return pfn + 1;
+}
+
+#define for_each_device_pfn(pfn, map) \
+ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->kill)
+ pgmap->ops->kill(pgmap);
+ else
+ percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->cleanup) {
+ pgmap->ops->cleanup(pgmap);
+ } else {
+ wait_for_completion(&pgmap->done);
+ percpu_ref_exit(pgmap->ref);
+ }
+ /*
+ * Undo the pgmap ref assignment for the internal case as the
+ * caller may re-enable the same pgmap.
+ */
+ if (pgmap->ref == &pgmap->internal_ref)
+ pgmap->ref = NULL;
+}
+
+void memunmap_pages(struct dev_pagemap *pgmap)
+{
+ struct resource *res = &pgmap->res;
+ struct page *first_page;
+ unsigned long pfn;
+ int nid;
+
+ dev_pagemap_kill(pgmap);
+ for_each_device_pfn(pfn, pgmap)
+ put_page(pfn_to_page(pfn));
+ dev_pagemap_cleanup(pgmap);
+
+ /* make sure to access a memmap that was actually initialized */
+ first_page = pfn_to_page(pfn_first(pgmap));
+
+ /* pages are dead and unused, undo the arch mapping */
+ nid = page_to_nid(first_page);
+
+ mem_hotplug_begin();
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ __remove_pages(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), NULL);
+ } else {
+ arch_remove_memory(nid, res->start, resource_size(res),
+ pgmap_altmap(pgmap));
+ kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ }
+ mem_hotplug_done();
+
+ untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
+ pgmap_array_delete(res);
+ WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
+ devmap_managed_enable_put();
+}
+EXPORT_SYMBOL_GPL(memunmap_pages);
+
+static void devm_memremap_pages_release(void *data)
+{
+ memunmap_pages(data);
+}
+
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+ struct dev_pagemap *pgmap =
+ container_of(ref, struct dev_pagemap, internal_ref);
+
+ complete(&pgmap->done);
+}
+
+/*
+ * Not device managed version of dev_memremap_pages, undone by
+ * memunmap_pages(). Please use dev_memremap_pages if you have a struct
+ * device available.
+ */
+void *memremap_pages(struct dev_pagemap *pgmap, int nid)
+{
+ struct resource *res = &pgmap->res;
+ struct dev_pagemap *conflict_pgmap;
+ struct mhp_restrictions restrictions = {
+ /*
+ * We do not want any optional features only our own memmap
+ */
+ .altmap = pgmap_altmap(pgmap),
+ };
+ pgprot_t pgprot = PAGE_KERNEL;
+ int error, is_ram;
+ bool need_devmap_managed = true;
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+ WARN(1, "Device private memory not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+ WARN(1, "Missing migrate_to_ram method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_FS_DAX:
+ if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+ IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+ WARN(1, "File system DAX not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_DEVDAX:
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ need_devmap_managed = false;
+ break;
+ default:
+ WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+ break;
+ }
+
+ if (!pgmap->ref) {
+ if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+ return ERR_PTR(-EINVAL);
+
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->internal_ref,
+ dev_pagemap_percpu_release, 0, GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
+ pgmap->ref = &pgmap->internal_ref;
+ } else {
+ if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ if (need_devmap_managed) {
+ error = devmap_managed_enable_get(pgmap);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ error = -ENOMEM;
+ goto err_array;
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ error = -ENOMEM;
+ goto err_array;
+ }
+
+ is_ram = region_intersects(res->start, resource_size(res),
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+ is_ram == REGION_MIXED ? "mixed" : "ram", res);
+ error = -ENXIO;
+ goto err_array;
+ }
+
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
+ PHYS_PFN(res->end), pgmap, GFP_KERNEL));
+ if (error)
+ goto err_array;
+
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0,
+ resource_size(res));
+ if (error)
+ goto err_pfn_remap;
+
+ mem_hotplug_begin();
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), &restrictions);
+ } else {
+ error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, res->start, resource_size(res),
+ &restrictions);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), restrictions.altmap);
+ }
+
+ mem_hotplug_done();
+ if (error)
+ goto err_add_memory;
+
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
+ return __va(res->start);
+
+ err_add_memory:
+ kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ err_kasan:
+ untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
+ err_pfn_remap:
+ pgmap_array_delete(res);
+ err_array:
+ dev_pagemap_kill(pgmap);
+ dev_pagemap_cleanup(pgmap);
+ devmap_managed_enable_put();
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(memremap_pages);
+
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @pgmap: pointer to a struct dev_pagemap
+ *
+ * Notes:
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
+ * by the caller before passing it to this function
+ *
+ * 2/ The altmap field may optionally be initialized, in which case
+ * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
+ *
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ * 'live' on entry and will be killed and reaped at
+ * devm_memremap_pages_release() time, or if this routine fails.
+ *
+ * 4/ res is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ int error;
+ void *ret;
+
+ ret = memremap_pages(pgmap, dev_to_node(dev));
+ if (IS_ERR(ret))
+ return ret;
+
+ error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+ pgmap);
+ if (error)
+ return ERR_PTR(error);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
+
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ if (altmap)
+ return altmap->reserve + altmap->free;
+ return 0;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
+ * is non-NULL but does not cover @pfn the reference to it will be released.
+ */
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+ struct dev_pagemap *pgmap)
+{
+ resource_size_t phys = PFN_PHYS(pfn);
+
+ /*
+ * In the cached case we're already holding a live reference.
+ */
+ if (pgmap) {
+ if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ return pgmap;
+ put_dev_pagemap(pgmap);
+ }
+
+ /* fall back to slow path lookup */
+ rcu_read_lock();
+ pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
+ if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ pgmap = NULL;
+ rcu_read_unlock();
+
+ return pgmap;
+}
+EXPORT_SYMBOL_GPL(get_dev_pagemap);
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void __put_devmap_managed_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ mem_cgroup_uncharge(page);
+
+ /*
+ * When a device_private page is freed, the page->mapping field
+ * may still contain a (stale) mapping value. For example, the
+ * lower bits of page->mapping may still identify the page as
+ * an anonymous page. Ultimately, this entire field is just
+ * stale and wrong, and it will cause errors if not cleared.
+ * One example is:
+ *
+ * migrate_vma_pages()
+ * migrate_vma_insert_page()
+ * page_add_new_anon_rmap()
+ * __page_set_anon_rmap()
+ * ...checks page->mapping, via PageAnon(page) call,
+ * and incorrectly concludes that the page is an
+ * anonymous page. Therefore, it incorrectly,
+ * silently fails to set up the new anon rmap.
+ *
+ * For other types of ZONE_DEVICE pages, migration is either
+ * handled differently or not done at all, so there is no need
+ * to clear page->mapping.
+ */
+ if (is_device_private_page(page))
+ page->mapping = NULL;
+
+ page->pgmap->ops->page_free(page);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(__put_devmap_managed_page);
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/migrate.c b/mm/migrate.c
index 8992741f10aa..4fe45d1428c8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pagewalk.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
@@ -459,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
for (i = 1; i < HPAGE_PMD_NR; i++) {
xas_next(&xas);
- xas_store(&xas, newpage + i);
+ xas_store(&xas, newpage);
}
}
@@ -767,12 +768,12 @@ recheck_buffers:
}
bh = bh->b_this_page;
} while (bh != head);
- spin_unlock(&mapping->private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
+ spin_unlock(&mapping->private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
@@ -805,6 +806,8 @@ recheck_buffers:
rc = MIGRATEPAGE_SUCCESS;
unlock_buffers:
+ if (check_refs)
+ spin_unlock(&mapping->private_lock);
bh = head;
do {
unlock_buffer(bh);
@@ -1609,7 +1612,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
goto out_flush;
if (get_user(node, nodes + i))
goto out_flush;
- addr = (unsigned long)p;
+ addr = (unsigned long)untagged_addr(p);
err = -ENODEV;
if (node < 0 || node >= MAX_NUMNODES)
@@ -1889,7 +1892,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
if (isolate_lru_page(page))
@@ -2117,17 +2120,7 @@ out_unlock:
#endif /* CONFIG_NUMA */
-#if defined(CONFIG_MIGRATE_VMA_HELPER)
-struct migrate_vma {
- struct vm_area_struct *vma;
- unsigned long *dst;
- unsigned long *src;
- unsigned long cpages;
- unsigned long npages;
- unsigned long start;
- unsigned long end;
-};
-
+#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
struct mm_walk *walk)
@@ -2225,17 +2218,15 @@ again:
pte_t pte;
pte = *ptep;
- pfn = pte_pfn(pte);
if (pte_none(pte)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
if (!pte_present(pte)) {
- mpfn = pfn = 0;
+ mpfn = 0;
/*
* Only care about unaddressable device page special
@@ -2247,15 +2238,15 @@ again:
goto next;
page = device_private_entry_to_page(entry);
- mpfn = migrate_pfn(page_to_pfn(page))|
- MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
@@ -2265,10 +2256,9 @@ again:
/* FIXME support THP */
if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = pfn = 0;
+ mpfn = 0;
goto next;
}
- pfn = page_to_pfn(page);
/*
* By getting a reference on the page we pin it and that blocks
@@ -2327,6 +2317,11 @@ next:
return 0;
}
+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+};
+
/*
* migrate_vma_collect() - collect pages over a range of virtual addresses
* @migrate: migrate struct containing all migration information
@@ -2338,24 +2333,15 @@ next:
static void migrate_vma_collect(struct migrate_vma *migrate)
{
struct mmu_notifier_range range;
- struct mm_walk mm_walk;
-
- mm_walk.pmd_entry = migrate_vma_collect_pmd;
- mm_walk.pte_entry = NULL;
- mm_walk.pte_hole = migrate_vma_collect_hole;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.vma = migrate->vma;
- mm_walk.mm = migrate->vma->vm_mm;
- mm_walk.private = migrate;
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm,
- migrate->start,
- migrate->end);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
+ migrate->vma->vm_mm, migrate->start, migrate->end);
mmu_notifier_invalidate_range_start(&range);
- walk_page_range(migrate->start, migrate->end, &mm_walk);
- mmu_notifier_invalidate_range_end(&range);
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
}
@@ -2578,6 +2564,110 @@ restore:
}
}
+/**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
+ * (destination pages must have their struct pages locked, via lock_page()).
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unback virtual
+ * address. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entries after calling migrate_vma_pages()
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_sem is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+int migrate_vma_setup(struct migrate_vma *args)
+{
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_prepare(args);
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+}
+EXPORT_SYMBOL(migrate_vma_setup);
+
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2709,7 +2799,7 @@ abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/*
+/**
* migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
@@ -2717,7 +2807,7 @@ abort:
* struct page. This effectively finishes the migration from source page to the
* destination page.
*/
-static void migrate_vma_pages(struct migrate_vma *migrate)
+void migrate_vma_pages(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
@@ -2791,8 +2881,9 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
if (notified)
mmu_notifier_invalidate_range_only_end(&range);
}
+EXPORT_SYMBOL(migrate_vma_pages);
-/*
+/**
* migrate_vma_finalize() - restore CPU page table entry
* @migrate: migrate struct containing all migration information
*
@@ -2803,7 +2894,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
* This also unlocks the pages and puts them back on the lru, or drops the extra
* refcount, for device pages.
*/
-static void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_vma_finalize(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
unsigned long i;
@@ -2846,124 +2937,5 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
}
-
-/*
- * migrate_vma() - migrate a range of memory inside vma
- *
- * @ops: migration callback for allocating destination memory and copying
- * @vma: virtual memory area containing the range to be migrated
- * @start: start address of the range to migrate (inclusive)
- * @end: end address of the range to migrate (exclusive)
- * @src: array of hmm_pfn_t containing source pfns
- * @dst: array of hmm_pfn_t containing destination pfns
- * @private: pointer passed back to each of the callback
- * Returns: 0 on success, error code otherwise
- *
- * This function tries to migrate a range of memory virtual address range, using
- * callbacks to allocate and copy memory from source to destination. First it
- * collects all the pages backing each virtual address in the range, saving this
- * inside the src array. Then it locks those pages and unmaps them. Once the pages
- * are locked and unmapped, it checks whether each page is pinned or not. Pages
- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
- * in the corresponding src array entry. It then restores any pages that are
- * pinned, by remapping and unlocking those pages.
- *
- * At this point it calls the alloc_and_copy() callback. For documentation on
- * what is expected from that callback, see struct migrate_vma_ops comments in
- * include/linux/migrate.h
- *
- * After the alloc_and_copy() callback, this function goes over each entry in
- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
- * then the function tries to migrate struct page information from the source
- * struct page to the destination struct page. If it fails to migrate the struct
- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
- * array.
- *
- * At this point all successfully migrated pages have an entry in the src
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
- * array entry with MIGRATE_PFN_VALID flag set.
- *
- * It then calls the finalize_and_map() callback. See comments for "struct
- * migrate_vma_ops", in include/linux/migrate.h for details about
- * finalize_and_map() behavior.
- *
- * After the finalize_and_map() callback, for successfully migrated pages, this
- * function updates the CPU page table to point to new pages, otherwise it
- * restores the CPU page table to point to the original source pages.
- *
- * Function returns 0 after the above steps, even if no pages were migrated
- * (The function only returns an error if any of the arguments are invalid.)
- *
- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
- * unsigned long entries.
- */
-int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private)
-{
- struct migrate_vma migrate;
-
- /* Sanity check the arguments */
- start &= PAGE_MASK;
- end &= PAGE_MASK;
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma))
- return -EINVAL;
- if (start < vma->vm_start || start >= vma->vm_end)
- return -EINVAL;
- if (end <= vma->vm_start || end > vma->vm_end)
- return -EINVAL;
- if (!ops || !src || !dst || start >= end)
- return -EINVAL;
-
- memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
- migrate.src = src;
- migrate.dst = dst;
- migrate.start = start;
- migrate.npages = 0;
- migrate.cpages = 0;
- migrate.end = end;
- migrate.vma = vma;
-
- /* Collect, and try to unmap source pages */
- migrate_vma_collect(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /* Lock and isolate page */
- migrate_vma_prepare(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /* Unmap pages */
- migrate_vma_unmap(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /*
- * At this point pages are locked and unmapped, and thus they have
- * stable content and can safely be copied to destination memory that
- * is allocated by the callback.
- *
- * Note that migration can fail in migrate_vma_struct_page() for each
- * individual page.
- */
- ops->alloc_and_copy(vma, src, dst, start, end, private);
-
- /* This does the real migration of struct page */
- migrate_vma_pages(&migrate);
-
- ops->finalize_and_map(vma, src, dst, start, end, private);
-
- /* Unlock and remap pages */
- migrate_vma_finalize(&migrate);
-
- return 0;
-}
-EXPORT_SYMBOL(migrate_vma);
-#endif /* defined(MIGRATE_VMA_HELPER) */
+EXPORT_SYMBOL(migrate_vma_finalize);
+#endif /* CONFIG_DEVICE_PRIVATE */
diff --git a/mm/mincore.c b/mm/mincore.c
index 4fe91d497436..49b6fa2f6aa1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -10,7 +10,7 @@
*/
#include <linux/pagemap.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/mman.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
@@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
}
+static const struct mm_walk_ops mincore_walk_ops = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+};
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
struct vm_area_struct *vma;
unsigned long end;
int err;
- struct mm_walk mincore_walk = {
- .pmd_entry = mincore_pte_range,
- .pte_hole = mincore_unmapped_range,
- .hugetlb_entry = mincore_hugetlb,
- .private = vec,
- };
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
@@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
memset(vec, 1, pages);
return pages;
}
- mincore_walk.mm = vma->vm_mm;
- err = walk_page_range(addr, end, &mincore_walk);
+ err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
if (err < 0)
return err;
return (end - addr) >> PAGE_SHIFT;
@@ -257,6 +256,8 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
unsigned long pages;
unsigned char *tmp;
+ start = untagged_addr(start);
+
/* Check the start address: needs to be page-aligned.. */
if (start & ~PAGE_MASK)
return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index a90099da4fb4..a72c1eeded77 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -674,6 +674,8 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
unsigned long lock_limit;
int error = -ENOMEM;
+ start = untagged_addr(start);
+
if (!can_do_mlock())
return -EPERM;
@@ -735,6 +737,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
+ start = untagged_addr(start);
+
len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7e8c3e8ae75f..a7d8c84d19b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -201,6 +201,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
bool downgraded = false;
LIST_HEAD(uf);
+ brk = untagged_addr(brk);
+
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -289,9 +291,9 @@ out:
return retval;
}
-static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
- unsigned long max, prev_end, subtree_gap;
+ unsigned long gap, prev_end;
/*
* Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
@@ -299,14 +301,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
* an unmapped area; whereas when expanding we only require one.
* That's a little inconsistent, but keeps the code here simpler.
*/
- max = vm_start_gap(vma);
+ gap = vm_start_gap(vma);
if (vma->vm_prev) {
prev_end = vm_end_gap(vma->vm_prev);
- if (max > prev_end)
- max -= prev_end;
+ if (gap > prev_end)
+ gap -= prev_end;
else
- max = 0;
+ gap = 0;
}
+ return gap;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max = vma_compute_gap(vma), subtree_gap;
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -322,7 +331,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
return max;
}
-#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
@@ -428,8 +436,9 @@ static void validate_mm(struct mm_struct *mm)
#define validate_mm(mm) do { } while (0)
#endif
-RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
+ struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_gap)
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
@@ -439,8 +448,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
static void vma_gap_update(struct vm_area_struct *vma)
{
/*
- * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
- * function that does exactly what we want.
+ * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
+ * a callback function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
@@ -1358,6 +1367,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
if (S_ISBLK(inode->i_mode))
return MAX_LFS_FILESIZE;
+ if (S_ISSOCK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
/* Special "we do even unsigned file positions" case */
if (file->f_mode & FMODE_UNSIGNED_OFFSET)
return 0;
@@ -1483,8 +1495,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
- if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
- return -EACCES;
+ if (prot & PROT_WRITE) {
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EACCES;
+ if (IS_SWAPFILE(file->f_mapping->host))
+ return -ETXTBSY;
+ }
/*
* Make sure we don't allow writing to an append-only
@@ -1573,6 +1589,8 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
struct file *file = NULL;
unsigned long retval;
+ addr = untagged_addr(addr);
+
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
@@ -2270,12 +2288,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
if (vma) {
*pprev = vma->vm_prev;
} else {
- struct rb_node *rb_node = mm->mm_rb.rb_node;
- *pprev = NULL;
- while (rb_node) {
- *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- rb_node = rb_node->rb_right;
- }
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
@@ -2874,6 +2889,7 @@ EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
+ addr = untagged_addr(addr);
profile_munmap(addr);
return __vm_munmap(addr, len, true);
}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 8c943a6e1696..7d70e5c78f97 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
tlb_flush_mmu(tlb);
- /* keep the page table cache within bounds */
- check_pgt_cache();
#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
tlb_batch_list_free(tlb);
#endif
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index b5670620aea0..9a889e456168 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -21,17 +21,11 @@
/* global SRCU for all MMs */
DEFINE_STATIC_SRCU(srcu);
-/*
- * This function allows mmu_notifier::release callback to delay a call to
- * a function that will free appropriate resources. The function must be
- * quick and must not block.
- */
-void mmu_notifier_call_srcu(struct rcu_head *rcu,
- void (*func)(struct rcu_head *rcu))
-{
- call_srcu(&srcu, rcu, func);
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
+ .name = "mmu_notifier_invalidate_range_start"
+};
+#endif
/*
* This function can't run concurrently against mmu_notifier_register
@@ -174,11 +168,19 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start) {
- int _ret = mn->ops->invalidate_range_start(mn, range);
+ int _ret;
+
+ if (!mmu_notifier_range_blockable(range))
+ non_block_start();
+ _ret = mn->ops->invalidate_range_start(mn, range);
+ if (!mmu_notifier_range_blockable(range))
+ non_block_end();
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
mn->ops->invalidate_range_start, _ret,
!mmu_notifier_range_blockable(range) ? "non-" : "");
+ WARN_ON(mmu_notifier_range_blockable(range) ||
+ _ret != -EAGAIN);
ret = _ret;
}
}
@@ -187,7 +189,6 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
return ret;
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
bool only_end)
@@ -195,6 +196,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
struct mmu_notifier *mn;
int id;
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
/*
@@ -214,12 +216,17 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
mn->ops->invalidate_range(mn, range->mm,
range->start,
range->end);
- if (mn->ops->invalidate_range_end)
+ if (mn->ops->invalidate_range_end) {
+ if (!mmu_notifier_range_blockable(range))
+ non_block_start();
mn->ops->invalidate_range_end(mn, range);
+ if (!mmu_notifier_range_blockable(range))
+ non_block_end();
+ }
}
srcu_read_unlock(&srcu, id);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -234,35 +241,49 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
srcu_read_unlock(&srcu, id);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
-static int do_mmu_notifier_register(struct mmu_notifier *mn,
- struct mm_struct *mm,
- int take_mmap_sem)
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
- struct mmu_notifier_mm *mmu_notifier_mm;
+ struct mmu_notifier_mm *mmu_notifier_mm = NULL;
int ret;
+ lockdep_assert_held_write(&mm->mmap_sem);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
- ret = -ENOMEM;
- mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
- if (unlikely(!mmu_notifier_mm))
- goto out;
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ fs_reclaim_acquire(GFP_KERNEL);
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ fs_reclaim_release(GFP_KERNEL);
+ }
- if (take_mmap_sem)
- down_write(&mm->mmap_sem);
- ret = mm_take_all_locks(mm);
- if (unlikely(ret))
- goto out_clean;
+ mn->mm = mm;
+ mn->users = 1;
+
+ if (!mm->mmu_notifier_mm) {
+ /*
+ * kmalloc cannot be called under mm_take_all_locks(), but we
+ * know that mm->mmu_notifier_mm can't change while we hold
+ * the write side of the mmap_sem.
+ */
+ mmu_notifier_mm =
+ kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ if (!mmu_notifier_mm)
+ return -ENOMEM;
- if (!mm_has_notifiers(mm)) {
INIT_HLIST_HEAD(&mmu_notifier_mm->list);
spin_lock_init(&mmu_notifier_mm->lock);
-
- mm->mmu_notifier_mm = mmu_notifier_mm;
- mmu_notifier_mm = NULL;
}
+
+ ret = mm_take_all_locks(mm);
+ if (unlikely(ret))
+ goto out_clean;
+
+ /* Pairs with the mmdrop in mmu_notifier_unregister_* */
mmgrab(mm);
/*
@@ -273,48 +294,118 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
* We can't race against any other mmu notifier method either
* thanks to mm_take_all_locks().
*/
+ if (mmu_notifier_mm)
+ mm->mmu_notifier_mm = mmu_notifier_mm;
+
spin_lock(&mm->mmu_notifier_mm->lock);
hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ return 0;
+
out_clean:
- if (take_mmap_sem)
- up_write(&mm->mmap_sem);
kfree(mmu_notifier_mm);
-out:
- BUG_ON(atomic_read(&mm->mm_users) <= 0);
return ret;
}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
-/*
+/**
+ * mmu_notifier_register - Register a notifier on a mm
+ * @mn: The notifier to attach
+ * @mm: The mm to attach the notifier to
+ *
* Must not hold mmap_sem nor any other VM related lock when calling
* this registration function. Must also ensure mm_users can't go down
* to zero while this runs to avoid races with mmu_notifier_release,
* so mm has to be current->mm or the mm should be pinned safely such
* as with get_task_mm(). If the mm is not current->mm, the mm_users
* pin should be released by calling mmput after mmu_notifier_register
- * returns. mmu_notifier_unregister must be always called to
- * unregister the notifier. mm_count is automatically pinned to allow
- * mmu_notifier_unregister to safely run at any time later, before or
- * after exit_mmap. ->release will always be called before exit_mmap
- * frees the pages.
+ * returns.
+ *
+ * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
+ * unregister the notifier.
+ *
+ * While the caller has a mmu_notifier get the mn->mm pointer will remain
+ * valid, and can be converted to an active mm pointer via mmget_not_zero().
*/
int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
- return do_mmu_notifier_register(mn, mm, 1);
+ int ret;
+
+ down_write(&mm->mmap_sem);
+ ret = __mmu_notifier_register(mn, mm);
+ up_write(&mm->mmap_sem);
+ return ret;
}
EXPORT_SYMBOL_GPL(mmu_notifier_register);
-/*
- * Same as mmu_notifier_register but here the caller must hold the
- * mmap_sem in write mode.
+static struct mmu_notifier *
+find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
+{
+ struct mmu_notifier *mn;
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops != ops)
+ continue;
+
+ if (likely(mn->users != UINT_MAX))
+ mn->users++;
+ else
+ mn = ERR_PTR(-EOVERFLOW);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ return mn;
+ }
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ return NULL;
+}
+
+/**
+ * mmu_notifier_get_locked - Return the single struct mmu_notifier for
+ * the mm & ops
+ * @ops: The operations struct being subscribe with
+ * @mm : The mm to attach notifiers too
+ *
+ * This function either allocates a new mmu_notifier via
+ * ops->alloc_notifier(), or returns an already existing notifier on the
+ * list. The value of the ops pointer is used to determine when two notifiers
+ * are the same.
+ *
+ * Each call to mmu_notifier_get() must be paired with a call to
+ * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem.
+ *
+ * While the caller has a mmu_notifier get the mm pointer will remain valid,
+ * and can be converted to an active mm pointer via mmget_not_zero().
*/
-int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
+ struct mm_struct *mm)
{
- return do_mmu_notifier_register(mn, mm, 0);
+ struct mmu_notifier *mn;
+ int ret;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ if (mm->mmu_notifier_mm) {
+ mn = find_get_mmu_notifier(mm, ops);
+ if (mn)
+ return mn;
+ }
+
+ mn = ops->alloc_notifier(mm);
+ if (IS_ERR(mn))
+ return mn;
+ mn->ops = ops;
+ ret = __mmu_notifier_register(mn, mm);
+ if (ret)
+ goto out_free;
+ return mn;
+out_free:
+ mn->ops->free_notifier(mn);
+ return ERR_PTR(ret);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
/* this is called after the last mmu_notifier_unregister() returned */
void __mmu_notifier_mm_destroy(struct mm_struct *mm)
@@ -375,24 +466,74 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
-/*
- * Same as mmu_notifier_unregister but no callback and no srcu synchronization.
+static void mmu_notifier_free_rcu(struct rcu_head *rcu)
+{
+ struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu);
+ struct mm_struct *mm = mn->mm;
+
+ mn->ops->free_notifier(mn);
+ /* Pairs with the get in __mmu_notifier_register() */
+ mmdrop(mm);
+}
+
+/**
+ * mmu_notifier_put - Release the reference on the notifier
+ * @mn: The notifier to act on
+ *
+ * This function must be paired with each mmu_notifier_get(), it releases the
+ * reference obtained by the get. If this is the last reference then process
+ * to free the notifier will be run asynchronously.
+ *
+ * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
+ * when the mm_struct is destroyed. Instead free_notifier is always called to
+ * release any resources held by the user.
+ *
+ * As ops->release is not guaranteed to be called, the user must ensure that
+ * all sptes are dropped, and no new sptes can be established before
+ * mmu_notifier_put() is called.
+ *
+ * This function can be called from the ops->release callback, however the
+ * caller must still ensure it is called pairwise with mmu_notifier_get().
+ *
+ * Modules calling this function must call mmu_notifier_synchronize() in
+ * their __exit functions to ensure the async work is completed.
*/
-void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
+void mmu_notifier_put(struct mmu_notifier *mn)
{
+ struct mm_struct *mm = mn->mm;
+
spin_lock(&mm->mmu_notifier_mm->lock);
- /*
- * Can not use list_del_rcu() since __mmu_notifier_release
- * can delete it before we hold the lock.
- */
+ if (WARN_ON(!mn->users) || --mn->users)
+ goto out_unlock;
hlist_del_init_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
- BUG_ON(atomic_read(&mm->mm_count) <= 0);
- mmdrop(mm);
+ call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu);
+ return;
+
+out_unlock:
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_put);
+
+/**
+ * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
+ *
+ * This function ensures that all outstanding async SRU work from
+ * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
+ * associated with an unused mmu_notifier will no longer be called.
+ *
+ * Before using the caller must ensure that all of its mmu_notifiers have been
+ * fully released via mmu_notifier_put().
+ *
+ * Modules using the mmu_notifier_put() API should call this in their __exit
+ * function to avoid module unloading races.
+ */
+void mmu_notifier_synchronize(void)
+{
+ synchronize_srcu(&srcu);
}
-EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bf38dfbbb4b4..7967825f6d33 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -9,7 +9,7 @@
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
@@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next,
return 0;
}
-static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, unsigned long newflags)
-{
- pgprot_t new_pgprot = vm_get_page_prot(newflags);
- struct mm_walk prot_none_walk = {
- .pte_entry = prot_none_pte_entry,
- .hugetlb_entry = prot_none_hugetlb_entry,
- .test_walk = prot_none_test,
- .mm = current->mm,
- .private = &new_pgprot,
- };
-
- return walk_page_range(start, end, &prot_none_walk);
-}
+static const struct mm_walk_ops prot_none_walk_ops = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+};
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
@@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
if (arch_has_pfn_modify_check() &&
(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
- error = prot_none_walk(vma, start, end, newflags);
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+
+ error = walk_page_range(current->mm, start, end,
+ &prot_none_walk_ops, &new_pgprot);
if (error)
return error;
}
@@ -465,6 +459,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
+ start = untagged_addr(start);
+
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
diff --git a/mm/mremap.c b/mm/mremap.c
index fc241d23cd97..1fc8a29fbe3f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -606,6 +606,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
+ addr = untagged_addr(addr);
+ new_addr = untagged_addr(new_addr);
+
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
return ret;
diff --git a/mm/msync.c b/mm/msync.c
index ef30a429623a..c3bd3e75f687 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -37,6 +37,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
int unmapped_error = 0;
int error = -EINVAL;
+ start = untagged_addr(start);
+
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
if (offset_in_page(start))
diff --git a/mm/nommu.c b/mm/nommu.c
index fed1b6e9c89b..99b7ec318824 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp)
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
/**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a0bdc6..71e3acea7817 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -73,7 +73,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
/**
* oom_cpuset_eligible() - check task eligiblity for kill
* @start: task struct of which task to consider
- * @mask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
@@ -287,7 +287,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, *oc->nodemask)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
@@ -300,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
if (cpuset_limited) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
@@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
continue;
/*
@@ -884,12 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- message, task_pid_nr(victim), victim->comm,
- K(victim->mm->total_vm),
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ from_kuid(&init_user_ns, task_uid(victim)),
+ mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
task_unlock(victim);
/*
@@ -1068,9 +1069,10 @@ bool out_of_memory(struct oom_control *oc)
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1804f64ff43c..50055d2e4ea8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
+ mem_cgroup_flush_foreign(wb);
+
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
+
+ mem_cgroup_track_foreign_dirty(page, wb);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 272c6de1bf4e..f391c0c4ed1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -670,6 +670,7 @@ out:
void free_compound_page(struct page *page)
{
+ mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page));
}
@@ -1174,11 +1175,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- arch_free_page(page, order);
if (want_init_on_free())
kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order, 0);
+ /*
+ * arch_free_page() can make the page's contents inaccessible. s390
+ * does this. So nothing which can access the page's contents should
+ * happen after this.
+ */
+ arch_free_page(page, order);
+
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
@@ -1941,6 +1948,14 @@ void __init page_alloc_init_late(void)
wait_for_completion(&pgdat_init_all_done_comp);
/*
+ * The number of managed pages has changed due to the initialisation
+ * so the pcpu batch and high limits needs to be updated or the limits
+ * will be artificially small.
+ */
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone);
+
+ /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -2238,27 +2253,12 @@ static int move_freepages(struct zone *zone,
unsigned int order;
int pages_moved = 0;
-#ifndef CONFIG_HOLES_IN_ZONE
- /*
- * page_zone is not safe to call in this context when
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- * anyway as we check zone boundaries in move_freepages_block().
- * Remove at a later date when no bug reports exist related to
- * grouping pages by mobility
- */
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- pfn_valid(page_to_pfn(end_page)) &&
- page_zone(start_page) != page_zone(end_page));
-#endif
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2273,6 +2273,10 @@ static int move_freepages(struct zone *zone,
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+
order = page_order(page);
move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
@@ -3522,7 +3526,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- RECLAIM_DISTANCE;
+ node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@ -3724,10 +3728,6 @@ try_this_zone:
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
-
- if (!__ratelimit(&show_mem_rs))
- return;
/*
* This documents exceptions given to allocations in certain
@@ -3748,8 +3748,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
@@ -3966,14 +3965,22 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
goto check_priority;
/*
+ * compaction was skipped because there are not enough order-0 pages
+ * to work with, so we retry only if it looks like reclaim can help.
+ */
+ if (compaction_needs_reclaim(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
+
+ /*
* make sure the compaction wasn't deferred or didn't bail out early
* due to locks contention before we declare that we should give up.
- * But do not retry if the given zonelist is not suitable for
- * compaction.
+ * But the next retry should use a higher priority if allowed, so
+ * we don't just keep bailing out endlessly.
*/
if (compaction_withdrawn(compact_result)) {
- ret = compaction_zonelist_suitable(ac, order, alloc_flags);
- goto out;
+ goto check_priority;
}
/*
@@ -4469,6 +4476,30 @@ retry_cpuset:
if (page)
goto got_pg;
+ if (order >= pageblock_order && (gfp_mask & __GFP_IO) &&
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)) {
+ /*
+ * If allocating entire pageblock(s) and compaction
+ * failed because all zones are below low watermarks
+ * or is prohibited because it recently failed at this
+ * order, fail immediately unless the allocator has
+ * requested compaction and reclaim retry.
+ *
+ * Reclaim is
+ * - potentially very expensive because zones are far
+ * below their low watermarks or this is part of very
+ * bursty high order allocations,
+ * - not guaranteed to help because isolate_freepages()
+ * may not iterate over freed pages as part of its
+ * linear scan, and
+ * - unlikely to make entire pageblocks free on its
+ * own.
+ */
+ if (compact_result == COMPACT_SKIPPED ||
+ compact_result == COMPACT_DEFERRED)
+ goto nopage;
+ }
+
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
@@ -5982,7 +6013,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
}
}
- pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+ pr_info("%s initialised %lu pages in %ums\n", __func__,
size, jiffies_to_msecs(jiffies - start));
}
@@ -6649,9 +6680,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
- spin_lock_init(&pgdat->split_queue_lock);
- INIT_LIST_HEAD(&pgdat->split_queue);
- pgdat->split_queue_len = 0;
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
}
#else
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
@@ -8207,7 +8240,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- skip_pages = (1 << compound_order(head)) - (page - head);
+ skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
continue;
}
@@ -8484,7 +8517,6 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
@@ -8498,7 +8530,6 @@ void __meminit zone_pcp_update(struct zone *zone)
per_cpu_ptr(zone->pageset, cpu));
mutex_unlock(&pcp_batch_high_lock);
}
-#endif
void zone_pcp_reset(struct zone *zone)
{
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 5f5769c7db3b..4ade843ff588 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -67,8 +67,9 @@ static struct page_ext_operations *page_ext_ops[] = {
#endif
};
+unsigned long page_ext_size = sizeof(struct page_ext);
+
static unsigned long total_usage;
-static unsigned long extra_mem;
static bool __init invoke_need_callbacks(void)
{
@@ -78,9 +79,8 @@ static bool __init invoke_need_callbacks(void)
for (i = 0; i < entries; i++) {
if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
- page_ext_ops[i]->offset = sizeof(struct page_ext) +
- extra_mem;
- extra_mem += page_ext_ops[i]->size;
+ page_ext_ops[i]->offset = page_ext_size;
+ page_ext_size += page_ext_ops[i]->size;
need = true;
}
}
@@ -99,14 +99,9 @@ static void __init invoke_init_callbacks(void)
}
}
-static unsigned long get_entry_size(void)
-{
- return sizeof(struct page_ext) + extra_mem;
-}
-
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
- return base + get_entry_size() * index;
+ return base + page_ext_size * index;
}
#if !defined(CONFIG_SPARSEMEM)
@@ -156,7 +151,7 @@ static int __init alloc_node_page_ext(int nid)
!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
nr_pages += MAX_ORDER_NR_PAGES;
- table_size = get_entry_size() * nr_pages;
+ table_size = page_ext_size * nr_pages;
base = memblock_alloc_try_nid(
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -234,7 +229,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
if (section->page_ext)
return 0;
- table_size = get_entry_size() * PAGES_PER_SECTION;
+ table_size = page_ext_size * PAGES_PER_SECTION;
base = alloc_page_ext(table_size, nid);
/*
@@ -254,7 +249,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
* we need to apply a mask.
*/
pfn &= PAGE_SECTION_MASK;
- section->page_ext = (void *)base - get_entry_size() * pfn;
+ section->page_ext = (void *)base - page_ext_size * pfn;
total_usage += table_size;
return 0;
}
@@ -267,7 +262,7 @@ static void free_page_ext(void *addr)
struct page *page = virt_to_page(addr);
size_t table_size;
- table_size = get_entry_size() * PAGES_PER_SECTION;
+ table_size = page_ext_size * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
kmemleak_free(addr);
diff --git a/mm/page_io.c b/mm/page_io.c
index 24ee600f9131..60a66a58b9bf 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -73,6 +73,7 @@ static void swap_slot_free_notify(struct page *page)
{
struct swap_info_struct *sis;
struct gendisk *disk;
+ swp_entry_t entry;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -104,11 +105,10 @@ static void swap_slot_free_notify(struct page *page)
* we again wish to reclaim it.
*/
disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
+ entry.val = page_private(page);
+ if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
unsigned long offset;
- entry.val = page_private(page);
offset = swp_offset(entry);
SetPageDirty(page);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index addcbb2ae4e4..18ecde9f45b2 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -24,9 +24,10 @@ struct page_owner {
short last_migrate_reason;
gfp_t gfp_mask;
depot_stack_handle_t handle;
+ depot_stack_handle_t free_handle;
};
-static bool page_owner_disabled = true;
+static bool page_owner_enabled = false;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
@@ -41,7 +42,7 @@ static int __init early_page_owner_param(char *buf)
return -EINVAL;
if (strcmp(buf, "on") == 0)
- page_owner_disabled = false;
+ page_owner_enabled = true;
return 0;
}
@@ -49,10 +50,7 @@ early_param("page_owner", early_page_owner_param);
static bool need_page_owner(void)
{
- if (page_owner_disabled)
- return false;
-
- return true;
+ return page_owner_enabled;
}
static __always_inline depot_stack_handle_t create_dummy_stack(void)
@@ -81,7 +79,7 @@ static noinline void register_early_stack(void)
static void init_page_owner(void)
{
- if (page_owner_disabled)
+ if (!page_owner_enabled)
return;
register_dummy_stack();
@@ -102,19 +100,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
return (void *)page_ext + page_owner_ops.offset;
}
-void __reset_page_owner(struct page *page, unsigned int order)
-{
- int i;
- struct page_ext *page_ext;
-
- for (i = 0; i < (1 << order); i++) {
- page_ext = lookup_page_ext(page + i);
- if (unlikely(!page_ext))
- continue;
- __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
- }
-}
-
static inline bool check_recursive_alloc(unsigned long *entries,
unsigned int nr_entries,
unsigned long ip)
@@ -154,18 +139,44 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
return handle;
}
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
- depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
+void __reset_page_owner(struct page *page, unsigned int order)
{
+ int i;
+ struct page_ext *page_ext;
+ depot_stack_handle_t handle = 0;
struct page_owner *page_owner;
- page_owner = get_page_owner(page_ext);
- page_owner->handle = handle;
- page_owner->order = order;
- page_owner->gfp_mask = gfp_mask;
- page_owner->last_migrate_reason = -1;
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+ for (i = 0; i < (1 << order); i++) {
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_owner = get_page_owner(page_ext);
+ page_owner->free_handle = handle;
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+static inline void __set_page_owner_handle(struct page *page,
+ struct page_ext *page_ext, depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
+{
+ struct page_owner *page_owner;
+ int i;
- __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = handle;
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = -1;
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+
+ page_ext = page_ext_next(page_ext);
+ }
}
noinline void __set_page_owner(struct page *page, unsigned int order,
@@ -178,7 +189,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -202,10 +213,11 @@ void __split_page_owner(struct page *page, unsigned int order)
if (unlikely(!page_ext))
return;
- page_owner = get_page_owner(page_ext);
- page_owner->order = 0;
- for (i = 1; i < (1 << order); i++)
- __copy_page_owner(page, page + i);
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->order = 0;
+ page_ext = page_ext_next(page_ext);
+ }
}
void __copy_page_owner(struct page *oldpage, struct page *newpage)
@@ -235,6 +247,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
* the new page, which will be freed.
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -258,7 +271,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
- if (!pfn_valid(pfn)) {
+ page = pfn_to_online_page(pfn);
+ if (!page) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
}
@@ -266,13 +280,13 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
+ /* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
@@ -294,7 +308,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (unlikely(!page_ext))
continue;
- if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
continue;
page_owner = get_page_owner(page_ext);
@@ -405,20 +419,34 @@ void __dump_page_owner(struct page *page)
mt = gfpflags_to_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
- pr_alert("page_owner info is not active (free page?)\n");
+ pr_alert("page_owner info is not present (never set?)\n");
return;
}
+ if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ pr_alert("page_owner tracks the page as allocated\n");
+ else
+ pr_alert("page_owner tracks the page as freed\n");
+
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+
handle = READ_ONCE(page_owner->handle);
if (!handle) {
- pr_alert("page_owner info is not active (free page?)\n");
- return;
+ pr_alert("page_owner allocation stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ stack_trace_print(entries, nr_entries, 0);
}
- nr_entries = stack_depot_fetch(handle, &entries);
- pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
- stack_trace_print(entries, nr_entries, 0);
+ handle = READ_ONCE(page_owner->free_handle);
+ if (!handle) {
+ pr_alert("page_owner free stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ pr_alert("page last free stack trace:\n");
+ stack_trace_print(entries, nr_entries, 0);
+ }
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
@@ -481,9 +509,23 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ /*
+ * Although we do have the info about past allocation of free
+ * pages, it's not relevant for current memory usage.
+ */
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ continue;
+
page_owner = get_page_owner(page_ext);
/*
+ * Don't print "tail" pages of high-order allocations as that
+ * would inflate the stats.
+ */
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
+ continue;
+
+ /*
* Access to page_ext->handle isn't synchronous so we should
* be careful to access it.
*/
@@ -562,7 +604,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/* Found early allocated page */
- __set_page_owner_handle(page_ext, early_handle, 0, 0);
+ __set_page_owner_handle(page, page_ext, early_handle,
+ 0, 0);
count++;
}
cond_resched();
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 21d4f97cb49b..34b9181ee5d1 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -101,7 +101,7 @@ static void unpoison_page(struct page *page)
/*
* Page poisoning when enabled poisons each and every page
* that is freed to buddy. Thus no extra check is done to
- * see if a page was posioned.
+ * see if a page was poisoned.
*/
check_poison_mem(addr, PAGE_SIZE);
kunmap_atomic(addr);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 11df03e71288..eff4b4520c8d 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address,
- PAGE_SIZE << compound_order(page));
+ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3084ff2569d..d48c2a986ea3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
@@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
{
pte_t *pte;
int err = 0;
+ const struct mm_walk_ops *ops = walk->ops;
pte = pte_offset_map(pmd, addr);
for (;;) {
- err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
@@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
{
pmd_t *pmd;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
pmd = pmd_offset(pud, addr);
@@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
again:
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || !walk->vma) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
@@ -47,8 +49,8 @@ again:
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
*/
- if (walk->pmd_entry)
- err = walk->pmd_entry(pmd, addr, next, walk);
+ if (ops->pmd_entry)
+ err = ops->pmd_entry(pmd, addr, next, walk);
if (err)
break;
@@ -56,7 +58,7 @@ again:
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
- if (!walk->pte_entry)
+ if (!ops->pte_entry)
continue;
split_huge_pmd(walk->vma, pmd, addr);
@@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
{
pud_t *pud;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
pud = pud_offset(p4d, addr);
@@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
again:
next = pud_addr_end(addr, end);
if (pud_none(*pud) || !walk->vma) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pud_entry) {
+ if (ops->pud_entry) {
spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
if (ptl) {
- err = walk->pud_entry(pud, addr, next, walk);
+ err = ops->pud_entry(pud, addr, next, walk);
spin_unlock(ptl);
if (err)
break;
@@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (pud_none(*pud))
goto again;
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
@@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
{
p4d_t *p4d;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
{
pgd_t *pgd;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
unsigned long hmask = huge_page_mask(h);
unsigned long sz = huge_page_size(h);
pte_t *pte;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
do {
@@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
pte = huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte)
- err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
- else if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
+ else if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
@@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
+ const struct mm_walk_ops *ops = walk->ops;
- if (walk->test_walk)
- return walk->test_walk(start, end, walk);
+ if (ops->test_walk)
+ return ops->test_walk(start, end, walk);
/*
* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
@@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end,
*/
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
- if (walk->pte_hole)
- err = walk->pte_hole(start, end, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, end, walk);
return err ? err : 1;
}
return 0;
@@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
struct vm_area_struct *vma = walk->vma;
if (vma && is_vm_hugetlb_page(vma)) {
- if (walk->hugetlb_entry)
+ if (walk->ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
@@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end,
/**
* walk_page_range - walk page table with caller specific callbacks
- * @start: start address of the virtual address range
- * @end: end address of the virtual address range
- * @walk: mm_walk structure defining the callbacks and the target address space
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
*
- * Recursively walk the page table tree of the process represented by @walk->mm
+ * Recursively walk the page table tree of the process represented by @mm
* within the virtual address range [@start, @end). During walking, we can do
* some caller-specific works for each entry, by setting up pmd_entry(),
* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
@@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end,
*
* Before starting to walk page table, some callers want to check whether
* they really want to walk over the current vma, typically by checking
- * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
* purpose.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
- * caller-specific data to callbacks, @walk->private should be helpful.
+ * caller-specific data to callbacks, @private should be helpful.
*
* Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold
- * @walk->mm->mmap_sem, because these function traverse vma list and/or
- * access to vma's data.
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
+ * because these function traverse vma list and/or access to vma's data.
*/
-int walk_page_range(unsigned long start, unsigned long end,
- struct mm_walk *walk)
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
{
int err = 0;
unsigned long next;
struct vm_area_struct *vma;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .private = private,
+ };
if (start >= end)
return -EINVAL;
- if (!walk->mm)
+ if (!walk.mm)
return -EINVAL;
- VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
+ lockdep_assert_held(&walk.mm->mmap_sem);
- vma = find_vma(walk->mm, start);
+ vma = find_vma(walk.mm, start);
do {
if (!vma) { /* after the last vma */
- walk->vma = NULL;
+ walk.vma = NULL;
next = end;
} else if (start < vma->vm_start) { /* outside vma */
- walk->vma = NULL;
+ walk.vma = NULL;
next = min(end, vma->vm_start);
} else { /* inside vma */
- walk->vma = vma;
+ walk.vma = vma;
next = min(end, vma->vm_end);
vma = vma->vm_next;
- err = walk_page_test(start, next, walk);
+ err = walk_page_test(start, next, &walk);
if (err > 0) {
/*
* positive return values are purely for
@@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end,
if (err < 0)
break;
}
- if (walk->vma || walk->pte_hole)
- err = __walk_page_range(start, next, walk);
+ if (walk.vma || walk.ops->pte_hole)
+ err = __walk_page_range(start, next, &walk);
if (err)
break;
} while (start = next, start < end);
return err;
}
-int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+ void *private)
{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
int err;
- if (!walk->mm)
+ if (!walk.mm)
return -EINVAL;
- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- VM_BUG_ON(!vma);
- walk->vma = vma;
- err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ lockdep_assert_held(&walk.mm->mmap_sem);
+
+ err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0)
return 0;
if (err < 0)
return err;
- return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+ return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 9821241fdede..7e06a1e58720 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2125,7 +2125,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
void *ptr;
int unit;
- base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+ base_size = ALIGN(struct_size(ai, groups, nr_groups),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
@@ -2220,7 +2220,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
* @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
- * perpcu area. This function is to be called from arch percpu area
+ * percpu area. This function is to be called from arch percpu area
* setup path.
*
* @ai contains all information necessary to initialize the first
@@ -2267,12 +2267,9 @@ static void pcpu_dump_alloc_info(const char *lvl,
* share the same vm, but use offset regions in the area allocation map.
* The chunk serving the dynamic region is circulated in the chunk slots
* and available for dynamic allocation like any other chunk.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
*/
-int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
- void *base_addr)
+void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
{
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
size_t static_size, dyn_size;
@@ -2457,7 +2454,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* we're done */
pcpu_base_addr = base_addr;
- return 0;
}
#ifdef CONFIG_SMP
@@ -2710,7 +2706,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size;
unsigned long max_distance;
- int group, i, highest_group, rc;
+ int group, i, highest_group, rc = 0;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
@@ -2795,7 +2791,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
- rc = pcpu_setup_first_chunk(ai, base);
+ pcpu_setup_first_chunk(ai, base);
goto out_free;
out_free_areas:
@@ -2839,7 +2835,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
int unit_pages;
size_t pages_size;
struct page **pages;
- int unit, i, j, rc;
+ int unit, i, j, rc = 0;
int upa;
int nr_g0_units;
@@ -2920,7 +2916,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
- rc = pcpu_setup_first_chunk(ai, vm.addr);
+ pcpu_setup_first_chunk(ai, vm.addr);
goto out_free_ar;
enomem:
@@ -3014,8 +3010,7 @@ void __init setup_per_cpu_areas(void)
ai->groups[0].nr_units = 1;
ai->groups[0].cpu_map[0] = 0;
- if (pcpu_setup_first_chunk(ai, fc) < 0)
- panic("Failed to initialize percpu areas.");
+ pcpu_setup_first_chunk(ai, fc);
pcpu_free_alloc_info(ai);
}
diff --git a/mm/quicklist.c b/mm/quicklist.c
deleted file mode 100644
index 5e98ac78e410..000000000000
--- a/mm/quicklist.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Quicklist support.
- *
- * Quicklists are light weight lists of pages that have a defined state
- * on alloc and free. Pages must be in the quicklist specific defined state
- * (zero by default) when the page is freed. It seems that the initial idea
- * for such lists first came from Dave Miller and then various other people
- * improved on it.
- *
- * Copyright (C) 2007 SGI,
- * Christoph Lameter <cl@linux.com>
- * Generalized, added support for multiple lists and
- * constructors / destructors.
- */
-#include <linux/kernel.h>
-
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/quicklist.h>
-
-DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
-
-#define FRACTION_OF_NODE_MEM 16
-
-static unsigned long max_pages(unsigned long min_pages)
-{
- unsigned long node_free_pages, max;
- int node = numa_node_id();
- struct zone *zones = NODE_DATA(node)->node_zones;
- int num_cpus_on_node;
-
- node_free_pages =
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
-
- max = node_free_pages / FRACTION_OF_NODE_MEM;
-
- num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
- max /= num_cpus_on_node;
-
- return max(max, min_pages);
-}
-
-static long min_pages_to_free(struct quicklist *q,
- unsigned long min_pages, long max_free)
-{
- long pages_to_free;
-
- pages_to_free = q->nr_pages - max_pages(min_pages);
-
- return min(pages_to_free, max_free);
-}
-
-/*
- * Trim down the number of pages in the quicklist
- */
-void quicklist_trim(int nr, void (*dtor)(void *),
- unsigned long min_pages, unsigned long max_free)
-{
- long pages_to_free;
- struct quicklist *q;
-
- q = &get_cpu_var(quicklist)[nr];
- if (q->nr_pages > min_pages) {
- pages_to_free = min_pages_to_free(q, min_pages, max_free);
-
- while (pages_to_free > 0) {
- /*
- * We pass a gfp_t of 0 to quicklist_alloc here
- * because we will never call into the page allocator.
- */
- void *p = quicklist_alloc(nr, 0, NULL);
-
- if (dtor)
- dtor(p);
- free_page((unsigned long)p);
- pages_to_free--;
- }
- }
- put_cpu_var(quicklist);
-}
-
-unsigned long quicklist_total_size(void)
-{
- unsigned long count = 0;
- int cpu;
- struct quicklist *ql, *q;
-
- for_each_online_cpu(cpu) {
- ql = per_cpu(quicklist, cpu);
- for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
- count += q->nr_pages;
- }
- return count;
-}
-
diff --git a/mm/rmap.c b/mm/rmap.c
index e5dfe2ae6b0d..0c7b2a9400d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -61,6 +61,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
@@ -898,15 +899,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ min(vma->vm_end, address + page_size(page)));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
- unsigned long cstart;
int ret = 0;
- cstart = address = pvmw.address;
+ address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -933,7 +932,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
- cstart &= PMD_MASK;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -1192,8 +1190,10 @@ void page_add_file_rmap(struct page *page, bool compound)
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1232,8 +1232,10 @@ static void page_remove_file_rmap(struct page *page, bool compound)
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
@@ -1374,8 +1376,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address,
- min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ min(vma->vm_end, address + page_size(page)));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1475,7 +1476,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
*/
+ subpage = page;
goto discard;
}
@@ -1516,8 +1525,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
- int nr = 1 << compound_order(page);
- hugetlb_count_sub(nr, mm);
+ hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
diff --git a/mm/shmem.c b/mm/shmem.c
index 626d8c74b973..220be9fa2c41 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -37,6 +37,7 @@
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
+#include <linux/fs_parser.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -107,6 +108,20 @@ struct shmem_falloc {
pgoff_t nr_unswapped; /* how often writepage refused to swap out */
};
+struct shmem_options {
+ unsigned long long blocks;
+ unsigned long long inodes;
+ struct mempolicy *mpol;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ int huge;
+ int seen;
+#define SHMEM_SEEN_BLOCKS 1
+#define SHMEM_SEEN_INODES 2
+#define SHMEM_SEEN_HUGE 4
+};
+
#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
@@ -594,7 +609,7 @@ static int shmem_add_to_page_cache(struct page *page,
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
- unsigned long nr = 1UL << compound_order(page);
+ unsigned long nr = compound_nr(page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -616,7 +631,7 @@ static int shmem_add_to_page_cache(struct page *page,
if (xas_error(&xas))
goto unlock;
next:
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
@@ -1719,7 +1734,7 @@ unlock:
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * fault_mm and fault_type are only supplied by shmem_fault:
+ * vmf and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1869,7 +1884,7 @@ alloc_nohuge:
lru_cache_add_anon(page);
spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
+ info->alloced += compound_nr(page);
inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1910,7 +1925,7 @@ clear:
struct page *head = compound_head(page);
int i;
- for (i = 0; i < (1 << compound_order(head)); i++) {
+ for (i = 0; i < compound_nr(head); i++) {
clear_highpage(head + i);
flush_dcache_page(head + i);
}
@@ -1937,7 +1952,7 @@ clear:
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
if (PageTransHuge(page)) {
unlock_page(page);
@@ -3349,16 +3364,132 @@ static const struct export_operations shmem_export_ops = {
.fh_to_dentry = shmem_fh_to_dentry,
};
-static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
- bool remount)
+enum shmem_param {
+ Opt_gid,
+ Opt_huge,
+ Opt_mode,
+ Opt_mpol,
+ Opt_nr_blocks,
+ Opt_nr_inodes,
+ Opt_size,
+ Opt_uid,
+};
+
+static const struct fs_parameter_spec shmem_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_enum ("huge", Opt_huge),
+ fsparam_u32oct("mode", Opt_mode),
+ fsparam_string("mpol", Opt_mpol),
+ fsparam_string("nr_blocks", Opt_nr_blocks),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+static const struct fs_parameter_enum shmem_param_enums[] = {
+ { Opt_huge, "never", SHMEM_HUGE_NEVER },
+ { Opt_huge, "always", SHMEM_HUGE_ALWAYS },
+ { Opt_huge, "within_size", SHMEM_HUGE_WITHIN_SIZE },
+ { Opt_huge, "advise", SHMEM_HUGE_ADVISE },
+ {}
+};
+
+const struct fs_parameter_description shmem_fs_parameters = {
+ .name = "tmpfs",
+ .specs = shmem_param_specs,
+ .enums = shmem_param_enums,
+};
+
+static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
- char *this_char, *value, *rest;
- struct mempolicy *mpol = NULL;
- uid_t uid;
- gid_t gid;
+ struct shmem_options *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ unsigned long long size;
+ char *rest;
+ int opt;
+
+ opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_size:
+ size = memparse(param->string, &rest);
+ if (*rest == '%') {
+ size <<= PAGE_SHIFT;
+ size *= totalram_pages();
+ do_div(size, 100);
+ rest++;
+ }
+ if (*rest)
+ goto bad_value;
+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_blocks:
+ ctx->blocks = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_inodes:
+ ctx->inodes = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_INODES;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 07777;
+ break;
+ case Opt_uid:
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(ctx->uid))
+ goto bad_value;
+ break;
+ case Opt_gid:
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(ctx->gid))
+ goto bad_value;
+ break;
+ case Opt_huge:
+ ctx->huge = result.uint_32;
+ if (ctx->huge != SHMEM_HUGE_NEVER &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ has_transparent_hugepage()))
+ goto unsupported_parameter;
+ ctx->seen |= SHMEM_SEEN_HUGE;
+ break;
+ case Opt_mpol:
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ mpol_put(ctx->mpol);
+ ctx->mpol = NULL;
+ if (mpol_parse_str(param->string, &ctx->mpol))
+ goto bad_value;
+ break;
+ }
+ goto unsupported_parameter;
+ }
+ return 0;
+
+unsupported_parameter:
+ return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
+bad_value:
+ return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
+}
+
+static int shmem_parse_options(struct fs_context *fc, void *data)
+{
+ char *options = data;
+
+ if (options) {
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
+ if (err)
+ return err;
+ }
while (options != NULL) {
- this_char = options;
+ char *this_char = options;
for (;;) {
/*
* NUL-terminate this option: unfortunately,
@@ -3374,139 +3505,83 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
break;
}
}
- if (!*this_char)
- continue;
- if ((value = strchr(this_char,'=')) != NULL) {
- *value++ = 0;
- } else {
- pr_err("tmpfs: No value for mount option '%s'\n",
- this_char);
- goto error;
- }
-
- if (!strcmp(this_char,"size")) {
- unsigned long long size;
- size = memparse(value,&rest);
- if (*rest == '%') {
- size <<= PAGE_SHIFT;
- size *= totalram_pages();
- do_div(size, 100);
- rest++;
+ if (*this_char) {
+ char *value = strchr(this_char,'=');
+ size_t len = 0;
+ int err;
+
+ if (value) {
+ *value++ = '\0';
+ len = strlen(value);
}
- if (*rest)
- goto bad_val;
- sbinfo->max_blocks =
- DIV_ROUND_UP(size, PAGE_SIZE);
- } else if (!strcmp(this_char,"nr_blocks")) {
- sbinfo->max_blocks = memparse(value, &rest);
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"nr_inodes")) {
- sbinfo->max_inodes = memparse(value, &rest);
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"mode")) {
- if (remount)
- continue;
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"uid")) {
- if (remount)
- continue;
- uid = simple_strtoul(value, &rest, 0);
- if (*rest)
- goto bad_val;
- sbinfo->uid = make_kuid(current_user_ns(), uid);
- if (!uid_valid(sbinfo->uid))
- goto bad_val;
- } else if (!strcmp(this_char,"gid")) {
- if (remount)
- continue;
- gid = simple_strtoul(value, &rest, 0);
- if (*rest)
- goto bad_val;
- sbinfo->gid = make_kgid(current_user_ns(), gid);
- if (!gid_valid(sbinfo->gid))
- goto bad_val;
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
- } else if (!strcmp(this_char, "huge")) {
- int huge;
- huge = shmem_parse_huge(value);
- if (huge < 0)
- goto bad_val;
- if (!has_transparent_hugepage() &&
- huge != SHMEM_HUGE_NEVER)
- goto bad_val;
- sbinfo->huge = huge;
-#endif
-#ifdef CONFIG_NUMA
- } else if (!strcmp(this_char,"mpol")) {
- mpol_put(mpol);
- mpol = NULL;
- if (mpol_parse_str(value, &mpol))
- goto bad_val;
-#endif
- } else {
- pr_err("tmpfs: Bad mount option %s\n", this_char);
- goto error;
+ err = vfs_parse_fs_string(fc, this_char, value, len);
+ if (err < 0)
+ return err;
}
}
- sbinfo->mpol = mpol;
return 0;
-
-bad_val:
- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
- value, this_char);
-error:
- mpol_put(mpol);
- return 1;
-
}
-static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
+/*
+ * Reconfigure a shmem filesystem.
+ *
+ * Note that we disallow change from limited->unlimited blocks/inodes while any
+ * are in use; but we must separately disallow unlimited->limited, because in
+ * that case we have no record of how much is already in use.
+ */
+static int shmem_reconfigure(struct fs_context *fc)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- struct shmem_sb_info config = *sbinfo;
+ struct shmem_options *ctx = fc->fs_private;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
- int error = -EINVAL;
-
- config.mpol = NULL;
- if (shmem_parse_options(data, &config, true))
- return error;
+ const char *err;
spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
- goto out;
- if (config.max_inodes < inodes)
- goto out;
- /*
- * Those tests disallow limited->unlimited while any are in use;
- * but we must separately disallow unlimited->limited, because
- * in that case we have no record of how much is already in use.
- */
- if (config.max_blocks && !sbinfo->max_blocks)
- goto out;
- if (config.max_inodes && !sbinfo->max_inodes)
- goto out;
+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
+ if (!sbinfo->max_blocks) {
+ err = "Cannot retroactively limit size";
+ goto out;
+ }
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ ctx->blocks) > 0) {
+ err = "Too small a size for current use";
+ goto out;
+ }
+ }
+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
+ if (!sbinfo->max_inodes) {
+ err = "Cannot retroactively limit inodes";
+ goto out;
+ }
+ if (ctx->inodes < inodes) {
+ err = "Too few inodes for current use";
+ goto out;
+ }
+ }
- error = 0;
- sbinfo->huge = config.huge;
- sbinfo->max_blocks = config.max_blocks;
- sbinfo->max_inodes = config.max_inodes;
- sbinfo->free_inodes = config.max_inodes - inodes;
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
+ sbinfo->max_blocks = ctx->blocks;
+ if (ctx->seen & SHMEM_SEEN_INODES) {
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_inodes = ctx->inodes - inodes;
+ }
/*
* Preserve previous mempolicy unless mpol remount option was specified.
*/
- if (config.mpol) {
+ if (ctx->mpol) {
mpol_put(sbinfo->mpol);
- sbinfo->mpol = config.mpol; /* transfers initial ref */
+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
+ ctx->mpol = NULL;
}
+ spin_unlock(&sbinfo->stat_lock);
+ return 0;
out:
spin_unlock(&sbinfo->stat_lock);
- return error;
+ return invalf(fc, "tmpfs: %s", err);
}
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
@@ -3547,8 +3622,9 @@ static void shmem_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
-int shmem_fill_super(struct super_block *sb, void *data, int silent)
+static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
int err = -ENOMEM;
@@ -3559,9 +3635,6 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
if (!sbinfo)
return -ENOMEM;
- sbinfo->mode = 0777 | S_ISVTX;
- sbinfo->uid = current_fsuid();
- sbinfo->gid = current_fsgid();
sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
@@ -3571,12 +3644,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
* but the internal instance is left unlimited.
*/
if (!(sb->s_flags & SB_KERNMOUNT)) {
- sbinfo->max_blocks = shmem_default_max_blocks();
- sbinfo->max_inodes = shmem_default_max_inodes();
- if (shmem_parse_options(data, sbinfo, false)) {
- err = -EINVAL;
- goto failed;
- }
+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
+ ctx->blocks = shmem_default_max_blocks();
+ if (!(ctx->seen & SHMEM_SEEN_INODES))
+ ctx->inodes = shmem_default_max_inodes();
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -3585,11 +3656,18 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#else
sb->s_flags |= SB_NOUSER;
#endif
+ sbinfo->max_blocks = ctx->blocks;
+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->mode = ctx->mode;
+ sbinfo->huge = ctx->huge;
+ sbinfo->mpol = ctx->mpol;
+ ctx->mpol = NULL;
spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
- sbinfo->free_inodes = sbinfo->max_inodes;
spin_lock_init(&sbinfo->shrinklist_lock);
INIT_LIST_HEAD(&sbinfo->shrinklist);
@@ -3622,6 +3700,31 @@ failed:
return err;
}
+static int shmem_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, shmem_fill_super);
+}
+
+static void shmem_free_fc(struct fs_context *fc)
+{
+ struct shmem_options *ctx = fc->fs_private;
+
+ if (ctx) {
+ mpol_put(ctx->mpol);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations shmem_fs_context_ops = {
+ .free = shmem_free_fc,
+ .get_tree = shmem_get_tree,
+#ifdef CONFIG_TMPFS
+ .parse_monolithic = shmem_parse_options,
+ .parse_param = shmem_parse_one,
+ .reconfigure = shmem_reconfigure,
+#endif
+};
+
static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
@@ -3738,7 +3841,6 @@ static const struct super_operations shmem_ops = {
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
- .remount_fs = shmem_remount_fs,
.show_options = shmem_show_options,
#endif
.evict_inode = shmem_evict_inode,
@@ -3759,16 +3861,30 @@ static const struct vm_operations_struct shmem_vm_ops = {
#endif
};
-static struct dentry *shmem_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+int shmem_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, shmem_fill_super);
+ struct shmem_options *ctx;
+
+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->mode = 0777 | S_ISVTX;
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+
+ fc->fs_private = ctx;
+ fc->ops = &shmem_fs_context_ops;
+ return 0;
}
static struct file_system_type shmem_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .mount = shmem_mount,
+ .init_fs_context = shmem_init_fs_context,
+#ifdef CONFIG_TMPFS
+ .parameters = &shmem_fs_parameters,
+#endif
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -3912,7 +4028,8 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
- .mount = ramfs_mount,
+ .init_fs_context = ramfs_init_fs_context,
+ .parameters = &ramfs_fs_parameters,
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 3ce12481b1dc..b3fe97fd6654 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -33,7 +33,7 @@ __meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
}
static bool shuffle_param;
-extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+static int shuffle_show(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
? 'Y' : 'N');
diff --git a/mm/slab.c b/mm/slab.c
index 9df370558e5d..66e5d8032bae 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4206,9 +4206,12 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
/**
* __ksize -- Uninstrumented ksize.
+ * @objp: pointer to the object
*
* Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
* safety checks as ksize() with KASAN instrumentation enabled.
+ *
+ * Return: size of the actual memory used by @objp in bytes
*/
size_t __ksize(const void *objp)
{
diff --git a/mm/slab.h b/mm/slab.h
index 9057b8056b07..b2b01694dc43 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,6 +30,69 @@ struct kmem_cache {
struct list_head list; /* List of all slab caches on the system */
};
+#else /* !CONFIG_SLOB */
+
+struct memcg_cache_array {
+ struct rcu_head rcu;
+ struct kmem_cache *entries[0];
+};
+
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
+ *
+ * Root and child caches hold different metadata.
+ *
+ * @root_cache: Common to root and child caches. NULL for root, pointer to
+ * the root cache for children.
+ *
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches. This table is
+ * used to index child cachces during allocation and cleared
+ * early during shutdown.
+ *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
+ * @children: List of all child caches. While the child caches are also
+ * reachable through @memcg_caches, a child cache remains on
+ * this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg: Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
+ */
+struct memcg_cache_params {
+ struct kmem_cache *root_cache;
+ union {
+ struct {
+ struct memcg_cache_array __rcu *memcg_caches;
+ struct list_head __root_caches_node;
+ struct list_head children;
+ bool dying;
+ };
+ struct {
+ struct mem_cgroup *memcg;
+ struct list_head children_node;
+ struct list_head kmem_caches_node;
+ struct percpu_ref refcnt;
+
+ void (*work_fn)(struct kmem_cache *);
+ union {
+ struct rcu_head rcu_head;
+ struct work_struct work;
+ };
+ };
+ };
+};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
@@ -174,6 +237,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
void __kmemcg_cache_deactivate(struct kmem_cache *s);
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
+void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
@@ -259,8 +323,8 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
* Expects a pointer to a slab page. Please note, that PageSlab() check
* isn't sufficient, as it returns true also for tail compound slab pages,
* which do not have slab_cache pointer set.
- * So this function assumes that the page can pass PageHead() and PageSlab()
- * checks.
+ * So this function assumes that the page can pass PageSlab() && !PageTail()
+ * check.
*
* The kmem_cache can be reparented asynchronously. The caller must ensure
* the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 807490fe217a..f9fb27b4c843 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -178,10 +178,13 @@ static int init_memcg_params(struct kmem_cache *s,
static void destroy_memcg_params(struct kmem_cache *s)
{
- if (is_root_cache(s))
+ if (is_root_cache(s)) {
kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
- else
+ } else {
+ mem_cgroup_put(s->memcg_params.memcg);
+ WRITE_ONCE(s->memcg_params.memcg, NULL);
percpu_ref_exit(&s->memcg_params.refcnt);
+ }
}
static void free_memcg_params(struct rcu_head *rcu)
@@ -253,8 +256,6 @@ static void memcg_unlink_cache(struct kmem_cache *s)
} else {
list_del(&s->memcg_params.children_node);
list_del(&s->memcg_params.kmem_caches_node);
- mem_cgroup_put(s->memcg_params.memcg);
- WRITE_ONCE(s->memcg_params.memcg, NULL);
}
}
#else
@@ -981,6 +982,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
+/**
+ * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
+ * @s: The cache pointer
+ */
+void kmem_cache_shrink_all(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+
+ if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
+ kmem_cache_shrink(s);
+ return;
+ }
+
+ get_online_cpus();
+ get_online_mems();
+ kasan_cache_shrink(s);
+ __kmem_cache_shrink(s);
+
+ /*
+ * We have to take the slab_mutex to protect from the memcg list
+ * modification.
+ */
+ mutex_lock(&slab_mutex);
+ for_each_memcg_cache(c, s) {
+ /*
+ * Don't need to shrink deactivated memcg caches.
+ */
+ if (s->flags & SLAB_DEACTIVATED)
+ continue;
+ kasan_cache_shrink(c);
+ __kmem_cache_shrink(c);
+ }
+ mutex_unlock(&slab_mutex);
+ put_online_mems();
+ put_online_cpus();
+}
+
bool slab_is_available(void)
{
return slab_state >= UP;
@@ -993,10 +1031,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
unsigned int useroffset, unsigned int usersize)
{
int err;
+ unsigned int align = ARCH_KMALLOC_MINALIGN;
s->name = name;
s->size = s->object_size = size;
- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ /*
+ * For power of two sizes, guarantee natural alignment for kmalloc
+ * caches, regardless of SL*B debugging options.
+ */
+ if (is_power_of_2(size))
+ align = max(align, size);
+ s->align = calculate_alignment(flags, align, size);
+
s->useroffset = useroffset;
s->usersize = usersize;
@@ -1250,12 +1297,16 @@ void __init create_kmalloc_caches(slab_flags_t flags)
*/
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
- void *ret;
+ void *ret = NULL;
struct page *page;
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
- ret = page ? page_address(page) : NULL;
+ if (likely(page)) {
+ ret = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ret, size, 1, flags);
diff --git a/mm/slob.c b/mm/slob.c
index 7f421d0ca9ab..fa53e9f73893 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
static void *slob_new_pages(gfp_t gfp, int order, int node)
{
- void *page;
+ struct page *page;
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
@@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
return page_address(page);
}
static void slob_free_pages(void *b, int order)
{
+ struct page *sp = virt_to_page(b);
+
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- free_pages((unsigned long)b, order);
+
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
}
/*
@@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order)
* @sp: Page to look in.
* @size: Size of the allocation.
* @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
* @page_removed_from_list: Return parameter.
*
* Tries to find a chunk of memory at least @size bytes big within @page.
@@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order)
* true (set to false otherwise).
*/
static void *slob_page_alloc(struct page *sp, size_t size, int align,
- bool *page_removed_from_list)
+ int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
@@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
+ /*
+ * 'aligned' will hold the address of the slob block so that the
+ * address 'aligned'+'align_offset' is aligned according to the
+ * 'align' parameter. This is for kmalloc() which prepends the
+ * allocated block with its size, so that the block itself is
+ * aligned when needed.
+ */
if (align) {
- aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ aligned = (slob_t *)
+ (ALIGN((unsigned long)cur + align_offset, align)
+ - align_offset);
delta = aligned - cur;
}
if (avail >= units + delta) { /* room enough? */
@@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
/*
* slob_alloc: entry point into the slob allocator.
*/
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+ int align_offset)
{
struct page *sp;
struct list_head *slob_list;
@@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- b = slob_page_alloc(sp, size, align, &page_removed_from_list);
+ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
if (!b)
continue;
@@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align, &_unused);
+ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
@@ -451,7 +469,7 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
fs_reclaim_acquire(gfp);
fs_reclaim_release(gfp);
- if (size < PAGE_SIZE - align) {
+ if (size < PAGE_SIZE - minalign) {
+ int align = minalign;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for
+ * kmalloc()'d objects.
+ */
+ if (is_power_of_2(size))
+ align = max(minalign, (int) size);
+
if (!size)
return ZERO_SIZE_PTR;
- m = slob_alloc(size + align, gfp, align, node);
+ m = slob_alloc(size + minalign, gfp, align, node, minalign);
if (!m)
return NULL;
*m = size;
- ret = (void *)m + align;
+ ret = (void *)m + minalign;
trace_kmalloc_node(caller, ret,
- size, size + align, gfp, node);
+ size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -521,8 +548,13 @@ void kfree(const void *block)
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
- } else
- __free_pages(sp, compound_order(sp));
+ } else {
+ unsigned int order = compound_order(sp);
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
+
+ }
}
EXPORT_SYMBOL(kfree);
@@ -539,7 +571,7 @@ size_t __ksize(const void *block)
sp = virt_to_page(block);
if (unlikely(!PageSlab(sp)))
- return PAGE_SIZE << compound_order(sp);
+ return page_size(sp);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
@@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
- b = slob_alloc(c->size, flags, c->align, node);
+ b = slob_alloc(c->size, flags, c->align, node, 0);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
diff --git a/mm/slub.c b/mm/slub.c
index e6c030e47364..e72e802fc569 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- length = PAGE_SIZE << compound_order(page);
+ length = page_size(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
-static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
if (!(s->flags & SLAB_POISON))
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, PAGE_SIZE << order);
+ memset(addr, POISON_INUSE, page_size(page));
metadata_access_disable();
}
@@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
-static inline void setup_page_debug(struct kmem_cache *s,
- void *addr, int order) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1432,10 +1433,15 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *old_tail = *tail ? *tail : *head;
int rsize;
- if (slab_want_init_on_free(s))
- do {
- object = next;
- next = get_freepointer(s, object);
+ /* Head and tail of the reconstructed freelist */
+ *head = NULL;
+ *tail = NULL;
+
+ do {
+ object = next;
+ next = get_freepointer(s, object);
+
+ if (slab_want_init_on_free(s)) {
/*
* Clear the object and the metadata, but don't touch
* the redzone.
@@ -1445,27 +1451,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
: 0;
memset((char *)object + s->inuse, 0,
s->size - s->inuse - rsize);
- set_freepointer(s, object, next);
- } while (object != old_tail);
-/*
- * Compiler cannot detect this function can be removed if slab_free_hook()
- * evaluates to nothing. Thus, catch all relevant config debug options here.
- */
-#if defined(CONFIG_LOCKDEP) || \
- defined(CONFIG_DEBUG_KMEMLEAK) || \
- defined(CONFIG_DEBUG_OBJECTS_FREE) || \
- defined(CONFIG_KASAN)
-
- next = *head;
-
- /* Head and tail of the reconstructed freelist */
- *head = NULL;
- *tail = NULL;
-
- do {
- object = next;
- next = get_freepointer(s, object);
+ }
/* If object's reuse doesn't have to be delayed */
if (!slab_free_hook(s, object)) {
/* Move object to the new freelist */
@@ -1480,9 +1467,6 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
*tail = NULL;
return *head != NULL;
-#else
- return true;
-#endif
}
static void *setup_object(struct kmem_cache *s, struct page *page,
@@ -1635,7 +1619,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
- int idx, order;
+ int idx;
bool shuffle;
flags &= gfp_allowed_mask;
@@ -1669,7 +1653,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->objects = oo_objects(oo);
- order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
@@ -1679,7 +1662,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
- setup_page_debug(s, start, order);
+ setup_page_debug(s, page, start);
shuffle = shuffle_freelist(s, page);
@@ -2000,6 +1983,7 @@ static inline unsigned long next_tid(unsigned long tid)
return tid + TID_STEP;
}
+#ifdef SLUB_DEBUG_CMPXCHG
static inline unsigned int tid_to_cpu(unsigned long tid)
{
return tid % TID_STEP;
@@ -2009,6 +1993,7 @@ static inline unsigned long tid_to_event(unsigned long tid)
{
return tid / TID_STEP;
}
+#endif
static inline unsigned int init_tid(int cpu)
{
@@ -2666,6 +2651,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
}
/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
+}
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -2753,12 +2749,8 @@ redo:
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
- /*
- * If the object has been wiped upon free, make sure it's fully
- * initialized by zeroing out freelist pointer.
- */
- if (unlikely(slab_want_init_on_free(s)) && object)
- memset(object + s->offset, 0, sizeof(void *));
+
+ maybe_wipe_obj_freeptr(s, object);
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
@@ -3172,10 +3164,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
goto error;
c = this_cpu_ptr(s->cpu_slab);
+ maybe_wipe_obj_freeptr(s, p[i]);
+
continue; /* goto for-loop */
}
c->freelist = get_freepointer(s, object);
p[i] = object;
+ maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
local_irq_enable();
@@ -3815,11 +3810,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
void *ptr = NULL;
+ unsigned int order = get_order(size);
flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, get_order(size));
- if (page)
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
ptr = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
return kmalloc_large_node_hook(ptr, size, flags);
}
@@ -3926,7 +3925,7 @@ size_t __ksize(const void *object)
if (unlikely(!PageSlab(page))) {
WARN_ON(!PageCompound(page));
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
return slab_ksize(page->slab_cache);
@@ -3945,9 +3944,13 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_pages(page, compound_order(page));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(page, order);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -4832,7 +4835,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ *
+ * We don't really need mem_hotplug_lock (to hold off
+ * slab_mem_going_offline_callback) here because slab's memory hot
+ * unplug code doesn't destroy the kmem_cache->node[] data.
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4873,7 +4886,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -5294,7 +5306,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink(s);
+ kmem_cache_shrink_all(s);
else
return -EINVAL;
return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 72f010d9bff5..f6891c1992b1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -11,6 +11,8 @@
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include "internal.h"
#include <asm/dma.h>
@@ -217,7 +219,7 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
-void subsection_mask_set(unsigned long *map, unsigned long pfn,
+static void subsection_mask_set(unsigned long *map, unsigned long pfn,
unsigned long nr_pages)
{
int idx = subsection_map_index(pfn);
@@ -470,6 +472,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;
+static inline void __meminit sparse_buffer_free(unsigned long size)
+{
+ WARN_ON(!sparsemap_buf || size == 0);
+ memblock_free_early(__pa(sparsemap_buf), size);
+}
+
static void __init sparse_buffer_init(unsigned long size, int nid)
{
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
@@ -486,7 +494,7 @@ static void __init sparse_buffer_fini(void)
unsigned long size = sparsemap_buf_end - sparsemap_buf;
if (sparsemap_buf && size > 0)
- memblock_free_early(__pa(sparsemap_buf), size);
+ sparse_buffer_free(size);
sparsemap_buf = NULL;
}
@@ -495,11 +503,15 @@ void * __meminit sparse_buffer_alloc(unsigned long size)
void *ptr = NULL;
if (sparsemap_buf) {
- ptr = PTR_ALIGN(sparsemap_buf, size);
+ ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
if (ptr + size > sparsemap_buf_end)
ptr = NULL;
- else
+ else {
+ /* Free redundant aligned space */
+ if ((unsigned long)(ptr - sparsemap_buf) > 0)
+ sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
sparsemap_buf = ptr + size;
+ }
}
return ptr;
}
@@ -867,7 +879,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
*/
page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
- ms = __pfn_to_section(start_pfn);
+ ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
section_mark_present(ms);
@@ -884,9 +896,6 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
int i;
- if (!memmap)
- return;
-
/*
* A further optimization is to have per section refcounted
* num_poisoned_pages. But that would need more space per memmap, so
@@ -898,7 +907,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
- atomic_long_sub(1, &num_poisoned_pages);
+ num_poisoned_pages_dec();
ClearPageHWPoison(&memmap[i]);
}
}
diff --git a/mm/swap.c b/mm/swap.c
index ae300397dfda..38c3fa4308e2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -71,12 +72,12 @@ static void __page_cache_release(struct page *page)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
}
__ClearPageWaiters(page);
- mem_cgroup_uncharge(page);
}
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
+ mem_cgroup_uncharge(page);
free_unref_page(page);
}
@@ -515,7 +516,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
del_page_from_lru_list(page, lruvec, lru + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
if (PageWriteback(page) || PageDirty(page)) {
/*
@@ -523,13 +523,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
+ add_page_to_lru_list(page, lruvec, lru);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- list_move_tail(&page->lru, &lruvec->lists[lru]);
+ add_page_to_lru_list_tail(page, lruvec, lru);
__count_vm_event(PGROTATED);
}
@@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
@@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page)
}
}
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
/**
* mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
@@ -687,6 +728,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
@@ -844,17 +886,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
get_page(page_tail);
list_add_tail(&page_tail->lru, list);
} else {
- struct list_head *list_head;
/*
* Head page has not yet been counted, as an hpage,
* so we must account for each subpage individually.
*
- * Use the standard add function to put page_tail on the list,
- * but then correct its position so they all end up in order.
+ * Put page_tail on the list at the correct position
+ * so they all end up in order.
*/
- add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
- list_head = page_tail->lru.prev;
- list_move_tail(&page_tail->lru, list_head);
+ add_page_to_lru_list_tail(page_tail, lruvec,
+ page_lru(page_tail));
}
if (!PageUnevictable(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8368621a0fc7..8e7ce9a9bc5e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = 1UL << compound_order(page);
+ unsigned long i, nr = compound_nr(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
set_page_private(page + i, entry.val + i);
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
xas_next(&xas);
}
address_space->nrpages += nr;
@@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, NULL);
- VM_BUG_ON_PAGE(entry != page + i, entry);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0789a762ce2f..dab43523afdd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2368,9 +2368,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
* requirements, they are simply tossed out - we will never use those blocks
* for swapping.
*
- * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
- * prevents root from shooting her foot off by ftruncating an in-use swapfile,
- * which will scribble on the fs.
+ * For all swap devices we set S_SWAPFILE across the life of the swapon. This
+ * prevents users from writing to the swap device, which will corrupt memory.
*
* The amount of disk space which a single swap extent represents varies.
* Typically it is in the 1-4 megabyte range. So we can have hundreds of
@@ -2661,13 +2660,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
+
set_blocksize(bdev, old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
- } else {
- inode_lock(inode);
- inode->i_flags &= ~S_SWAPFILE;
- inode_unlock(inode);
}
+
+ inode_lock(inode);
+ inode->i_flags &= ~S_SWAPFILE;
+ inode_unlock(inode);
filp_close(swap_file, NULL);
/*
@@ -2890,11 +2890,11 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
- inode_lock(inode);
- if (IS_SWAPFILE(inode))
- return -EBUSY;
- } else
- return -EINVAL;
+ }
+
+ inode_lock(inode);
+ if (IS_SWAPFILE(inode))
+ return -EBUSY;
return 0;
}
@@ -3275,6 +3275,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap;
+ /*
+ * Flush any pending IO and dirty mappings before we start using this
+ * swap device.
+ */
+ inode->i_flags |= S_SWAPFILE;
+ error = inode_drain_writes(inode);
+ if (error) {
+ inode->i_flags &= ~S_SWAPFILE;
+ goto bad_swap;
+ }
+
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
@@ -3295,8 +3306,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
- if (S_ISREG(inode->i_mode))
- inode->i_flags |= S_SWAPFILE;
error = 0;
goto out;
bad_swap:
@@ -3318,7 +3327,7 @@ bad_swap:
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file) {
- if (inode && S_ISREG(inode->i_mode)) {
+ if (inode) {
inode_unlock(inode);
inode = NULL;
}
@@ -3331,7 +3340,7 @@ out:
}
if (name)
putname(name);
- if (inode && S_ISREG(inode->i_mode))
+ if (inode)
inode_unlock(inode);
if (!error)
enable_swap_slots_cache();
diff --git a/mm/truncate.c b/mm/truncate.c
index 8563339041f6..dd9ebc1da356 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -592,6 +592,16 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
unlock_page(page);
continue;
}
+
+ /* Take a pin outside pagevec */
+ get_page(page);
+
+ /*
+ * Drop extra pins before trying to invalidate
+ * the huge page.
+ */
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
}
ret = invalidate_inode_page(page);
@@ -602,6 +612,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
*/
if (!ret)
deactivate_file_page(page);
+ if (PageTransHuge(page))
+ put_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 2a09796edef8..660717a1ea5c 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -147,7 +148,7 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
bool to_user)
{
/* Reject if object wraps past end of memory. */
- if (ptr + n < ptr)
+ if (ptr + (n - 1) < ptr)
usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);
/* Reject if NULL or ZERO-allocation. */
@@ -227,7 +228,12 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
if (!virt_addr_valid(ptr))
return;
- page = virt_to_head_page(ptr);
+ /*
+ * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
+ * highmem page or fallback to virt_to_page(). The following
+ * is effectively a highmem-aware virt_to_head_page().
+ */
+ page = compound_head(kmap_to_page((void *)ptr));
if (PageSlab(page)) {
/* Check slab allocator for flags and size. */
diff --git a/mm/util.c b/mm/util.c
index e6351a80f248..3ad6db9a722e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -16,6 +16,13 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/processor.h>
+#include <linux/sizes.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -293,7 +300,105 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+ unsigned long random_variable = 0;
+
+ if (current->flags & PF_RANDOMIZE) {
+ random_variable = get_random_long();
+ random_variable &= STACK_RND_MASK;
+ random_variable <<= PAGE_SHIFT;
+ }
+#ifdef CONFIG_STACK_GROWSUP
+ return PAGE_ALIGN(stack_top) + random_variable;
+#else
+ return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /* Is the current task 32bit ? */
+ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ if (is_compat_task())
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+ else
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+
+ return rnd << PAGE_SHIFT;
+}
+
+static int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+/*
+ * Leave enough space between the mmap area and the stack to honour ulimit in
+ * the face of randomisation.
+ */
+#define MIN_GAP (SZ_128M)
+#define MAX_GAP (STACK_TOP / 6 * 5)
+
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+ unsigned long pad = stack_guard_gap;
+
+ /* Account for stack randomization if necessary */
+ if (current->flags & PF_RANDOMIZE)
+ pad += (STACK_RND_MASK << PAGE_SHIFT);
+
+ /* Values close to RLIM_INFINITY can overflow. */
+ if (gap + pad > gap)
+ gap += pad;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ if (mmap_is_legacy(rlim_stack)) {
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -521,7 +626,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
@@ -783,3 +888,16 @@ out_mm:
out:
return res;
}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4fa8d84599b0..a3c70e275f4e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -329,8 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
-#define VM_LAZY_FREE 0x02
-#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
@@ -398,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va)
get_subtree_max_size(va->rb_node.rb_right));
}
-RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
- struct vmap_area, rb_node, unsigned long, subtree_max_size,
- compute_subtree_max_size)
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
@@ -1116,7 +1113,7 @@ retry:
va->va_start = addr;
va->va_end = addr + size;
- va->flags = 0;
+ va->vm = NULL;
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -1259,6 +1256,12 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
return false;
/*
+ * First make sure the mappings are removed from all page-tables
+ * before they are freed.
+ */
+ vmalloc_sync_all();
+
+ /*
* TODO: to calculate a flush range without looping.
* The list can be up to lazy_max_pages() elements.
*/
@@ -1276,7 +1279,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
- __free_vmap_area(va);
+ /*
+ * Finally insert or merge lazily-freed area. It is
+ * detached and there is no need to "unlink" it from
+ * anything.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
@@ -1318,6 +1328,10 @@ static void free_vmap_area_noflush(struct vmap_area *va)
{
unsigned long nr_lazy;
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
@@ -1912,7 +1926,6 @@ void __init vmalloc_init(void)
if (WARN_ON_ONCE(!va))
continue;
- va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
@@ -2010,7 +2023,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
- va->flags |= VM_VM_AREA;
spin_unlock(&vmap_area_lock);
}
@@ -2115,10 +2127,10 @@ struct vm_struct *find_vm_area(const void *addr)
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA)
- return va->vm;
+ if (!va)
+ return NULL;
- return NULL;
+ return va->vm;
}
/**
@@ -2137,14 +2149,12 @@ struct vm_struct *remove_vm_area(const void *addr)
might_sleep();
- va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA) {
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr);
+ if (va && va->vm) {
struct vm_struct *vm = va->vm;
- spin_lock(&vmap_area_lock);
va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
kasan_free_shadow(vm);
@@ -2152,6 +2162,8 @@ struct vm_struct *remove_vm_area(const void *addr)
return vm;
}
+
+ spin_unlock(&vmap_area_lock);
return NULL;
}
@@ -2396,7 +2408,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
- area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
@@ -2404,13 +2415,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
- area->pages = pages;
- if (!area->pages) {
+
+ if (!pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
+ area->pages = pages;
+ area->nr_pages = nr_pages;
+
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
@@ -2845,7 +2859,7 @@ long vread(char *buf, char *addr, unsigned long count)
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
@@ -2925,7 +2939,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
@@ -2987,7 +3001,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!area)
return -EINVAL;
- if (!(area->flags & VM_USERMAP))
+ if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
if (kaddr + size > area->addr + get_vm_area_size(area))
@@ -3038,6 +3052,9 @@ EXPORT_SYMBOL(remap_vmalloc_range);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
+ *
+ * The purpose of this function is to make sure the vmalloc area
+ * mappings are identical in all page-tables in the system.
*/
void __weak vmalloc_sync_all(void)
{
@@ -3270,9 +3287,19 @@ retry:
goto overflow;
/*
+ * If required width exeeds current VA block, move
+ * base downwards and then recheck.
+ */
+ if (base + end > va->va_end) {
+ base = pvm_determine_end_from_reverse(&va, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
* If this VA does not fit, move base downwards and recheck.
*/
- if (base + start < va->va_start || base + end > va->va_end) {
+ if (base + start < va->va_start) {
va = node_to_va(rb_prev(&va->rb_node));
base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
@@ -3431,6 +3458,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
}
}
+static void show_purge_info(struct seq_file *m)
+{
+ struct llist_node *head;
+ struct vmap_area *va;
+
+ head = READ_ONCE(vmap_purge_list.first);
+ if (head == NULL)
+ return;
+
+ llist_for_each_entry(va, head, purge_list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ }
+}
+
static int s_show(struct seq_file *m, void *p)
{
struct vmap_area *va;
@@ -3439,14 +3482,13 @@ static int s_show(struct seq_file *m, void *p)
va = list_entry(p, struct vmap_area, list);
/*
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
- * behalf of vmap area is being tear down or vm_map_ram allocation.
+ * s_show can encounter race with remove_vm_area, !vm on behalf
+ * of vmap area is being tear down or vm_map_ram allocation.
*/
- if (!(va->flags & VM_VM_AREA)) {
- seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+ if (!va->vm) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start,
- va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+ va->va_end - va->va_start);
return 0;
}
@@ -3477,11 +3519,24 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
+ if (v->flags & VM_DMA_COHERENT)
+ seq_puts(m, " dma-coherent");
+
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
show_numa_info(m, v);
seq_putc(m, '\n');
+
+ /*
+ * As a final step, dump "unpurged" areas. Note,
+ * that entire "/proc/vmallocinfo" output will not
+ * be address sorted, because the purge list is not
+ * sorted.
+ */
+ if (list_is_last(&va->list, &vmap_area_list))
+ show_purge_info(m);
+
return 0;
}
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index f3b50811497a..4bac22fe1aa2 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -355,6 +355,9 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* "hierarchy" or "local").
*
* To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
*/
int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
@@ -362,7 +365,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
- enum vmpressure_levels level = -1;
+ enum vmpressure_levels level;
char *spec, *spec_orig;
char *token;
int ret = 0;
@@ -375,20 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
/* Find required level */
token = strsep(&spec, ",");
- level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
- if (level < 0) {
- ret = level;
+ ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (ret < 0)
goto out;
- }
+ level = ret;
/* Find optional mode */
token = strsep(&spec, ",");
if (token) {
- mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
- if (mode < 0) {
- ret = mode;
+ ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (ret < 0)
goto out;
- }
+ mode = ret;
}
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
@@ -404,6 +405,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
+ ret = 0;
out:
kfree(spec_orig);
return ret;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 44df66a98f2a..ee4eecc7e1c2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,9 +88,6 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
- /* e.g. boosted watermark reclaim leaves slabs alone */
- unsigned int may_shrinkslab:1;
-
/*
* Cgroups are not reclaimed below their configured memory.low,
* unless we threaten to OOM. If any cgroups are skipped due to
@@ -174,11 +171,22 @@ int vm_swappiness = 60;
*/
unsigned long vm_total_pages;
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_MEMCG_KMEM
-
+#ifdef CONFIG_MEMCG
/*
* We allow subsystems to populate their shrinker-related
* LRU lists before register_shrinker_prepared() is called
@@ -230,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
idr_remove(&shrinker_idr, id);
up_write(&shrinker_rwsem);
}
-#else /* CONFIG_MEMCG_KMEM */
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return 0;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-static void set_task_reclaim_state(struct task_struct *task,
- struct reclaim_state *rs)
-{
- /* Check for an overwrite */
- WARN_ON_ONCE(rs && task->reclaim_state);
-
- /* Check for the nulling of an already-nulled member */
- WARN_ON_ONCE(!rs && !task->reclaim_state);
- task->reclaim_state = rs;
-}
-
-#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
@@ -308,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat,
}
#else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
static bool global_reclaim(struct scan_control *sc)
{
return true;
@@ -357,12 +351,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
*/
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
- unsigned long lru_size;
+ unsigned long lru_size = 0;
int zid;
- if (!mem_cgroup_disabled())
- lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
- else
+ if (!mem_cgroup_disabled()) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ } else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
@@ -594,7 +589,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
@@ -602,7 +597,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
unsigned long ret, freed = 0;
int i;
- if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ if (!mem_cgroup_online(memcg))
return 0;
if (!down_read_trylock(&shrinker_rwsem))
@@ -628,6 +623,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
continue;
}
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_enabled() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(i, map->map);
@@ -664,13 +664,13 @@ unlock:
up_read(&shrinker_rwsem);
return freed;
}
-#else /* CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
@@ -699,7 +699,14 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
unsigned long ret, freed = 0;
struct shrinker *shrinker;
- if (!mem_cgroup_is_root(memcg))
+ /*
+ * The root memcg might be allocated even though memcg is disabled
+ * via "cgroup_disable=memory" boot parameter. This could make
+ * mem_cgroup_is_root() return false, then just run memcg slab
+ * shrink, but skip global shrink. This may result in premature
+ * oom.
+ */
+ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
@@ -926,10 +933,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
- if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
- refcount = 1 + HPAGE_PMD_NR;
- else
- refcount = 2;
+ refcount = 1 + compound_nr(page);
if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
@@ -1117,7 +1121,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
- bool force_reclaim)
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -1131,7 +1135,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
@@ -1145,7 +1149,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
VM_BUG_ON_PAGE(PageActive(page), page);
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
/* Account the number of base pages even though THP */
sc->nr_scanned += nr_pages;
@@ -1262,7 +1266,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- if (!force_reclaim)
+ if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
@@ -1483,10 +1487,9 @@ free_it:
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page))) {
- mem_cgroup_uncharge(page);
+ if (unlikely(PageTransHuge(page)))
(*get_compound_page_dtor(page))(page);
- } else
+ else
list_add(&page->lru, &free_pages);
continue;
@@ -1701,7 +1704,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
VM_BUG_ON_PAGE(!PageLRU(page), page);
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
@@ -1907,7 +1910,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&pgdat->lru_lock);
} else
@@ -2141,6 +2143,62 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate, nr_rotated, sc->priority, file);
}
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid = -1;
+ unsigned long nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct reclaim_stat dummy_stat;
+ struct page *page;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ if (nid == -1) {
+ nid = page_to_nid(page);
+ INIT_LIST_HEAD(&node_page_list);
+ }
+
+ if (nid == page_to_nid(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &node_page_list);
+ continue;
+ }
+
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+ nid = -1;
+ }
+
+ if (!list_empty(&node_page_list)) {
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+ }
+
+ return nr_reclaimed;
+}
+
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
@@ -2399,17 +2457,70 @@ out:
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2429,7 +2540,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2438,7 +2549,7 @@ out:
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2582,7 +2693,6 @@ static bool in_reclaim_compaction(struct scan_control *sc)
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
- unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
@@ -2593,40 +2703,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
if (!in_reclaim_compaction(sc))
return false;
- /* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
- /*
- * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
- * full LRU list has been scanned and we are still failing
- * to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
- } else {
- /*
- * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
- * fail without consequence, stop if we failed to reclaim
- * any pages from the last SWAP_CLUSTER_MAX number of
- * pages that were scanned. This will return to the
- * caller faster at the risk reclaim/compaction and
- * the resulting allocation attempt fails
- */
- if (!nr_reclaimed)
- return false;
- }
-
/*
- * If we have not reclaimed enough pages for compaction and the
- * inactive lists are large enough, continue reclaiming
+ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+ * number of pages that were scanned. This will return to the caller
+ * with the risk reclaim/compaction and the resulting allocation attempt
+ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+ * allocations through requiring that the full LRU list has been scanned
+ * first, by assuming that zero delta of sc->nr_scanned means full LRU
+ * scan, but that approximation was wrong, and there were corner cases
+ * where always a non-zero amount of pages were scanned.
*/
- pages_for_compaction = compact_gap(sc->order);
- inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
- inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
- if (sc->nr_reclaimed < pages_for_compaction &&
- inactive_lru_pages > pages_for_compaction)
- return true;
+ if (!nr_reclaimed)
+ return false;
/* If compaction would go ahead or the allocation would succeed, stop */
for (z = 0; z <= sc->reclaim_idx; z++) {
@@ -2643,7 +2731,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
;
}
}
- return true;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = compact_gap(sc->order);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ return inactive_lru_pages > pages_for_compaction;
}
static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
@@ -2660,10 +2758,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .pgdat = pgdat,
- .priority = sc->priority,
- };
unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
@@ -2672,7 +2766,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ memcg = mem_cgroup_iter(root, NULL, NULL);
do {
unsigned long lru_pages;
unsigned long reclaimed;
@@ -2699,6 +2793,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
@@ -2707,31 +2808,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- if (sc->may_shrinkslab) {
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
- }
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+ sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- /*
- * Kswapd have to scan all memory cgroups to fulfill
- * the overall scan target for the node.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!current_is_kswapd() &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2808,7 +2893,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc));
+ sc));
/*
* Kswapd gives up on balancing particular nodes after too
@@ -3187,7 +3272,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
- .may_shrinkslab = 1,
};
/*
@@ -3219,6 +3303,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_MEMCG
+/* Only used by soft limit reclaim. Do not reuse for anything else. */
unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
@@ -3231,11 +3316,11 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.may_unmap = 1,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
- .may_shrinkslab = 1,
};
unsigned long lru_pages;
- set_task_reclaim_state(current, &sc.reclaim_state);
+ WARN_ON_ONCE(!current->reclaim_state);
+
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -3253,7 +3338,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
- set_task_reclaim_state(current, NULL);
*nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
@@ -3279,7 +3363,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = may_swap,
- .may_shrinkslab = 1,
};
set_task_reclaim_state(current, &sc.reclaim_state);
@@ -3591,7 +3674,6 @@ restart:
*/
sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;
- sc.may_shrinkslab = !nr_boost_reclaim;
/*
* Do some background aging of the anon list, to give
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fd7e16ca6996..a8222041bd44 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = {
"nr_shmem",
"nr_shmem_hugepages",
"nr_shmem_pmdmapped",
+ "nr_file_hugepages",
+ "nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
"nr_unstable",
"nr_vmscan_write",
@@ -1381,12 +1383,29 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
unsigned long freecount = 0;
struct free_area *area;
struct list_head *curr;
+ bool overflow = false;
area = &(zone->free_area[order]);
- list_for_each(curr, &area->free_list[mtype])
- freecount++;
- seq_printf(m, "%6lu ", freecount);
+ list_for_each(curr, &area->free_list[mtype]) {
+ /*
+ * Cap the free_list iteration because it might
+ * be really large and we are under a spinlock
+ * so a long time spent here could trigger a
+ * hard lockup detector. Anyway this is a
+ * debugging tool so knowing there is a handful
+ * of pages of this order should be more than
+ * sufficient.
+ */
+ if (++freecount >= 100000) {
+ overflow = true;
+ break;
+ }
+ }
+ seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
+ spin_unlock_irq(&zone->lock);
+ cond_resched();
+ spin_lock_irq(&zone->lock);
}
seq_putc(m, '\n');
}
@@ -1970,7 +1989,7 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
- proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
#endif
diff --git a/mm/workingset.c b/mm/workingset.c
index e0b4edcb88c8..c963831d354f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -380,14 +380,12 @@ void workingset_update_node(struct xa_node *node)
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
- __inc_lruvec_page_state(virt_to_page(node),
- WORKINGSET_NODES);
+ __inc_lruvec_slab_state(node, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
- __dec_lruvec_page_state(virt_to_page(node),
- WORKINGSET_NODES);
+ __dec_lruvec_slab_state(node, WORKINGSET_NODES);
}
}
}
@@ -480,7 +478,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
list_lru_isolate(lru, item);
- __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
+ __dec_lruvec_slab_state(node, WORKINGSET_NODES);
spin_unlock(lru_lock);
@@ -503,7 +501,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* shadow entries we were tracking ...
*/
xas_store(&xas, NULL);
- __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 1a029a7432ee..6d3d3f698ebb 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -295,14 +295,11 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
}
/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page,
+static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_pool *pool, gfp_t gfp)
{
struct z3fold_header *zhdr = page_address(page);
- struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp);
-
- if (!slots)
- return NULL;
+ struct z3fold_buddy_slots *slots;
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
@@ -310,6 +307,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
+ if (headless)
+ return zhdr;
+
+ slots = alloc_slots(pool, gfp);
+ if (!slots)
+ return NULL;
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -366,9 +369,10 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
*/
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+static unsigned long __encode_handle(struct z3fold_header *zhdr,
+ struct z3fold_buddy_slots *slots,
+ enum buddy bud)
{
- struct z3fold_buddy_slots *slots;
unsigned long h = (unsigned long)zhdr;
int idx = 0;
@@ -385,11 +389,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
if (bud == LAST)
h |= (zhdr->last_chunks << BUDDY_SHIFT);
- slots = zhdr->slots;
slots->slot[idx] = h;
return (unsigned long)&slots->slot[idx];
}
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return __encode_handle(zhdr, zhdr->slots, bud);
+}
+
/* Returns the z3fold page where a given handle is stored */
static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
{
@@ -624,6 +632,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
}
if (unlikely(PageIsolated(page) ||
+ test_bit(PAGE_CLAIMED, &page->private) ||
test_bit(PAGE_STALE, &page->private))) {
z3fold_page_unlock(zhdr);
return;
@@ -817,9 +826,19 @@ out:
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
kmem_cache_destroy(pool->c_handle);
- z3fold_unregister_migration(pool);
- destroy_workqueue(pool->release_wq);
+
+ /*
+ * We need to destroy pool->compact_wq before pool->release_wq,
+ * as any pending work on pool->compact_wq will call
+ * queue_work(pool->release_wq, &pool->work).
+ *
+ * There are still outstanding pages until both workqueues are drained,
+ * so we cannot unregister migration until then.
+ */
+
destroy_workqueue(pool->compact_wq);
+ destroy_workqueue(pool->release_wq);
+ z3fold_unregister_migration(pool);
kfree(pool);
}
@@ -914,7 +933,7 @@ retry:
if (!page)
return -ENOMEM;
- zhdr = init_z3fold_page(page, pool, gfp);
+ zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
if (!zhdr) {
__free_page(page);
return -ENOMEM;
@@ -979,9 +998,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
struct z3fold_header *zhdr;
struct page *page;
enum buddy bud;
+ bool page_claimed;
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
if (test_bit(PAGE_HEADLESS, &page->private)) {
/* if a headless page is under reclaim, just leave.
@@ -989,7 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
* has not been set before, we release this page
* immediately so we don't care about its value any more.
*/
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (!page_claimed) {
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -1025,13 +1046,15 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(PAGE_CLAIMED, &page->private)) {
+ if (page_claimed) {
+ /* the page has not been claimed by us */
z3fold_page_unlock(zhdr);
return;
}
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1041,10 +1064,12 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -1090,6 +1115,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
struct list_head *pos;
+ struct z3fold_buddy_slots slots;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
@@ -1108,16 +1134,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
/* this bit could have been set by free, in which case
* we pass over to the next page in the pool.
*/
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ page = NULL;
continue;
+ }
- if (unlikely(PageIsolated(page)))
+ if (unlikely(PageIsolated(page))) {
+ clear_bit(PAGE_CLAIMED, &page->private);
+ page = NULL;
continue;
+ }
+ zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
break;
- zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr)) {
+ clear_bit(PAGE_CLAIMED, &page->private);
zhdr = NULL;
continue; /* can't evict at this point */
}
@@ -1135,26 +1167,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
if (!test_bit(PAGE_HEADLESS, &page->private)) {
/*
- * We need encode the handles before unlocking, since
- * we can race with free that will set
- * (first|last)_chunks to 0
+ * We need encode the handles before unlocking, and
+ * use our local slots structure because z3fold_free
+ * can zero out zhdr->slots and we can't do much
+ * about that
*/
first_handle = 0;
last_handle = 0;
middle_handle = 0;
if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
+ first_handle = __encode_handle(zhdr, &slots,
+ FIRST);
if (zhdr->middle_chunks)
- middle_handle = encode_handle(zhdr, MIDDLE);
+ middle_handle = __encode_handle(zhdr, &slots,
+ MIDDLE);
if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
+ last_handle = __encode_handle(zhdr, &slots,
+ LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
*/
z3fold_page_unlock(zhdr);
} else {
- first_handle = encode_handle(zhdr, HEADLESS);
+ first_handle = __encode_handle(zhdr, &slots, HEADLESS);
last_handle = middle_handle = 0;
}
@@ -1184,9 +1220,9 @@ next:
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
} else {
z3fold_page_lock(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
@@ -1201,6 +1237,7 @@ next:
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
}
/* We started off locked to we need to lock the pool back */
@@ -1305,7 +1342,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
- if (test_bit(PAGE_HEADLESS, &page->private))
+ if (test_bit(PAGE_HEADLESS, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private))
return false;
zhdr = page_address(page);
diff --git a/mm/zpool.c b/mm/zpool.c
index a2dd9107857d..863669212070 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -239,6 +239,22 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+ return zpool->driver->malloc_support_movable;
+}
+
+/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
* @size: The amount of memory to allocate.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 57fbb7ced69f..2b2b9aae8a3c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -54,6 +54,7 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
+#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
@@ -268,6 +269,10 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
#endif
};
@@ -438,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool)
}
static struct zpool_driver zs_zpool_driver = {
- .type = "zsmalloc",
- .owner = THIS_MODULE,
- .create = zs_zpool_create,
- .destroy = zs_zpool_destroy,
- .malloc = zs_zpool_malloc,
- .free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .type = "zsmalloc",
+ .owner = THIS_MODULE,
+ .create = zs_zpool_create,
+ .destroy = zs_zpool_destroy,
+ .malloc_support_movable = true,
+ .malloc = zs_zpool_malloc,
+ .free = zs_zpool_free,
+ .map = zs_zpool_map,
+ .unmap = zs_zpool_unmap,
+ .total_size = zs_zpool_total_size,
};
MODULE_ALIAS("zpool-zsmalloc");
@@ -471,10 +477,6 @@ static inline int get_zspage_inuse(struct zspage *zspage)
return zspage->inuse;
}
-static inline void set_zspage_inuse(struct zspage *zspage, int val)
-{
- zspage->inuse = val;
-}
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
{
@@ -1862,6 +1864,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
zspage->isolated--;
}
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * There's no possibility of racing, since wait_for_isolated_drain()
+ * checks the isolated count under &class->lock after enqueuing
+ * on migration_wait.
+ */
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1931,6 +1958,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
@@ -2030,8 +2058,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
- if (!is_zspage_isolated(zspage))
- putback_zspage(class, zspage);
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
reset_page(page);
put_page(page);
@@ -2077,13 +2113,12 @@ static void zs_page_putback(struct page *page)
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
- fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
}
spin_unlock(&class->lock);
}
@@ -2107,8 +2142,36 @@ static int zs_register_migration(struct zs_pool *pool)
return 0;
}
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
static void zs_unregister_migration(struct zs_pool *pool)
{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2346,6 +2409,10 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pool->migration_wait);
+#endif
+
if (create_cache(pool))
goto err;
diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744a76cb..46a322316e52 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
/* extract swpentry from data */
zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
swpentry = zhdr->swpentry; /* here */
- zpool_unmap_handle(pool, handle);
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
@@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
+ zpool_unmap_handle(pool, handle);
return 0;
}
spin_unlock(&tree->lock);
@@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
- ZPOOL_MM_RO) + sizeof(struct zswap_header);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
dst = kmap_atomic(page);
tfm = *get_cpu_ptr(entry->pool->tfm);
ret = crypto_comp_decompress(tfm, src, entry->length,
dst, &dlen);
put_cpu_ptr(entry->pool->tfm);
kunmap_atomic(dst);
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -940,6 +938,7 @@ fail:
spin_unlock(&tree->lock);
end:
+ zpool_unmap_handle(pool, handle);
return ret;
}
@@ -997,6 +996,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
char *buf;
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+ gfp_t gfp;
/* THP isn't supported */
if (PageTransHuge(page)) {
@@ -1070,9 +1070,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* store */
hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
- __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
- &handle);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(entry->pool->zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;