aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/backing-dev.c22
-rw-r--r--mm/cma.c83
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/filemap.c99
-rw-r--r--mm/gup.c12
-rw-r--r--mm/gup_benchmark.c4
-rw-r--r--mm/hmm.c573
-rw-r--r--mm/huge_memory.c26
-rw-r--r--mm/internal.h5
-rw-r--r--mm/khugepaged.c67
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/memcontrol.c39
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory_hotplug.c5
-rw-r--r--mm/mempolicy.c40
-rw-r--r--mm/migrate.c421
-rw-r--r--mm/mmap.c98
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/oom_kill.c81
-rw-r--r--mm/page-writeback.c61
-rw-r--r--mm/page_alloc.c119
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/shmem.c60
-rw-r--r--mm/slab.c3
-rw-r--r--mm/slub.c57
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap_state.c17
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/util.c15
-rw-r--r--mm/vmscan.c231
-rw-r--r--mm/vmstat.c5
-rw-r--r--mm/workingset.c22
-rw-r--r--mm/z3fold.c53
37 files changed, 1364 insertions, 922 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..e14c01513bfd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT
default n
depends on NO_BOOTMEM
depends on !FLATMEM
+ depends on !NEED_PER_CPU_KM
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 08b9aab631ab..7441bd93b732 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -115,6 +115,7 @@ static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
bdi, &bdi_debug_stats_fops);
if (!bdi->debug_stats) {
debugfs_remove(bdi->debug_dir);
+ bdi->debug_dir = NULL;
return -ENOMEM;
}
@@ -383,7 +384,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
* the barrier provided by test_and_clear_bit() above.
*/
smp_wmb();
- clear_bit(WB_shutting_down, &wb->state);
+ clear_and_wake_up_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -1020,23 +1021,18 @@ EXPORT_SYMBOL(congestion_wait);
/**
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @pgdat: A pgdat to check if it is heavily congested
* @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies
*
- * In the event of a congested backing_dev (any backing_dev) and the given
- * @pgdat has experienced recent congestion, this waits for up to @timeout
- * jiffies for either a BDI to exit congestion of the given @sync queue
- * or a write to complete.
- *
- * In the absence of pgdat congestion, cond_resched() is called to yield
- * the processor if necessary but otherwise does not sleep.
+ * In the event of a congested backing_dev (any backing_dev) this waits
+ * for up to @timeout jiffies for either a BDI to exit congestion of the
+ * given @sync queue or a write to complete.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
* returned. return_value == timeout implies the function did not sleep.
*/
-long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
+long wait_iff_congested(int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
@@ -1044,12 +1040,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
wait_queue_head_t *wqh = &congestion_wqh[sync];
/*
- * If there is no congestion, or heavy congestion is not being
- * encountered in the current pgdat, yield if necessary instead
+ * If there is no congestion, yield if necessary instead
* of sleeping on the congestion queue
*/
- if (atomic_read(&nr_wb_congested[sync]) == 0 ||
- !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
+ if (atomic_read(&nr_wb_congested[sync]) == 0) {
cond_resched();
/* In case we scheduled, work out time remaining */
diff --git a/mm/cma.c b/mm/cma.c
index 5809bbe360d7..aa40e6c7b042 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -39,6 +39,7 @@
#include <trace/events/cma.h>
#include "cma.h"
+#include "internal.h"
struct cma cma_areas[MAX_CMA_AREAS];
unsigned cma_area_count;
@@ -109,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma)
if (!cma->bitmap)
return -ENOMEM;
- WARN_ON_ONCE(!pfn_valid(pfn));
- zone = page_zone(pfn_to_page(pfn));
-
do {
unsigned j;
base_pfn = pfn;
+ if (!pfn_valid(base_pfn))
+ goto err;
+
+ zone = page_zone(pfn_to_page(base_pfn));
for (j = pageblock_nr_pages; j; --j, pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
+ if (!pfn_valid(pfn))
+ goto err;
+
/*
- * alloc_contig_range requires the pfn range
- * specified to be in the same zone. Make this
- * simple by forcing the entire CMA resv range
- * to be in the same zone.
+ * In init_cma_reserved_pageblock(), present_pages
+ * is adjusted with assumption that all pages in
+ * the pageblock come from a single zone.
*/
if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
+ goto err;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
} while (--i);
@@ -139,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma)
return 0;
-not_in_zone:
+err:
pr_err("CMA area %s could not be activated\n", cma->name);
kfree(cma->bitmap);
cma->count = 0;
@@ -149,6 +152,41 @@ not_in_zone:
static int __init cma_init_reserved_areas(void)
{
int i;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ if (!cma_area_count)
+ return 0;
+
+ for_each_online_pgdat(pgdat) {
+ unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+
+ zone = &pgdat->node_zones[ZONE_MOVABLE];
+
+ /*
+ * In this case, we cannot adjust the zone range
+ * since it is now maximum node span and we don't
+ * know original zone range.
+ */
+ if (populated_zone(zone))
+ continue;
+
+ for (i = 0; i < cma_area_count; i++) {
+ if (pfn_to_nid(cma_areas[i].base_pfn) !=
+ pgdat->node_id)
+ continue;
+
+ start_pfn = min(start_pfn, cma_areas[i].base_pfn);
+ end_pfn = max(end_pfn, cma_areas[i].base_pfn +
+ cma_areas[i].count);
+ }
+
+ if (!end_pfn)
+ continue;
+
+ zone->zone_start_pfn = start_pfn;
+ zone->spanned_pages = end_pfn - start_pfn;
+ }
for (i = 0; i < cma_area_count; i++) {
int ret = cma_activate_area(&cma_areas[i]);
@@ -157,9 +195,32 @@ static int __init cma_init_reserved_areas(void)
return ret;
}
+ /*
+ * Reserved pages for ZONE_MOVABLE are now activated and
+ * this would change ZONE_MOVABLE's managed page counter and
+ * the other zones' present counter. We need to re-calculate
+ * various zone information that depends on this initialization.
+ */
+ build_all_zonelists(NULL);
+ for_each_populated_zone(zone) {
+ if (zone_idx(zone) == ZONE_MOVABLE) {
+ zone_pcp_reset(zone);
+ setup_zone_pageset(zone);
+ } else
+ zone_pcp_update(zone);
+
+ set_zone_contiguous(zone);
+ }
+
+ /*
+ * We need to re-init per zone wmark by calling
+ * init_per_zone_wmark_min() but doesn't call here because it is
+ * registered on core_initcall and it will be called later than us.
+ */
+
return 0;
}
-core_initcall(cma_init_reserved_areas);
+pure_initcall(cma_init_reserved_areas);
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
diff --git a/mm/compaction.c b/mm/compaction.c
index 88d01a50a015..028b7210a669 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1166,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc)
* from the isolated freelists in the block we are migrating to.
*/
static struct page *compaction_alloc(struct page *migratepage,
- unsigned long data,
- int **result)
+ unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
@@ -1451,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
- * ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
- ALLOC_CMA, wmark_target))
+ 0, wmark_target))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
diff --git a/mm/filemap.c b/mm/filemap.c
index 693f62212a59..0604cb02e6f3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@
* ->i_mmap_rwsem (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
- * ->mapping->tree_lock
+ * ->i_pages lock
*
* ->i_mutex
* ->i_mmap_rwsem (truncate->unmap_mapping_range)
@@ -74,7 +74,7 @@
* ->mmap_sem
* ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
- * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
+ * ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
@@ -84,7 +84,7 @@
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
- * ->mapping->tree_lock (__sync_single_inode)
+ * ->i_pages lock (__sync_single_inode)
*
* ->i_mmap_rwsem
* ->anon_vma.lock (vma_adjust)
@@ -95,11 +95,11 @@
* ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
- * ->tree_lock (try_to_unmap_one)
+ * ->i_pages lock (try_to_unmap_one)
* ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
* ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
- * ->tree_lock (page_remove_rmap->set_page_dirty)
+ * ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
* ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
@@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index, 0,
+ error = __radix_tree_create(&mapping->i_pages, page->index, 0,
&node, &slot);
if (error)
return error;
if (*slot) {
void *p;
- p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ p = radix_tree_deref_slot_protected(slot,
+ &mapping->i_pages.xa_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
@@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (shadowp)
*shadowp = p;
}
- __radix_tree_replace(&mapping->page_tree, node, slot, page,
+ __radix_tree_replace(&mapping->i_pages, node, slot, page,
workingset_lookup_update(mapping));
mapping->nrpages++;
return 0;
@@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping,
struct radix_tree_node *node;
void **slot;
- __radix_tree_lookup(&mapping->page_tree, page->index + i,
+ __radix_tree_lookup(&mapping->i_pages, page->index + i,
&node, &slot);
VM_BUG_ON_PAGE(!node && nr != 1, page);
- radix_tree_clear_tags(&mapping->page_tree, node, slot);
- __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+ radix_tree_clear_tags(&mapping->i_pages, node, slot);
+ __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
workingset_lookup_update(mapping));
}
@@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
+ * is safe. The caller must hold the i_pages lock.
*/
void __delete_from_page_cache(struct page *page, void *shadow)
{
@@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page)
unsigned long flags;
BUG_ON(!PageLocked(page));
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
__delete_from_page_cache(page, NULL);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
page_cache_free_page(mapping, page);
}
@@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @mapping: the mapping to which pages belong
* @pvec: pagevec with pages to delete
*
- * The function walks over mapping->page_tree and removes pages passed in @pvec
- * from the radix tree. The function expects @pvec to be sorted by page index.
- * It tolerates holes in @pvec (radix tree entries at those indices are not
+ * The function walks over mapping->i_pages and removes pages passed in @pvec
+ * from the mapping. The function expects @pvec to be sorted by page index.
+ * It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the radix
- * tree as well.
+ * @pvec and takes care to delete all corresponding tail pages from the
+ * mapping as well.
*
- * The function expects mapping->tree_lock to be held.
+ * The function expects the i_pages lock to be held.
*/
static void
page_cache_tree_delete_batch(struct address_space *mapping,
@@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
pgoff_t start;
start = pvec->pages[0]->index;
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (i >= pagevec_count(pvec) && !tail_pages)
break;
page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (radix_tree_exceptional_entry(page))
continue;
if (!tail_pages) {
@@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping,
} else {
tail_pages--;
}
- radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
- __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
+ radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
+ __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
workingset_lookup_update(mapping));
total_pages++;
}
@@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping,
if (!pagevec_count(pvec))
return;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
page_cache_tree_delete_batch(mapping, pvec);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -785,7 +786,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
- error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
@@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
__delete_from_page_cache(old, NULL);
error = page_cache_tree_insert(mapping, new, NULL);
BUG_ON(error);
@@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
__inc_node_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_node_page_state(new, NR_SHMEM);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
@@ -841,7 +842,7 @@ static int __add_to_page_cache_locked(struct page *page,
return error;
}
- error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
+ error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
if (error) {
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
@@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page,
page->mapping = mapping;
page->index = offset;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
error = page_cache_tree_insert(mapping, page, shadowp);
radix_tree_preload_end();
if (unlikely(error))
@@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting. */
if (!huge)
__inc_node_page_state(page, NR_FILE_PAGES);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
@@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page,
err_insert:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
@@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
for (i = 0; i < max_scan; i++) {
struct page *page;
- page = radix_tree_lookup(&mapping->page_tree, index);
+ page = radix_tree_lookup(&mapping->i_pages, index);
if (!page || radix_tree_exceptional_entry(page))
break;
index++;
@@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping,
for (i = 0; i < max_scan; i++) {
struct page *page;
- page = radix_tree_lookup(&mapping->page_tree, index);
+ page = radix_tree_lookup(&mapping->i_pages, index);
if (!page || radix_tree_exceptional_entry(page))
break;
index--;
@@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
rcu_read_lock();
repeat:
page = NULL;
- pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+ pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
if (unlikely(!page))
@@ -1584,8 +1585,7 @@ no_page:
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
- err = add_to_page_cache_lru(page, mapping, offset,
- gfp_mask & GFP_RECLAIM_MASK);
+ err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (unlikely(err)) {
put_page(page);
page = NULL;
@@ -1633,7 +1633,7 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
@@ -1710,7 +1710,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
struct page *head, *page;
if (iter.index > end)
@@ -1795,7 +1795,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
- radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
+ radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
@@ -1875,8 +1875,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree,
- &iter, *index, tag) {
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
struct page *head, *page;
if (iter.index > end)
@@ -1969,8 +1968,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree,
- &iter, start, tag) {
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
@@ -2388,7 +2386,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
@@ -2624,8 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
struct page *head, *page;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
- start_pgoff) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
if (iter.index > end_pgoff)
break;
repeat:
@@ -2721,7 +2718,6 @@ out:
sb_end_pagefault(inode->i_sb);
return ret;
}
-EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
@@ -2752,6 +2748,10 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
#else
+int filemap_page_mkwrite(struct vm_fault *vmf)
+{
+ return -ENOSYS;
+}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
return -ENOSYS;
@@ -2762,6 +2762,7 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
}
#endif /* CONFIG_MMU */
+EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
diff --git a/mm/gup.c b/mm/gup.c
index f296df6cf666..541904a7c60f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -544,6 +544,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
+ if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ return -EFAULT;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
@@ -1740,7 +1743,9 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
/*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
- * the regular GUP. It will only return non-negative values.
+ * the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
@@ -1806,9 +1811,12 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
+ if (nr_pages <= 0)
+ return 0;
+
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
(void __user *)start, len)))
- return 0;
+ return -EFAULT;
if (gup_fast_permitted(start, nr_pages, write)) {
local_irq_disable();
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 5c8e2abeaa15..0f44759486e2 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -23,7 +23,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
struct page **pages;
nr_pages = gup->size / PAGE_SIZE;
- pages = kvmalloc(sizeof(void *) * nr_pages, GFP_KERNEL);
+ pages = kvzalloc(sizeof(void *) * nr_pages, GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -41,6 +41,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
}
nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i);
+ if (nr <= 0)
+ break;
i += nr;
}
end_time = ktime_get();
diff --git a/mm/hmm.c b/mm/hmm.c
index 320545b98ff5..486dc394a5a3 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -160,6 +160,32 @@ static void hmm_invalidate_range(struct hmm *hmm,
up_read(&hmm->mirrors_sem);
}
+static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct hmm_mirror *mirror;
+ struct hmm *hmm = mm->hmm;
+
+ down_write(&hmm->mirrors_sem);
+ mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
+ list);
+ while (mirror) {
+ list_del_init(&mirror->list);
+ if (mirror->ops->release) {
+ /*
+ * Drop mirrors_sem so callback can wait on any pending
+ * work that might itself trigger mmu_notifier callback
+ * and thus would deadlock with us.
+ */
+ up_write(&hmm->mirrors_sem);
+ mirror->ops->release(mirror);
+ down_write(&hmm->mirrors_sem);
+ }
+ mirror = list_first_entry_or_null(&hmm->mirrors,
+ struct hmm_mirror, list);
+ }
+ up_write(&hmm->mirrors_sem);
+}
+
static void hmm_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
@@ -185,6 +211,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+ .release = hmm_release,
.invalidate_range_start = hmm_invalidate_range_start,
.invalidate_range_end = hmm_invalidate_range_end,
};
@@ -206,13 +233,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
+again:
mirror->hmm = hmm_register(mm);
if (!mirror->hmm)
return -ENOMEM;
down_write(&mirror->hmm->mirrors_sem);
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
+ if (mirror->hmm->mm == NULL) {
+ /*
+ * A racing hmm_mirror_unregister() is about to destroy the hmm
+ * struct. Try again to allocate a new one.
+ */
+ up_write(&mirror->hmm->mirrors_sem);
+ mirror->hmm = NULL;
+ goto again;
+ } else {
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
+ }
return 0;
}
@@ -227,11 +265,32 @@ EXPORT_SYMBOL(hmm_mirror_register);
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- struct hmm *hmm = mirror->hmm;
+ bool should_unregister = false;
+ struct mm_struct *mm;
+ struct hmm *hmm;
+ if (mirror->hmm == NULL)
+ return;
+
+ hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
- list_del(&mirror->list);
+ list_del_init(&mirror->list);
+ should_unregister = list_empty(&hmm->mirrors);
+ mirror->hmm = NULL;
+ mm = hmm->mm;
+ hmm->mm = NULL;
up_write(&hmm->mirrors_sem);
+
+ if (!should_unregister || mm == NULL)
+ return;
+
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
+
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
+ kfree(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
@@ -240,110 +299,275 @@ struct hmm_vma_walk {
unsigned long last;
bool fault;
bool block;
- bool write;
};
-static int hmm_vma_do_fault(struct mm_walk *walk,
- unsigned long addr,
- hmm_pfn_t *pfn)
+static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
+ bool write_fault, uint64_t *pfn)
{
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
int r;
flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
- flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+ flags |= write_fault ? FAULT_FLAG_WRITE : 0;
r = handle_mm_fault(vma, addr, flags);
if (r & VM_FAULT_RETRY)
return -EBUSY;
if (r & VM_FAULT_ERROR) {
- *pfn = HMM_PFN_ERROR;
+ *pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
return -EAGAIN;
}
-static void hmm_pfns_special(hmm_pfn_t *pfns,
- unsigned long addr,
- unsigned long end)
-{
- for (; addr < end; addr += PAGE_SIZE, pfns++)
- *pfns = HMM_PFN_SPECIAL;
-}
-
static int hmm_pfns_bad(unsigned long addr,
unsigned long end,
struct mm_walk *walk)
{
- struct hmm_range *range = walk->private;
- hmm_pfn_t *pfns = range->pfns;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ uint64_t *pfns = range->pfns;
unsigned long i;
i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
- pfns[i] = HMM_PFN_ERROR;
+ pfns[i] = range->values[HMM_PFN_ERROR];
return 0;
}
-static void hmm_pfns_clear(hmm_pfn_t *pfns,
- unsigned long addr,
- unsigned long end)
-{
- for (; addr < end; addr += PAGE_SIZE, pfns++)
- *pfns = 0;
-}
-
-static int hmm_vma_walk_hole(unsigned long addr,
- unsigned long end,
- struct mm_walk *walk)
+/*
+ * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @fault: should we fault or not ?
+ * @write_fault: write fault ?
+ * @walk: mm_walk structure
+ * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ *
+ * This function will be called whenever pmd_none() or pte_none() returns true,
+ * or whenever there is no page directory covering the virtual address range.
+ */
+static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
+ bool fault, bool write_fault,
+ struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- hmm_pfn_t *pfns = range->pfns;
+ uint64_t *pfns = range->pfns;
unsigned long i;
hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++) {
- pfns[i] = HMM_PFN_EMPTY;
- if (hmm_vma_walk->fault) {
+ pfns[i] = range->values[HMM_PFN_NONE];
+ if (fault || write_fault) {
int ret;
- ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ ret = hmm_vma_do_fault(walk, addr, write_fault,
+ &pfns[i]);
if (ret != -EAGAIN)
return ret;
}
}
- return hmm_vma_walk->fault ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EAGAIN : 0;
}
-static int hmm_vma_walk_clear(unsigned long addr,
- unsigned long end,
- struct mm_walk *walk)
+static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ uint64_t pfns, uint64_t cpu_flags,
+ bool *fault, bool *write_fault)
{
- struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- hmm_pfn_t *pfns = range->pfns;
+
+ *fault = *write_fault = false;
+ if (!hmm_vma_walk->fault)
+ return;
+
+ /* We aren't ask to do anything ... */
+ if (!(pfns & range->flags[HMM_PFN_VALID]))
+ return;
+ /* If this is device memory than only fault if explicitly requested */
+ if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
+ /* Do we fault on device memory ? */
+ if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
+ *write_fault = pfns & range->flags[HMM_PFN_WRITE];
+ *fault = true;
+ }
+ return;
+ }
+
+ /* If CPU page table is not valid then we need to fault */
+ *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
+ /* Need to write fault ? */
+ if ((pfns & range->flags[HMM_PFN_WRITE]) &&
+ !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
+ *write_fault = true;
+ *fault = true;
+ }
+}
+
+static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ const uint64_t *pfns, unsigned long npages,
+ uint64_t cpu_flags, bool *fault,
+ bool *write_fault)
+{
unsigned long i;
- hmm_vma_walk->last = addr;
+ if (!hmm_vma_walk->fault) {
+ *fault = *write_fault = false;
+ return;
+ }
+
+ for (i = 0; i < npages; ++i) {
+ hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
+ fault, write_fault);
+ if ((*fault) || (*write_fault))
+ return;
+ }
+}
+
+static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ bool fault, write_fault;
+ unsigned long i, npages;
+ uint64_t *pfns;
+
i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
- pfns[i] = 0;
- if (hmm_vma_walk->fault) {
- int ret;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ 0, &fault, &write_fault);
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+}
- ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
- if (ret != -EAGAIN)
- return ret;
+static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
+{
+ if (pmd_protnone(pmd))
+ return 0;
+ return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_handle_pmd(struct mm_walk *walk,
+ unsigned long addr,
+ unsigned long end,
+ uint64_t *pfns,
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long pfn, npages, i;
+ bool fault, write_fault;
+ uint64_t cpu_flags;
+
+ npages = (end - addr) >> PAGE_SHIFT;
+ cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
+ &fault, &write_fault);
+
+ if (pmd_protnone(pmd) || fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ pfn = pmd_pfn(pmd) + pte_index(addr);
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ hmm_vma_walk->last = end;
+ return 0;
+}
+
+static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
+{
+ if (pte_none(pte) || !pte_present(pte))
+ return 0;
+ return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, pmd_t *pmdp, pte_t *ptep,
+ uint64_t *pfn)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ bool fault, write_fault;
+ uint64_t cpu_flags;
+ pte_t pte = *ptep;
+ uint64_t orig_pfn = *pfn;
+
+ *pfn = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+
+ if (pte_none(pte)) {
+ if (fault || write_fault)
+ goto fault;
+ return 0;
+ }
+
+ if (!pte_present(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (!non_swap_entry(entry)) {
+ if (fault || write_fault)
+ goto fault;
+ return 0;
}
+
+ /*
+ * This is a special swap entry, ignore migration, use
+ * device and report anything else as error.
+ */
+ if (is_device_private_entry(entry)) {
+ cpu_flags = range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_DEVICE_PRIVATE];
+ cpu_flags |= is_write_device_private_entry(entry) ?
+ range->flags[HMM_PFN_WRITE] : 0;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault)
+ goto fault;
+ *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn |= cpu_flags;
+ return 0;
+ }
+
+ if (is_migration_entry(entry)) {
+ if (fault || write_fault) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(vma->vm_mm,
+ pmdp, addr);
+ return -EAGAIN;
+ }
+ return 0;
+ }
+
+ /* Report error for everything else */
+ *pfn = range->values[HMM_PFN_ERROR];
+ return -EFAULT;
}
- return hmm_vma_walk->fault ? -EAGAIN : 0;
+ if (fault || write_fault)
+ goto fault;
+
+ *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ return 0;
+
+fault:
+ pte_unmap(ptep);
+ /* Fault any virtual address we were asked to fault */
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
}
static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -353,26 +577,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- struct vm_area_struct *vma = walk->vma;
- hmm_pfn_t *pfns = range->pfns;
+ uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
- bool write_fault;
- hmm_pfn_t flag;
pte_t *ptep;
i = (addr - range->start) >> PAGE_SHIFT;
- flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
- write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
again:
if (pmd_none(*pmdp))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
+ if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
return hmm_pfns_bad(start, end, walk);
if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
- unsigned long pfn;
pmd_t pmd;
/*
@@ -388,17 +606,8 @@ again:
barrier();
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
- if (pmd_protnone(pmd))
- return hmm_vma_walk_clear(start, end, walk);
- if (write_fault && !pmd_write(pmd))
- return hmm_vma_walk_clear(start, end, walk);
-
- pfn = pmd_pfn(pmd) + pte_index(addr);
- flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
- return 0;
+ return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
}
if (pmd_bad(*pmdp))
@@ -406,79 +615,43 @@ again:
ptep = pte_offset_map(pmdp, addr);
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
- pte_t pte = *ptep;
-
- pfns[i] = 0;
+ int r;
- if (pte_none(pte)) {
- pfns[i] = HMM_PFN_EMPTY;
- if (hmm_vma_walk->fault)
- goto fault;
- continue;
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
+ if (r) {
+ /* hmm_vma_handle_pte() did unmap pte directory */
+ hmm_vma_walk->last = addr;
+ return r;
}
-
- if (!pte_present(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (!non_swap_entry(entry)) {
- if (hmm_vma_walk->fault)
- goto fault;
- continue;
- }
-
- /*
- * This is a special swap entry, ignore migration, use
- * device and report anything else as error.
- */
- if (is_device_private_entry(entry)) {
- pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
- if (is_write_device_private_entry(entry)) {
- pfns[i] |= HMM_PFN_WRITE;
- } else if (write_fault)
- goto fault;
- pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
- pfns[i] |= flag;
- } else if (is_migration_entry(entry)) {
- if (hmm_vma_walk->fault) {
- pte_unmap(ptep);
- hmm_vma_walk->last = addr;
- migration_entry_wait(vma->vm_mm,
- pmdp, addr);
- return -EAGAIN;
- }
- continue;
- } else {
- /* Report error for everything else */
- pfns[i] = HMM_PFN_ERROR;
- }
- continue;
- }
-
- if (write_fault && !pte_write(pte))
- goto fault;
-
- pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
- pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
- continue;
-
-fault:
- pte_unmap(ptep);
- /* Fault all pages in range */
- return hmm_vma_walk_clear(start, end, walk);
}
pte_unmap(ptep - 1);
+ hmm_vma_walk->last = addr;
return 0;
}
+static void hmm_pfns_clear(struct hmm_range *range,
+ uint64_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = range->values[HMM_PFN_NONE];
+}
+
+static void hmm_pfns_special(struct hmm_range *range)
+{
+ unsigned long addr = range->start, i = 0;
+
+ for (; addr < range->end; addr += PAGE_SIZE, i++)
+ range->pfns[i] = range->values[HMM_PFN_SPECIAL];
+}
+
/*
* hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @vma: virtual memory area containing the virtual address range
- * @range: used to track snapshot validity
- * @start: range virtual start address (inclusive)
- * @end: range virtual end address (exclusive)
- * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
+ * @range: range being snapshotted
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * vma permission, 0 success
*
* This snapshots the CPU page table for a range of virtual addresses. Snapshot
* validity is tracked by range struct. See hmm_vma_range_done() for further
@@ -491,26 +664,17 @@ fault:
* NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
* MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
*/
-int hmm_vma_get_pfns(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns)
+int hmm_vma_get_pfns(struct hmm_range *range)
{
+ struct vm_area_struct *vma = range->vma;
struct hmm_vma_walk hmm_vma_walk;
struct mm_walk mm_walk;
struct hmm *hmm;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
- hmm_pfns_special(pfns, start, end);
- return -EINVAL;
- }
-
/* Sanity check, this really should not happen ! */
- if (start < vma->vm_start || start >= vma->vm_end)
+ if (range->start < vma->vm_start || range->start >= vma->vm_end)
return -EINVAL;
- if (end < vma->vm_start || end > vma->vm_end)
+ if (range->end < vma->vm_start || range->end > vma->vm_end)
return -EINVAL;
hmm = hmm_register(vma->vm_mm);
@@ -520,10 +684,24 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
if (!hmm->mmu_notifier.ops)
return -EINVAL;
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(range);
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it does
+ * not allow write access, either. Architecture that allow
+ * write without read access are not supported by HMM, because
+ * operations such has atomic access would not work.
+ */
+ hmm_pfns_clear(range, range->pfns, range->start, range->end);
+ return -EPERM;
+ }
+
/* Initialize range to track CPU page table update */
- range->start = start;
- range->pfns = pfns;
- range->end = end;
spin_lock(&hmm->lock);
range->valid = true;
list_add_rcu(&range->list, &hmm->ranges);
@@ -541,14 +719,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole;
- walk_page_range(start, end, &mm_walk);
+ walk_page_range(range->start, range->end, &mm_walk);
return 0;
}
EXPORT_SYMBOL(hmm_vma_get_pfns);
/*
* hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @vma: virtual memory area containing the virtual address range
* @range: range being tracked
* Returns: false if range data has been invalidated, true otherwise
*
@@ -568,10 +745,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
*
* There are two ways to use this :
* again:
- * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
* trans = device_build_page_table_update_transaction(pfns);
* device_page_table_lock();
- * if (!hmm_vma_range_done(vma, range)) {
+ * if (!hmm_vma_range_done(range)) {
* device_page_table_unlock();
* goto again;
* }
@@ -579,13 +756,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
* device_page_table_unlock();
*
* Or:
- * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
* device_page_table_lock();
- * hmm_vma_range_done(vma, range);
- * device_update_page_table(pfns);
+ * hmm_vma_range_done(range);
+ * device_update_page_table(range->pfns);
* device_page_table_unlock();
*/
-bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
+bool hmm_vma_range_done(struct hmm_range *range)
{
unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
struct hmm *hmm;
@@ -595,7 +772,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
return false;
}
- hmm = hmm_register(vma->vm_mm);
+ hmm = hmm_register(range->vma->vm_mm);
if (!hmm) {
memset(range->pfns, 0, sizeof(*range->pfns) * npages);
return false;
@@ -611,36 +788,34 @@ EXPORT_SYMBOL(hmm_vma_range_done);
/*
* hmm_vma_fault() - try to fault some address in a virtual address range
- * @vma: virtual memory area containing the virtual address range
- * @range: use to track pfns array content validity
- * @start: fault range virtual start address (inclusive)
- * @end: fault range virtual end address (exclusive)
- * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
- * @write: is it a write fault
+ * @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
* Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
*
* This is similar to a regular CPU page fault except that it will not trigger
* any memory migration if the memory being faulted is not accessible by CPUs.
*
- * On error, for one virtual address in the range, the function will set the
- * hmm_pfn_t error flag for the corresponding pfn entry.
+ * On error, for one virtual address in the range, the function will mark the
+ * corresponding HMM pfn entry with an error flag.
*
* Expected use pattern:
* retry:
* down_read(&mm->mmap_sem);
* // Find vma and address device wants to fault, initialize hmm_pfn_t
* // array accordingly
- * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ * ret = hmm_vma_fault(range, write, block);
* switch (ret) {
* case -EAGAIN:
- * hmm_vma_range_done(vma, range);
+ * hmm_vma_range_done(range);
* // You might want to rate limit or yield to play nicely, you may
* // also commit any valid pfn in the array assuming that you are
* // getting true from hmm_vma_range_monitor_end()
* goto retry;
* case 0:
* break;
+ * case -ENOMEM:
+ * case -EINVAL:
+ * case -EPERM:
* default:
* // Handle error !
* up_read(&mm->mmap_sem)
@@ -648,7 +823,7 @@ EXPORT_SYMBOL(hmm_vma_range_done);
* }
* // Take device driver lock that serialize device page table update
* driver_lock_device_page_table_update();
- * hmm_vma_range_done(vma, range);
+ * hmm_vma_range_done(range);
* // Commit pfns we got from hmm_vma_fault()
* driver_unlock_device_page_table_update();
* up_read(&mm->mmap_sem)
@@ -658,51 +833,54 @@ EXPORT_SYMBOL(hmm_vma_range_done);
*
* YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns,
- bool write,
- bool block)
+int hmm_vma_fault(struct hmm_range *range, bool block)
{
+ struct vm_area_struct *vma = range->vma;
+ unsigned long start = range->start;
struct hmm_vma_walk hmm_vma_walk;
struct mm_walk mm_walk;
struct hmm *hmm;
int ret;
/* Sanity check, this really should not happen ! */
- if (start < vma->vm_start || start >= vma->vm_end)
+ if (range->start < vma->vm_start || range->start >= vma->vm_end)
return -EINVAL;
- if (end < vma->vm_start || end > vma->vm_end)
+ if (range->end < vma->vm_start || range->end > vma->vm_end)
return -EINVAL;
hmm = hmm_register(vma->vm_mm);
if (!hmm) {
- hmm_pfns_clear(pfns, start, end);
+ hmm_pfns_clear(range, range->pfns, range->start, range->end);
return -ENOMEM;
}
/* Caller must have registered a mirror using hmm_mirror_register() */
if (!hmm->mmu_notifier.ops)
return -EINVAL;
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(range);
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it does
+ * not allow write access, either. Architecture that allow
+ * write without read access are not supported by HMM, because
+ * operations such has atomic access would not work.
+ */
+ hmm_pfns_clear(range, range->pfns, range->start, range->end);
+ return -EPERM;
+ }
+
/* Initialize range to track CPU page table update */
- range->start = start;
- range->pfns = pfns;
- range->end = end;
spin_lock(&hmm->lock);
range->valid = true;
list_add_rcu(&range->list, &hmm->ranges);
spin_unlock(&hmm->lock);
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
- hmm_pfns_special(pfns, start, end);
- return 0;
- }
-
hmm_vma_walk.fault = true;
- hmm_vma_walk.write = write;
hmm_vma_walk.block = block;
hmm_vma_walk.range = range;
mm_walk.private = &hmm_vma_walk;
@@ -717,7 +895,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
mm_walk.pte_hole = hmm_vma_walk_hole;
do {
- ret = walk_page_range(start, end, &mm_walk);
+ ret = walk_page_range(start, range->end, &mm_walk);
start = hmm_vma_walk.last;
} while (ret == -EAGAIN);
@@ -725,8 +903,9 @@ int hmm_vma_fault(struct vm_area_struct *vma,
unsigned long i;
i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
- hmm_vma_range_done(vma, range);
+ hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
+ range->end);
+ hmm_vma_range_done(range);
}
return ret;
}
@@ -845,13 +1024,6 @@ static void hmm_devmem_release(struct device *dev, void *data)
hmm_devmem_radix_release(resource);
}
-static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
-}
-
static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
{
resource_size_t key, align_start, align_size, align_end;
@@ -892,9 +1064,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
struct hmm_devmem *dup;
- rcu_read_lock();
- dup = hmm_devmem_find(key);
- rcu_read_unlock();
+ dup = radix_tree_lookup(&hmm_devmem_radix,
+ key >> PA_SECTION_SHIFT);
if (dup) {
dev_err(device, "%s: collides with mapping for %s\n",
__func__, dev_name(dup->device));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0ae8d1d4329..a3a1815f8e11 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -555,8 +555,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
- true)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1317,7 +1316,7 @@ alloc:
}
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
- huge_gfp | __GFP_NORETRY, &memcg, true))) {
+ huge_gfp, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
@@ -2402,6 +2401,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+
+ /*
+ * always add to the tail because some iterators expect new
+ * pages to show after the currently processed elements - e.g.
+ * migrate_pages
+ */
lru_add_page_tail(head, page_tail, lruvec, list);
}
@@ -2445,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
} else {
/* Additional pin to radix tree */
page_ref_add(head, 2);
- spin_unlock(&head->mapping->tree_lock);
+ xa_unlock(&head->mapping->i_pages);
}
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
@@ -2653,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) {
void **pslot;
- spin_lock(&mapping->tree_lock);
- pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ xa_lock(&mapping->i_pages);
+ pslot = radix_tree_lookup_slot(&mapping->i_pages,
page_index(head));
/*
* Check if the head page is present in radix tree.
* We assume all tail are present too, if head is there.
*/
if (radix_tree_deref_slot_protected(pslot,
- &mapping->tree_lock) != head)
+ &mapping->i_pages.xa_lock) != head)
goto fail;
}
@@ -2695,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping)
- spin_unlock(&mapping->tree_lock);
+ xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
ret = -EBUSY;
@@ -2920,7 +2925,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
pmde = maybe_pmd_mkwrite(pmde, vma);
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
- page_add_anon_rmap(new, vma, mmun_start, true);
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, mmun_start, true);
+ else
+ page_add_file_rmap(new, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(new);
diff --git a/mm/internal.h b/mm/internal.h
index e6bd35182dae..62d8c34e63d5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
enum ttu_flags;
struct tlbflush_unmap_batch;
@@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
}
void setup_zone_pageset(struct zone *zone);
+extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e42568284e06..d7b2a4bf8671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -965,9 +965,7 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
- /* Do not oom kill for khugepaged charges */
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
- &memcg, true))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
@@ -1326,9 +1324,7 @@ static void collapse_shmem(struct mm_struct *mm,
goto out;
}
- /* Do not oom kill for khugepaged charges */
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
- &memcg, true))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
@@ -1348,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm,
*/
index = start;
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
int n = min(iter.index, end) - index;
/*
@@ -1362,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm,
}
nr_none += n;
for (; index < min(iter.index, end); index++) {
- radix_tree_insert(&mapping->page_tree, index,
+ radix_tree_insert(&mapping->i_pages, index,
new_page + (index % HPAGE_PMD_NR));
}
@@ -1371,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm,
break;
page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
result = SCAN_FAIL;
goto tree_unlocked;
}
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
} else if (trylock_page(page)) {
get_page(page);
} else {
@@ -1389,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm,
}
/*
- * The page must be locked, so we can drop the tree_lock
+ * The page must be locked, so we can drop the i_pages lock
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1400,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_TRUNCATED;
goto out_unlock;
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
@@ -1410,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm,
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
- slot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ slot = radix_tree_lookup_slot(&mapping->i_pages, index);
VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock), page);
+ &mapping->i_pages.xa_lock), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
@@ -1435,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(&mapping->page_tree, slot,
+ radix_tree_replace_slot(&mapping->i_pages, slot,
new_page + (index % HPAGE_PMD_NR));
slot = radix_tree_iter_resume(slot, &iter);
index++;
continue;
out_lru:
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
putback_lru_page(page);
out_isolate_failed:
unlock_page(page);
@@ -1468,14 +1464,14 @@ out_unlock:
}
for (; index < end; index++) {
- radix_tree_insert(&mapping->page_tree, index,
+ radix_tree_insert(&mapping->i_pages, index,
new_page + (index % HPAGE_PMD_NR));
}
nr_none += n;
}
tree_locked:
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
tree_unlocked:
if (result == SCAN_SUCCEED) {
@@ -1524,9 +1520,8 @@ tree_unlocked:
} else {
/* Something went wrong: rollback changes to the radix-tree */
shmem_uncharge(mapping->host, nr_none);
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
- start) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (iter.index >= end)
break;
page = list_first_entry_or_null(&pagelist,
@@ -1536,8 +1531,7 @@ tree_unlocked:
break;
nr_none--;
/* Put holes back where they were */
- radix_tree_delete(&mapping->page_tree,
- iter.index);
+ radix_tree_delete(&mapping->i_pages, iter.index);
continue;
}
@@ -1546,16 +1540,15 @@ tree_unlocked:
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(&mapping->page_tree,
- slot, page);
+ radix_tree_replace_slot(&mapping->i_pages, slot, page);
slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
putback_lru_page(page);
unlock_page(page);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
}
VM_BUG_ON(nr_none);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/* Unfreeze new_page, caller would take care about freeing it */
page_ref_unfreeze(new_page, 1);
@@ -1583,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (iter.index >= start + HPAGE_PMD_NR)
break;
@@ -1883,8 +1876,16 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- for_each_populated_zone(zone)
+ for_each_populated_zone(zone) {
+ /*
+ * We don't need to worry about fragmentation of
+ * ZONE_MOVABLE since it only has movable pages.
+ */
+ if (zone_idx(zone) > gfp_zone(GFP_USER))
+ continue;
+
nr_zones++;
+ }
/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
recommended_min = pageblock_nr_pages * nr_zones * 2;
diff --git a/mm/ksm.c b/mm/ksm.c
index e8d6c6210b80..e3cbf9a92f3c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1131,6 +1131,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
} else {
newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
vma->vm_page_prot));
+ /*
+ * We're replacing an anonymous page with a zero page, which is
+ * not anonymous. We need to do proper accounting otherwise we
+ * will get wrong values in /proc, and a BUG message in dmesg
+ * when tearing down the mm.
+ */
+ dec_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ec024b862ac..2bd3df3d101a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1485,7 +1485,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_may_oom)
+ if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
return;
/*
* We are in the middle of the charge context here, so we
@@ -1839,7 +1839,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
}
}
- for (i = 0; i < MEMCG_NR_EVENTS; i++) {
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
long x;
x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
@@ -1858,7 +1858,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
do {
if (page_counter_read(&memcg->memory) <= memcg->high)
continue;
- mem_cgroup_event(memcg, MEMCG_HIGH);
+ memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
} while ((memcg = parent_mem_cgroup(memcg)));
}
@@ -1949,7 +1949,7 @@ retry:
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- mem_cgroup_event(mem_over_limit, MEMCG_MAX);
+ memcg_memory_event(mem_over_limit, MEMCG_MAX);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
@@ -1992,7 +1992,7 @@ retry:
if (fatal_signal_pending(current))
goto force;
- mem_cgroup_event(mem_over_limit, MEMCG_OOM);
+ memcg_memory_event(mem_over_limit, MEMCG_OOM);
mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
@@ -2192,7 +2192,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
{
struct memcg_kmem_cache_create_work *cw;
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
+ cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw)
return;
@@ -2688,10 +2688,10 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
struct mem_cgroup *iter;
int i;
- memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
+ memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < MEMCG_NR_EVENTS; i++)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
events[i] += memcg_sum_events(iter, i);
}
}
@@ -4108,6 +4108,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ if (!pn)
+ return;
+
free_percpu(pn->lruvec_stat_cpu);
kfree(pn);
}
@@ -5178,7 +5181,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
continue;
}
- mem_cgroup_event(memcg, MEMCG_OOM);
+ memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
}
@@ -5191,10 +5194,14 @@ static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
- seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
- seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
- seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
+ seq_printf(m, "low %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
return 0;
@@ -5204,7 +5211,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[MEMCG_NR_EVENTS];
+ unsigned long events[NR_VM_EVENT_ITEMS];
int i;
/*
@@ -5967,9 +5974,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
/*
* Interrupts should be disabled here because the caller holds the
- * mapping->tree_lock lock which is taken with interrupts-off. It is
+ * i_pages lock which is taken with interrupts-off. It is
* important here to have the interrupts disabled because it is the
- * only synchronisation we have for udpating the per-CPU variables.
+ * only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2d4bf647cf01..9d142b9b86dc 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1487,7 +1487,7 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private, int **x)
+static struct page *new_page(struct page *p, unsigned long private)
{
int nid = page_to_nid(p);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cc6dfa5832ca..f74826cdceea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1329,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private,
- int **result)
+static struct page *new_node_page(struct page *page, unsigned long private)
{
int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
@@ -1373,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (isolate_huge_page(page, &source))
move_pages -= 1 << compound_order(head);
continue;
- } else if (thp_migration_supported() && PageTransHuge(page))
+ } else if (PageTransHuge(page))
pfn = page_to_pfn(compound_head(page))
+ hpage_nr_pages(page) - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01cbb7078d6c..9ac49ef17b4e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
goto out;
}
- if (!thp_migration_supported()) {
- get_page(page);
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- goto out;
- }
if (!queue_pages_required(page, qp)) {
ret = 1;
goto unlock;
@@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
-retry:
+
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -511,22 +502,6 @@ retry:
continue;
if (!queue_pages_required(page, qp))
continue;
- if (PageTransCompound(page) && !thp_migration_supported()) {
- get_page(page);
- pte_unmap_unlock(pte, ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- /* Failed to split -- skip. */
- if (ret) {
- pte = pte_offset_map_lock(walk->mm, pmd,
- addr, &ptl);
- continue;
- }
- goto retry;
- }
-
migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
}
}
-static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+/* page allocation callback for NUMA node migration */
+struct page *alloc_new_node_page(struct page *page, unsigned long node)
{
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
- else if (thp_migration_supported() && PageTransHuge(page)) {
+ else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_pages_node(node,
@@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, NULL, dest,
+ err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_page(struct page *page, unsigned long start, int **x)
+static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long uninitialized_var(address);
@@ -1123,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
if (PageHuge(page)) {
return alloc_huge_page_vma(page_hstate(compound_head(page)),
vma, address);
- } else if (thp_migration_supported() && PageTransHuge(page)) {
+ } else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
@@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_page(struct page *page, unsigned long start, int **x)
+static struct page *new_page(struct page *page, unsigned long start)
{
return NULL;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 003886606a22..8c0af0f7cab1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping,
oldzone = page_zone(page);
newzone = page_zone(newpage);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ pslot = radix_tree_lookup_slot(&mapping->i_pages,
page_index(page));
- expected_count += 1 + page_has_private(page);
+ expected_count += hpage_nr_pages(page) + page_has_private(page);
if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
- spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_deref_slot_protected(pslot,
+ &mapping->i_pages.xa_lock) != page) {
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
@@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
page_ref_unfreeze(page, expected_count);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
@@ -504,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- get_page(newpage); /* add cache reference */
+ page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -522,16 +523,27 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
- radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
+ radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ if (PageTransHuge(page)) {
+ int i;
+ int index = page_index(page);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ pslot = radix_tree_lookup_slot(&mapping->i_pages,
+ index + i);
+ radix_tree_replace_slot(&mapping->i_pages, pslot,
+ newpage + i);
+ }
+ }
/*
* Drop cache reference from old page by unfreezing
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
- spin_unlock(&mapping->tree_lock);
+ xa_unlock(&mapping->i_pages);
/* Leave irq disabled to prevent preemption while updating stats */
/*
@@ -574,20 +586,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
int expected_count;
void **pslot;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->page_tree,
- page_index(page));
+ pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
- spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
@@ -596,11 +607,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
- radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
+ radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
page_ref_unfreeze(page, expected_count - 1);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return MIGRATEPAGE_SUCCESS;
}
@@ -1137,10 +1148,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
enum migrate_reason reason)
{
int rc = MIGRATEPAGE_SUCCESS;
- int *result = NULL;
struct page *newpage;
- newpage = get_new_page(page, private, &result);
+ if (!thp_migration_supported() && PageTransHuge(page))
+ return -ENOMEM;
+
+ newpage = get_new_page(page, private);
if (!newpage)
return -ENOMEM;
@@ -1161,14 +1174,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
- lock_page(page);
- rc = split_huge_page(page);
- unlock_page(page);
- if (rc)
- goto out;
- }
-
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1231,12 +1236,6 @@ put_new:
put_page(newpage);
}
- if (result) {
- if (rc)
- *result = rc;
- else
- *result = page_to_nid(newpage);
- }
return rc;
}
@@ -1264,7 +1263,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
- int *result = NULL;
int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
@@ -1281,7 +1279,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOSYS;
}
- new_hpage = get_new_page(hpage, private, &result);
+ new_hpage = get_new_page(hpage, private);
if (!new_hpage)
return -ENOMEM;
@@ -1345,12 +1343,6 @@ out:
else
putback_active_hugepage(new_hpage);
- if (result) {
- if (rc)
- *result = rc;
- else
- *result = page_to_nid(new_hpage);
- }
return rc;
}
@@ -1395,6 +1387,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
+retry:
cond_resched();
if (PageHuge(page))
@@ -1408,6 +1401,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
switch(rc) {
case -ENOMEM:
+ /*
+ * THP migration might be unsupported or the
+ * allocation could've failed so we should
+ * retry on the same page with the THP split
+ * to base pages.
+ *
+ * Head page is retried immediately and tail
+ * pages are added to the tail of the list so
+ * we encounter them after the rest of the list
+ * is processed.
+ */
+ if (PageTransHuge(page)) {
+ lock_page(page);
+ rc = split_huge_page_to_list(page, from);
+ unlock_page(page);
+ if (!rc) {
+ list_safe_reset_next(page, page2, lru);
+ goto retry;
+ }
+ }
nr_failed++;
goto out;
case -EAGAIN:
@@ -1444,141 +1457,101 @@ out:
}
#ifdef CONFIG_NUMA
-/*
- * Move a list of individual pages
- */
-struct page_to_node {
- unsigned long addr;
- struct page *page;
- int node;
- int status;
-};
-static struct page *new_page_node(struct page *p, unsigned long private,
- int **result)
+static int store_status(int __user *status, int start, int value, int nr)
{
- struct page_to_node *pm = (struct page_to_node *)private;
-
- while (pm->node != MAX_NUMNODES && pm->page != p)
- pm++;
+ while (nr-- > 0) {
+ if (put_user(value, status + start))
+ return -EFAULT;
+ start++;
+ }
- if (pm->node == MAX_NUMNODES)
- return NULL;
+ return 0;
+}
- *result = &pm->status;
+static int do_move_pages_to_node(struct mm_struct *mm,
+ struct list_head *pagelist, int node)
+{
+ int err;
- if (PageHuge(p))
- return alloc_huge_page_node(page_hstate(compound_head(p)),
- pm->node);
- else if (thp_migration_supported() && PageTransHuge(p)) {
- struct page *thp;
+ if (list_empty(pagelist))
+ return 0;
- thp = alloc_pages_node(pm->node,
- (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(pm->node,
- GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
+ err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
+ MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(pagelist);
+ return err;
}
/*
- * Move a set of pages as indicated in the pm array. The addr
- * field must be set to the virtual address of the page to be moved
- * and the node number must contain a valid target node.
- * The pm array ends with node = MAX_NUMNODES.
+ * Resolves the given address to a struct page, isolates it from the LRU and
+ * puts it to the given pagelist.
+ * Returns -errno if the page cannot be found/isolated or 0 when it has been
+ * queued or the page doesn't need to be migrated because it is already on
+ * the target node
*/
-static int do_move_page_to_node_array(struct mm_struct *mm,
- struct page_to_node *pm,
- int migrate_all)
+static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+ int node, struct list_head *pagelist, bool migrate_all)
{
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned int follflags;
int err;
- struct page_to_node *pp;
- LIST_HEAD(pagelist);
down_read(&mm->mmap_sem);
+ err = -EFAULT;
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+ goto out;
- /*
- * Build a list of pages to migrate
- */
- for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
- struct vm_area_struct *vma;
- struct page *page;
- struct page *head;
- unsigned int follflags;
-
- err = -EFAULT;
- vma = find_vma(mm, pp->addr);
- if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
- goto set_status;
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ follflags = FOLL_GET | FOLL_DUMP;
+ page = follow_page(vma, addr, follflags);
- /* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
- if (!thp_migration_supported())
- follflags |= FOLL_SPLIT;
- page = follow_page(vma, pp->addr, follflags);
-
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto set_status;
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto out;
- err = -ENOENT;
- if (!page)
- goto set_status;
+ err = -ENOENT;
+ if (!page)
+ goto out;
- err = page_to_nid(page);
+ err = 0;
+ if (page_to_nid(page) == node)
+ goto out_putpage;
- if (err == pp->node)
- /*
- * Node already in the right place
- */
- goto put_and_set;
+ err = -EACCES;
+ if (page_mapcount(page) > 1 && !migrate_all)
+ goto out_putpage;
- err = -EACCES;
- if (page_mapcount(page) > 1 &&
- !migrate_all)
- goto put_and_set;
-
- if (PageHuge(page)) {
- if (PageHead(page)) {
- isolate_huge_page(page, &pagelist);
- err = 0;
- pp->page = page;
- }
- goto put_and_set;
+ if (PageHuge(page)) {
+ if (PageHead(page)) {
+ isolate_huge_page(page, pagelist);
+ err = 0;
}
+ } else {
+ struct page *head;
- pp->page = compound_head(page);
head = compound_head(page);
err = isolate_lru_page(head);
- if (!err) {
- list_add_tail(&head->lru, &pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
- }
-put_and_set:
- /*
- * Either remove the duplicate refcount from
- * isolate_lru_page() or drop the page ref if it was
- * not isolated.
- */
- put_page(page);
-set_status:
- pp->status = err;
- }
-
- err = 0;
- if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_page_node, NULL,
- (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
if (err)
- putback_movable_pages(&pagelist);
- }
+ goto out_putpage;
+ err = 0;
+ list_add_tail(&head->lru, pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
+ }
+out_putpage:
+ /*
+ * Either remove the duplicate refcount from
+ * isolate_lru_page() or drop the page ref if it was
+ * not isolated.
+ */
+ put_page(page);
+out:
up_read(&mm->mmap_sem);
return err;
}
@@ -1593,79 +1566,82 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
const int __user *nodes,
int __user *status, int flags)
{
- struct page_to_node *pm;
- unsigned long chunk_nr_pages;
- unsigned long chunk_start;
- int err;
-
- err = -ENOMEM;
- pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
- if (!pm)
- goto out;
+ int current_node = NUMA_NO_NODE;
+ LIST_HEAD(pagelist);
+ int start, i;
+ int err = 0, err1;
migrate_prep();
- /*
- * Store a chunk of page_to_node array in a page,
- * but keep the last one as a marker
- */
- chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
+ for (i = start = 0; i < nr_pages; i++) {
+ const void __user *p;
+ unsigned long addr;
+ int node;
- for (chunk_start = 0;
- chunk_start < nr_pages;
- chunk_start += chunk_nr_pages) {
- int j;
-
- if (chunk_start + chunk_nr_pages > nr_pages)
- chunk_nr_pages = nr_pages - chunk_start;
-
- /* fill the chunk pm with addrs and nodes from user-space */
- for (j = 0; j < chunk_nr_pages; j++) {
- const void __user *p;
- int node;
-
- err = -EFAULT;
- if (get_user(p, pages + j + chunk_start))
- goto out_pm;
- pm[j].addr = (unsigned long) p;
-
- if (get_user(node, nodes + j + chunk_start))
- goto out_pm;
-
- err = -ENODEV;
- if (node < 0 || node >= MAX_NUMNODES)
- goto out_pm;
-
- if (!node_state(node, N_MEMORY))
- goto out_pm;
-
- err = -EACCES;
- if (!node_isset(node, task_nodes))
- goto out_pm;
+ err = -EFAULT;
+ if (get_user(p, pages + i))
+ goto out_flush;
+ if (get_user(node, nodes + i))
+ goto out_flush;
+ addr = (unsigned long)p;
+
+ err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_flush;
+ if (!node_state(node, N_MEMORY))
+ goto out_flush;
- pm[j].node = node;
+ err = -EACCES;
+ if (!node_isset(node, task_nodes))
+ goto out_flush;
+
+ if (current_node == NUMA_NO_NODE) {
+ current_node = node;
+ start = i;
+ } else if (node != current_node) {
+ err = do_move_pages_to_node(mm, &pagelist, current_node);
+ if (err)
+ goto out;
+ err = store_status(status, start, current_node, i - start);
+ if (err)
+ goto out;
+ start = i;
+ current_node = node;
}
- /* End marker for this chunk */
- pm[chunk_nr_pages].node = MAX_NUMNODES;
-
- /* Migrate this chunk */
- err = do_move_page_to_node_array(mm, pm,
- flags & MPOL_MF_MOVE_ALL);
- if (err < 0)
- goto out_pm;
+ /*
+ * Errors in the page lookup or isolation are not fatal and we simply
+ * report them via status
+ */
+ err = add_page_for_migration(mm, addr, current_node,
+ &pagelist, flags & MPOL_MF_MOVE_ALL);
+ if (!err)
+ continue;
- /* Return status information */
- for (j = 0; j < chunk_nr_pages; j++)
- if (put_user(pm[j].status, status + j + chunk_start)) {
- err = -EFAULT;
- goto out_pm;
- }
- }
- err = 0;
+ err = store_status(status, i, err, 1);
+ if (err)
+ goto out_flush;
-out_pm:
- free_page((unsigned long)pm);
+ err = do_move_pages_to_node(mm, &pagelist, current_node);
+ if (err)
+ goto out;
+ if (i > start) {
+ err = store_status(status, start, current_node, i - start);
+ if (err)
+ goto out;
+ }
+ current_node = NUMA_NO_NODE;
+ }
+out_flush:
+ if (list_empty(&pagelist))
+ return err;
+
+ /* Make sure we do not overwrite the existing error */
+ err1 = do_move_pages_to_node(mm, &pagelist, current_node);
+ if (!err1)
+ err1 = store_status(status, start, current_node, i - start);
+ if (!err)
+ err = err1;
out:
return err;
}
@@ -1866,8 +1842,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
}
static struct page *alloc_misplaced_dst_page(struct page *page,
- unsigned long data,
- int **result)
+ unsigned long data)
{
int nid = (int) data;
struct page *newpage;
@@ -1987,6 +1962,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
/*
+ * Also do not migrate dirty pages as not all filesystems can move
+ * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
+ */
+ if (page_is_file_cache(page) && PageDirty(page))
+ goto out;
+
+ /*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
@@ -2339,7 +2321,8 @@ again:
ptep_get_and_clear(mm, addr, ptep);
/* Setup special migration page table entry */
- entry = make_migration_entry(page, pte_write(pte));
+ entry = make_migration_entry(page, mpfn &
+ MIGRATE_PFN_WRITE);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
diff --git a/mm/mmap.c b/mm/mmap.c
index f2154fc2548b..fc41c0543d7f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -100,11 +100,20 @@ pgprot_t protection_map[16] __ro_after_init = {
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
+#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+ return prot;
+}
+#endif
+
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- return __pgprot(pgprot_val(protection_map[vm_flags &
+ pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
pgprot_val(arch_vm_get_page_prot(vm_flags)));
+
+ return arch_filter_pgprot(ret);
}
EXPORT_SYMBOL(vm_get_page_prot);
@@ -1315,6 +1324,35 @@ static inline int mlock_future_check(struct mm_struct *mm,
return 0;
}
+static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ if (S_ISBLK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ /* Special "we do even unsigned file positions" case */
+ if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+ return 0;
+
+ /* Yes, random drivers might want more. But I'm tired of buggy drivers */
+ return ULONG_MAX;
+}
+
+static inline bool file_mmap_ok(struct file *file, struct inode *inode,
+ unsigned long pgoff, unsigned long len)
+{
+ u64 maxsize = file_mmap_size_max(file, inode);
+
+ if (maxsize && len > maxsize)
+ return false;
+ maxsize -= len;
+ if (pgoff > maxsize >> PAGE_SHIFT)
+ return false;
+ return true;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1342,6 +1380,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
+ /* force arch specific MAP_FIXED handling in get_unmapped_area */
+ if (flags & MAP_FIXED_NOREPLACE)
+ flags |= MAP_FIXED;
+
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
@@ -1365,6 +1407,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (offset_in_page(addr))
return addr;
+ if (flags & MAP_FIXED_NOREPLACE) {
+ struct vm_area_struct *vma = find_vma(mm, addr);
+
+ if (vma && vma->vm_start <= addr)
+ return -EEXIST;
+ }
+
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
@@ -1389,6 +1438,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
struct inode *inode = file_inode(file);
unsigned long flags_mask;
+ if (!file_mmap_ok(file, inode, pgoff, len))
+ return -EOVERFLOW;
+
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
@@ -3004,6 +3056,32 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
+ if (unlikely(mm_is_oom_victim(mm))) {
+ /*
+ * Manually reap the mm to free as much memory as possible.
+ * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
+ * this mm from further consideration. Taking mm->mmap_sem for
+ * write after setting MMF_OOM_SKIP will guarantee that the oom
+ * reaper will not run on this mm again after mmap_sem is
+ * dropped.
+ *
+ * Nothing can be holding mm->mmap_sem here and the above call
+ * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
+ * __oom_reap_task_mm() will not block.
+ *
+ * This needs to be done before calling munlock_vma_pages_all(),
+ * which clears VM_LOCKED, otherwise the oom reaper cannot
+ * reliably test it.
+ */
+ mutex_lock(&oom_lock);
+ __oom_reap_task_mm(mm);
+ mutex_unlock(&oom_lock);
+
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -3025,24 +3103,6 @@ void exit_mmap(struct mm_struct *mm)
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
-
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Wait for oom_reap_task() to stop working on this
- * mm. Because MMF_OOM_SKIP is already set before
- * calling down_read(), oom_reap_task() will not run
- * on this "mm" post up_write().
- *
- * mm_is_oom_victim() cannot be set from under us
- * either because victim->mm is already set to NULL
- * under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if victim->mm
- * is found not NULL while holding the task_lock.
- */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c1d6af7455da..625608bc8962 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,6 +27,7 @@
#include <linux/pkeys.h>
#include <linux/ksm.h>
#include <linux/uaccess.h>
+#include <linux/mm_inline.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
@@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page_mapcount(page) != 1)
continue;
+ /*
+ * While migration can move some dirty pages,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (page_is_file_cache(page) && PageDirty(page))
+ continue;
+
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa8760a..8ba6cb88cf58 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+void __oom_reap_task_mm(struct mm_struct *mm)
{
- struct mmu_gather tlb;
struct vm_area_struct *vma;
+
+ /*
+ * Tell all users of get_user/copy_from_user etc... that the content
+ * is no longer stable. No barriers really needed because unmapping
+ * should imply barriers already and the reader would hit a page fault
+ * if it stumbled over a reaped memory.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (!can_madv_dontneed_vma(vma))
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+ }
+ }
+}
+
+static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+{
bool ret = true;
/*
* We have to make sure to not race with the victim exit path
* and cause premature new oom victim selection:
- * __oom_reap_task_mm exit_mm
+ * oom_reap_task_mm exit_mm
* mmget_not_zero
* mmput
* atomic_dec_and_test
@@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
trace_start_task_reaping(tsk->pid);
- /*
- * Tell all users of get_user/copy_from_user etc... that the content
- * is no longer stable. No barriers really needed because unmapping
- * should imply barriers already and the reader would hit a page fault
- * if it stumbled over a reaped memory.
- */
- set_bit(MMF_UNSTABLE, &mm->flags);
-
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
- continue;
+ __oom_reap_task_mm(mm);
- /*
- * Only anonymous pages have a good chance to be dropped
- * without additional steps which we cannot afford as we
- * are OOM already.
- *
- * We do not even care about fs backed pages because all
- * which are reclaimable have already been reclaimed and
- * we do not want to block exit_mmap by keeping mm ref
- * count elevated without a good reason.
- */
- if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- const unsigned long start = vma->vm_start;
- const unsigned long end = vma->vm_end;
-
- tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
- unmap_page_range(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
- }
- }
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk)
struct mm_struct *mm = tsk->signal->oom_mm;
/* Retry the down_read_trylock(mmap_sem) a few times */
- while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
-
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
debug_show_all_locks();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 586f31261c83..337c6afb3345 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2099,7 +2099,8 @@ void __init page_writeback_init(void)
* so that it can tag pages faster than a dirtying process can create them).
*/
/*
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
+ * latency.
*/
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping,
struct radix_tree_iter iter;
void **slot;
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
PAGECACHE_TAG_DIRTY) {
if (iter.index > end)
break;
- radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+ radix_tree_iter_tag_set(&mapping->i_pages, &iter,
PAGECACHE_TAG_TOWRITE);
tagged++;
if ((tagged % WRITEBACK_TAG_BATCH) != 0)
continue;
slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
cond_resched();
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
@@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page)
return 1;
}
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree, page_index(page),
+ radix_tree_tag_set(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
if (mapping->host) {
@@ -2501,13 +2502,13 @@ void account_page_redirty(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
current->nr_dirtied--;
dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2613,15 +2614,15 @@ void __cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
lock_page_memcg(page);
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, wb);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
unlock_page_memcg(page);
} else {
ClearPageDirty(page);
@@ -2653,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
/*
* Yes, Virginia, this is indeed insane.
@@ -2690,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
return TestClearPageDirty(page);
@@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page)
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page)
PAGECACHE_TAG_WRITEBACK))
sb_clear_inode_writeback(mapping->host);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestClearPageWriteback(page);
}
@@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
@@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_set(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
@@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
if (!keep_write)
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_TOWRITE);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestSetPageWriteback(page);
}
@@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback);
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- return radix_tree_tagged(&mapping->page_tree, tag);
+ return radix_tree_tagged(&mapping->i_pages, tag);
}
EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b97b8ece4a9..905db9d7962f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,7 +46,6 @@
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
-#include <xen/xen.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
@@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
- 256,
+ [ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
- 256,
+ [ZONE_DMA32] = 256,
#endif
+ [ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
- 32,
+ [ZONE_HIGHMEM] = 0,
#endif
- 32,
+ [ZONE_MOVABLE] = 0,
};
EXPORT_SYMBOL(totalram_pages);
@@ -316,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
/* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
- /* Xen PV domains need page structures early */
- if (xen_pv_domain())
- return true;
(*nr_initialised)++;
if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
@@ -1746,16 +1743,38 @@ void __init page_alloc_init_late(void)
}
#ifdef CONFIG_CMA
+static void __init adjust_present_page_count(struct page *page, long count)
+{
+ struct zone *zone = page_zone(page);
+
+ /* We don't need to hold a lock since it is boot-up process */
+ zone->present_pages += count;
+}
+
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
+ unsigned long pfn = page_to_pfn(page);
struct page *p = page;
+ int nid = page_to_nid(page);
+
+ /*
+ * ZONE_MOVABLE will steal present pages from other zones by
+ * changing page links so page_zone() is changed. Before that,
+ * we need to adjust previous zone's page count first.
+ */
+ adjust_present_page_count(page, -pageblock_nr_pages);
do {
__ClearPageReserved(p);
set_page_count(p, 0);
- } while (++p, --i);
+
+ /* Steal pages from other zones */
+ set_page_links(p, ZONE_MOVABLE, nid, pfn);
+ } while (++p, ++pfn, --i);
+
+ adjust_present_page_count(page, pageblock_nr_pages);
set_pageblock_migratetype(page, MIGRATE_CMA);
@@ -2870,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = min_wmark_pages(zone) + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3146,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
@@ -3178,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
#ifdef CONFIG_CMA
- if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ if (!list_empty(&area->free_list[MIGRATE_CMA]))
return true;
- }
#endif
if (alloc_harder &&
!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3201,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
- long cma_pages = 0;
-
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
/*
* Fast check for order-0 only. If this fails then the reserves
@@ -3216,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* the caller is !atomic then it'll uselessly search the free
* list. That corner case is then slower but it is harmless.
*/
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
return true;
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3852,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
-#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
return alloc_flags;
}
@@ -4322,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
- *alloc_flags |= ALLOC_CMA;
-
return true;
}
@@ -4734,6 +4731,13 @@ long si_mem_available(void)
min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
wmark_low);
+ /*
+ * Part of the kernel memory, which can be released under memory
+ * pressure.
+ */
+ available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
+ PAGE_SHIFT;
+
if (available < 0)
available = 0;
return available;
@@ -6200,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
+ unsigned long node_end_pfn = 0;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6227,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long movable_size = 0;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
+ if (zone_end_pfn(zone) > node_end_pfn)
+ node_end_pfn = zone_end_pfn(zone);
+
/*
* Adjust freesize so that it accounts for how much memory
@@ -6278,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone_seqlock_init(zone);
zone_pcp_init(zone);
- if (!size)
+ /*
+ * The size of the CMA area is unknown now so we need to
+ * prepare the memory for the usemap at maximum.
+ */
+ if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
+ pgdat->node_spanned_pages) {
+ movable_size = node_end_pfn - pgdat->node_start_pfn;
+ }
+
+ if (!size && !movable_size)
continue;
set_pageblock_order();
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
+ if (movable_size) {
+ zone->zone_start_pfn = pgdat->node_start_pfn;
+ zone->spanned_pages = movable_size;
+ setup_usemap(pgdat, zone,
+ pgdat->node_start_pfn, movable_size);
+ init_currently_empty_zone(zone,
+ pgdat->node_start_pfn, movable_size);
+ } else {
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
+ }
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -7125,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void)
struct zone *lower_zone;
idx--;
-
- if (sysctl_lowmem_reserve_ratio[idx] < 1)
- sysctl_lowmem_reserve_ratio[idx] = 1;
-
lower_zone = pgdat->node_zones + idx;
- lower_zone->lowmem_reserve[j] = managed_pages /
- sysctl_lowmem_reserve_ratio[idx];
+
+ if (sysctl_lowmem_reserve_ratio[idx] < 1) {
+ sysctl_lowmem_reserve_ratio[idx] = 0;
+ lower_zone->lowmem_reserve[j] = 0;
+ } else {
+ lower_zone->lowmem_reserve[j] =
+ managed_pages / sysctl_lowmem_reserve_ratio[idx];
+ }
managed_pages += lower_zone->managed_pages;
}
}
@@ -7922,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 61dee77bb211..43e085608846 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -309,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
return pfn < end_pfn ? -EBUSY : 0;
}
-struct page *alloc_migrate_target(struct page *page, unsigned long private,
- int **resultp)
+struct page *alloc_migrate_target(struct page *page, unsigned long private)
{
return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 4d57b4644f98..539bbb6c1fad 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
break;
rcu_read_lock();
- page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ page = radix_tree_lookup(&mapping->i_pages, page_offset);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page))
continue;
diff --git a/mm/rmap.c b/mm/rmap.c
index 9122787c4947..8d5337fed37b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,11 +32,11 @@
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- * mapping->tree_lock (widely used)
+ * i_pages lock (widely used)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
- * mapping->tree_lock (widely used, in set_page_dirty,
+ * i_pages lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
@@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (!pvmw.pte && (flags & TTU_MIGRATION)) {
VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
- if (!PageAnon(page))
- continue;
-
set_pmd_migration_entry(&pvmw, page);
continue;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 4424fc0c33aa..9d6c7e595415 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
+ item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
if (!item)
return -ENOENT;
if (item != expected)
return -ENOENT;
- __radix_tree_replace(&mapping->page_tree, node, pslot,
+ __radix_tree_replace(&mapping->i_pages, node, pslot,
replacement, NULL);
return 0;
}
@@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
void *item;
rcu_read_lock();
- item = radix_tree_lookup(&mapping->page_tree, index);
+ item = radix_tree_lookup(&mapping->i_pages, index);
rcu_read_unlock();
return item == swp_to_radix_entry(swap);
}
@@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page,
page->mapping = mapping;
page->index = index;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
if (PageTransHuge(page)) {
void __rcu **results;
pgoff_t idx;
int i;
error = 0;
- if (radix_tree_gang_lookup_slot(&mapping->page_tree,
+ if (radix_tree_gang_lookup_slot(&mapping->i_pages,
&results, &idx, index, 1) &&
idx < index + HPAGE_PMD_NR) {
error = -EEXIST;
@@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page,
if (!error) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
- error = radix_tree_insert(&mapping->page_tree,
+ error = radix_tree_insert(&mapping->i_pages,
index + i, page + i);
VM_BUG_ON(error);
}
count_vm_event(THP_FILE_ALLOC);
}
} else if (!expected) {
- error = radix_tree_insert(&mapping->page_tree, index, page);
+ error = radix_tree_insert(&mapping->i_pages, index, page);
} else {
error = shmem_radix_tree_replace(mapping, index, expected,
page);
@@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page,
__inc_node_page_state(page, NR_SHMEM_THPS);
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
} else {
page->mapping = NULL;
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
page_ref_sub(page, nr);
}
return error;
@@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
VM_BUG_ON_PAGE(PageCompound(page), page);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
__dec_node_page_state(page, NR_FILE_PAGES);
__dec_node_page_state(page, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
put_page(page);
BUG_ON(error);
}
@@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping,
{
void *old;
- spin_lock_irq(&mapping->tree_lock);
- old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
+ old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
+ xa_unlock_irq(&mapping->i_pages);
if (old != radswap)
return -ENOENT;
free_swap_and_cache(radix_to_swp_entry(radswap));
@@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given offsets are swapped out.
*
- * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
@@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
if (iter.index >= end)
break;
@@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given vma is swapped out.
*
- * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
@@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
int error = 0;
radswap = swp_to_radix_entry(swap);
- index = find_swap_entry(&mapping->page_tree, radswap);
+ index = find_swap_entry(&mapping->i_pages, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -1448,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
hindex = round_down(index, HPAGE_PMD_NR);
rcu_read_lock();
- if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
+ if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
rcu_read_unlock();
return NULL;
@@ -1561,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* Our caller will very soon move newpage out of swapcache, but it's
* a nice clean interface for us to replace oldpage by newpage there.
*/
- spin_lock_irq(&swap_mapping->tree_lock);
+ xa_lock_irq(&swap_mapping->i_pages);
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage);
if (!error) {
__inc_node_page_state(newpage, NR_FILE_PAGES);
__dec_node_page_state(oldpage, NR_FILE_PAGES);
}
- spin_unlock_irq(&swap_mapping->tree_lock);
+ xa_unlock_irq(&swap_mapping->i_pages);
if (unlikely(error)) {
/*
@@ -2634,7 +2634,7 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
@@ -2642,10 +2642,10 @@ static void shmem_tag_pins(struct address_space *mapping)
continue;
}
} else if (page_count(page) - page_mapcount(page) > 1) {
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_set(&mapping->page_tree, iter.index,
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_set(&mapping->i_pages, iter.index,
SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
}
if (need_resched()) {
@@ -2677,7 +2677,7 @@ static int shmem_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+ if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED))
break;
if (!scan)
@@ -2687,7 +2687,7 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
@@ -2713,10 +2713,10 @@ static int shmem_wait_for_pins(struct address_space *mapping)
error = -EBUSY;
}
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(&mapping->page_tree,
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_clear(&mapping->i_pages,
iter.index, SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
continue_resched:
if (need_resched()) {
slot = radix_tree_iter_resume(slot, &iter);
diff --git a/mm/slab.c b/mm/slab.c
index e3a9b8e23306..2f308253c3d7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4086,7 +4086,8 @@ next:
next_reap_node();
out:
/* Set up the next iteration */
- schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
+ schedule_delayed_work_on(smp_processor_id(), work,
+ round_jiffies_relative(REAPTIMEOUT_AC));
}
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
diff --git a/mm/slub.c b/mm/slub.c
index 4fb037c98782..44aa7847324a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1363,10 +1363,8 @@ static __always_inline void kfree_hook(void *x)
kasan_kfree_large(x, _RET_IP_);
}
-static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
{
- void *freeptr;
-
kmemleak_free_recursive(x, s->flags);
/*
@@ -1386,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
- freeptr = get_freepointer(s, x);
- /*
- * kasan_slab_free() may put x into memory quarantine, delaying its
- * reuse. In this case the object's freelist pointer is changed.
- */
- kasan_slab_free(s, x, _RET_IP_);
- return freeptr;
+ /* KASAN might put x into memory quarantine, delaying its reuse */
+ return kasan_slab_free(s, x, _RET_IP_);
}
-static inline void slab_free_freelist_hook(struct kmem_cache *s,
- void *head, void *tail)
+static inline bool slab_free_freelist_hook(struct kmem_cache *s,
+ void **head, void **tail)
{
/*
* Compiler cannot detect this function can be removed if slab_free_hook()
@@ -1407,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
defined(CONFIG_DEBUG_OBJECTS_FREE) || \
defined(CONFIG_KASAN)
- void *object = head;
- void *tail_obj = tail ? : head;
- void *freeptr;
+ void *object;
+ void *next = *head;
+ void *old_tail = *tail ? *tail : *head;
+
+ /* Head and tail of the reconstructed freelist */
+ *head = NULL;
+ *tail = NULL;
do {
- freeptr = slab_free_hook(s, object);
- } while ((object != tail_obj) && (object = freeptr));
+ object = next;
+ next = get_freepointer(s, object);
+ /* If object's reuse doesn't have to be delayed */
+ if (!slab_free_hook(s, object)) {
+ /* Move object to the new freelist */
+ set_freepointer(s, object, *head);
+ *head = object;
+ if (!*tail)
+ *tail = object;
+ }
+ } while (object != old_tail);
+
+ if (*head == *tail)
+ *tail = NULL;
+
+ return *head != NULL;
+#else
+ return true;
#endif
}
@@ -2968,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
void *head, void *tail, int cnt,
unsigned long addr)
{
- slab_free_freelist_hook(s, head, tail);
/*
- * slab_free_freelist_hook() could have put the items into quarantine.
- * If so, no need to free them.
+ * With KASAN enabled slab_free_freelist_hook modifies the freelist
+ * to remove objects, whose reuse must be delayed.
*/
- if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
- return;
- do_slab_free(s, page, head, tail, cnt, addr);
+ if (slab_free_freelist_hook(s, &head, &tail))
+ do_slab_free(s, page, head, tail, cnt, addr);
}
#ifdef CONFIG_KASAN
diff --git a/mm/sparse.c b/mm/sparse.c
index 62eef264a7bd..73dc2fcc0eab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -629,7 +629,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(pfn);
struct mem_section *ms;
/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f233dccd3b1b..07f9aa2340c3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
SetPageSwapCache(page);
address_space = swap_address_space(entry);
- spin_lock_irq(&address_space->tree_lock);
+ xa_lock_irq(&address_space->i_pages);
for (i = 0; i < nr; i++) {
set_page_private(page + i, entry.val + i);
- error = radix_tree_insert(&address_space->page_tree,
+ error = radix_tree_insert(&address_space->i_pages,
idx + i, page + i);
if (unlikely(error))
break;
@@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON(error == -EEXIST);
set_page_private(page + i, 0UL);
while (i--) {
- radix_tree_delete(&address_space->page_tree, idx + i);
+ radix_tree_delete(&address_space->i_pages, idx + i);
set_page_private(page + i, 0UL);
}
ClearPageSwapCache(page);
page_ref_sub(page, nr);
}
- spin_unlock_irq(&address_space->tree_lock);
+ xa_unlock_irq(&address_space->i_pages);
return error;
}
@@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page)
address_space = swap_address_space(entry);
idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
- radix_tree_delete(&address_space->page_tree, idx + i);
+ radix_tree_delete(&address_space->i_pages, idx + i);
set_page_private(page + i, 0);
}
ClearPageSwapCache(page);
@@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
address_space = swap_address_space(entry);
- spin_lock_irq(&address_space->tree_lock);
+ xa_lock_irq(&address_space->i_pages);
__delete_from_swap_cache(page);
- spin_unlock_irq(&address_space->tree_lock);
+ xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
page_ref_sub(page, hpage_nr_pages(page));
@@ -628,12 +628,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
return -ENOMEM;
for (i = 0; i < nr; i++) {
space = spaces + i;
- INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
+ INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
atomic_set(&space->i_mmap_writable, 0);
space->a_ops = &swap_aops;
/* swap cache doesn't use writeback related tags */
mapping_set_no_writeback_tags(space);
- spin_lock_init(&space->tree_lock);
}
nr_swapper_spaces[type] = nr;
rcu_assign_pointer(swapper_spaces[type], spaces);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c7a33717d079..cc2cf04d9018 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-struct plist_head *swap_avail_heads;
+static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -2961,6 +2961,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
maxpages = swp_offset(pte_to_swp_entry(
swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
last_page = swap_header->info.last_page;
+ if (!last_page) {
+ pr_warn("Empty swap-file\n");
+ return 0;
+ }
if (last_page > maxpages) {
pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
maxpages << (PAGE_SHIFT - 10),
diff --git a/mm/truncate.c b/mm/truncate.c
index c34e2fd4f583..1d2fb2dca96f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
struct radix_tree_node *node;
void **slot;
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+ if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot))
return;
if (*slot != entry)
return;
- __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
+ __radix_tree_replace(&mapping->i_pages, node, slot, NULL,
workingset_update_node);
mapping->nrexceptional--;
}
@@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
}
/*
@@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
dax = dax_mapping(mapping);
lock = !dax && indices[j] < end;
if (lock)
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
for (i = j; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
}
if (lock)
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
pvec->nr = j;
}
@@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping)
* modification that does not see AS_EXITING is
* completed before starting the final truncate.
*/
- spin_lock_irq(&mapping->tree_lock);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
+ xa_unlock_irq(&mapping->i_pages);
truncate_inode_pages(mapping, 0);
}
@@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
put_page(page); /* pagecache ref */
return 1;
failed:
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
return 0;
}
diff --git a/mm/util.c b/mm/util.c
index 029fc2f3b395..45fc3169e7b0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
@@ -297,8 +297,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
- * If the architecture not support this function, simply return with no
- * page pinned
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
*/
int __weak __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
@@ -668,6 +670,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_node_page_state(NR_SLAB_RECLAIMABLE);
/*
+ * Part of the kernel memory, which can be released
+ * under memory pressure.
+ */
+ free += global_node_page_state(
+ NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
+
+ /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4390a8d5be41..9b697323a88c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -116,6 +116,16 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
+
+ struct {
+ unsigned int dirty;
+ unsigned int unqueued_dirty;
+ unsigned int congested;
+ unsigned int writeback;
+ unsigned int immediate;
+ unsigned int file_taken;
+ unsigned int taken;
+ } nr;
};
#ifdef ARCH_HAS_PREFETCH
@@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc)
#endif
return false;
}
+
+static void set_memcg_congestion(pg_data_t *pgdat,
+ struct mem_cgroup *memcg,
+ bool congested)
+{
+ struct mem_cgroup_per_node *mn;
+
+ if (!memcg)
+ return;
+
+ mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ WRITE_ONCE(mn->congested, congested);
+}
+
+static bool memcg_congested(pg_data_t *pgdat,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *mn;
+
+ mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ return READ_ONCE(mn->congested);
+
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
@@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc)
{
return true;
}
+
+static inline void set_memcg_congestion(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg, bool congested)
+{
+}
+
+static inline bool memcg_congested(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
+{
+ return false;
+
+}
#endif
/*
@@ -258,7 +303,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
/*
* Add a shrinker callback to be called from the vm.
*/
-int register_shrinker(struct shrinker *shrinker)
+int prealloc_shrinker(struct shrinker *shrinker)
{
size_t size = sizeof(*shrinker->nr_deferred);
@@ -268,10 +313,29 @@ int register_shrinker(struct shrinker *shrinker)
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
+ return 0;
+}
+
+void free_prealloced_shrinker(struct shrinker *shrinker)
+{
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+}
+void register_shrinker_prepared(struct shrinker *shrinker)
+{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
+}
+
+int register_shrinker(struct shrinker *shrinker)
+{
+ int err = prealloc_shrinker(shrinker);
+
+ if (err)
+ return err;
+ register_shrinker_prepared(shrinker);
return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -648,7 +712,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
/*
* The non racy check for a busy page.
*
@@ -672,7 +736,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* load is not satisfied before that of page->_refcount.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
- * and thus under tree_lock, then this ordering is not required.
+ * and thus under the i_pages lock, then this ordering is not required.
*/
if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
refcount = 1 + HPAGE_PMD_NR;
@@ -690,7 +754,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
@@ -711,13 +775,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* only page cache pages found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
- * same page_tree.
+ * same address_space.
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
__delete_from_page_cache(page, shadow);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
if (freepage != NULL)
freepage(page);
@@ -726,7 +790,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
return 0;
}
@@ -857,17 +921,6 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
-struct reclaim_stat {
- unsigned nr_dirty;
- unsigned nr_unqueued_dirty;
- unsigned nr_congested;
- unsigned nr_writeback;
- unsigned nr_immediate;
- unsigned nr_activate;
- unsigned nr_ref_keep;
- unsigned nr_unmap_fail;
-};
-
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -926,7 +979,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
- * The number of dirty pages determines if a zone is marked
+ * The number of dirty pages determines if a node is marked
* reclaim_congested which affects wait_iff_congested. kswapd
* will stall and start writing pages if the tail of the LRU
* is all dirty unqueued pages.
@@ -1755,23 +1808,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
free_unref_page_list(&page_list);
/*
- * If reclaim is isolating dirty pages under writeback, it implies
- * that the long-lived page allocation rate is exceeding the page
- * laundering rate. Either the global limits are not being effective
- * at throttling processes due to the page distribution throughout
- * zones or there is heavy usage of a slow backing device. The
- * only option is to throttle from reclaim context which is not ideal
- * as there is no guarantee the dirtying process is throttled in the
- * same way balance_dirty_pages() manages.
- *
- * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
- * of pages under pages flagged for immediate reclaim and stall if any
- * are encountered in the nr_immediate check below.
- */
- if (stat.nr_writeback && stat.nr_writeback == nr_taken)
- set_bit(PGDAT_WRITEBACK, &pgdat->flags);
-
- /*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
@@ -1785,48 +1821,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_unqueued_dirty == nr_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
- /*
- * Legacy memcg will stall in page writeback so avoid forcibly
- * stalling here.
- */
- if (sane_reclaim(sc)) {
- /*
- * Tag a zone as congested if all the dirty pages scanned were
- * backed by a congested BDI and wait_iff_congested will stall.
- */
- if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
- set_bit(PGDAT_CONGESTED, &pgdat->flags);
-
- /* Allow kswapd to start writing pages during reclaim. */
- if (stat.nr_unqueued_dirty == nr_taken)
- set_bit(PGDAT_DIRTY, &pgdat->flags);
-
- /*
- * If kswapd scans pages marked marked for immediate
- * reclaim and under writeback (nr_immediate), it implies
- * that pages are cycling through the LRU faster than
- * they are written so also forcibly stall.
- */
- if (stat.nr_immediate && current_may_throttle())
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
- /*
- * Stall direct reclaim for IO completions if underlying BDIs or zone
- * is congested. Allow kswapd to continue until it starts encountering
- * unqueued dirty pages or cycling through the LRU too quickly.
- */
- if (!sc->hibernation_mode && !current_is_kswapd() &&
- current_may_throttle())
- wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+ sc->nr.writeback += stat.nr_writeback;
+ sc->nr.immediate += stat.nr_immediate;
+ sc->nr.taken += nr_taken;
+ if (file)
+ sc->nr.file_taken += nr_taken;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
- nr_scanned, nr_reclaimed,
- stat.nr_dirty, stat.nr_writeback,
- stat.nr_congested, stat.nr_immediate,
- stat.nr_activate, stat.nr_ref_keep,
- stat.nr_unmap_fail,
- sc->priority, file);
+ nr_scanned, nr_reclaimed, &stat, sc->priority, file);
return nr_reclaimed;
}
@@ -2507,6 +2512,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return true;
}
+static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+{
+ return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
+ (memcg && memcg_congested(pgdat, memcg));
+}
+
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2522,6 +2533,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
+ memset(&sc->nr, 0, sizeof(sc->nr));
+
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
@@ -2536,7 +2549,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->memcg_low_skipped = 1;
continue;
}
- mem_cgroup_event(memcg, MEMCG_LOW);
+ memcg_memory_event(memcg, MEMCG_LOW);
}
reclaimed = sc->nr_reclaimed;
@@ -2587,6 +2600,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
+ if (current_is_kswapd()) {
+ /*
+ * If reclaim is isolating dirty pages under writeback,
+ * it implies that the long-lived page allocation rate
+ * is exceeding the page laundering rate. Either the
+ * global limits are not being effective at throttling
+ * processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing
+ * device. The only option is to throttle from reclaim
+ * context which is not ideal as there is no guarantee
+ * the dirtying process is throttled in the same way
+ * balance_dirty_pages() manages.
+ *
+ * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+ * count the number of pages under pages flagged for
+ * immediate reclaim and stall if any are encountered
+ * in the nr_immediate check below.
+ */
+ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+ /*
+ * Tag a node as congested if all the dirty pages
+ * scanned were backed by a congested BDI and
+ * wait_iff_congested will stall.
+ */
+ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_bit(PGDAT_CONGESTED, &pgdat->flags);
+
+ /* Allow kswapd to start writing pages during reclaim.*/
+ if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+ /*
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it
+ * implies that pages are cycling through the LRU
+ * faster than they are written so also forcibly stall.
+ */
+ if (sc->nr.immediate)
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
+
+ /*
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling in wait_iff_congested().
+ */
+ if (!global_reclaim(sc) && sane_reclaim(sc) &&
+ sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_memcg_congestion(pgdat, root, true);
+
+ /*
+ * Stall direct reclaim for IO completions if underlying BDIs
+ * and node is congested. Allow kswapd to continue until it
+ * starts encountering unqueued dirty pages or cycling through
+ * the LRU too quickly.
+ */
+ if (!sc->hibernation_mode && !current_is_kswapd() &&
+ current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+ wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
@@ -2802,6 +2876,7 @@ retry:
continue;
last_pgdat = zone->zone_pgdat;
snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+ set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
}
delayacct_freepages_end();
@@ -3808,7 +3883,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
- * Free memory by calling shrink zone with increasing
+ * Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.
*/
do {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 33581be705f0..a2b9518980ce 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
+ "", /* nr_indirectly_reclaimable */
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1739,6 +1740,10 @@ static int vmstat_show(struct seq_file *m, void *arg)
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
+ /* Skip hidden vmstat items. */
+ if (*vmstat_text[off] == '\0')
+ return 0;
+
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
diff --git a/mm/workingset.c b/mm/workingset.c
index b7d616a3bbbe..40ee02c83978 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
* @mapping: address space the page was backing
* @page: the page being evicted
*
- * Returns a shadow entry to be stored in @mapping->page_tree in place
+ * Returns a shadow entry to be stored in @mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
@@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node)
*
* Avoid acquiring the list_lru lock when the nodes are
* already where they should be. The list_empty() test is safe
- * as node->private_list is protected by &mapping->tree_lock.
+ * as node->private_list is protected by the i_pages lock.
*/
if (node->count && node->count == node->exceptional) {
if (list_empty(&node->private_list))
@@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long nodes;
unsigned long cache;
- /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ /* list_lru lock nests inside the IRQ-safe i_pages lock */
local_irq_disable();
nodes = list_lru_shrink_count(&shadow_nodes, sc);
local_irq_enable();
@@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/*
* Page cache insertions and deletions synchroneously maintain
- * the shadow node LRU under the mapping->tree_lock and the
+ * the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
* address_space that has radix tree nodes on the LRU.
*
- * We can then safely transition to the mapping->tree_lock to
+ * We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
* to reclaim, take the node off-LRU, and drop the lru_lock.
*/
node = container_of(item, struct radix_tree_node, private_list);
- mapping = container_of(node->root, struct address_space, page_tree);
+ mapping = container_of(node->root, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
- if (!spin_trylock(&mapping->tree_lock)) {
+ if (!xa_trylock(&mapping->i_pages)) {
spin_unlock(lru_lock);
ret = LRU_RETRY;
goto out;
@@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
- __radix_tree_delete_node(&mapping->page_tree, node,
+ __radix_tree_delete_node(&mapping->i_pages, node,
workingset_lookup_update(mapping));
out_invalid:
- spin_unlock(&mapping->tree_lock);
+ xa_unlock(&mapping->i_pages);
ret = LRU_REMOVED_RETRY;
out:
local_irq_enable();
@@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
{
unsigned long ret;
- /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ /* list_lru lock nests inside the IRQ-safe i_pages lock */
local_irq_disable();
ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
local_irq_enable();
@@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = {
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
- * mapping->tree_lock.
+ * i_pages lock.
*/
static struct lock_class_key shadow_nodes_key;
diff --git a/mm/z3fold.c b/mm/z3fold.c
index f579ad4a8100..4b366d181f35 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -144,7 +144,8 @@ enum z3fold_page_flags {
PAGE_HEADLESS = 0,
MIDDLE_CHUNK_MAPPED,
NEEDS_COMPACTING,
- PAGE_STALE
+ PAGE_STALE,
+ UNDER_RECLAIM
};
/*****************
@@ -173,6 +174,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
+ clear_bit(UNDER_RECLAIM, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -467,6 +469,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ if (!pool->unbuddied)
+ goto out_pool;
for_each_possible_cpu(cpu) {
struct list_head *unbuddied =
per_cpu_ptr(pool->unbuddied, cpu);
@@ -479,7 +483,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool->name = name;
pool->compact_wq = create_singlethread_workqueue(pool->name);
if (!pool->compact_wq)
- goto out;
+ goto out_unbuddied;
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
@@ -489,8 +493,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
out_wq:
destroy_workqueue(pool->compact_wq);
-out:
+out_unbuddied:
+ free_percpu(pool->unbuddied);
+out_pool:
kfree(pool);
+out:
return NULL;
}
@@ -533,7 +540,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
enum buddy bud;
- bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
+ bool can_sleep = gfpflags_allow_blocking(gfp);
if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
@@ -751,6 +758,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
+ if (test_bit(UNDER_RECLAIM, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
return;
@@ -835,6 +846,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
+ set_bit(UNDER_RECLAIM, &page->private);
+ break;
}
list_del_init(&page->lru);
@@ -882,25 +895,35 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
goto next;
}
next:
- spin_lock(&pool->lock);
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- spin_unlock(&pool->lock);
free_z3fold_page(page);
return 0;
}
- } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
- atomic64_dec(&pool->pages_nr);
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
- return 0;
+ } else {
+ z3fold_page_lock(zhdr);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return 0;
+ }
+ /*
+ * if we are here, the page is still not completely
+ * free. Take the global pool lock then to be able
+ * to add it back to the lru list
+ */
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
}
- /*
- * Add to the beginning of LRU.
- * Pool lock has to be kept here to ensure the page has
- * not already been released
- */
- list_add(&page->lru, &pool->lru);
+ /* We started off locked to we need to lock the pool back */
+ spin_lock(&pool->lock);
}
spin_unlock(&pool->lock);
return -EAGAIN;