aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/backing-dev.c21
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/cma.c83
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/gup.c3
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/internal.h6
-rw-r--r--mm/kasan/kasan.c66
-rw-r--r--mm/ksm.c23
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempool.c108
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mmap.c78
-rw-r--r--mm/oom_kill.c81
-rw-r--r--mm/page_alloc.c99
-rw-r--r--mm/readahead.c39
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c21
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c62
-rw-r--r--mm/z3fold.c42
29 files changed, 439 insertions, 350 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..3e0b6e87f65d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -266,7 +266,7 @@ config ARCH_ENABLE_THP_MIGRATION
bool
config PHYS_ADDR_T_64BIT
- def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+ def_bool 64BIT
config BOUNCE
bool "Enable bounce buffers"
@@ -305,7 +305,7 @@ config KSM
the many instances by a single page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
- See Documentation/vm/ksm.txt for more information: KSM is inactive
+ See Documentation/vm/ksm.rst for more information: KSM is inactive
until a program has madvised that an area is MADV_MERGEABLE, and
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
@@ -530,7 +530,7 @@ config MEM_SOFT_DIRTY
into a page just as regular dirty bit, but unlike the latter
it can be cleared by hands.
- See Documentation/vm/soft-dirty.txt for more details.
+ See Documentation/admin-guide/mm/soft-dirty.rst for more details.
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
@@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT
default n
depends on NO_BOOTMEM
depends on !FLATMEM
+ depends on !NEED_PER_CPU_KM
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
@@ -656,7 +657,8 @@ config IDLE_PAGE_TRACKING
be useful to tune memory cgroup limits and/or for job placement
within a compute cluster.
- See Documentation/vm/idle_page_tracking.txt for more details.
+ See Documentation/admin-guide/mm/idle_page_tracking.rst for
+ more details.
# arch_add_memory() comprehends device memory
config ARCH_HAS_ZONE_DEVICE
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 023190c69dce..8fe3ebd6ac00 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -115,6 +115,7 @@ static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
bdi, &bdi_debug_stats_fops);
if (!bdi->debug_stats) {
debugfs_remove(bdi->debug_dir);
+ bdi->debug_dir = NULL;
return -ENOMEM;
}
@@ -383,7 +384,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
* the barrier provided by test_and_clear_bit() above.
*/
smp_wmb();
- clear_bit(WB_shutting_down, &wb->state);
+ clear_and_wake_up_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -411,6 +412,7 @@ static void wb_exit(struct bdi_writeback *wb)
* protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
+static struct workqueue_struct *cgwb_release_wq;
/**
* wb_congested_get_create - get or create a wb_congested
@@ -521,7 +523,7 @@ static void cgwb_release(struct percpu_ref *refcnt)
{
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
refcnt);
- schedule_work(&wb->release_work);
+ queue_work(cgwb_release_wq, &wb->release_work);
}
static void cgwb_kill(struct bdi_writeback *wb)
@@ -783,6 +785,21 @@ static void cgwb_bdi_register(struct backing_dev_info *bdi)
spin_unlock_irq(&cgwb_lock);
}
+static int __init cgwb_init(void)
+{
+ /*
+ * There can be many concurrent release work items overwhelming
+ * system_wq. Put them in a separate wq and limit concurrency.
+ * There's no point in executing many of these in parallel.
+ */
+ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
+ if (!cgwb_release_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+subsys_initcall(cgwb_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
diff --git a/mm/cleancache.c b/mm/cleancache.c
index f7b9fdc79d97..126548b5a292 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -3,7 +3,7 @@
*
* This code provides the generic "frontend" layer to call a matching
* "backend" driver implementation of cleancache. See
- * Documentation/vm/cleancache.txt for more information.
+ * Documentation/vm/cleancache.rst for more information.
*
* Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
diff --git a/mm/cma.c b/mm/cma.c
index aa40e6c7b042..5809bbe360d7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -39,7 +39,6 @@
#include <trace/events/cma.h>
#include "cma.h"
-#include "internal.h"
struct cma cma_areas[MAX_CMA_AREAS];
unsigned cma_area_count;
@@ -110,25 +109,23 @@ static int __init cma_activate_area(struct cma *cma)
if (!cma->bitmap)
return -ENOMEM;
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
do {
unsigned j;
base_pfn = pfn;
- if (!pfn_valid(base_pfn))
- goto err;
-
- zone = page_zone(pfn_to_page(base_pfn));
for (j = pageblock_nr_pages; j; --j, pfn++) {
- if (!pfn_valid(pfn))
- goto err;
-
+ WARN_ON_ONCE(!pfn_valid(pfn));
/*
- * In init_cma_reserved_pageblock(), present_pages
- * is adjusted with assumption that all pages in
- * the pageblock come from a single zone.
+ * alloc_contig_range requires the pfn range
+ * specified to be in the same zone. Make this
+ * simple by forcing the entire CMA resv range
+ * to be in the same zone.
*/
if (page_zone(pfn_to_page(pfn)) != zone)
- goto err;
+ goto not_in_zone;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
} while (--i);
@@ -142,7 +139,7 @@ static int __init cma_activate_area(struct cma *cma)
return 0;
-err:
+not_in_zone:
pr_err("CMA area %s could not be activated\n", cma->name);
kfree(cma->bitmap);
cma->count = 0;
@@ -152,41 +149,6 @@ err:
static int __init cma_init_reserved_areas(void)
{
int i;
- struct zone *zone;
- pg_data_t *pgdat;
-
- if (!cma_area_count)
- return 0;
-
- for_each_online_pgdat(pgdat) {
- unsigned long start_pfn = UINT_MAX, end_pfn = 0;
-
- zone = &pgdat->node_zones[ZONE_MOVABLE];
-
- /*
- * In this case, we cannot adjust the zone range
- * since it is now maximum node span and we don't
- * know original zone range.
- */
- if (populated_zone(zone))
- continue;
-
- for (i = 0; i < cma_area_count; i++) {
- if (pfn_to_nid(cma_areas[i].base_pfn) !=
- pgdat->node_id)
- continue;
-
- start_pfn = min(start_pfn, cma_areas[i].base_pfn);
- end_pfn = max(end_pfn, cma_areas[i].base_pfn +
- cma_areas[i].count);
- }
-
- if (!end_pfn)
- continue;
-
- zone->zone_start_pfn = start_pfn;
- zone->spanned_pages = end_pfn - start_pfn;
- }
for (i = 0; i < cma_area_count; i++) {
int ret = cma_activate_area(&cma_areas[i]);
@@ -195,32 +157,9 @@ static int __init cma_init_reserved_areas(void)
return ret;
}
- /*
- * Reserved pages for ZONE_MOVABLE are now activated and
- * this would change ZONE_MOVABLE's managed page counter and
- * the other zones' present counter. We need to re-calculate
- * various zone information that depends on this initialization.
- */
- build_all_zonelists(NULL);
- for_each_populated_zone(zone) {
- if (zone_idx(zone) == ZONE_MOVABLE) {
- zone_pcp_reset(zone);
- setup_zone_pageset(zone);
- } else
- zone_pcp_update(zone);
-
- set_zone_contiguous(zone);
- }
-
- /*
- * We need to re-init per zone wmark by calling
- * init_per_zone_wmark_min() but doesn't call here because it is
- * registered on core_initcall and it will be called later than us.
- */
-
return 0;
}
-pure_initcall(cma_init_reserved_areas);
+core_initcall(cma_init_reserved_areas);
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
diff --git a/mm/compaction.c b/mm/compaction.c
index 028b7210a669..29bd1df18b98 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1450,12 +1450,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
+ * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+ * suitable migration targets
*/
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
- 0, wmark_target))
+ ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fec8b5044040..4f5476a0f955 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -3,7 +3,7 @@
*
* This code provides the generic "frontend" layer to call a matching
* "backend" driver implementation of frontswap. See
- * Documentation/vm/frontswap.txt for more information.
+ * Documentation/vm/frontswap.rst for more information.
*
* Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
diff --git a/mm/gup.c b/mm/gup.c
index 76af4cfeaf68..541904a7c60f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -544,6 +544,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
+ if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ return -EFAULT;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
diff --git a/mm/hmm.c b/mm/hmm.c
index 486dc394a5a3..e63e353830e8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,7 +37,7 @@
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
/*
- * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ * Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
*/
DEFINE_STATIC_KEY_FALSE(device_private_key);
EXPORT_SYMBOL(device_private_key);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3a1815f8e11..ac5591d8622c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1185,7 +1185,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
* mmu_notifier_invalidate_range_end() happens which can lead to a
* device seeing memory write in different order than CPU.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
@@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
* replacing a zero pmd write protected page with a zero pte write
* protected page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
pmdp_huge_clear_flush(vma, haddr, pmd);
@@ -2431,7 +2431,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
- __ClearPageDirty(head + i);
+ ClearPageDirty(head + i);
__delete_from_page_cache(head + i, NULL);
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 218679138255..129088710510 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3291,7 +3291,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
@@ -4357,7 +4357,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..9e3654d70289 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -53,7 +53,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-extern int __do_page_cache_readahead(struct address_space *mapping,
+extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size);
@@ -168,9 +168,6 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -498,6 +495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index bc0e68f7dc75..f185455b3406 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -792,6 +792,40 @@ DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -813,6 +847,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
case MEM_GOING_ONLINE: {
void *ret;
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
shadow_end, GFP_KERNEL,
PAGE_KERNEL, VM_NO_GUARD,
@@ -824,8 +866,26 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
kmemleak_ignore(ret);
return NOTIFY_OK;
}
- case MEM_OFFLINE:
- vfree((void *)shadow_start);
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
}
return NOTIFY_OK;
@@ -838,5 +898,5 @@ static int __init kasan_memhotplug_init(void)
return 0;
}
-module_init(kasan_memhotplug_init);
+core_initcall(kasan_memhotplug_init);
#endif
diff --git a/mm/ksm.c b/mm/ksm.c
index e3cbf9a92f3c..7d6558f3bac9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -51,7 +51,9 @@
#define DO_NUMA(x) do { } while (0)
#endif
-/*
+/**
+ * DOC: Overview
+ *
* A few notes about the KSM scanning process,
* to make it easier to understand the data structures below:
*
@@ -67,6 +69,21 @@
* this tree is fully assured to be working (except when pages are unmapped),
* and therefore this tree is called the stable tree.
*
+ * The stable tree node includes information required for reverse
+ * mapping from a KSM page to virtual addresses that map this page.
+ *
+ * In order to avoid large latencies of the rmap walks on KSM pages,
+ * KSM maintains two types of nodes in the stable tree:
+ *
+ * * the regular nodes that keep the reverse mapping structures in a
+ * linked list
+ * * the "chains" that link nodes ("dups") that represent the same
+ * write protected memory content, but each "dup" corresponds to a
+ * different KSM page copy of that content
+ *
+ * Internally, the regular nodes, "dups" and "chains" are represented
+ * using the same :c:type:`struct stable_node` structure.
+ *
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
* be "unchanged for a period of time". The unstable tree sorts these pages
@@ -1049,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are downgrading page table to read
* only not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
/*
@@ -1145,7 +1162,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are replacing a read only page with another
* read only page with the same content.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3d101a..1695f38630f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
if (ret)
goto out_put_css;
- efile.file->f_op->poll(efile.file, &event->pt);
+ vfs_poll(efile.file, &event->pt);
spin_lock(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f74826cdceea..25982467800b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1158,7 +1158,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
* nodes have to go through register_node.
* TODO clean up this mess.
*/
- ret = link_mem_sections(nid, start_pfn, nr_pages);
+ ret = link_mem_sections(nid, start_pfn, nr_pages, false);
register_fail:
/*
* If sysfs file of new node can't create, cpu on the node
diff --git a/mm/mempool.c b/mm/mempool.c
index 5c9dce34719b..b54f2c20e5e0 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -138,6 +138,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
}
/**
+ * mempool_exit - exit a mempool initialized with mempool_init()
+ * @pool: pointer to the memory pool which was initialized with
+ * mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself. This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+ while (pool->curr_nr) {
+ void *element = remove_element(pool, GFP_KERNEL);
+ pool->free(element, pool->pool_data);
+ }
+ kfree(pool->elements);
+ pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
+/**
* mempool_destroy - deallocate a memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
@@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool)
if (unlikely(!pool))
return;
- while (pool->curr_nr) {
- void *element = remove_element(pool, GFP_KERNEL);
- pool->free(element, pool->pool_data);
- }
- kfree(pool->elements);
+ mempool_exit(pool);
kfree(pool);
}
EXPORT_SYMBOL(mempool_destroy);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
+{
+ spin_lock_init(&pool->lock);
+ pool->min_nr = min_nr;
+ pool->pool_data = pool_data;
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
+ init_waitqueue_head(&pool->wait);
+
+ pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ gfp_mask, node_id);
+ if (!pool->elements)
+ return -ENOMEM;
+
+ /*
+ * First pre-allocate the guaranteed number of buffers.
+ */
+ while (pool->curr_nr < pool->min_nr) {
+ void *element;
+
+ element = pool->alloc(gfp_mask, pool->pool_data);
+ if (unlikely(!element)) {
+ mempool_exit(pool);
+ return -ENOMEM;
+ }
+ add_element(pool, element);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(mempool_init_node);
+
+/**
+ * mempool_init - initialize a memory pool
+ * @min_nr: the minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @alloc_fn: user-defined element-allocation function.
+ * @free_fn: user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * Like mempool_create(), but initializes the pool in (i.e. embedded in another
+ * structure).
+ */
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
+ pool_data, GFP_KERNEL, NUMA_NO_NODE);
+
+}
+EXPORT_SYMBOL(mempool_init);
+
/**
* mempool_create - create a memory pool
* @min_nr: the minimum number of elements guaranteed to be
@@ -186,35 +258,17 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
+
pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
if (!pool)
return NULL;
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
- gfp_mask, node_id);
- if (!pool->elements) {
+
+ if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
+ gfp_mask, node_id)) {
kfree(pool);
return NULL;
}
- spin_lock_init(&pool->lock);
- pool->min_nr = min_nr;
- pool->pool_data = pool_data;
- init_waitqueue_head(&pool->wait);
- pool->alloc = alloc_fn;
- pool->free = free_fn;
- /*
- * First pre-allocate the guaranteed number of buffers.
- */
- while (pool->curr_nr < pool->min_nr) {
- void *element;
-
- element = pool->alloc(gfp_mask, pool->pool_data);
- if (unlikely(!element)) {
- mempool_destroy(pool);
- return NULL;
- }
- add_element(pool, element);
- }
return pool;
}
EXPORT_SYMBOL(mempool_create_node);
diff --git a/mm/migrate.c b/mm/migrate.c
index 568433023831..8c0af0f7cab1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -528,14 +528,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
int i;
int index = page_index(page);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
pslot = radix_tree_lookup_slot(&mapping->i_pages,
index + i);
radix_tree_replace_slot(&mapping->i_pages, pslot,
newpage + i);
}
- } else {
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
}
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d5968d1e8e3..d817764a9974 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1324,6 +1324,35 @@ static inline int mlock_future_check(struct mm_struct *mm,
return 0;
}
+static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ if (S_ISBLK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ /* Special "we do even unsigned file positions" case */
+ if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+ return 0;
+
+ /* Yes, random drivers might want more. But I'm tired of buggy drivers */
+ return ULONG_MAX;
+}
+
+static inline bool file_mmap_ok(struct file *file, struct inode *inode,
+ unsigned long pgoff, unsigned long len)
+{
+ u64 maxsize = file_mmap_size_max(file, inode);
+
+ if (maxsize && len > maxsize)
+ return false;
+ maxsize -= len;
+ if (pgoff > maxsize >> PAGE_SHIFT)
+ return false;
+ return true;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1409,6 +1438,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
struct inode *inode = file_inode(file);
unsigned long flags_mask;
+ if (!file_mmap_ok(file, inode, pgoff, len))
+ return -EOVERFLOW;
+
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
@@ -2796,7 +2828,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
current->comm, current->pid);
if (prot)
@@ -3024,6 +3056,32 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
+ if (unlikely(mm_is_oom_victim(mm))) {
+ /*
+ * Manually reap the mm to free as much memory as possible.
+ * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
+ * this mm from further consideration. Taking mm->mmap_sem for
+ * write after setting MMF_OOM_SKIP will guarantee that the oom
+ * reaper will not run on this mm again after mmap_sem is
+ * dropped.
+ *
+ * Nothing can be holding mm->mmap_sem here and the above call
+ * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
+ * __oom_reap_task_mm() will not block.
+ *
+ * This needs to be done before calling munlock_vma_pages_all(),
+ * which clears VM_LOCKED, otherwise the oom reaper cannot
+ * reliably test it.
+ */
+ mutex_lock(&oom_lock);
+ __oom_reap_task_mm(mm);
+ mutex_unlock(&oom_lock);
+
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -3045,24 +3103,6 @@ void exit_mmap(struct mm_struct *mm)
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
-
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Wait for oom_reap_task() to stop working on this
- * mm. Because MMF_OOM_SKIP is already set before
- * calling down_read(), oom_reap_task() will not run
- * on this "mm" post up_write().
- *
- * mm_is_oom_victim() cannot be set from under us
- * either because victim->mm is already set to NULL
- * under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if victim->mm
- * is found not NULL while holding the task_lock.
- */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa8760a..8ba6cb88cf58 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+void __oom_reap_task_mm(struct mm_struct *mm)
{
- struct mmu_gather tlb;
struct vm_area_struct *vma;
+
+ /*
+ * Tell all users of get_user/copy_from_user etc... that the content
+ * is no longer stable. No barriers really needed because unmapping
+ * should imply barriers already and the reader would hit a page fault
+ * if it stumbled over a reaped memory.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (!can_madv_dontneed_vma(vma))
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+ }
+ }
+}
+
+static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+{
bool ret = true;
/*
* We have to make sure to not race with the victim exit path
* and cause premature new oom victim selection:
- * __oom_reap_task_mm exit_mm
+ * oom_reap_task_mm exit_mm
* mmget_not_zero
* mmput
* atomic_dec_and_test
@@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
trace_start_task_reaping(tsk->pid);
- /*
- * Tell all users of get_user/copy_from_user etc... that the content
- * is no longer stable. No barriers really needed because unmapping
- * should imply barriers already and the reader would hit a page fault
- * if it stumbled over a reaped memory.
- */
- set_bit(MMF_UNSTABLE, &mm->flags);
-
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
- continue;
+ __oom_reap_task_mm(mm);
- /*
- * Only anonymous pages have a good chance to be dropped
- * without additional steps which we cannot afford as we
- * are OOM already.
- *
- * We do not even care about fs backed pages because all
- * which are reclaimable have already been reclaimed and
- * we do not want to block exit_mmap by keeping mm ref
- * count elevated without a good reason.
- */
- if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- const unsigned long start = vma->vm_start;
- const unsigned long end = vma->vm_end;
-
- tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
- unmap_page_range(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
- }
- }
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk)
struct mm_struct *mm = tsk->signal->oom_mm;
/* Retry the down_read_trylock(mmap_sem) a few times */
- while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
-
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
debug_show_all_locks();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 905db9d7962f..22320ea27489 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1743,38 +1743,16 @@ void __init page_alloc_init_late(void)
}
#ifdef CONFIG_CMA
-static void __init adjust_present_page_count(struct page *page, long count)
-{
- struct zone *zone = page_zone(page);
-
- /* We don't need to hold a lock since it is boot-up process */
- zone->present_pages += count;
-}
-
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
- unsigned long pfn = page_to_pfn(page);
struct page *p = page;
- int nid = page_to_nid(page);
-
- /*
- * ZONE_MOVABLE will steal present pages from other zones by
- * changing page links so page_zone() is changed. Before that,
- * we need to adjust previous zone's page count first.
- */
- adjust_present_page_count(page, -pageblock_nr_pages);
do {
__ClearPageReserved(p);
set_page_count(p, 0);
-
- /* Steal pages from other zones */
- set_page_links(p, ZONE_MOVABLE, nid, pfn);
- } while (++p, ++pfn, --i);
-
- adjust_present_page_count(page, pageblock_nr_pages);
+ } while (++p, --i);
set_pageblock_migratetype(page, MIGRATE_CMA);
@@ -2889,7 +2867,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* exists.
*/
watermark = min_wmark_pages(zone) + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -3165,6 +3143,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
@@ -3191,8 +3175,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
#ifdef CONFIG_CMA
- if (!list_empty(&area->free_list[MIGRATE_CMA]))
+ if ((alloc_flags & ALLOC_CMA) &&
+ !list_empty(&area->free_list[MIGRATE_CMA])) {
return true;
+ }
#endif
if (alloc_harder &&
!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
@@ -3212,6 +3198,13 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
+ long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
/*
* Fast check for order-0 only. If this fails then the reserves
@@ -3220,7 +3213,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* the caller is !atomic then it'll uselessly search the free
* list. That corner case is then slower but it is harmless.
*/
- if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx])
+ if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
return true;
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
@@ -3856,6 +3849,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+#ifdef CONFIG_CMA
+ if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
return alloc_flags;
}
@@ -4322,6 +4319,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
+ if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+ *alloc_flags |= ALLOC_CMA;
+
return true;
}
@@ -6204,7 +6204,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long node_end_pfn = 0;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6232,13 +6231,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long movable_size = 0;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
- if (zone_end_pfn(zone) > node_end_pfn)
- node_end_pfn = zone_end_pfn(zone);
-
/*
* Adjust freesize so that it accounts for how much memory
@@ -6287,30 +6282,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone_seqlock_init(zone);
zone_pcp_init(zone);
- /*
- * The size of the CMA area is unknown now so we need to
- * prepare the memory for the usemap at maximum.
- */
- if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE &&
- pgdat->node_spanned_pages) {
- movable_size = node_end_pfn - pgdat->node_start_pfn;
- }
-
- if (!size && !movable_size)
+ if (!size)
continue;
set_pageblock_order();
- if (movable_size) {
- zone->zone_start_pfn = pgdat->node_start_pfn;
- zone->spanned_pages = movable_size;
- setup_usemap(pgdat, zone,
- pgdat->node_start_pfn, movable_size);
- init_currently_empty_zone(zone,
- pgdat->node_start_pfn, movable_size);
- } else {
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
- }
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -7621,11 +7598,12 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unsigned long pfn, iter, found;
/*
- * For avoiding noise data, lru_add_drain_all() should be called
- * If ZONE_MOVABLE, the zone never contains unmovable pages
+ * TODO we could make this much more efficient by not checking every
+ * page in the range if we know all of them are in MOVABLE_ZONE and
+ * that the movable zone guarantees that pages are migratable but
+ * the later is not the case right now unfortunatelly. E.g. movablecore
+ * can still lead to having bootmem allocations in zone_movable.
*/
- if (zone_idx(zone) == ZONE_MOVABLE)
- return false;
/*
* CMA allocations (alloc_contig_range) really need to mark isolate
@@ -7646,7 +7624,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
page = pfn_to_page(check);
if (PageReserved(page))
- return true;
+ goto unmovable;
/*
* Hugepages are not in LRU lists, but they're movable.
@@ -7696,9 +7674,12 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* page at boot.
*/
if (found > count)
- return true;
+ goto unmovable;
}
return false;
+unmovable:
+ WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+ return true;
}
bool is_pageblock_removable_nolock(struct page *page)
@@ -7951,7 +7932,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
-#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
diff --git a/mm/readahead.c b/mm/readahead.c
index 539bbb6c1fad..e273f0de3376 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -140,23 +140,23 @@ out:
}
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
- * the pages first, then submits them all for I/O. This avoids the very bad
+ * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read,
- unsigned long lookahead_size)
+unsigned int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
- int ret = 0;
+ unsigned int nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
@@ -177,8 +177,18 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->i_pages, page_offset);
rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page))
+ if (page && !radix_tree_exceptional_entry(page)) {
+ /*
+ * Page already present? Kick off the current batch of
+ * contiguous pages before continuing with the next
+ * batch.
+ */
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool, nr_pages,
+ gfp_mask);
+ nr_pages = 0;
continue;
+ }
page = __page_cache_alloc(gfp_mask);
if (!page)
@@ -187,7 +197,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
- ret++;
+ nr_pages++;
}
/*
@@ -195,11 +205,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- if (ret)
- read_pages(mapping, filp, &page_pool, ret, gfp_mask);
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
BUG_ON(!list_empty(&page_pool));
out:
- return ret;
+ return nr_pages;
}
/*
@@ -223,16 +233,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
- int err;
-
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- err = __do_page_cache_readahead(mapping, filp,
- offset, this_chunk, 0);
- if (err < 0)
- return err;
+ __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
offset += this_chunk;
nr_to_read -= this_chunk;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8d5337fed37b..6db729dc4c50 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -942,7 +942,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
if (ret)
(*cleaned)++;
@@ -1599,7 +1599,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* point at new page while a device still is using this
* page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
dec_mm_counter(mm, mm_counter_file(page));
}
@@ -1609,7 +1609,7 @@ discard:
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 62eef264a7bd..73dc2fcc0eab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -629,7 +629,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(pfn);
struct mem_section *ms;
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cc2cf04d9018..78a015fcec3b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3112,6 +3112,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
+ bool inced_nr_rotate_swap = false;
if (swap_flags & ~SWAP_FLAGS_VALID)
return -EINVAL;
@@ -3215,8 +3216,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(&cluster->index);
}
- } else
+ } else {
atomic_inc(&nr_rotate_swap);
+ inced_nr_rotate_swap = true;
+ }
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
@@ -3307,6 +3310,8 @@ bad_swap:
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
+ if (inced_nr_rotate_swap)
+ atomic_dec(&nr_rotate_swap);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
inode_unlock(inode);
diff --git a/mm/util.c b/mm/util.c
index 45fc3169e7b0..c2d0a7cdb189 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -621,7 +621,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ebff729cc956..63a5f502da08 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2751,25 +2751,14 @@ static const struct seq_operations vmalloc_op = {
.show = s_show,
};
-static int vmalloc_open(struct inode *inode, struct file *file)
+static int __init proc_vmalloc_init(void)
{
if (IS_ENABLED(CONFIG_NUMA))
- return seq_open_private(file, &vmalloc_op,
- nr_node_ids * sizeof(unsigned int));
+ proc_create_seq_private("vmallocinfo", S_IRUSR, NULL,
+ &vmalloc_op,
+ nr_node_ids * sizeof(unsigned int), NULL);
else
- return seq_open(file, &vmalloc_op);
-}
-
-static const struct file_operations proc_vmalloc_operations = {
- .open = vmalloc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-static int __init proc_vmalloc_init(void)
-{
- proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+ proc_create_seq("vmallocinfo", S_IRUSR, NULL, &vmalloc_op);
return 0;
}
module_init(proc_vmalloc_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9b697323a88c..9270a4370d54 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1418,7 +1418,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
return ret;
mapping = page_mapping(page);
- migrate_dirty = mapping && mapping->a_ops->migratepage;
+ migrate_dirty = !mapping || mapping->a_ops->migratepage;
unlock_page(page);
if (!migrate_dirty)
return ret;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 536332e988b8..75eda9c2b260 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1161,7 +1161,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
- "nr_indirectly_reclaimable",
+ "", /* nr_indirectly_reclaimable */
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1516,18 +1516,6 @@ static const struct seq_operations fragmentation_op = {
.show = frag_show,
};
-static int fragmentation_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &fragmentation_op);
-}
-
-static const struct file_operations buddyinfo_file_operations = {
- .open = fragmentation_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static const struct seq_operations pagetypeinfo_op = {
.start = frag_start,
.next = frag_next,
@@ -1535,18 +1523,6 @@ static const struct seq_operations pagetypeinfo_op = {
.show = pagetypeinfo_show,
};
-static int pagetypeinfo_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &pagetypeinfo_op);
-}
-
-static const struct file_operations pagetypeinfo_file_operations = {
- .open = pagetypeinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
{
int zid;
@@ -1663,18 +1639,6 @@ static const struct seq_operations zoneinfo_op = {
.show = zoneinfo_show,
};
-static int zoneinfo_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &zoneinfo_op);
-}
-
-static const struct file_operations zoneinfo_file_operations = {
- .open = zoneinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
enum writeback_stat_item {
NR_DIRTY_THRESHOLD,
NR_DIRTY_BG_THRESHOLD,
@@ -1740,6 +1704,10 @@ static int vmstat_show(struct seq_file *m, void *arg)
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
+ /* Skip hidden vmstat items. */
+ if (*vmstat_text[off] == '\0')
+ return 0;
+
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
@@ -1758,18 +1726,6 @@ static const struct seq_operations vmstat_op = {
.stop = vmstat_stop,
.show = vmstat_show,
};
-
-static int vmstat_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &vmstat_op);
-}
-
-static const struct file_operations vmstat_file_operations = {
- .open = vmstat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
@@ -2016,10 +1972,10 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
#ifdef CONFIG_PROC_FS
- proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
- proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
- proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
- proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
+ proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+ proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+ proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
+ proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
#endif
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index c0bca6153b95..4b366d181f35 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -144,7 +144,8 @@ enum z3fold_page_flags {
PAGE_HEADLESS = 0,
MIDDLE_CHUNK_MAPPED,
NEEDS_COMPACTING,
- PAGE_STALE
+ PAGE_STALE,
+ UNDER_RECLAIM
};
/*****************
@@ -173,6 +174,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
+ clear_bit(UNDER_RECLAIM, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -756,6 +758,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
+ if (test_bit(UNDER_RECLAIM, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
return;
@@ -840,6 +846,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
+ set_bit(UNDER_RECLAIM, &page->private);
+ break;
}
list_del_init(&page->lru);
@@ -887,25 +895,35 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
goto next;
}
next:
- spin_lock(&pool->lock);
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- spin_unlock(&pool->lock);
free_z3fold_page(page);
return 0;
}
- } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
- atomic64_dec(&pool->pages_nr);
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ } else {
+ z3fold_page_lock(zhdr);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return 0;
+ }
+ /*
+ * if we are here, the page is still not completely
+ * free. Take the global pool lock then to be able
+ * to add it back to the lru list
+ */
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
- return 0;
+ z3fold_page_unlock(zhdr);
}
- /*
- * Add to the beginning of LRU.
- * Pool lock has to be kept here to ensure the page has
- * not already been released
- */
- list_add(&page->lru, &pool->lru);
+ /* We started off locked to we need to lock the pool back */
+ spin_lock(&pool->lock);
}
spin_unlock(&pool->lock);
return -EAGAIN;